fs/iomap/buffered-io.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2010 Red Hat, Inc.
0004  * Copyright (C) 2016-2019 Christoph Hellwig.
0005  */
0006 #include <linux/module.h>
0007 #include <linux/compiler.h>
0008 #include <linux/fs.h>
0009 #include <linux/iomap.h>
0010 #include <linux/pagemap.h>
0011 #include <linux/uio.h>
0012 #include <linux/buffer_head.h>
0013 #include <linux/dax.h>
0014 #include <linux/writeback.h>
0015 #include <linux/list_sort.h>
0016 #include <linux/swap.h>
0017 #include <linux/bio.h>
0018 #include <linux/sched/signal.h>
0019 #include <linux/migrate.h>
0020 #include "trace.h"
0021
0022 #include "../internal.h"
0023
0024 #define IOEND_BATCH_SIZE    4096
0025
0026 /*
0027  * Structure allocated for each folio when block size < folio size
0028  * to track sub-folio uptodate status and I/O completions.
0029  */
0030 struct iomap_page {
0031     atomic_t        read_bytes_pending;
0032     atomic_t        write_bytes_pending;
0033     spinlock_t      uptodate_lock;
0034     unsigned long       uptodate[];
0035 };
0036
0037 static inline struct iomap_page *to_iomap_page(struct folio *folio)
0038 {
0039     if (folio_test_private(folio))
0040         return folio_get_private(folio);
0041     return NULL;
0042 }
0043
0044 static struct bio_set iomap_ioend_bioset;
0045
0046 static struct iomap_page *
0047 iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
0048 {
0049     struct iomap_page *iop = to_iomap_page(folio);
0050     unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
0051     gfp_t gfp;
0052
0053     if (iop || nr_blocks <= 1)
0054         return iop;
0055
0056     if (flags & IOMAP_NOWAIT)
0057         gfp = GFP_NOWAIT;
0058     else
0059         gfp = GFP_NOFS | __GFP_NOFAIL;
0060
0061     iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
0062               gfp);
0063     if (iop) {
0064         spin_lock_init(&iop->uptodate_lock);
0065         if (folio_test_uptodate(folio))
0066             bitmap_fill(iop->uptodate, nr_blocks);
0067         folio_attach_private(folio, iop);
0068     }
0069     return iop;
0070 }
0071
0072 static void iomap_page_release(struct folio *folio)
0073 {
0074     struct iomap_page *iop = folio_detach_private(folio);
0075     struct inode *inode = folio->mapping->host;
0076     unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
0077
0078     if (!iop)
0079         return;
0080     WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
0081     WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
0082     WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
0083             folio_test_uptodate(folio));
0084     kfree(iop);
0085 }
0086
0087 /*
0088  * Calculate the range inside the folio that we actually need to read.
0089  */
0090 static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
0091         loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
0092 {
0093     struct iomap_page *iop = to_iomap_page(folio);
0094     loff_t orig_pos = *pos;
0095     loff_t isize = i_size_read(inode);
0096     unsigned block_bits = inode->i_blkbits;
0097     unsigned block_size = (1 << block_bits);
0098     size_t poff = offset_in_folio(folio, *pos);
0099     size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
0100     unsigned first = poff >> block_bits;
0101     unsigned last = (poff + plen - 1) >> block_bits;
0102
0103     /*
0104      * If the block size is smaller than the page size, we need to check the
0105      * per-block uptodate status and adjust the offset and length if needed
0106      * to avoid reading in already uptodate ranges.
0107      */
0108     if (iop) {
0109         unsigned int i;
0110
0111         /* move forward for each leading block marked uptodate */
0112         for (i = first; i <= last; i++) {
0113             if (!test_bit(i, iop->uptodate))
0114                 break;
0115             *pos += block_size;
0116             poff += block_size;
0117             plen -= block_size;
0118             first++;
0119         }
0120
0121         /* truncate len if we find any trailing uptodate block(s) */
0122         for ( ; i <= last; i++) {
0123             if (test_bit(i, iop->uptodate)) {
0124                 plen -= (last - i + 1) * block_size;
0125                 last = i - 1;
0126                 break;
0127             }
0128         }
0129     }
0130
0131     /*
0132      * If the extent spans the block that contains the i_size, we need to
0133      * handle both halves separately so that we properly zero data in the
0134      * page cache for blocks that are entirely outside of i_size.
0135      */
0136     if (orig_pos <= isize && orig_pos + length > isize) {
0137         unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
0138
0139         if (first <= end && last > end)
0140             plen -= (last - end) * block_size;
0141     }
0142
0143     *offp = poff;
0144     *lenp = plen;
0145 }
0146
0147 static void iomap_iop_set_range_uptodate(struct folio *folio,
0148         struct iomap_page *iop, size_t off, size_t len)
0149 {
0150     struct inode *inode = folio->mapping->host;
0151     unsigned first = off >> inode->i_blkbits;
0152     unsigned last = (off + len - 1) >> inode->i_blkbits;
0153     unsigned long flags;
0154
0155     spin_lock_irqsave(&iop->uptodate_lock, flags);
0156     bitmap_set(iop->uptodate, first, last - first + 1);
0157     if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
0158         folio_mark_uptodate(folio);
0159     spin_unlock_irqrestore(&iop->uptodate_lock, flags);
0160 }
0161
0162 static void iomap_set_range_uptodate(struct folio *folio,
0163         struct iomap_page *iop, size_t off, size_t len)
0164 {
0165     if (iop)
0166         iomap_iop_set_range_uptodate(folio, iop, off, len);
0167     else
0168         folio_mark_uptodate(folio);
0169 }
0170
0171 static void iomap_finish_folio_read(struct folio *folio, size_t offset,
0172         size_t len, int error)
0173 {
0174     struct iomap_page *iop = to_iomap_page(folio);
0175
0176     if (unlikely(error)) {
0177         folio_clear_uptodate(folio);
0178         folio_set_error(folio);
0179     } else {
0180         iomap_set_range_uptodate(folio, iop, offset, len);
0181     }
0182
0183     if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
0184         folio_unlock(folio);
0185 }
0186
0187 static void iomap_read_end_io(struct bio *bio)
0188 {
0189     int error = blk_status_to_errno(bio->bi_status);
0190     struct folio_iter fi;
0191
0192     bio_for_each_folio_all(fi, bio)
0193         iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
0194     bio_put(bio);
0195 }
0196
0197 struct iomap_readpage_ctx {
0198     struct folio        *cur_folio;
0199     bool            cur_folio_in_bio;
0200     struct bio      *bio;
0201     struct readahead_control *rac;
0202 };
0203
0204 /**
0205  * iomap_read_inline_data - copy inline data into the page cache
0206  * @iter: iteration structure
0207  * @folio: folio to copy to
0208  *
0209  * Copy the inline data in @iter into @folio and zero out the rest of the folio.
0210  * Only a single IOMAP_INLINE extent is allowed at the end of each file.
0211  * Returns zero for success to complete the read, or the usual negative errno.
0212  */
0213 static int iomap_read_inline_data(const struct iomap_iter *iter,
0214         struct folio *folio)
0215 {
0216     struct iomap_page *iop;
0217     const struct iomap *iomap = iomap_iter_srcmap(iter);
0218     size_t size = i_size_read(iter->inode) - iomap->offset;
0219     size_t poff = offset_in_page(iomap->offset);
0220     size_t offset = offset_in_folio(folio, iomap->offset);
0221     void *addr;
0222
0223     if (folio_test_uptodate(folio))
0224         return 0;
0225
0226     if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
0227         return -EIO;
0228     if (WARN_ON_ONCE(size > PAGE_SIZE -
0229              offset_in_page(iomap->inline_data)))
0230         return -EIO;
0231     if (WARN_ON_ONCE(size > iomap->length))
0232         return -EIO;
0233     if (offset > 0)
0234         iop = iomap_page_create(iter->inode, folio, iter->flags);
0235     else
0236         iop = to_iomap_page(folio);
0237
0238     addr = kmap_local_folio(folio, offset);
0239     memcpy(addr, iomap->inline_data, size);
0240     memset(addr + size, 0, PAGE_SIZE - poff - size);
0241     kunmap_local(addr);
0242     iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
0243     return 0;
0244 }
0245
0246 static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
0247         loff_t pos)
0248 {
0249     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0250
0251     return srcmap->type != IOMAP_MAPPED ||
0252         (srcmap->flags & IOMAP_F_NEW) ||
0253         pos >= i_size_read(iter->inode);
0254 }
0255
0256 static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
0257         struct iomap_readpage_ctx *ctx, loff_t offset)
0258 {
0259     const struct iomap *iomap = &iter->iomap;
0260     loff_t pos = iter->pos + offset;
0261     loff_t length = iomap_length(iter) - offset;
0262     struct folio *folio = ctx->cur_folio;
0263     struct iomap_page *iop;
0264     loff_t orig_pos = pos;
0265     size_t poff, plen;
0266     sector_t sector;
0267
0268     if (iomap->type == IOMAP_INLINE)
0269         return iomap_read_inline_data(iter, folio);
0270
0271     /* zero post-eof blocks as the page may be mapped */
0272     iop = iomap_page_create(iter->inode, folio, iter->flags);
0273     iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
0274     if (plen == 0)
0275         goto done;
0276
0277     if (iomap_block_needs_zeroing(iter, pos)) {
0278         folio_zero_range(folio, poff, plen);
0279         iomap_set_range_uptodate(folio, iop, poff, plen);
0280         goto done;
0281     }
0282
0283     ctx->cur_folio_in_bio = true;
0284     if (iop)
0285         atomic_add(plen, &iop->read_bytes_pending);
0286
0287     sector = iomap_sector(iomap, pos);
0288     if (!ctx->bio ||
0289         bio_end_sector(ctx->bio) != sector ||
0290         !bio_add_folio(ctx->bio, folio, plen, poff)) {
0291         gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
0292         gfp_t orig_gfp = gfp;
0293         unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
0294
0295         if (ctx->bio)
0296             submit_bio(ctx->bio);
0297
0298         if (ctx->rac) /* same as readahead_gfp_mask */
0299             gfp |= __GFP_NORETRY | __GFP_NOWARN;
0300         ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
0301                      REQ_OP_READ, gfp);
0302         /*
0303          * If the bio_alloc fails, try it again for a single page to
0304          * avoid having to deal with partial page reads.  This emulates
0305          * what do_mpage_read_folio does.
0306          */
0307         if (!ctx->bio) {
0308             ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
0309                          orig_gfp);
0310         }
0311         if (ctx->rac)
0312             ctx->bio->bi_opf |= REQ_RAHEAD;
0313         ctx->bio->bi_iter.bi_sector = sector;
0314         ctx->bio->bi_end_io = iomap_read_end_io;
0315         bio_add_folio(ctx->bio, folio, plen, poff);
0316     }
0317
0318 done:
0319     /*
0320      * Move the caller beyond our range so that it keeps making progress.
0321      * For that, we have to include any leading non-uptodate ranges, but
0322      * we can skip trailing ones as they will be handled in the next
0323      * iteration.
0324      */
0325     return pos - orig_pos + plen;
0326 }
0327
0328 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
0329 {
0330     struct iomap_iter iter = {
0331         .inode      = folio->mapping->host,
0332         .pos        = folio_pos(folio),
0333         .len        = folio_size(folio),
0334     };
0335     struct iomap_readpage_ctx ctx = {
0336         .cur_folio  = folio,
0337     };
0338     int ret;
0339
0340     trace_iomap_readpage(iter.inode, 1);
0341
0342     while ((ret = iomap_iter(&iter, ops)) > 0)
0343         iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
0344
0345     if (ret < 0)
0346         folio_set_error(folio);
0347
0348     if (ctx.bio) {
0349         submit_bio(ctx.bio);
0350         WARN_ON_ONCE(!ctx.cur_folio_in_bio);
0351     } else {
0352         WARN_ON_ONCE(ctx.cur_folio_in_bio);
0353         folio_unlock(folio);
0354     }
0355
0356     /*
0357      * Just like mpage_readahead and block_read_full_folio, we always
0358      * return 0 and just set the folio error flag on errors.  This
0359      * should be cleaned up throughout the stack eventually.
0360      */
0361     return 0;
0362 }
0363 EXPORT_SYMBOL_GPL(iomap_read_folio);
0364
0365 static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
0366         struct iomap_readpage_ctx *ctx)
0367 {
0368     loff_t length = iomap_length(iter);
0369     loff_t done, ret;
0370
0371     for (done = 0; done < length; done += ret) {
0372         if (ctx->cur_folio &&
0373             offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
0374             if (!ctx->cur_folio_in_bio)
0375                 folio_unlock(ctx->cur_folio);
0376             ctx->cur_folio = NULL;
0377         }
0378         if (!ctx->cur_folio) {
0379             ctx->cur_folio = readahead_folio(ctx->rac);
0380             ctx->cur_folio_in_bio = false;
0381         }
0382         ret = iomap_readpage_iter(iter, ctx, done);
0383         if (ret <= 0)
0384             return ret;
0385     }
0386
0387     return done;
0388 }
0389
0390 /**
0391  * iomap_readahead - Attempt to read pages from a file.
0392  * @rac: Describes the pages to be read.
0393  * @ops: The operations vector for the filesystem.
0394  *
0395  * This function is for filesystems to call to implement their readahead
0396  * address_space operation.
0397  *
0398  * Context: The @ops callbacks may submit I/O (eg to read the addresses of
0399  * blocks from disc), and may wait for it.  The caller may be trying to
0400  * access a different page, and so sleeping excessively should be avoided.
0401  * It may allocate memory, but should avoid costly allocations.  This
0402  * function is called with memalloc_nofs set, so allocations will not cause
0403  * the filesystem to be reentered.
0404  */
0405 void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
0406 {
0407     struct iomap_iter iter = {
0408         .inode  = rac->mapping->host,
0409         .pos    = readahead_pos(rac),
0410         .len    = readahead_length(rac),
0411     };
0412     struct iomap_readpage_ctx ctx = {
0413         .rac    = rac,
0414     };
0415
0416     trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
0417
0418     while (iomap_iter(&iter, ops) > 0)
0419         iter.processed = iomap_readahead_iter(&iter, &ctx);
0420
0421     if (ctx.bio)
0422         submit_bio(ctx.bio);
0423     if (ctx.cur_folio) {
0424         if (!ctx.cur_folio_in_bio)
0425             folio_unlock(ctx.cur_folio);
0426     }
0427 }
0428 EXPORT_SYMBOL_GPL(iomap_readahead);
0429
0430 /*
0431  * iomap_is_partially_uptodate checks whether blocks within a folio are
0432  * uptodate or not.
0433  *
0434  * Returns true if all blocks which correspond to the specified part
0435  * of the folio are uptodate.
0436  */
0437 bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
0438 {
0439     struct iomap_page *iop = to_iomap_page(folio);
0440     struct inode *inode = folio->mapping->host;
0441     unsigned first, last, i;
0442
0443     if (!iop)
0444         return false;
0445
0446     /* Caller's range may extend past the end of this folio */
0447     count = min(folio_size(folio) - from, count);
0448
0449     /* First and last blocks in range within folio */
0450     first = from >> inode->i_blkbits;
0451     last = (from + count - 1) >> inode->i_blkbits;
0452
0453     for (i = first; i <= last; i++)
0454         if (!test_bit(i, iop->uptodate))
0455             return false;
0456     return true;
0457 }
0458 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
0459
0460 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
0461 {
0462     trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
0463             folio_size(folio));
0464
0465     /*
0466      * mm accommodates an old ext3 case where clean folios might
0467      * not have had the dirty bit cleared.  Thus, it can send actual
0468      * dirty folios to ->release_folio() via shrink_active_list();
0469      * skip those here.
0470      */
0471     if (folio_test_dirty(folio) || folio_test_writeback(folio))
0472         return false;
0473     iomap_page_release(folio);
0474     return true;
0475 }
0476 EXPORT_SYMBOL_GPL(iomap_release_folio);
0477
0478 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
0479 {
0480     trace_iomap_invalidate_folio(folio->mapping->host,
0481                     folio_pos(folio) + offset, len);
0482
0483     /*
0484      * If we're invalidating the entire folio, clear the dirty state
0485      * from it and release it to avoid unnecessary buildup of the LRU.
0486      */
0487     if (offset == 0 && len == folio_size(folio)) {
0488         WARN_ON_ONCE(folio_test_writeback(folio));
0489         folio_cancel_dirty(folio);
0490         iomap_page_release(folio);
0491     } else if (folio_test_large(folio)) {
0492         /* Must release the iop so the page can be split */
0493         WARN_ON_ONCE(!folio_test_uptodate(folio) &&
0494                  folio_test_dirty(folio));
0495         iomap_page_release(folio);
0496     }
0497 }
0498 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
0499
0500 static void
0501 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
0502 {
0503     loff_t i_size = i_size_read(inode);
0504
0505     /*
0506      * Only truncate newly allocated pages beyoned EOF, even if the
0507      * write started inside the existing inode size.
0508      */
0509     if (pos + len > i_size)
0510         truncate_pagecache_range(inode, max(pos, i_size),
0511                      pos + len - 1);
0512 }
0513
0514 static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
0515         size_t poff, size_t plen, const struct iomap *iomap)
0516 {
0517     struct bio_vec bvec;
0518     struct bio bio;
0519
0520     bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
0521     bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
0522     bio_add_folio(&bio, folio, plen, poff);
0523     return submit_bio_wait(&bio);
0524 }
0525
0526 static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
0527         size_t len, struct folio *folio)
0528 {
0529     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0530     struct iomap_page *iop;
0531     loff_t block_size = i_blocksize(iter->inode);
0532     loff_t block_start = round_down(pos, block_size);
0533     loff_t block_end = round_up(pos + len, block_size);
0534     unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
0535     size_t from = offset_in_folio(folio, pos), to = from + len;
0536     size_t poff, plen;
0537
0538     if (folio_test_uptodate(folio))
0539         return 0;
0540     folio_clear_error(folio);
0541
0542     iop = iomap_page_create(iter->inode, folio, iter->flags);
0543     if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
0544         return -EAGAIN;
0545
0546     do {
0547         iomap_adjust_read_range(iter->inode, folio, &block_start,
0548                 block_end - block_start, &poff, &plen);
0549         if (plen == 0)
0550             break;
0551
0552         if (!(iter->flags & IOMAP_UNSHARE) &&
0553             (from <= poff || from >= poff + plen) &&
0554             (to <= poff || to >= poff + plen))
0555             continue;
0556
0557         if (iomap_block_needs_zeroing(iter, block_start)) {
0558             if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
0559                 return -EIO;
0560             folio_zero_segments(folio, poff, from, to, poff + plen);
0561         } else {
0562             int status;
0563
0564             if (iter->flags & IOMAP_NOWAIT)
0565                 return -EAGAIN;
0566
0567             status = iomap_read_folio_sync(block_start, folio,
0568                     poff, plen, srcmap);
0569             if (status)
0570                 return status;
0571         }
0572         iomap_set_range_uptodate(folio, iop, poff, plen);
0573     } while ((block_start += plen) < block_end);
0574
0575     return 0;
0576 }
0577
0578 static int iomap_write_begin_inline(const struct iomap_iter *iter,
0579         struct folio *folio)
0580 {
0581     /* needs more work for the tailpacking case; disable for now */
0582     if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
0583         return -EIO;
0584     return iomap_read_inline_data(iter, folio);
0585 }
0586
0587 static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
0588         size_t len, struct folio **foliop)
0589 {
0590     const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
0591     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0592     struct folio *folio;
0593     unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
0594     int status = 0;
0595
0596     if (iter->flags & IOMAP_NOWAIT)
0597         fgp |= FGP_NOWAIT;
0598
0599     BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
0600     if (srcmap != &iter->iomap)
0601         BUG_ON(pos + len > srcmap->offset + srcmap->length);
0602
0603     if (fatal_signal_pending(current))
0604         return -EINTR;
0605
0606     if (!mapping_large_folio_support(iter->inode->i_mapping))
0607         len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
0608
0609     if (page_ops && page_ops->page_prepare) {
0610         status = page_ops->page_prepare(iter->inode, pos, len);
0611         if (status)
0612             return status;
0613     }
0614
0615     folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
0616             fgp, mapping_gfp_mask(iter->inode->i_mapping));
0617     if (!folio) {
0618         status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
0619         goto out_no_page;
0620     }
0621     if (pos + len > folio_pos(folio) + folio_size(folio))
0622         len = folio_pos(folio) + folio_size(folio) - pos;
0623
0624     if (srcmap->type == IOMAP_INLINE)
0625         status = iomap_write_begin_inline(iter, folio);
0626     else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
0627         status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
0628     else
0629         status = __iomap_write_begin(iter, pos, len, folio);
0630
0631     if (unlikely(status))
0632         goto out_unlock;
0633
0634     *foliop = folio;
0635     return 0;
0636
0637 out_unlock:
0638     folio_unlock(folio);
0639     folio_put(folio);
0640     iomap_write_failed(iter->inode, pos, len);
0641
0642 out_no_page:
0643     if (page_ops && page_ops->page_done)
0644         page_ops->page_done(iter->inode, pos, 0, NULL);
0645     return status;
0646 }
0647
0648 static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
0649         size_t copied, struct folio *folio)
0650 {
0651     struct iomap_page *iop = to_iomap_page(folio);
0652     flush_dcache_folio(folio);
0653
0654     /*
0655      * The blocks that were entirely written will now be uptodate, so we
0656      * don't have to worry about a read_folio reading them and overwriting a
0657      * partial write.  However, if we've encountered a short write and only
0658      * partially written into a block, it will not be marked uptodate, so a
0659      * read_folio might come in and destroy our partial write.
0660      *
0661      * Do the simplest thing and just treat any short write to a
0662      * non-uptodate page as a zero-length write, and force the caller to
0663      * redo the whole thing.
0664      */
0665     if (unlikely(copied < len && !folio_test_uptodate(folio)))
0666         return 0;
0667     iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
0668     filemap_dirty_folio(inode->i_mapping, folio);
0669     return copied;
0670 }
0671
0672 static size_t iomap_write_end_inline(const struct iomap_iter *iter,
0673         struct folio *folio, loff_t pos, size_t copied)
0674 {
0675     const struct iomap *iomap = &iter->iomap;
0676     void *addr;
0677
0678     WARN_ON_ONCE(!folio_test_uptodate(folio));
0679     BUG_ON(!iomap_inline_data_valid(iomap));
0680
0681     flush_dcache_folio(folio);
0682     addr = kmap_local_folio(folio, pos);
0683     memcpy(iomap_inline_data(iomap, pos), addr, copied);
0684     kunmap_local(addr);
0685
0686     mark_inode_dirty(iter->inode);
0687     return copied;
0688 }
0689
0690 /* Returns the number of bytes copied.  May be 0.  Cannot be an errno. */
0691 static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
0692         size_t copied, struct folio *folio)
0693 {
0694     const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
0695     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0696     loff_t old_size = iter->inode->i_size;
0697     size_t ret;
0698
0699     if (srcmap->type == IOMAP_INLINE) {
0700         ret = iomap_write_end_inline(iter, folio, pos, copied);
0701     } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
0702         ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
0703                 copied, &folio->page, NULL);
0704     } else {
0705         ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
0706     }
0707
0708     /*
0709      * Update the in-memory inode size after copying the data into the page
0710      * cache.  It's up to the file system to write the updated size to disk,
0711      * preferably after I/O completion so that no stale data is exposed.
0712      */
0713     if (pos + ret > old_size) {
0714         i_size_write(iter->inode, pos + ret);
0715         iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
0716     }
0717     folio_unlock(folio);
0718
0719     if (old_size < pos)
0720         pagecache_isize_extended(iter->inode, old_size, pos);
0721     if (page_ops && page_ops->page_done)
0722         page_ops->page_done(iter->inode, pos, ret, &folio->page);
0723     folio_put(folio);
0724
0725     if (ret < len)
0726         iomap_write_failed(iter->inode, pos + ret, len - ret);
0727     return ret;
0728 }
0729
0730 static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
0731 {
0732     loff_t length = iomap_length(iter);
0733     loff_t pos = iter->pos;
0734     ssize_t written = 0;
0735     long status = 0;
0736     struct address_space *mapping = iter->inode->i_mapping;
0737     unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
0738
0739     do {
0740         struct folio *folio;
0741         struct page *page;
0742         unsigned long offset;   /* Offset into pagecache page */
0743         unsigned long bytes;    /* Bytes to write to page */
0744         size_t copied;      /* Bytes copied from user */
0745
0746         offset = offset_in_page(pos);
0747         bytes = min_t(unsigned long, PAGE_SIZE - offset,
0748                         iov_iter_count(i));
0749 again:
0750         status = balance_dirty_pages_ratelimited_flags(mapping,
0751                                    bdp_flags);
0752         if (unlikely(status))
0753             break;
0754
0755         if (bytes > length)
0756             bytes = length;
0757
0758         /*
0759          * Bring in the user page that we'll copy from _first_.
0760          * Otherwise there's a nasty deadlock on copying from the
0761          * same page as we're writing to, without it being marked
0762          * up-to-date.
0763          *
0764          * For async buffered writes the assumption is that the user
0765          * page has already been faulted in. This can be optimized by
0766          * faulting the user page.
0767          */
0768         if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
0769             status = -EFAULT;
0770             break;
0771         }
0772
0773         status = iomap_write_begin(iter, pos, bytes, &folio);
0774         if (unlikely(status))
0775             break;
0776
0777         page = folio_file_page(folio, pos >> PAGE_SHIFT);
0778         if (mapping_writably_mapped(mapping))
0779             flush_dcache_page(page);
0780
0781         copied = copy_page_from_iter_atomic(page, offset, bytes, i);
0782
0783         status = iomap_write_end(iter, pos, bytes, copied, folio);
0784
0785         if (unlikely(copied != status))
0786             iov_iter_revert(i, copied - status);
0787
0788         cond_resched();
0789         if (unlikely(status == 0)) {
0790             /*
0791              * A short copy made iomap_write_end() reject the
0792              * thing entirely.  Might be memory poisoning
0793              * halfway through, might be a race with munmap,
0794              * might be severe memory pressure.
0795              */
0796             if (copied)
0797                 bytes = copied;
0798             goto again;
0799         }
0800         pos += status;
0801         written += status;
0802         length -= status;
0803     } while (iov_iter_count(i) && length);
0804
0805     if (status == -EAGAIN) {
0806         iov_iter_revert(i, written);
0807         return -EAGAIN;
0808     }
0809     return written ? written : status;
0810 }
0811
0812 ssize_t
0813 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
0814         const struct iomap_ops *ops)
0815 {
0816     struct iomap_iter iter = {
0817         .inode      = iocb->ki_filp->f_mapping->host,
0818         .pos        = iocb->ki_pos,
0819         .len        = iov_iter_count(i),
0820         .flags      = IOMAP_WRITE,
0821     };
0822     int ret;
0823
0824     if (iocb->ki_flags & IOCB_NOWAIT)
0825         iter.flags |= IOMAP_NOWAIT;
0826
0827     while ((ret = iomap_iter(&iter, ops)) > 0)
0828         iter.processed = iomap_write_iter(&iter, i);
0829     if (iter.pos == iocb->ki_pos)
0830         return ret;
0831     return iter.pos - iocb->ki_pos;
0832 }
0833 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
0834
0835 static loff_t iomap_unshare_iter(struct iomap_iter *iter)
0836 {
0837     struct iomap *iomap = &iter->iomap;
0838     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0839     loff_t pos = iter->pos;
0840     loff_t length = iomap_length(iter);
0841     long status = 0;
0842     loff_t written = 0;
0843
0844     /* don't bother with blocks that are not shared to start with */
0845     if (!(iomap->flags & IOMAP_F_SHARED))
0846         return length;
0847     /* don't bother with holes or unwritten extents */
0848     if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
0849         return length;
0850
0851     do {
0852         unsigned long offset = offset_in_page(pos);
0853         unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
0854         struct folio *folio;
0855
0856         status = iomap_write_begin(iter, pos, bytes, &folio);
0857         if (unlikely(status))
0858             return status;
0859
0860         status = iomap_write_end(iter, pos, bytes, bytes, folio);
0861         if (WARN_ON_ONCE(status == 0))
0862             return -EIO;
0863
0864         cond_resched();
0865
0866         pos += status;
0867         written += status;
0868         length -= status;
0869
0870         balance_dirty_pages_ratelimited(iter->inode->i_mapping);
0871     } while (length);
0872
0873     return written;
0874 }
0875
0876 int
0877 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
0878         const struct iomap_ops *ops)
0879 {
0880     struct iomap_iter iter = {
0881         .inode      = inode,
0882         .pos        = pos,
0883         .len        = len,
0884         .flags      = IOMAP_WRITE | IOMAP_UNSHARE,
0885     };
0886     int ret;
0887
0888     while ((ret = iomap_iter(&iter, ops)) > 0)
0889         iter.processed = iomap_unshare_iter(&iter);
0890     return ret;
0891 }
0892 EXPORT_SYMBOL_GPL(iomap_file_unshare);
0893
0894 static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
0895 {
0896     const struct iomap *srcmap = iomap_iter_srcmap(iter);
0897     loff_t pos = iter->pos;
0898     loff_t length = iomap_length(iter);
0899     loff_t written = 0;
0900
0901     /* already zeroed?  we're done. */
0902     if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
0903         return length;
0904
0905     do {
0906         struct folio *folio;
0907         int status;
0908         size_t offset;
0909         size_t bytes = min_t(u64, SIZE_MAX, length);
0910
0911         status = iomap_write_begin(iter, pos, bytes, &folio);
0912         if (status)
0913             return status;
0914
0915         offset = offset_in_folio(folio, pos);
0916         if (bytes > folio_size(folio) - offset)
0917             bytes = folio_size(folio) - offset;
0918
0919         folio_zero_range(folio, offset, bytes);
0920         folio_mark_accessed(folio);
0921
0922         bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
0923         if (WARN_ON_ONCE(bytes == 0))
0924             return -EIO;
0925
0926         pos += bytes;
0927         length -= bytes;
0928         written += bytes;
0929     } while (length > 0);
0930
0931     if (did_zero)
0932         *did_zero = true;
0933     return written;
0934 }
0935
0936 int
0937 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
0938         const struct iomap_ops *ops)
0939 {
0940     struct iomap_iter iter = {
0941         .inode      = inode,
0942         .pos        = pos,
0943         .len        = len,
0944         .flags      = IOMAP_ZERO,
0945     };
0946     int ret;
0947
0948     while ((ret = iomap_iter(&iter, ops)) > 0)
0949         iter.processed = iomap_zero_iter(&iter, did_zero);
0950     return ret;
0951 }
0952 EXPORT_SYMBOL_GPL(iomap_zero_range);
0953
0954 int
0955 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
0956         const struct iomap_ops *ops)
0957 {
0958     unsigned int blocksize = i_blocksize(inode);
0959     unsigned int off = pos & (blocksize - 1);
0960
0961     /* Block boundary? Nothing to do */
0962     if (!off)
0963         return 0;
0964     return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
0965 }
0966 EXPORT_SYMBOL_GPL(iomap_truncate_page);
0967
0968 static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
0969         struct folio *folio)
0970 {
0971     loff_t length = iomap_length(iter);
0972     int ret;
0973
0974     if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
0975         ret = __block_write_begin_int(folio, iter->pos, length, NULL,
0976                           &iter->iomap);
0977         if (ret)
0978             return ret;
0979         block_commit_write(&folio->page, 0, length);
0980     } else {
0981         WARN_ON_ONCE(!folio_test_uptodate(folio));
0982         folio_mark_dirty(folio);
0983     }
0984
0985     return length;
0986 }
0987
0988 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
0989 {
0990     struct iomap_iter iter = {
0991         .inode      = file_inode(vmf->vma->vm_file),
0992         .flags      = IOMAP_WRITE | IOMAP_FAULT,
0993     };
0994     struct folio *folio = page_folio(vmf->page);
0995     ssize_t ret;
0996
0997     folio_lock(folio);
0998     ret = folio_mkwrite_check_truncate(folio, iter.inode);
0999     if (ret < 0)
1000         goto out_unlock;
1001     iter.pos = folio_pos(folio);
1002     iter.len = ret;
1003     while ((ret = iomap_iter(&iter, ops)) > 0)
1004         iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
1005
1006     if (ret < 0)
1007         goto out_unlock;
1008     folio_wait_stable(folio);
1009     return VM_FAULT_LOCKED;
1010 out_unlock:
1011     folio_unlock(folio);
1012     return block_page_mkwrite_return(ret);
1013 }
1014 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1015
1016 static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
1017         size_t len, int error)
1018 {
1019     struct iomap_page *iop = to_iomap_page(folio);
1020
1021     if (error) {
1022         folio_set_error(folio);
1023         mapping_set_error(inode->i_mapping, error);
1024     }
1025
1026     WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
1027     WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1028
1029     if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1030         folio_end_writeback(folio);
1031 }
1032
1033 /*
1034  * We're now finished for good with this ioend structure.  Update the page
1035  * state, release holds on bios, and finally free up memory.  Do not use the
1036  * ioend after this.
1037  */
1038 static u32
1039 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1040 {
1041     struct inode *inode = ioend->io_inode;
1042     struct bio *bio = &ioend->io_inline_bio;
1043     struct bio *last = ioend->io_bio, *next;
1044     u64 start = bio->bi_iter.bi_sector;
1045     loff_t offset = ioend->io_offset;
1046     bool quiet = bio_flagged(bio, BIO_QUIET);
1047     u32 folio_count = 0;
1048
1049     for (bio = &ioend->io_inline_bio; bio; bio = next) {
1050         struct folio_iter fi;
1051
1052         /*
1053          * For the last bio, bi_private points to the ioend, so we
1054          * need to explicitly end the iteration here.
1055          */
1056         if (bio == last)
1057             next = NULL;
1058         else
1059             next = bio->bi_private;
1060
1061         /* walk all folios in bio, ending page IO on them */
1062         bio_for_each_folio_all(fi, bio) {
1063             iomap_finish_folio_write(inode, fi.folio, fi.length,
1064                     error);
1065             folio_count++;
1066         }
1067         bio_put(bio);
1068     }
1069     /* The ioend has been freed by bio_put() */
1070
1071     if (unlikely(error && !quiet)) {
1072         printk_ratelimited(KERN_ERR
1073 "%s: writeback error on inode %lu, offset %lld, sector %llu",
1074             inode->i_sb->s_id, inode->i_ino, offset, start);
1075     }
1076     return folio_count;
1077 }
1078
1079 /*
1080  * Ioend completion routine for merged bios. This can only be called from task
1081  * contexts as merged ioends can be of unbound length. Hence we have to break up
1082  * the writeback completions into manageable chunks to avoid long scheduler
1083  * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
1084  * good batch processing throughput without creating adverse scheduler latency
1085  * conditions.
1086  */
1087 void
1088 iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1089 {
1090     struct list_head tmp;
1091     u32 completions;
1092
1093     might_sleep();
1094
1095     list_replace_init(&ioend->io_list, &tmp);
1096     completions = iomap_finish_ioend(ioend, error);
1097
1098     while (!list_empty(&tmp)) {
1099         if (completions > IOEND_BATCH_SIZE * 8) {
1100             cond_resched();
1101             completions = 0;
1102         }
1103         ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1104         list_del_init(&ioend->io_list);
1105         completions += iomap_finish_ioend(ioend, error);
1106     }
1107 }
1108 EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1109
1110 /*
1111  * We can merge two adjacent ioends if they have the same set of work to do.
1112  */
1113 static bool
1114 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1115 {
1116     if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1117         return false;
1118     if ((ioend->io_flags & IOMAP_F_SHARED) ^
1119         (next->io_flags & IOMAP_F_SHARED))
1120         return false;
1121     if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1122         (next->io_type == IOMAP_UNWRITTEN))
1123         return false;
1124     if (ioend->io_offset + ioend->io_size != next->io_offset)
1125         return false;
1126     /*
1127      * Do not merge physically discontiguous ioends. The filesystem
1128      * completion functions will have to iterate the physical
1129      * discontiguities even if we merge the ioends at a logical level, so
1130      * we don't gain anything by merging physical discontiguities here.
1131      *
1132      * We cannot use bio->bi_iter.bi_sector here as it is modified during
1133      * submission so does not point to the start sector of the bio at
1134      * completion.
1135      */
1136     if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
1137         return false;
1138     return true;
1139 }
1140
1141 void
1142 iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1143 {
1144     struct iomap_ioend *next;
1145
1146     INIT_LIST_HEAD(&ioend->io_list);
1147
1148     while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1149             io_list))) {
1150         if (!iomap_ioend_can_merge(ioend, next))
1151             break;
1152         list_move_tail(&next->io_list, &ioend->io_list);
1153         ioend->io_size += next->io_size;
1154     }
1155 }
1156 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1157
1158 static int
1159 iomap_ioend_compare(void *priv, const struct list_head *a,
1160         const struct list_head *b)
1161 {
1162     struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1163     struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1164
1165     if (ia->io_offset < ib->io_offset)
1166         return -1;
1167     if (ia->io_offset > ib->io_offset)
1168         return 1;
1169     return 0;
1170 }
1171
1172 void
1173 iomap_sort_ioends(struct list_head *ioend_list)
1174 {
1175     list_sort(NULL, ioend_list, iomap_ioend_compare);
1176 }
1177 EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1178
1179 static void iomap_writepage_end_bio(struct bio *bio)
1180 {
1181     struct iomap_ioend *ioend = bio->bi_private;
1182
1183     iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1184 }
1185
1186 /*
1187  * Submit the final bio for an ioend.
1188  *
1189  * If @error is non-zero, it means that we have a situation where some part of
1190  * the submission process has failed after we've marked pages for writeback
1191  * and unlocked them.  In this situation, we need to fail the bio instead of
1192  * submitting it.  This typically only happens on a filesystem shutdown.
1193  */
1194 static int
1195 iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1196         int error)
1197 {
1198     ioend->io_bio->bi_private = ioend;
1199     ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1200
1201     if (wpc->ops->prepare_ioend)
1202         error = wpc->ops->prepare_ioend(ioend, error);
1203     if (error) {
1204         /*
1205          * If we're failing the IO now, just mark the ioend with an
1206          * error and finish it.  This will run IO completion immediately
1207          * as there is only one reference to the ioend at this point in
1208          * time.
1209          */
1210         ioend->io_bio->bi_status = errno_to_blk_status(error);
1211         bio_endio(ioend->io_bio);
1212         return error;
1213     }
1214
1215     submit_bio(ioend->io_bio);
1216     return 0;
1217 }
1218
1219 static struct iomap_ioend *
1220 iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1221         loff_t offset, sector_t sector, struct writeback_control *wbc)
1222 {
1223     struct iomap_ioend *ioend;
1224     struct bio *bio;
1225
1226     bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
1227                    REQ_OP_WRITE | wbc_to_write_flags(wbc),
1228                    GFP_NOFS, &iomap_ioend_bioset);
1229     bio->bi_iter.bi_sector = sector;
1230     wbc_init_bio(wbc, bio);
1231
1232     ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1233     INIT_LIST_HEAD(&ioend->io_list);
1234     ioend->io_type = wpc->iomap.type;
1235     ioend->io_flags = wpc->iomap.flags;
1236     ioend->io_inode = inode;
1237     ioend->io_size = 0;
1238     ioend->io_folios = 0;
1239     ioend->io_offset = offset;
1240     ioend->io_bio = bio;
1241     ioend->io_sector = sector;
1242     return ioend;
1243 }
1244
1245 /*
1246  * Allocate a new bio, and chain the old bio to the new one.
1247  *
1248  * Note that we have to perform the chaining in this unintuitive order
1249  * so that the bi_private linkage is set up in the right direction for the
1250  * traversal in iomap_finish_ioend().
1251  */
1252 static struct bio *
1253 iomap_chain_bio(struct bio *prev)
1254 {
1255     struct bio *new;
1256
1257     new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
1258     bio_clone_blkg_association(new, prev);
1259     new->bi_iter.bi_sector = bio_end_sector(prev);
1260
1261     bio_chain(prev, new);
1262     bio_get(prev);      /* for iomap_finish_ioend */
1263     submit_bio(prev);
1264     return new;
1265 }
1266
1267 static bool
1268 iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1269         sector_t sector)
1270 {
1271     if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1272         (wpc->ioend->io_flags & IOMAP_F_SHARED))
1273         return false;
1274     if (wpc->iomap.type != wpc->ioend->io_type)
1275         return false;
1276     if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1277         return false;
1278     if (sector != bio_end_sector(wpc->ioend->io_bio))
1279         return false;
1280     /*
1281      * Limit ioend bio chain lengths to minimise IO completion latency. This
1282      * also prevents long tight loops ending page writeback on all the
1283      * folios in the ioend.
1284      */
1285     if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
1286         return false;
1287     return true;
1288 }
1289
1290 /*
1291  * Test to see if we have an existing ioend structure that we could append to
1292  * first; otherwise finish off the current ioend and start another.
1293  */
1294 static void
1295 iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
1296         struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1297         struct writeback_control *wbc, struct list_head *iolist)
1298 {
1299     sector_t sector = iomap_sector(&wpc->iomap, pos);
1300     unsigned len = i_blocksize(inode);
1301     size_t poff = offset_in_folio(folio, pos);
1302
1303     if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
1304         if (wpc->ioend)
1305             list_add(&wpc->ioend->io_list, iolist);
1306         wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
1307     }
1308
1309     if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
1310         wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
1311         bio_add_folio(wpc->ioend->io_bio, folio, len, poff);
1312     }
1313
1314     if (iop)
1315         atomic_add(len, &iop->write_bytes_pending);
1316     wpc->ioend->io_size += len;
1317     wbc_account_cgroup_owner(wbc, &folio->page, len);
1318 }
1319
1320 /*
1321  * We implement an immediate ioend submission policy here to avoid needing to
1322  * chain multiple ioends and hence nest mempool allocations which can violate
1323  * the forward progress guarantees we need to provide. The current ioend we're
1324  * adding blocks to is cached in the writepage context, and if the new block
1325  * doesn't append to the cached ioend, it will create a new ioend and cache that
1326  * instead.
1327  *
1328  * If a new ioend is created and cached, the old ioend is returned and queued
1329  * locally for submission once the entire page is processed or an error has been
1330  * detected.  While ioends are submitted immediately after they are completed,
1331  * batching optimisations are provided by higher level block plugging.
1332  *
1333  * At the end of a writeback pass, there will be a cached ioend remaining on the
1334  * writepage context that the caller will need to submit.
1335  */
1336 static int
1337 iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1338         struct writeback_control *wbc, struct inode *inode,
1339         struct folio *folio, u64 end_pos)
1340 {
1341     struct iomap_page *iop = iomap_page_create(inode, folio, 0);
1342     struct iomap_ioend *ioend, *next;
1343     unsigned len = i_blocksize(inode);
1344     unsigned nblocks = i_blocks_per_folio(inode, folio);
1345     u64 pos = folio_pos(folio);
1346     int error = 0, count = 0, i;
1347     LIST_HEAD(submit_list);
1348
1349     WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1350
1351     /*
1352      * Walk through the folio to find areas to write back. If we
1353      * run off the end of the current map or find the current map
1354      * invalid, grab a new one.
1355      */
1356     for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1357         if (iop && !test_bit(i, iop->uptodate))
1358             continue;
1359
1360         error = wpc->ops->map_blocks(wpc, inode, pos);
1361         if (error)
1362             break;
1363         if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1364             continue;
1365         if (wpc->iomap.type == IOMAP_HOLE)
1366             continue;
1367         iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
1368                  &submit_list);
1369         count++;
1370     }
1371     if (count)
1372         wpc->ioend->io_folios++;
1373
1374     WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1375     WARN_ON_ONCE(!folio_test_locked(folio));
1376     WARN_ON_ONCE(folio_test_writeback(folio));
1377     WARN_ON_ONCE(folio_test_dirty(folio));
1378
1379     /*
1380      * We cannot cancel the ioend directly here on error.  We may have
1381      * already set other pages under writeback and hence we have to run I/O
1382      * completion to mark the error state of the pages under writeback
1383      * appropriately.
1384      */
1385     if (unlikely(error)) {
1386         /*
1387          * Let the filesystem know what portion of the current page
1388          * failed to map. If the page hasn't been added to ioend, it
1389          * won't be affected by I/O completion and we must unlock it
1390          * now.
1391          */
1392         if (wpc->ops->discard_folio)
1393             wpc->ops->discard_folio(folio, pos);
1394         if (!count) {
1395             folio_unlock(folio);
1396             goto done;
1397         }
1398     }
1399
1400     folio_start_writeback(folio);
1401     folio_unlock(folio);
1402
1403     /*
1404      * Preserve the original error if there was one; catch
1405      * submission errors here and propagate into subsequent ioend
1406      * submissions.
1407      */
1408     list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1409         int error2;
1410
1411         list_del_init(&ioend->io_list);
1412         error2 = iomap_submit_ioend(wpc, ioend, error);
1413         if (error2 && !error)
1414             error = error2;
1415     }
1416
1417     /*
1418      * We can end up here with no error and nothing to write only if we race
1419      * with a partial page truncate on a sub-page block sized filesystem.
1420      */
1421     if (!count)
1422         folio_end_writeback(folio);
1423 done:
1424     mapping_set_error(folio->mapping, error);
1425     return error;
1426 }
1427
1428 /*
1429  * Write out a dirty page.
1430  *
1431  * For delalloc space on the page, we need to allocate space and flush it.
1432  * For unwritten space on the page, we need to start the conversion to
1433  * regular allocated space.
1434  */
1435 static int
1436 iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
1437 {
1438     struct folio *folio = page_folio(page);
1439     struct iomap_writepage_ctx *wpc = data;
1440     struct inode *inode = folio->mapping->host;
1441     u64 end_pos, isize;
1442
1443     trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
1444
1445     /*
1446      * Refuse to write the folio out if we're called from reclaim context.
1447      *
1448      * This avoids stack overflows when called from deeply used stacks in
1449      * random callers for direct reclaim or memcg reclaim.  We explicitly
1450      * allow reclaim from kswapd as the stack usage there is relatively low.
1451      *
1452      * This should never happen except in the case of a VM regression so
1453      * warn about it.
1454      */
1455     if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1456             PF_MEMALLOC))
1457         goto redirty;
1458
1459     /*
1460      * Is this folio beyond the end of the file?
1461      *
1462      * The folio index is less than the end_index, adjust the end_pos
1463      * to the highest offset that this folio should represent.
1464      * -----------------------------------------------------
1465      * |            file mapping           | <EOF> |
1466      * -----------------------------------------------------
1467      * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1468      * ^--------------------------------^----------|--------
1469      * |     desired writeback range    |      see else    |
1470      * ---------------------------------^------------------|
1471      */
1472     isize = i_size_read(inode);
1473     end_pos = folio_pos(folio) + folio_size(folio);
1474     if (end_pos > isize) {
1475         /*
1476          * Check whether the page to write out is beyond or straddles
1477          * i_size or not.
1478          * -------------------------------------------------------
1479          * |        file mapping                | <EOF>  |
1480          * -------------------------------------------------------
1481          * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1482          * ^--------------------------------^-----------|---------
1483          * |                    |      Straddles     |
1484          * ---------------------------------^-----------|--------|
1485          */
1486         size_t poff = offset_in_folio(folio, isize);
1487         pgoff_t end_index = isize >> PAGE_SHIFT;
1488
1489         /*
1490          * Skip the page if it's fully outside i_size, e.g.
1491          * due to a truncate operation that's in progress.  We've
1492          * cleaned this page and truncate will finish things off for
1493          * us.
1494          *
1495          * Note that the end_index is unsigned long.  If the given
1496          * offset is greater than 16TB on a 32-bit system then if we
1497          * checked if the page is fully outside i_size with
1498          * "if (page->index >= end_index + 1)", "end_index + 1" would
1499          * overflow and evaluate to 0.  Hence this page would be
1500          * redirtied and written out repeatedly, which would result in
1501          * an infinite loop; the user program performing this operation
1502          * would hang.  Instead, we can detect this situation by
1503          * checking if the page is totally beyond i_size or if its
1504          * offset is just equal to the EOF.
1505          */
1506         if (folio->index > end_index ||
1507             (folio->index == end_index && poff == 0))
1508             goto unlock;
1509
1510         /*
1511          * The page straddles i_size.  It must be zeroed out on each
1512          * and every writepage invocation because it may be mmapped.
1513          * "A file is mapped in multiples of the page size.  For a file
1514          * that is not a multiple of the page size, the remaining
1515          * memory is zeroed when mapped, and writes to that region are
1516          * not written out to the file."
1517          */
1518         folio_zero_segment(folio, poff, folio_size(folio));
1519         end_pos = isize;
1520     }
1521
1522     return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
1523
1524 redirty:
1525     folio_redirty_for_writepage(wbc, folio);
1526 unlock:
1527     folio_unlock(folio);
1528     return 0;
1529 }
1530
1531 int
1532 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1533         struct iomap_writepage_ctx *wpc,
1534         const struct iomap_writeback_ops *ops)
1535 {
1536     int         ret;
1537
1538     wpc->ops = ops;
1539     ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1540     if (!wpc->ioend)
1541         return ret;
1542     return iomap_submit_ioend(wpc, wpc->ioend, ret);
1543 }
1544 EXPORT_SYMBOL_GPL(iomap_writepages);
1545
1546 static int __init iomap_init(void)
1547 {
1548     return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1549                offsetof(struct iomap_ioend, io_inline_bio),
1550                BIOSET_NEED_BVECS);
1551 }
1552 fs_initcall(iomap_init);