Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * linux/fs/ext4/page-io.c
0004  *
0005  * This contains the new page_io functions for ext4
0006  *
0007  * Written by Theodore Ts'o, 2010.
0008  */
0009 
0010 #include <linux/fs.h>
0011 #include <linux/time.h>
0012 #include <linux/highuid.h>
0013 #include <linux/pagemap.h>
0014 #include <linux/quotaops.h>
0015 #include <linux/string.h>
0016 #include <linux/buffer_head.h>
0017 #include <linux/writeback.h>
0018 #include <linux/pagevec.h>
0019 #include <linux/mpage.h>
0020 #include <linux/namei.h>
0021 #include <linux/uio.h>
0022 #include <linux/bio.h>
0023 #include <linux/workqueue.h>
0024 #include <linux/kernel.h>
0025 #include <linux/slab.h>
0026 #include <linux/mm.h>
0027 #include <linux/sched/mm.h>
0028 
0029 #include "ext4_jbd2.h"
0030 #include "xattr.h"
0031 #include "acl.h"
0032 
0033 static struct kmem_cache *io_end_cachep;
0034 static struct kmem_cache *io_end_vec_cachep;
0035 
0036 int __init ext4_init_pageio(void)
0037 {
0038     io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
0039     if (io_end_cachep == NULL)
0040         return -ENOMEM;
0041 
0042     io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
0043     if (io_end_vec_cachep == NULL) {
0044         kmem_cache_destroy(io_end_cachep);
0045         return -ENOMEM;
0046     }
0047     return 0;
0048 }
0049 
0050 void ext4_exit_pageio(void)
0051 {
0052     kmem_cache_destroy(io_end_cachep);
0053     kmem_cache_destroy(io_end_vec_cachep);
0054 }
0055 
0056 struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
0057 {
0058     struct ext4_io_end_vec *io_end_vec;
0059 
0060     io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
0061     if (!io_end_vec)
0062         return ERR_PTR(-ENOMEM);
0063     INIT_LIST_HEAD(&io_end_vec->list);
0064     list_add_tail(&io_end_vec->list, &io_end->list_vec);
0065     return io_end_vec;
0066 }
0067 
0068 static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
0069 {
0070     struct ext4_io_end_vec *io_end_vec, *tmp;
0071 
0072     if (list_empty(&io_end->list_vec))
0073         return;
0074     list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
0075         list_del(&io_end_vec->list);
0076         kmem_cache_free(io_end_vec_cachep, io_end_vec);
0077     }
0078 }
0079 
0080 struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
0081 {
0082     BUG_ON(list_empty(&io_end->list_vec));
0083     return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
0084 }
0085 
0086 /*
0087  * Print an buffer I/O error compatible with the fs/buffer.c.  This
0088  * provides compatibility with dmesg scrapers that look for a specific
0089  * buffer I/O error message.  We really need a unified error reporting
0090  * structure to userspace ala Digital Unix's uerf system, but it's
0091  * probably not going to happen in my lifetime, due to LKML politics...
0092  */
0093 static void buffer_io_error(struct buffer_head *bh)
0094 {
0095     printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
0096                bh->b_bdev,
0097             (unsigned long long)bh->b_blocknr);
0098 }
0099 
0100 static void ext4_finish_bio(struct bio *bio)
0101 {
0102     struct bio_vec *bvec;
0103     struct bvec_iter_all iter_all;
0104 
0105     bio_for_each_segment_all(bvec, bio, iter_all) {
0106         struct page *page = bvec->bv_page;
0107         struct page *bounce_page = NULL;
0108         struct buffer_head *bh, *head;
0109         unsigned bio_start = bvec->bv_offset;
0110         unsigned bio_end = bio_start + bvec->bv_len;
0111         unsigned under_io = 0;
0112         unsigned long flags;
0113 
0114         if (fscrypt_is_bounce_page(page)) {
0115             bounce_page = page;
0116             page = fscrypt_pagecache_page(bounce_page);
0117         }
0118 
0119         if (bio->bi_status) {
0120             SetPageError(page);
0121             mapping_set_error(page->mapping, -EIO);
0122         }
0123         bh = head = page_buffers(page);
0124         /*
0125          * We check all buffers in the page under b_uptodate_lock
0126          * to avoid races with other end io clearing async_write flags
0127          */
0128         spin_lock_irqsave(&head->b_uptodate_lock, flags);
0129         do {
0130             if (bh_offset(bh) < bio_start ||
0131                 bh_offset(bh) + bh->b_size > bio_end) {
0132                 if (buffer_async_write(bh))
0133                     under_io++;
0134                 continue;
0135             }
0136             clear_buffer_async_write(bh);
0137             if (bio->bi_status) {
0138                 set_buffer_write_io_error(bh);
0139                 buffer_io_error(bh);
0140             }
0141         } while ((bh = bh->b_this_page) != head);
0142         spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
0143         if (!under_io) {
0144             fscrypt_free_bounce_page(bounce_page);
0145             end_page_writeback(page);
0146         }
0147     }
0148 }
0149 
0150 static void ext4_release_io_end(ext4_io_end_t *io_end)
0151 {
0152     struct bio *bio, *next_bio;
0153 
0154     BUG_ON(!list_empty(&io_end->list));
0155     BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
0156     WARN_ON(io_end->handle);
0157 
0158     for (bio = io_end->bio; bio; bio = next_bio) {
0159         next_bio = bio->bi_private;
0160         ext4_finish_bio(bio);
0161         bio_put(bio);
0162     }
0163     ext4_free_io_end_vec(io_end);
0164     kmem_cache_free(io_end_cachep, io_end);
0165 }
0166 
0167 /*
0168  * Check a range of space and convert unwritten extents to written. Note that
0169  * we are protected from truncate touching same part of extent tree by the
0170  * fact that truncate code waits for all DIO to finish (thus exclusion from
0171  * direct IO is achieved) and also waits for PageWriteback bits. Thus we
0172  * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
0173  * completed (happens from ext4_free_ioend()).
0174  */
0175 static int ext4_end_io_end(ext4_io_end_t *io_end)
0176 {
0177     struct inode *inode = io_end->inode;
0178     handle_t *handle = io_end->handle;
0179     int ret = 0;
0180 
0181     ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
0182            "list->prev 0x%p\n",
0183            io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
0184 
0185     io_end->handle = NULL;  /* Following call will use up the handle */
0186     ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
0187     if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
0188         ext4_msg(inode->i_sb, KERN_EMERG,
0189              "failed to convert unwritten extents to written "
0190              "extents -- potential data loss!  "
0191              "(inode %lu, error %d)", inode->i_ino, ret);
0192     }
0193     ext4_clear_io_unwritten_flag(io_end);
0194     ext4_release_io_end(io_end);
0195     return ret;
0196 }
0197 
0198 static void dump_completed_IO(struct inode *inode, struct list_head *head)
0199 {
0200 #ifdef  EXT4FS_DEBUG
0201     struct list_head *cur, *before, *after;
0202     ext4_io_end_t *io_end, *io_end0, *io_end1;
0203 
0204     if (list_empty(head))
0205         return;
0206 
0207     ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
0208     list_for_each_entry(io_end, head, list) {
0209         cur = &io_end->list;
0210         before = cur->prev;
0211         io_end0 = container_of(before, ext4_io_end_t, list);
0212         after = cur->next;
0213         io_end1 = container_of(after, ext4_io_end_t, list);
0214 
0215         ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
0216                 io_end, inode->i_ino, io_end0, io_end1);
0217     }
0218 #endif
0219 }
0220 
0221 /* Add the io_end to per-inode completed end_io list. */
0222 static void ext4_add_complete_io(ext4_io_end_t *io_end)
0223 {
0224     struct ext4_inode_info *ei = EXT4_I(io_end->inode);
0225     struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
0226     struct workqueue_struct *wq;
0227     unsigned long flags;
0228 
0229     /* Only reserved conversions from writeback should enter here */
0230     WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
0231     WARN_ON(!io_end->handle && sbi->s_journal);
0232     spin_lock_irqsave(&ei->i_completed_io_lock, flags);
0233     wq = sbi->rsv_conversion_wq;
0234     if (list_empty(&ei->i_rsv_conversion_list))
0235         queue_work(wq, &ei->i_rsv_conversion_work);
0236     list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
0237     spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
0238 }
0239 
0240 static int ext4_do_flush_completed_IO(struct inode *inode,
0241                       struct list_head *head)
0242 {
0243     ext4_io_end_t *io_end;
0244     struct list_head unwritten;
0245     unsigned long flags;
0246     struct ext4_inode_info *ei = EXT4_I(inode);
0247     int err, ret = 0;
0248 
0249     spin_lock_irqsave(&ei->i_completed_io_lock, flags);
0250     dump_completed_IO(inode, head);
0251     list_replace_init(head, &unwritten);
0252     spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
0253 
0254     while (!list_empty(&unwritten)) {
0255         io_end = list_entry(unwritten.next, ext4_io_end_t, list);
0256         BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
0257         list_del_init(&io_end->list);
0258 
0259         err = ext4_end_io_end(io_end);
0260         if (unlikely(!ret && err))
0261             ret = err;
0262     }
0263     return ret;
0264 }
0265 
0266 /*
0267  * work on completed IO, to convert unwritten extents to extents
0268  */
0269 void ext4_end_io_rsv_work(struct work_struct *work)
0270 {
0271     struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
0272                           i_rsv_conversion_work);
0273     ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
0274 }
0275 
0276 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
0277 {
0278     ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
0279 
0280     if (io_end) {
0281         io_end->inode = inode;
0282         INIT_LIST_HEAD(&io_end->list);
0283         INIT_LIST_HEAD(&io_end->list_vec);
0284         refcount_set(&io_end->count, 1);
0285     }
0286     return io_end;
0287 }
0288 
0289 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
0290 {
0291     if (refcount_dec_and_test(&io_end->count)) {
0292         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
0293                 list_empty(&io_end->list_vec)) {
0294             ext4_release_io_end(io_end);
0295             return;
0296         }
0297         ext4_add_complete_io(io_end);
0298     }
0299 }
0300 
0301 int ext4_put_io_end(ext4_io_end_t *io_end)
0302 {
0303     int err = 0;
0304 
0305     if (refcount_dec_and_test(&io_end->count)) {
0306         if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
0307             err = ext4_convert_unwritten_io_end_vec(io_end->handle,
0308                                 io_end);
0309             io_end->handle = NULL;
0310             ext4_clear_io_unwritten_flag(io_end);
0311         }
0312         ext4_release_io_end(io_end);
0313     }
0314     return err;
0315 }
0316 
0317 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
0318 {
0319     refcount_inc(&io_end->count);
0320     return io_end;
0321 }
0322 
0323 /* BIO completion function for page writeback */
0324 static void ext4_end_bio(struct bio *bio)
0325 {
0326     ext4_io_end_t *io_end = bio->bi_private;
0327     sector_t bi_sector = bio->bi_iter.bi_sector;
0328 
0329     if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
0330               bio->bi_bdev,
0331               (long long) bio->bi_iter.bi_sector,
0332               (unsigned) bio_sectors(bio),
0333               bio->bi_status)) {
0334         ext4_finish_bio(bio);
0335         bio_put(bio);
0336         return;
0337     }
0338     bio->bi_end_io = NULL;
0339 
0340     if (bio->bi_status) {
0341         struct inode *inode = io_end->inode;
0342 
0343         ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
0344                  "starting block %llu)",
0345                  bio->bi_status, inode->i_ino,
0346                  (unsigned long long)
0347                  bi_sector >> (inode->i_blkbits - 9));
0348         mapping_set_error(inode->i_mapping,
0349                 blk_status_to_errno(bio->bi_status));
0350     }
0351 
0352     if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
0353         /*
0354          * Link bio into list hanging from io_end. We have to do it
0355          * atomically as bio completions can be racing against each
0356          * other.
0357          */
0358         bio->bi_private = xchg(&io_end->bio, bio);
0359         ext4_put_io_end_defer(io_end);
0360     } else {
0361         /*
0362          * Drop io_end reference early. Inode can get freed once
0363          * we finish the bio.
0364          */
0365         ext4_put_io_end_defer(io_end);
0366         ext4_finish_bio(bio);
0367         bio_put(bio);
0368     }
0369 }
0370 
0371 void ext4_io_submit(struct ext4_io_submit *io)
0372 {
0373     struct bio *bio = io->io_bio;
0374 
0375     if (bio) {
0376         if (io->io_wbc->sync_mode == WB_SYNC_ALL)
0377             io->io_bio->bi_opf |= REQ_SYNC;
0378         submit_bio(io->io_bio);
0379     }
0380     io->io_bio = NULL;
0381 }
0382 
0383 void ext4_io_submit_init(struct ext4_io_submit *io,
0384              struct writeback_control *wbc)
0385 {
0386     io->io_wbc = wbc;
0387     io->io_bio = NULL;
0388     io->io_end = NULL;
0389 }
0390 
0391 static void io_submit_init_bio(struct ext4_io_submit *io,
0392                    struct buffer_head *bh)
0393 {
0394     struct bio *bio;
0395 
0396     /*
0397      * bio_alloc will _always_ be able to allocate a bio if
0398      * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
0399      */
0400     bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
0401     fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
0402     bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
0403     bio->bi_end_io = ext4_end_bio;
0404     bio->bi_private = ext4_get_io_end(io->io_end);
0405     io->io_bio = bio;
0406     io->io_next_block = bh->b_blocknr;
0407     wbc_init_bio(io->io_wbc, bio);
0408 }
0409 
0410 static void io_submit_add_bh(struct ext4_io_submit *io,
0411                  struct inode *inode,
0412                  struct page *page,
0413                  struct buffer_head *bh)
0414 {
0415     int ret;
0416 
0417     if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
0418                !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
0419 submit_and_retry:
0420         ext4_io_submit(io);
0421     }
0422     if (io->io_bio == NULL)
0423         io_submit_init_bio(io, bh);
0424     ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
0425     if (ret != bh->b_size)
0426         goto submit_and_retry;
0427     wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
0428     io->io_next_block++;
0429 }
0430 
0431 int ext4_bio_write_page(struct ext4_io_submit *io,
0432             struct page *page,
0433             int len,
0434             bool keep_towrite)
0435 {
0436     struct page *bounce_page = NULL;
0437     struct inode *inode = page->mapping->host;
0438     unsigned block_start;
0439     struct buffer_head *bh, *head;
0440     int ret = 0;
0441     int nr_submitted = 0;
0442     int nr_to_submit = 0;
0443     struct writeback_control *wbc = io->io_wbc;
0444 
0445     BUG_ON(!PageLocked(page));
0446     BUG_ON(PageWriteback(page));
0447 
0448     if (keep_towrite)
0449         set_page_writeback_keepwrite(page);
0450     else
0451         set_page_writeback(page);
0452     ClearPageError(page);
0453 
0454     /*
0455      * Comments copied from block_write_full_page:
0456      *
0457      * The page straddles i_size.  It must be zeroed out on each and every
0458      * writepage invocation because it may be mmapped.  "A file is mapped
0459      * in multiples of the page size.  For a file that is not a multiple of
0460      * the page size, the remaining memory is zeroed when mapped, and
0461      * writes to that region are not written out to the file."
0462      */
0463     if (len < PAGE_SIZE)
0464         zero_user_segment(page, len, PAGE_SIZE);
0465     /*
0466      * In the first loop we prepare and mark buffers to submit. We have to
0467      * mark all buffers in the page before submitting so that
0468      * end_page_writeback() cannot be called from ext4_end_bio() when IO
0469      * on the first buffer finishes and we are still working on submitting
0470      * the second buffer.
0471      */
0472     bh = head = page_buffers(page);
0473     do {
0474         block_start = bh_offset(bh);
0475         if (block_start >= len) {
0476             clear_buffer_dirty(bh);
0477             set_buffer_uptodate(bh);
0478             continue;
0479         }
0480         if (!buffer_dirty(bh) || buffer_delay(bh) ||
0481             !buffer_mapped(bh) || buffer_unwritten(bh)) {
0482             /* A hole? We can safely clear the dirty bit */
0483             if (!buffer_mapped(bh))
0484                 clear_buffer_dirty(bh);
0485             if (io->io_bio)
0486                 ext4_io_submit(io);
0487             continue;
0488         }
0489         if (buffer_new(bh))
0490             clear_buffer_new(bh);
0491         set_buffer_async_write(bh);
0492         nr_to_submit++;
0493     } while ((bh = bh->b_this_page) != head);
0494 
0495     bh = head = page_buffers(page);
0496 
0497     /*
0498      * If any blocks are being written to an encrypted file, encrypt them
0499      * into a bounce page.  For simplicity, just encrypt until the last
0500      * block which might be needed.  This may cause some unneeded blocks
0501      * (e.g. holes) to be unnecessarily encrypted, but this is rare and
0502      * can't happen in the common case of blocksize == PAGE_SIZE.
0503      */
0504     if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
0505         gfp_t gfp_flags = GFP_NOFS;
0506         unsigned int enc_bytes = round_up(len, i_blocksize(inode));
0507 
0508         /*
0509          * Since bounce page allocation uses a mempool, we can only use
0510          * a waiting mask (i.e. request guaranteed allocation) on the
0511          * first page of the bio.  Otherwise it can deadlock.
0512          */
0513         if (io->io_bio)
0514             gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
0515     retry_encrypt:
0516         bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
0517                                    0, gfp_flags);
0518         if (IS_ERR(bounce_page)) {
0519             ret = PTR_ERR(bounce_page);
0520             if (ret == -ENOMEM &&
0521                 (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
0522                 gfp_t new_gfp_flags = GFP_NOFS;
0523                 if (io->io_bio)
0524                     ext4_io_submit(io);
0525                 else
0526                     new_gfp_flags |= __GFP_NOFAIL;
0527                 memalloc_retry_wait(gfp_flags);
0528                 gfp_flags = new_gfp_flags;
0529                 goto retry_encrypt;
0530             }
0531 
0532             printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
0533             redirty_page_for_writepage(wbc, page);
0534             do {
0535                 clear_buffer_async_write(bh);
0536                 bh = bh->b_this_page;
0537             } while (bh != head);
0538             goto unlock;
0539         }
0540     }
0541 
0542     /* Now submit buffers to write */
0543     do {
0544         if (!buffer_async_write(bh))
0545             continue;
0546         io_submit_add_bh(io, inode,
0547                  bounce_page ? bounce_page : page, bh);
0548         nr_submitted++;
0549         clear_buffer_dirty(bh);
0550     } while ((bh = bh->b_this_page) != head);
0551 
0552 unlock:
0553     unlock_page(page);
0554     /* Nothing submitted - we have to end page writeback */
0555     if (!nr_submitted)
0556         end_page_writeback(page);
0557     return ret;
0558 }