Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /**
0003  * aops.c - NTFS kernel address space operations and page cache handling.
0004  *
0005  * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
0006  * Copyright (c) 2002 Richard Russon
0007  */
0008 
0009 #include <linux/errno.h>
0010 #include <linux/fs.h>
0011 #include <linux/gfp.h>
0012 #include <linux/mm.h>
0013 #include <linux/pagemap.h>
0014 #include <linux/swap.h>
0015 #include <linux/buffer_head.h>
0016 #include <linux/writeback.h>
0017 #include <linux/bit_spinlock.h>
0018 #include <linux/bio.h>
0019 
0020 #include "aops.h"
0021 #include "attrib.h"
0022 #include "debug.h"
0023 #include "inode.h"
0024 #include "mft.h"
0025 #include "runlist.h"
0026 #include "types.h"
0027 #include "ntfs.h"
0028 
0029 /**
0030  * ntfs_end_buffer_async_read - async io completion for reading attributes
0031  * @bh:     buffer head on which io is completed
0032  * @uptodate:   whether @bh is now uptodate or not
0033  *
0034  * Asynchronous I/O completion handler for reading pages belonging to the
0035  * attribute address space of an inode.  The inodes can either be files or
0036  * directories or they can be fake inodes describing some attribute.
0037  *
0038  * If NInoMstProtected(), perform the post read mst fixups when all IO on the
0039  * page has been completed and mark the page uptodate or set the error bit on
0040  * the page.  To determine the size of the records that need fixing up, we
0041  * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
0042  * record size, and index_block_size_bits, to the log(base 2) of the ntfs
0043  * record size.
0044  */
0045 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
0046 {
0047     unsigned long flags;
0048     struct buffer_head *first, *tmp;
0049     struct page *page;
0050     struct inode *vi;
0051     ntfs_inode *ni;
0052     int page_uptodate = 1;
0053 
0054     page = bh->b_page;
0055     vi = page->mapping->host;
0056     ni = NTFS_I(vi);
0057 
0058     if (likely(uptodate)) {
0059         loff_t i_size;
0060         s64 file_ofs, init_size;
0061 
0062         set_buffer_uptodate(bh);
0063 
0064         file_ofs = ((s64)page->index << PAGE_SHIFT) +
0065                 bh_offset(bh);
0066         read_lock_irqsave(&ni->size_lock, flags);
0067         init_size = ni->initialized_size;
0068         i_size = i_size_read(vi);
0069         read_unlock_irqrestore(&ni->size_lock, flags);
0070         if (unlikely(init_size > i_size)) {
0071             /* Race with shrinking truncate. */
0072             init_size = i_size;
0073         }
0074         /* Check for the current buffer head overflowing. */
0075         if (unlikely(file_ofs + bh->b_size > init_size)) {
0076             int ofs;
0077             void *kaddr;
0078 
0079             ofs = 0;
0080             if (file_ofs < init_size)
0081                 ofs = init_size - file_ofs;
0082             kaddr = kmap_atomic(page);
0083             memset(kaddr + bh_offset(bh) + ofs, 0,
0084                     bh->b_size - ofs);
0085             flush_dcache_page(page);
0086             kunmap_atomic(kaddr);
0087         }
0088     } else {
0089         clear_buffer_uptodate(bh);
0090         SetPageError(page);
0091         ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
0092                 "0x%llx.", (unsigned long long)bh->b_blocknr);
0093     }
0094     first = page_buffers(page);
0095     spin_lock_irqsave(&first->b_uptodate_lock, flags);
0096     clear_buffer_async_read(bh);
0097     unlock_buffer(bh);
0098     tmp = bh;
0099     do {
0100         if (!buffer_uptodate(tmp))
0101             page_uptodate = 0;
0102         if (buffer_async_read(tmp)) {
0103             if (likely(buffer_locked(tmp)))
0104                 goto still_busy;
0105             /* Async buffers must be locked. */
0106             BUG();
0107         }
0108         tmp = tmp->b_this_page;
0109     } while (tmp != bh);
0110     spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
0111     /*
0112      * If none of the buffers had errors then we can set the page uptodate,
0113      * but we first have to perform the post read mst fixups, if the
0114      * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
0115      * Note we ignore fixup errors as those are detected when
0116      * map_mft_record() is called which gives us per record granularity
0117      * rather than per page granularity.
0118      */
0119     if (!NInoMstProtected(ni)) {
0120         if (likely(page_uptodate && !PageError(page)))
0121             SetPageUptodate(page);
0122     } else {
0123         u8 *kaddr;
0124         unsigned int i, recs;
0125         u32 rec_size;
0126 
0127         rec_size = ni->itype.index.block_size;
0128         recs = PAGE_SIZE / rec_size;
0129         /* Should have been verified before we got here... */
0130         BUG_ON(!recs);
0131         kaddr = kmap_atomic(page);
0132         for (i = 0; i < recs; i++)
0133             post_read_mst_fixup((NTFS_RECORD*)(kaddr +
0134                     i * rec_size), rec_size);
0135         kunmap_atomic(kaddr);
0136         flush_dcache_page(page);
0137         if (likely(page_uptodate && !PageError(page)))
0138             SetPageUptodate(page);
0139     }
0140     unlock_page(page);
0141     return;
0142 still_busy:
0143     spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
0144     return;
0145 }
0146 
0147 /**
0148  * ntfs_read_block - fill a @page of an address space with data
0149  * @page:   page cache page to fill with data
0150  *
0151  * Fill the page @page of the address space belonging to the @page->host inode.
0152  * We read each buffer asynchronously and when all buffers are read in, our io
0153  * completion handler ntfs_end_buffer_read_async(), if required, automatically
0154  * applies the mst fixups to the page before finally marking it uptodate and
0155  * unlocking it.
0156  *
0157  * We only enforce allocated_size limit because i_size is checked for in
0158  * generic_file_read().
0159  *
0160  * Return 0 on success and -errno on error.
0161  *
0162  * Contains an adapted version of fs/buffer.c::block_read_full_folio().
0163  */
0164 static int ntfs_read_block(struct page *page)
0165 {
0166     loff_t i_size;
0167     VCN vcn;
0168     LCN lcn;
0169     s64 init_size;
0170     struct inode *vi;
0171     ntfs_inode *ni;
0172     ntfs_volume *vol;
0173     runlist_element *rl;
0174     struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
0175     sector_t iblock, lblock, zblock;
0176     unsigned long flags;
0177     unsigned int blocksize, vcn_ofs;
0178     int i, nr;
0179     unsigned char blocksize_bits;
0180 
0181     vi = page->mapping->host;
0182     ni = NTFS_I(vi);
0183     vol = ni->vol;
0184 
0185     /* $MFT/$DATA must have its complete runlist in memory at all times. */
0186     BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
0187 
0188     blocksize = vol->sb->s_blocksize;
0189     blocksize_bits = vol->sb->s_blocksize_bits;
0190 
0191     if (!page_has_buffers(page)) {
0192         create_empty_buffers(page, blocksize, 0);
0193         if (unlikely(!page_has_buffers(page))) {
0194             unlock_page(page);
0195             return -ENOMEM;
0196         }
0197     }
0198     bh = head = page_buffers(page);
0199     BUG_ON(!bh);
0200 
0201     /*
0202      * We may be racing with truncate.  To avoid some of the problems we
0203      * now take a snapshot of the various sizes and use those for the whole
0204      * of the function.  In case of an extending truncate it just means we
0205      * may leave some buffers unmapped which are now allocated.  This is
0206      * not a problem since these buffers will just get mapped when a write
0207      * occurs.  In case of a shrinking truncate, we will detect this later
0208      * on due to the runlist being incomplete and if the page is being
0209      * fully truncated, truncate will throw it away as soon as we unlock
0210      * it so no need to worry what we do with it.
0211      */
0212     iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
0213     read_lock_irqsave(&ni->size_lock, flags);
0214     lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
0215     init_size = ni->initialized_size;
0216     i_size = i_size_read(vi);
0217     read_unlock_irqrestore(&ni->size_lock, flags);
0218     if (unlikely(init_size > i_size)) {
0219         /* Race with shrinking truncate. */
0220         init_size = i_size;
0221     }
0222     zblock = (init_size + blocksize - 1) >> blocksize_bits;
0223 
0224     /* Loop through all the buffers in the page. */
0225     rl = NULL;
0226     nr = i = 0;
0227     do {
0228         int err = 0;
0229 
0230         if (unlikely(buffer_uptodate(bh)))
0231             continue;
0232         if (unlikely(buffer_mapped(bh))) {
0233             arr[nr++] = bh;
0234             continue;
0235         }
0236         bh->b_bdev = vol->sb->s_bdev;
0237         /* Is the block within the allowed limits? */
0238         if (iblock < lblock) {
0239             bool is_retry = false;
0240 
0241             /* Convert iblock into corresponding vcn and offset. */
0242             vcn = (VCN)iblock << blocksize_bits >>
0243                     vol->cluster_size_bits;
0244             vcn_ofs = ((VCN)iblock << blocksize_bits) &
0245                     vol->cluster_size_mask;
0246             if (!rl) {
0247 lock_retry_remap:
0248                 down_read(&ni->runlist.lock);
0249                 rl = ni->runlist.rl;
0250             }
0251             if (likely(rl != NULL)) {
0252                 /* Seek to element containing target vcn. */
0253                 while (rl->length && rl[1].vcn <= vcn)
0254                     rl++;
0255                 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
0256             } else
0257                 lcn = LCN_RL_NOT_MAPPED;
0258             /* Successful remap. */
0259             if (lcn >= 0) {
0260                 /* Setup buffer head to correct block. */
0261                 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
0262                         + vcn_ofs) >> blocksize_bits;
0263                 set_buffer_mapped(bh);
0264                 /* Only read initialized data blocks. */
0265                 if (iblock < zblock) {
0266                     arr[nr++] = bh;
0267                     continue;
0268                 }
0269                 /* Fully non-initialized data block, zero it. */
0270                 goto handle_zblock;
0271             }
0272             /* It is a hole, need to zero it. */
0273             if (lcn == LCN_HOLE)
0274                 goto handle_hole;
0275             /* If first try and runlist unmapped, map and retry. */
0276             if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
0277                 is_retry = true;
0278                 /*
0279                  * Attempt to map runlist, dropping lock for
0280                  * the duration.
0281                  */
0282                 up_read(&ni->runlist.lock);
0283                 err = ntfs_map_runlist(ni, vcn);
0284                 if (likely(!err))
0285                     goto lock_retry_remap;
0286                 rl = NULL;
0287             } else if (!rl)
0288                 up_read(&ni->runlist.lock);
0289             /*
0290              * If buffer is outside the runlist, treat it as a
0291              * hole.  This can happen due to concurrent truncate
0292              * for example.
0293              */
0294             if (err == -ENOENT || lcn == LCN_ENOENT) {
0295                 err = 0;
0296                 goto handle_hole;
0297             }
0298             /* Hard error, zero out region. */
0299             if (!err)
0300                 err = -EIO;
0301             bh->b_blocknr = -1;
0302             SetPageError(page);
0303             ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
0304                     "attribute type 0x%x, vcn 0x%llx, "
0305                     "offset 0x%x because its location on "
0306                     "disk could not be determined%s "
0307                     "(error code %i).", ni->mft_no,
0308                     ni->type, (unsigned long long)vcn,
0309                     vcn_ofs, is_retry ? " even after "
0310                     "retrying" : "", err);
0311         }
0312         /*
0313          * Either iblock was outside lblock limits or
0314          * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
0315          * of the page and set the buffer uptodate.
0316          */
0317 handle_hole:
0318         bh->b_blocknr = -1UL;
0319         clear_buffer_mapped(bh);
0320 handle_zblock:
0321         zero_user(page, i * blocksize, blocksize);
0322         if (likely(!err))
0323             set_buffer_uptodate(bh);
0324     } while (i++, iblock++, (bh = bh->b_this_page) != head);
0325 
0326     /* Release the lock if we took it. */
0327     if (rl)
0328         up_read(&ni->runlist.lock);
0329 
0330     /* Check we have at least one buffer ready for i/o. */
0331     if (nr) {
0332         struct buffer_head *tbh;
0333 
0334         /* Lock the buffers. */
0335         for (i = 0; i < nr; i++) {
0336             tbh = arr[i];
0337             lock_buffer(tbh);
0338             tbh->b_end_io = ntfs_end_buffer_async_read;
0339             set_buffer_async_read(tbh);
0340         }
0341         /* Finally, start i/o on the buffers. */
0342         for (i = 0; i < nr; i++) {
0343             tbh = arr[i];
0344             if (likely(!buffer_uptodate(tbh)))
0345                 submit_bh(REQ_OP_READ, tbh);
0346             else
0347                 ntfs_end_buffer_async_read(tbh, 1);
0348         }
0349         return 0;
0350     }
0351     /* No i/o was scheduled on any of the buffers. */
0352     if (likely(!PageError(page)))
0353         SetPageUptodate(page);
0354     else /* Signal synchronous i/o error. */
0355         nr = -EIO;
0356     unlock_page(page);
0357     return nr;
0358 }
0359 
0360 /**
0361  * ntfs_read_folio - fill a @folio of a @file with data from the device
0362  * @file:   open file to which the folio @folio belongs or NULL
0363  * @folio:  page cache folio to fill with data
0364  *
0365  * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
0366  * file @file by calling the ntfs version of the generic block_read_full_folio()
0367  * function, ntfs_read_block(), which in turn creates and reads in the buffers
0368  * associated with the folio asynchronously.
0369  *
0370  * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
0371  * data from the mft record (which at this stage is most likely in memory) and
0372  * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
0373  * even if the mft record is not cached at this point in time, we need to wait
0374  * for it to be read in before we can do the copy.
0375  *
0376  * Return 0 on success and -errno on error.
0377  */
0378 static int ntfs_read_folio(struct file *file, struct folio *folio)
0379 {
0380     struct page *page = &folio->page;
0381     loff_t i_size;
0382     struct inode *vi;
0383     ntfs_inode *ni, *base_ni;
0384     u8 *addr;
0385     ntfs_attr_search_ctx *ctx;
0386     MFT_RECORD *mrec;
0387     unsigned long flags;
0388     u32 attr_len;
0389     int err = 0;
0390 
0391 retry_readpage:
0392     BUG_ON(!PageLocked(page));
0393     vi = page->mapping->host;
0394     i_size = i_size_read(vi);
0395     /* Is the page fully outside i_size? (truncate in progress) */
0396     if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
0397             PAGE_SHIFT)) {
0398         zero_user(page, 0, PAGE_SIZE);
0399         ntfs_debug("Read outside i_size - truncated?");
0400         goto done;
0401     }
0402     /*
0403      * This can potentially happen because we clear PageUptodate() during
0404      * ntfs_writepage() of MstProtected() attributes.
0405      */
0406     if (PageUptodate(page)) {
0407         unlock_page(page);
0408         return 0;
0409     }
0410     ni = NTFS_I(vi);
0411     /*
0412      * Only $DATA attributes can be encrypted and only unnamed $DATA
0413      * attributes can be compressed.  Index root can have the flags set but
0414      * this means to create compressed/encrypted files, not that the
0415      * attribute is compressed/encrypted.  Note we need to check for
0416      * AT_INDEX_ALLOCATION since this is the type of both directory and
0417      * index inodes.
0418      */
0419     if (ni->type != AT_INDEX_ALLOCATION) {
0420         /* If attribute is encrypted, deny access, just like NT4. */
0421         if (NInoEncrypted(ni)) {
0422             BUG_ON(ni->type != AT_DATA);
0423             err = -EACCES;
0424             goto err_out;
0425         }
0426         /* Compressed data streams are handled in compress.c. */
0427         if (NInoNonResident(ni) && NInoCompressed(ni)) {
0428             BUG_ON(ni->type != AT_DATA);
0429             BUG_ON(ni->name_len);
0430             return ntfs_read_compressed_block(page);
0431         }
0432     }
0433     /* NInoNonResident() == NInoIndexAllocPresent() */
0434     if (NInoNonResident(ni)) {
0435         /* Normal, non-resident data stream. */
0436         return ntfs_read_block(page);
0437     }
0438     /*
0439      * Attribute is resident, implying it is not compressed or encrypted.
0440      * This also means the attribute is smaller than an mft record and
0441      * hence smaller than a page, so can simply zero out any pages with
0442      * index above 0.  Note the attribute can actually be marked compressed
0443      * but if it is resident the actual data is not compressed so we are
0444      * ok to ignore the compressed flag here.
0445      */
0446     if (unlikely(page->index > 0)) {
0447         zero_user(page, 0, PAGE_SIZE);
0448         goto done;
0449     }
0450     if (!NInoAttr(ni))
0451         base_ni = ni;
0452     else
0453         base_ni = ni->ext.base_ntfs_ino;
0454     /* Map, pin, and lock the mft record. */
0455     mrec = map_mft_record(base_ni);
0456     if (IS_ERR(mrec)) {
0457         err = PTR_ERR(mrec);
0458         goto err_out;
0459     }
0460     /*
0461      * If a parallel write made the attribute non-resident, drop the mft
0462      * record and retry the read_folio.
0463      */
0464     if (unlikely(NInoNonResident(ni))) {
0465         unmap_mft_record(base_ni);
0466         goto retry_readpage;
0467     }
0468     ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
0469     if (unlikely(!ctx)) {
0470         err = -ENOMEM;
0471         goto unm_err_out;
0472     }
0473     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
0474             CASE_SENSITIVE, 0, NULL, 0, ctx);
0475     if (unlikely(err))
0476         goto put_unm_err_out;
0477     attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
0478     read_lock_irqsave(&ni->size_lock, flags);
0479     if (unlikely(attr_len > ni->initialized_size))
0480         attr_len = ni->initialized_size;
0481     i_size = i_size_read(vi);
0482     read_unlock_irqrestore(&ni->size_lock, flags);
0483     if (unlikely(attr_len > i_size)) {
0484         /* Race with shrinking truncate. */
0485         attr_len = i_size;
0486     }
0487     addr = kmap_atomic(page);
0488     /* Copy the data to the page. */
0489     memcpy(addr, (u8*)ctx->attr +
0490             le16_to_cpu(ctx->attr->data.resident.value_offset),
0491             attr_len);
0492     /* Zero the remainder of the page. */
0493     memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
0494     flush_dcache_page(page);
0495     kunmap_atomic(addr);
0496 put_unm_err_out:
0497     ntfs_attr_put_search_ctx(ctx);
0498 unm_err_out:
0499     unmap_mft_record(base_ni);
0500 done:
0501     SetPageUptodate(page);
0502 err_out:
0503     unlock_page(page);
0504     return err;
0505 }
0506 
0507 #ifdef NTFS_RW
0508 
0509 /**
0510  * ntfs_write_block - write a @page to the backing store
0511  * @page:   page cache page to write out
0512  * @wbc:    writeback control structure
0513  *
0514  * This function is for writing pages belonging to non-resident, non-mst
0515  * protected attributes to their backing store.
0516  *
0517  * For a page with buffers, map and write the dirty buffers asynchronously
0518  * under page writeback. For a page without buffers, create buffers for the
0519  * page, then proceed as above.
0520  *
0521  * If a page doesn't have buffers the page dirty state is definitive. If a page
0522  * does have buffers, the page dirty state is just a hint, and the buffer dirty
0523  * state is definitive. (A hint which has rules: dirty buffers against a clean
0524  * page is illegal. Other combinations are legal and need to be handled. In
0525  * particular a dirty page containing clean buffers for example.)
0526  *
0527  * Return 0 on success and -errno on error.
0528  *
0529  * Based on ntfs_read_block() and __block_write_full_page().
0530  */
0531 static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
0532 {
0533     VCN vcn;
0534     LCN lcn;
0535     s64 initialized_size;
0536     loff_t i_size;
0537     sector_t block, dblock, iblock;
0538     struct inode *vi;
0539     ntfs_inode *ni;
0540     ntfs_volume *vol;
0541     runlist_element *rl;
0542     struct buffer_head *bh, *head;
0543     unsigned long flags;
0544     unsigned int blocksize, vcn_ofs;
0545     int err;
0546     bool need_end_writeback;
0547     unsigned char blocksize_bits;
0548 
0549     vi = page->mapping->host;
0550     ni = NTFS_I(vi);
0551     vol = ni->vol;
0552 
0553     ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
0554             "0x%lx.", ni->mft_no, ni->type, page->index);
0555 
0556     BUG_ON(!NInoNonResident(ni));
0557     BUG_ON(NInoMstProtected(ni));
0558     blocksize = vol->sb->s_blocksize;
0559     blocksize_bits = vol->sb->s_blocksize_bits;
0560     if (!page_has_buffers(page)) {
0561         BUG_ON(!PageUptodate(page));
0562         create_empty_buffers(page, blocksize,
0563                 (1 << BH_Uptodate) | (1 << BH_Dirty));
0564         if (unlikely(!page_has_buffers(page))) {
0565             ntfs_warning(vol->sb, "Error allocating page "
0566                     "buffers.  Redirtying page so we try "
0567                     "again later.");
0568             /*
0569              * Put the page back on mapping->dirty_pages, but leave
0570              * its buffers' dirty state as-is.
0571              */
0572             redirty_page_for_writepage(wbc, page);
0573             unlock_page(page);
0574             return 0;
0575         }
0576     }
0577     bh = head = page_buffers(page);
0578     BUG_ON(!bh);
0579 
0580     /* NOTE: Different naming scheme to ntfs_read_block()! */
0581 
0582     /* The first block in the page. */
0583     block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
0584 
0585     read_lock_irqsave(&ni->size_lock, flags);
0586     i_size = i_size_read(vi);
0587     initialized_size = ni->initialized_size;
0588     read_unlock_irqrestore(&ni->size_lock, flags);
0589 
0590     /* The first out of bounds block for the data size. */
0591     dblock = (i_size + blocksize - 1) >> blocksize_bits;
0592 
0593     /* The last (fully or partially) initialized block. */
0594     iblock = initialized_size >> blocksize_bits;
0595 
0596     /*
0597      * Be very careful.  We have no exclusion from block_dirty_folio
0598      * here, and the (potentially unmapped) buffers may become dirty at
0599      * any time.  If a buffer becomes dirty here after we've inspected it
0600      * then we just miss that fact, and the page stays dirty.
0601      *
0602      * Buffers outside i_size may be dirtied by block_dirty_folio;
0603      * handle that here by just cleaning them.
0604      */
0605 
0606     /*
0607      * Loop through all the buffers in the page, mapping all the dirty
0608      * buffers to disk addresses and handling any aliases from the
0609      * underlying block device's mapping.
0610      */
0611     rl = NULL;
0612     err = 0;
0613     do {
0614         bool is_retry = false;
0615 
0616         if (unlikely(block >= dblock)) {
0617             /*
0618              * Mapped buffers outside i_size will occur, because
0619              * this page can be outside i_size when there is a
0620              * truncate in progress. The contents of such buffers
0621              * were zeroed by ntfs_writepage().
0622              *
0623              * FIXME: What about the small race window where
0624              * ntfs_writepage() has not done any clearing because
0625              * the page was within i_size but before we get here,
0626              * vmtruncate() modifies i_size?
0627              */
0628             clear_buffer_dirty(bh);
0629             set_buffer_uptodate(bh);
0630             continue;
0631         }
0632 
0633         /* Clean buffers are not written out, so no need to map them. */
0634         if (!buffer_dirty(bh))
0635             continue;
0636 
0637         /* Make sure we have enough initialized size. */
0638         if (unlikely((block >= iblock) &&
0639                 (initialized_size < i_size))) {
0640             /*
0641              * If this page is fully outside initialized
0642              * size, zero out all pages between the current
0643              * initialized size and the current page. Just
0644              * use ntfs_read_folio() to do the zeroing
0645              * transparently.
0646              */
0647             if (block > iblock) {
0648                 // TODO:
0649                 // For each page do:
0650                 // - read_cache_page()
0651                 // Again for each page do:
0652                 // - wait_on_page_locked()
0653                 // - Check (PageUptodate(page) &&
0654                 //          !PageError(page))
0655                 // Update initialized size in the attribute and
0656                 // in the inode.
0657                 // Again, for each page do:
0658                 //  block_dirty_folio();
0659                 // put_page()
0660                 // We don't need to wait on the writes.
0661                 // Update iblock.
0662             }
0663             /*
0664              * The current page straddles initialized size. Zero
0665              * all non-uptodate buffers and set them uptodate (and
0666              * dirty?). Note, there aren't any non-uptodate buffers
0667              * if the page is uptodate.
0668              * FIXME: For an uptodate page, the buffers may need to
0669              * be written out because they were not initialized on
0670              * disk before.
0671              */
0672             if (!PageUptodate(page)) {
0673                 // TODO:
0674                 // Zero any non-uptodate buffers up to i_size.
0675                 // Set them uptodate and dirty.
0676             }
0677             // TODO:
0678             // Update initialized size in the attribute and in the
0679             // inode (up to i_size).
0680             // Update iblock.
0681             // FIXME: This is inefficient. Try to batch the two
0682             // size changes to happen in one go.
0683             ntfs_error(vol->sb, "Writing beyond initialized size "
0684                     "is not supported yet. Sorry.");
0685             err = -EOPNOTSUPP;
0686             break;
0687             // Do NOT set_buffer_new() BUT DO clear buffer range
0688             // outside write request range.
0689             // set_buffer_uptodate() on complete buffers as well as
0690             // set_buffer_dirty().
0691         }
0692 
0693         /* No need to map buffers that are already mapped. */
0694         if (buffer_mapped(bh))
0695             continue;
0696 
0697         /* Unmapped, dirty buffer. Need to map it. */
0698         bh->b_bdev = vol->sb->s_bdev;
0699 
0700         /* Convert block into corresponding vcn and offset. */
0701         vcn = (VCN)block << blocksize_bits;
0702         vcn_ofs = vcn & vol->cluster_size_mask;
0703         vcn >>= vol->cluster_size_bits;
0704         if (!rl) {
0705 lock_retry_remap:
0706             down_read(&ni->runlist.lock);
0707             rl = ni->runlist.rl;
0708         }
0709         if (likely(rl != NULL)) {
0710             /* Seek to element containing target vcn. */
0711             while (rl->length && rl[1].vcn <= vcn)
0712                 rl++;
0713             lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
0714         } else
0715             lcn = LCN_RL_NOT_MAPPED;
0716         /* Successful remap. */
0717         if (lcn >= 0) {
0718             /* Setup buffer head to point to correct block. */
0719             bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
0720                     vcn_ofs) >> blocksize_bits;
0721             set_buffer_mapped(bh);
0722             continue;
0723         }
0724         /* It is a hole, need to instantiate it. */
0725         if (lcn == LCN_HOLE) {
0726             u8 *kaddr;
0727             unsigned long *bpos, *bend;
0728 
0729             /* Check if the buffer is zero. */
0730             kaddr = kmap_atomic(page);
0731             bpos = (unsigned long *)(kaddr + bh_offset(bh));
0732             bend = (unsigned long *)((u8*)bpos + blocksize);
0733             do {
0734                 if (unlikely(*bpos))
0735                     break;
0736             } while (likely(++bpos < bend));
0737             kunmap_atomic(kaddr);
0738             if (bpos == bend) {
0739                 /*
0740                  * Buffer is zero and sparse, no need to write
0741                  * it.
0742                  */
0743                 bh->b_blocknr = -1;
0744                 clear_buffer_dirty(bh);
0745                 continue;
0746             }
0747             // TODO: Instantiate the hole.
0748             // clear_buffer_new(bh);
0749             // clean_bdev_bh_alias(bh);
0750             ntfs_error(vol->sb, "Writing into sparse regions is "
0751                     "not supported yet. Sorry.");
0752             err = -EOPNOTSUPP;
0753             break;
0754         }
0755         /* If first try and runlist unmapped, map and retry. */
0756         if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
0757             is_retry = true;
0758             /*
0759              * Attempt to map runlist, dropping lock for
0760              * the duration.
0761              */
0762             up_read(&ni->runlist.lock);
0763             err = ntfs_map_runlist(ni, vcn);
0764             if (likely(!err))
0765                 goto lock_retry_remap;
0766             rl = NULL;
0767         } else if (!rl)
0768             up_read(&ni->runlist.lock);
0769         /*
0770          * If buffer is outside the runlist, truncate has cut it out
0771          * of the runlist.  Just clean and clear the buffer and set it
0772          * uptodate so it can get discarded by the VM.
0773          */
0774         if (err == -ENOENT || lcn == LCN_ENOENT) {
0775             bh->b_blocknr = -1;
0776             clear_buffer_dirty(bh);
0777             zero_user(page, bh_offset(bh), blocksize);
0778             set_buffer_uptodate(bh);
0779             err = 0;
0780             continue;
0781         }
0782         /* Failed to map the buffer, even after retrying. */
0783         if (!err)
0784             err = -EIO;
0785         bh->b_blocknr = -1;
0786         ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
0787                 "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
0788                 "because its location on disk could not be "
0789                 "determined%s (error code %i).", ni->mft_no,
0790                 ni->type, (unsigned long long)vcn,
0791                 vcn_ofs, is_retry ? " even after "
0792                 "retrying" : "", err);
0793         break;
0794     } while (block++, (bh = bh->b_this_page) != head);
0795 
0796     /* Release the lock if we took it. */
0797     if (rl)
0798         up_read(&ni->runlist.lock);
0799 
0800     /* For the error case, need to reset bh to the beginning. */
0801     bh = head;
0802 
0803     /* Just an optimization, so ->read_folio() is not called later. */
0804     if (unlikely(!PageUptodate(page))) {
0805         int uptodate = 1;
0806         do {
0807             if (!buffer_uptodate(bh)) {
0808                 uptodate = 0;
0809                 bh = head;
0810                 break;
0811             }
0812         } while ((bh = bh->b_this_page) != head);
0813         if (uptodate)
0814             SetPageUptodate(page);
0815     }
0816 
0817     /* Setup all mapped, dirty buffers for async write i/o. */
0818     do {
0819         if (buffer_mapped(bh) && buffer_dirty(bh)) {
0820             lock_buffer(bh);
0821             if (test_clear_buffer_dirty(bh)) {
0822                 BUG_ON(!buffer_uptodate(bh));
0823                 mark_buffer_async_write(bh);
0824             } else
0825                 unlock_buffer(bh);
0826         } else if (unlikely(err)) {
0827             /*
0828              * For the error case. The buffer may have been set
0829              * dirty during attachment to a dirty page.
0830              */
0831             if (err != -ENOMEM)
0832                 clear_buffer_dirty(bh);
0833         }
0834     } while ((bh = bh->b_this_page) != head);
0835 
0836     if (unlikely(err)) {
0837         // TODO: Remove the -EOPNOTSUPP check later on...
0838         if (unlikely(err == -EOPNOTSUPP))
0839             err = 0;
0840         else if (err == -ENOMEM) {
0841             ntfs_warning(vol->sb, "Error allocating memory. "
0842                     "Redirtying page so we try again "
0843                     "later.");
0844             /*
0845              * Put the page back on mapping->dirty_pages, but
0846              * leave its buffer's dirty state as-is.
0847              */
0848             redirty_page_for_writepage(wbc, page);
0849             err = 0;
0850         } else
0851             SetPageError(page);
0852     }
0853 
0854     BUG_ON(PageWriteback(page));
0855     set_page_writeback(page);   /* Keeps try_to_free_buffers() away. */
0856 
0857     /* Submit the prepared buffers for i/o. */
0858     need_end_writeback = true;
0859     do {
0860         struct buffer_head *next = bh->b_this_page;
0861         if (buffer_async_write(bh)) {
0862             submit_bh(REQ_OP_WRITE, bh);
0863             need_end_writeback = false;
0864         }
0865         bh = next;
0866     } while (bh != head);
0867     unlock_page(page);
0868 
0869     /* If no i/o was started, need to end_page_writeback(). */
0870     if (unlikely(need_end_writeback))
0871         end_page_writeback(page);
0872 
0873     ntfs_debug("Done.");
0874     return err;
0875 }
0876 
0877 /**
0878  * ntfs_write_mst_block - write a @page to the backing store
0879  * @page:   page cache page to write out
0880  * @wbc:    writeback control structure
0881  *
0882  * This function is for writing pages belonging to non-resident, mst protected
0883  * attributes to their backing store.  The only supported attributes are index
0884  * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
0885  * supported for the index allocation case.
0886  *
0887  * The page must remain locked for the duration of the write because we apply
0888  * the mst fixups, write, and then undo the fixups, so if we were to unlock the
0889  * page before undoing the fixups, any other user of the page will see the
0890  * page contents as corrupt.
0891  *
0892  * We clear the page uptodate flag for the duration of the function to ensure
0893  * exclusion for the $MFT/$DATA case against someone mapping an mft record we
0894  * are about to apply the mst fixups to.
0895  *
0896  * Return 0 on success and -errno on error.
0897  *
0898  * Based on ntfs_write_block(), ntfs_mft_writepage(), and
0899  * write_mft_record_nolock().
0900  */
0901 static int ntfs_write_mst_block(struct page *page,
0902         struct writeback_control *wbc)
0903 {
0904     sector_t block, dblock, rec_block;
0905     struct inode *vi = page->mapping->host;
0906     ntfs_inode *ni = NTFS_I(vi);
0907     ntfs_volume *vol = ni->vol;
0908     u8 *kaddr;
0909     unsigned int rec_size = ni->itype.index.block_size;
0910     ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
0911     struct buffer_head *bh, *head, *tbh, *rec_start_bh;
0912     struct buffer_head *bhs[MAX_BUF_PER_PAGE];
0913     runlist_element *rl;
0914     int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
0915     unsigned bh_size, rec_size_bits;
0916     bool sync, is_mft, page_is_dirty, rec_is_dirty;
0917     unsigned char bh_size_bits;
0918 
0919     if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
0920         return -EINVAL;
0921 
0922     ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
0923             "0x%lx.", vi->i_ino, ni->type, page->index);
0924     BUG_ON(!NInoNonResident(ni));
0925     BUG_ON(!NInoMstProtected(ni));
0926     is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
0927     /*
0928      * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
0929      * in its page cache were to be marked dirty.  However this should
0930      * never happen with the current driver and considering we do not
0931      * handle this case here we do want to BUG(), at least for now.
0932      */
0933     BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
0934             (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
0935     bh_size = vol->sb->s_blocksize;
0936     bh_size_bits = vol->sb->s_blocksize_bits;
0937     max_bhs = PAGE_SIZE / bh_size;
0938     BUG_ON(!max_bhs);
0939     BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
0940 
0941     /* Were we called for sync purposes? */
0942     sync = (wbc->sync_mode == WB_SYNC_ALL);
0943 
0944     /* Make sure we have mapped buffers. */
0945     bh = head = page_buffers(page);
0946     BUG_ON(!bh);
0947 
0948     rec_size_bits = ni->itype.index.block_size_bits;
0949     BUG_ON(!(PAGE_SIZE >> rec_size_bits));
0950     bhs_per_rec = rec_size >> bh_size_bits;
0951     BUG_ON(!bhs_per_rec);
0952 
0953     /* The first block in the page. */
0954     rec_block = block = (sector_t)page->index <<
0955             (PAGE_SHIFT - bh_size_bits);
0956 
0957     /* The first out of bounds block for the data size. */
0958     dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
0959 
0960     rl = NULL;
0961     err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
0962     page_is_dirty = rec_is_dirty = false;
0963     rec_start_bh = NULL;
0964     do {
0965         bool is_retry = false;
0966 
0967         if (likely(block < rec_block)) {
0968             if (unlikely(block >= dblock)) {
0969                 clear_buffer_dirty(bh);
0970                 set_buffer_uptodate(bh);
0971                 continue;
0972             }
0973             /*
0974              * This block is not the first one in the record.  We
0975              * ignore the buffer's dirty state because we could
0976              * have raced with a parallel mark_ntfs_record_dirty().
0977              */
0978             if (!rec_is_dirty)
0979                 continue;
0980             if (unlikely(err2)) {
0981                 if (err2 != -ENOMEM)
0982                     clear_buffer_dirty(bh);
0983                 continue;
0984             }
0985         } else /* if (block == rec_block) */ {
0986             BUG_ON(block > rec_block);
0987             /* This block is the first one in the record. */
0988             rec_block += bhs_per_rec;
0989             err2 = 0;
0990             if (unlikely(block >= dblock)) {
0991                 clear_buffer_dirty(bh);
0992                 continue;
0993             }
0994             if (!buffer_dirty(bh)) {
0995                 /* Clean records are not written out. */
0996                 rec_is_dirty = false;
0997                 continue;
0998             }
0999             rec_is_dirty = true;
1000             rec_start_bh = bh;
1001         }
1002         /* Need to map the buffer if it is not mapped already. */
1003         if (unlikely(!buffer_mapped(bh))) {
1004             VCN vcn;
1005             LCN lcn;
1006             unsigned int vcn_ofs;
1007 
1008             bh->b_bdev = vol->sb->s_bdev;
1009             /* Obtain the vcn and offset of the current block. */
1010             vcn = (VCN)block << bh_size_bits;
1011             vcn_ofs = vcn & vol->cluster_size_mask;
1012             vcn >>= vol->cluster_size_bits;
1013             if (!rl) {
1014 lock_retry_remap:
1015                 down_read(&ni->runlist.lock);
1016                 rl = ni->runlist.rl;
1017             }
1018             if (likely(rl != NULL)) {
1019                 /* Seek to element containing target vcn. */
1020                 while (rl->length && rl[1].vcn <= vcn)
1021                     rl++;
1022                 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1023             } else
1024                 lcn = LCN_RL_NOT_MAPPED;
1025             /* Successful remap. */
1026             if (likely(lcn >= 0)) {
1027                 /* Setup buffer head to correct block. */
1028                 bh->b_blocknr = ((lcn <<
1029                         vol->cluster_size_bits) +
1030                         vcn_ofs) >> bh_size_bits;
1031                 set_buffer_mapped(bh);
1032             } else {
1033                 /*
1034                  * Remap failed.  Retry to map the runlist once
1035                  * unless we are working on $MFT which always
1036                  * has the whole of its runlist in memory.
1037                  */
1038                 if (!is_mft && !is_retry &&
1039                         lcn == LCN_RL_NOT_MAPPED) {
1040                     is_retry = true;
1041                     /*
1042                      * Attempt to map runlist, dropping
1043                      * lock for the duration.
1044                      */
1045                     up_read(&ni->runlist.lock);
1046                     err2 = ntfs_map_runlist(ni, vcn);
1047                     if (likely(!err2))
1048                         goto lock_retry_remap;
1049                     if (err2 == -ENOMEM)
1050                         page_is_dirty = true;
1051                     lcn = err2;
1052                 } else {
1053                     err2 = -EIO;
1054                     if (!rl)
1055                         up_read(&ni->runlist.lock);
1056                 }
1057                 /* Hard error.  Abort writing this record. */
1058                 if (!err || err == -ENOMEM)
1059                     err = err2;
1060                 bh->b_blocknr = -1;
1061                 ntfs_error(vol->sb, "Cannot write ntfs record "
1062                         "0x%llx (inode 0x%lx, "
1063                         "attribute type 0x%x) because "
1064                         "its location on disk could "
1065                         "not be determined (error "
1066                         "code %lli).",
1067                         (long long)block <<
1068                         bh_size_bits >>
1069                         vol->mft_record_size_bits,
1070                         ni->mft_no, ni->type,
1071                         (long long)lcn);
1072                 /*
1073                  * If this is not the first buffer, remove the
1074                  * buffers in this record from the list of
1075                  * buffers to write and clear their dirty bit
1076                  * if not error -ENOMEM.
1077                  */
1078                 if (rec_start_bh != bh) {
1079                     while (bhs[--nr_bhs] != rec_start_bh)
1080                         ;
1081                     if (err2 != -ENOMEM) {
1082                         do {
1083                             clear_buffer_dirty(
1084                                 rec_start_bh);
1085                         } while ((rec_start_bh =
1086                                 rec_start_bh->
1087                                 b_this_page) !=
1088                                 bh);
1089                     }
1090                 }
1091                 continue;
1092             }
1093         }
1094         BUG_ON(!buffer_uptodate(bh));
1095         BUG_ON(nr_bhs >= max_bhs);
1096         bhs[nr_bhs++] = bh;
1097     } while (block++, (bh = bh->b_this_page) != head);
1098     if (unlikely(rl))
1099         up_read(&ni->runlist.lock);
1100     /* If there were no dirty buffers, we are done. */
1101     if (!nr_bhs)
1102         goto done;
1103     /* Map the page so we can access its contents. */
1104     kaddr = kmap(page);
1105     /* Clear the page uptodate flag whilst the mst fixups are applied. */
1106     BUG_ON(!PageUptodate(page));
1107     ClearPageUptodate(page);
1108     for (i = 0; i < nr_bhs; i++) {
1109         unsigned int ofs;
1110 
1111         /* Skip buffers which are not at the beginning of records. */
1112         if (i % bhs_per_rec)
1113             continue;
1114         tbh = bhs[i];
1115         ofs = bh_offset(tbh);
1116         if (is_mft) {
1117             ntfs_inode *tni;
1118             unsigned long mft_no;
1119 
1120             /* Get the mft record number. */
1121             mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1122                     >> rec_size_bits;
1123             /* Check whether to write this mft record. */
1124             tni = NULL;
1125             if (!ntfs_may_write_mft_record(vol, mft_no,
1126                     (MFT_RECORD*)(kaddr + ofs), &tni)) {
1127                 /*
1128                  * The record should not be written.  This
1129                  * means we need to redirty the page before
1130                  * returning.
1131                  */
1132                 page_is_dirty = true;
1133                 /*
1134                  * Remove the buffers in this mft record from
1135                  * the list of buffers to write.
1136                  */
1137                 do {
1138                     bhs[i] = NULL;
1139                 } while (++i % bhs_per_rec);
1140                 continue;
1141             }
1142             /*
1143              * The record should be written.  If a locked ntfs
1144              * inode was returned, add it to the array of locked
1145              * ntfs inodes.
1146              */
1147             if (tni)
1148                 locked_nis[nr_locked_nis++] = tni;
1149         }
1150         /* Apply the mst protection fixups. */
1151         err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1152                 rec_size);
1153         if (unlikely(err2)) {
1154             if (!err || err == -ENOMEM)
1155                 err = -EIO;
1156             ntfs_error(vol->sb, "Failed to apply mst fixups "
1157                     "(inode 0x%lx, attribute type 0x%x, "
1158                     "page index 0x%lx, page offset 0x%x)!"
1159                     "  Unmount and run chkdsk.", vi->i_ino,
1160                     ni->type, page->index, ofs);
1161             /*
1162              * Mark all the buffers in this record clean as we do
1163              * not want to write corrupt data to disk.
1164              */
1165             do {
1166                 clear_buffer_dirty(bhs[i]);
1167                 bhs[i] = NULL;
1168             } while (++i % bhs_per_rec);
1169             continue;
1170         }
1171         nr_recs++;
1172     }
1173     /* If no records are to be written out, we are done. */
1174     if (!nr_recs)
1175         goto unm_done;
1176     flush_dcache_page(page);
1177     /* Lock buffers and start synchronous write i/o on them. */
1178     for (i = 0; i < nr_bhs; i++) {
1179         tbh = bhs[i];
1180         if (!tbh)
1181             continue;
1182         if (!trylock_buffer(tbh))
1183             BUG();
1184         /* The buffer dirty state is now irrelevant, just clean it. */
1185         clear_buffer_dirty(tbh);
1186         BUG_ON(!buffer_uptodate(tbh));
1187         BUG_ON(!buffer_mapped(tbh));
1188         get_bh(tbh);
1189         tbh->b_end_io = end_buffer_write_sync;
1190         submit_bh(REQ_OP_WRITE, tbh);
1191     }
1192     /* Synchronize the mft mirror now if not @sync. */
1193     if (is_mft && !sync)
1194         goto do_mirror;
1195 do_wait:
1196     /* Wait on i/o completion of buffers. */
1197     for (i = 0; i < nr_bhs; i++) {
1198         tbh = bhs[i];
1199         if (!tbh)
1200             continue;
1201         wait_on_buffer(tbh);
1202         if (unlikely(!buffer_uptodate(tbh))) {
1203             ntfs_error(vol->sb, "I/O error while writing ntfs "
1204                     "record buffer (inode 0x%lx, "
1205                     "attribute type 0x%x, page index "
1206                     "0x%lx, page offset 0x%lx)!  Unmount "
1207                     "and run chkdsk.", vi->i_ino, ni->type,
1208                     page->index, bh_offset(tbh));
1209             if (!err || err == -ENOMEM)
1210                 err = -EIO;
1211             /*
1212              * Set the buffer uptodate so the page and buffer
1213              * states do not become out of sync.
1214              */
1215             set_buffer_uptodate(tbh);
1216         }
1217     }
1218     /* If @sync, now synchronize the mft mirror. */
1219     if (is_mft && sync) {
1220 do_mirror:
1221         for (i = 0; i < nr_bhs; i++) {
1222             unsigned long mft_no;
1223             unsigned int ofs;
1224 
1225             /*
1226              * Skip buffers which are not at the beginning of
1227              * records.
1228              */
1229             if (i % bhs_per_rec)
1230                 continue;
1231             tbh = bhs[i];
1232             /* Skip removed buffers (and hence records). */
1233             if (!tbh)
1234                 continue;
1235             ofs = bh_offset(tbh);
1236             /* Get the mft record number. */
1237             mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1238                     >> rec_size_bits;
1239             if (mft_no < vol->mftmirr_size)
1240                 ntfs_sync_mft_mirror(vol, mft_no,
1241                         (MFT_RECORD*)(kaddr + ofs),
1242                         sync);
1243         }
1244         if (!sync)
1245             goto do_wait;
1246     }
1247     /* Remove the mst protection fixups again. */
1248     for (i = 0; i < nr_bhs; i++) {
1249         if (!(i % bhs_per_rec)) {
1250             tbh = bhs[i];
1251             if (!tbh)
1252                 continue;
1253             post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1254                     bh_offset(tbh)));
1255         }
1256     }
1257     flush_dcache_page(page);
1258 unm_done:
1259     /* Unlock any locked inodes. */
1260     while (nr_locked_nis-- > 0) {
1261         ntfs_inode *tni, *base_tni;
1262         
1263         tni = locked_nis[nr_locked_nis];
1264         /* Get the base inode. */
1265         mutex_lock(&tni->extent_lock);
1266         if (tni->nr_extents >= 0)
1267             base_tni = tni;
1268         else {
1269             base_tni = tni->ext.base_ntfs_ino;
1270             BUG_ON(!base_tni);
1271         }
1272         mutex_unlock(&tni->extent_lock);
1273         ntfs_debug("Unlocking %s inode 0x%lx.",
1274                 tni == base_tni ? "base" : "extent",
1275                 tni->mft_no);
1276         mutex_unlock(&tni->mrec_lock);
1277         atomic_dec(&tni->count);
1278         iput(VFS_I(base_tni));
1279     }
1280     SetPageUptodate(page);
1281     kunmap(page);
1282 done:
1283     if (unlikely(err && err != -ENOMEM)) {
1284         /*
1285          * Set page error if there is only one ntfs record in the page.
1286          * Otherwise we would loose per-record granularity.
1287          */
1288         if (ni->itype.index.block_size == PAGE_SIZE)
1289             SetPageError(page);
1290         NVolSetErrors(vol);
1291     }
1292     if (page_is_dirty) {
1293         ntfs_debug("Page still contains one or more dirty ntfs "
1294                 "records.  Redirtying the page starting at "
1295                 "record 0x%lx.", page->index <<
1296                 (PAGE_SHIFT - rec_size_bits));
1297         redirty_page_for_writepage(wbc, page);
1298         unlock_page(page);
1299     } else {
1300         /*
1301          * Keep the VM happy.  This must be done otherwise the
1302          * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1303          * the page is clean.
1304          */
1305         BUG_ON(PageWriteback(page));
1306         set_page_writeback(page);
1307         unlock_page(page);
1308         end_page_writeback(page);
1309     }
1310     if (likely(!err))
1311         ntfs_debug("Done.");
1312     return err;
1313 }
1314 
1315 /**
1316  * ntfs_writepage - write a @page to the backing store
1317  * @page:   page cache page to write out
1318  * @wbc:    writeback control structure
1319  *
1320  * This is called from the VM when it wants to have a dirty ntfs page cache
1321  * page cleaned.  The VM has already locked the page and marked it clean.
1322  *
1323  * For non-resident attributes, ntfs_writepage() writes the @page by calling
1324  * the ntfs version of the generic block_write_full_page() function,
1325  * ntfs_write_block(), which in turn if necessary creates and writes the
1326  * buffers associated with the page asynchronously.
1327  *
1328  * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1329  * the data to the mft record (which at this stage is most likely in memory).
1330  * The mft record is then marked dirty and written out asynchronously via the
1331  * vfs inode dirty code path for the inode the mft record belongs to or via the
1332  * vm page dirty code path for the page the mft record is in.
1333  *
1334  * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
1335  *
1336  * Return 0 on success and -errno on error.
1337  */
1338 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1339 {
1340     loff_t i_size;
1341     struct inode *vi = page->mapping->host;
1342     ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1343     char *addr;
1344     ntfs_attr_search_ctx *ctx = NULL;
1345     MFT_RECORD *m = NULL;
1346     u32 attr_len;
1347     int err;
1348 
1349 retry_writepage:
1350     BUG_ON(!PageLocked(page));
1351     i_size = i_size_read(vi);
1352     /* Is the page fully outside i_size? (truncate in progress) */
1353     if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
1354             PAGE_SHIFT)) {
1355         struct folio *folio = page_folio(page);
1356         /*
1357          * The page may have dirty, unmapped buffers.  Make them
1358          * freeable here, so the page does not leak.
1359          */
1360         block_invalidate_folio(folio, 0, folio_size(folio));
1361         folio_unlock(folio);
1362         ntfs_debug("Write outside i_size - truncated?");
1363         return 0;
1364     }
1365     /*
1366      * Only $DATA attributes can be encrypted and only unnamed $DATA
1367      * attributes can be compressed.  Index root can have the flags set but
1368      * this means to create compressed/encrypted files, not that the
1369      * attribute is compressed/encrypted.  Note we need to check for
1370      * AT_INDEX_ALLOCATION since this is the type of both directory and
1371      * index inodes.
1372      */
1373     if (ni->type != AT_INDEX_ALLOCATION) {
1374         /* If file is encrypted, deny access, just like NT4. */
1375         if (NInoEncrypted(ni)) {
1376             unlock_page(page);
1377             BUG_ON(ni->type != AT_DATA);
1378             ntfs_debug("Denying write access to encrypted file.");
1379             return -EACCES;
1380         }
1381         /* Compressed data streams are handled in compress.c. */
1382         if (NInoNonResident(ni) && NInoCompressed(ni)) {
1383             BUG_ON(ni->type != AT_DATA);
1384             BUG_ON(ni->name_len);
1385             // TODO: Implement and replace this with
1386             // return ntfs_write_compressed_block(page);
1387             unlock_page(page);
1388             ntfs_error(vi->i_sb, "Writing to compressed files is "
1389                     "not supported yet.  Sorry.");
1390             return -EOPNOTSUPP;
1391         }
1392         // TODO: Implement and remove this check.
1393         if (NInoNonResident(ni) && NInoSparse(ni)) {
1394             unlock_page(page);
1395             ntfs_error(vi->i_sb, "Writing to sparse files is not "
1396                     "supported yet.  Sorry.");
1397             return -EOPNOTSUPP;
1398         }
1399     }
1400     /* NInoNonResident() == NInoIndexAllocPresent() */
1401     if (NInoNonResident(ni)) {
1402         /* We have to zero every time due to mmap-at-end-of-file. */
1403         if (page->index >= (i_size >> PAGE_SHIFT)) {
1404             /* The page straddles i_size. */
1405             unsigned int ofs = i_size & ~PAGE_MASK;
1406             zero_user_segment(page, ofs, PAGE_SIZE);
1407         }
1408         /* Handle mst protected attributes. */
1409         if (NInoMstProtected(ni))
1410             return ntfs_write_mst_block(page, wbc);
1411         /* Normal, non-resident data stream. */
1412         return ntfs_write_block(page, wbc);
1413     }
1414     /*
1415      * Attribute is resident, implying it is not compressed, encrypted, or
1416      * mst protected.  This also means the attribute is smaller than an mft
1417      * record and hence smaller than a page, so can simply return error on
1418      * any pages with index above 0.  Note the attribute can actually be
1419      * marked compressed but if it is resident the actual data is not
1420      * compressed so we are ok to ignore the compressed flag here.
1421      */
1422     BUG_ON(page_has_buffers(page));
1423     BUG_ON(!PageUptodate(page));
1424     if (unlikely(page->index > 0)) {
1425         ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
1426                 "Aborting write.", page->index);
1427         BUG_ON(PageWriteback(page));
1428         set_page_writeback(page);
1429         unlock_page(page);
1430         end_page_writeback(page);
1431         return -EIO;
1432     }
1433     if (!NInoAttr(ni))
1434         base_ni = ni;
1435     else
1436         base_ni = ni->ext.base_ntfs_ino;
1437     /* Map, pin, and lock the mft record. */
1438     m = map_mft_record(base_ni);
1439     if (IS_ERR(m)) {
1440         err = PTR_ERR(m);
1441         m = NULL;
1442         ctx = NULL;
1443         goto err_out;
1444     }
1445     /*
1446      * If a parallel write made the attribute non-resident, drop the mft
1447      * record and retry the writepage.
1448      */
1449     if (unlikely(NInoNonResident(ni))) {
1450         unmap_mft_record(base_ni);
1451         goto retry_writepage;
1452     }
1453     ctx = ntfs_attr_get_search_ctx(base_ni, m);
1454     if (unlikely(!ctx)) {
1455         err = -ENOMEM;
1456         goto err_out;
1457     }
1458     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1459             CASE_SENSITIVE, 0, NULL, 0, ctx);
1460     if (unlikely(err))
1461         goto err_out;
1462     /*
1463      * Keep the VM happy.  This must be done otherwise the radix-tree tag
1464      * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1465      */
1466     BUG_ON(PageWriteback(page));
1467     set_page_writeback(page);
1468     unlock_page(page);
1469     attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1470     i_size = i_size_read(vi);
1471     if (unlikely(attr_len > i_size)) {
1472         /* Race with shrinking truncate or a failed truncate. */
1473         attr_len = i_size;
1474         /*
1475          * If the truncate failed, fix it up now.  If a concurrent
1476          * truncate, we do its job, so it does not have to do anything.
1477          */
1478         err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
1479                 attr_len);
1480         /* Shrinking cannot fail. */
1481         BUG_ON(err);
1482     }
1483     addr = kmap_atomic(page);
1484     /* Copy the data from the page to the mft record. */
1485     memcpy((u8*)ctx->attr +
1486             le16_to_cpu(ctx->attr->data.resident.value_offset),
1487             addr, attr_len);
1488     /* Zero out of bounds area in the page cache page. */
1489     memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
1490     kunmap_atomic(addr);
1491     flush_dcache_page(page);
1492     flush_dcache_mft_record_page(ctx->ntfs_ino);
1493     /* We are done with the page. */
1494     end_page_writeback(page);
1495     /* Finally, mark the mft record dirty, so it gets written back. */
1496     mark_mft_record_dirty(ctx->ntfs_ino);
1497     ntfs_attr_put_search_ctx(ctx);
1498     unmap_mft_record(base_ni);
1499     return 0;
1500 err_out:
1501     if (err == -ENOMEM) {
1502         ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1503                 "page so we try again later.");
1504         /*
1505          * Put the page back on mapping->dirty_pages, but leave its
1506          * buffers' dirty state as-is.
1507          */
1508         redirty_page_for_writepage(wbc, page);
1509         err = 0;
1510     } else {
1511         ntfs_error(vi->i_sb, "Resident attribute write failed with "
1512                 "error %i.", err);
1513         SetPageError(page);
1514         NVolSetErrors(ni->vol);
1515     }
1516     unlock_page(page);
1517     if (ctx)
1518         ntfs_attr_put_search_ctx(ctx);
1519     if (m)
1520         unmap_mft_record(base_ni);
1521     return err;
1522 }
1523 
1524 #endif  /* NTFS_RW */
1525 
1526 /**
1527  * ntfs_bmap - map logical file block to physical device block
1528  * @mapping:    address space mapping to which the block to be mapped belongs
1529  * @block:  logical block to map to its physical device block
1530  *
1531  * For regular, non-resident files (i.e. not compressed and not encrypted), map
1532  * the logical @block belonging to the file described by the address space
1533  * mapping @mapping to its physical device block.
1534  *
1535  * The size of the block is equal to the @s_blocksize field of the super block
1536  * of the mounted file system which is guaranteed to be smaller than or equal
1537  * to the cluster size thus the block is guaranteed to fit entirely inside the
1538  * cluster which means we do not need to care how many contiguous bytes are
1539  * available after the beginning of the block.
1540  *
1541  * Return the physical device block if the mapping succeeded or 0 if the block
1542  * is sparse or there was an error.
1543  *
1544  * Note: This is a problem if someone tries to run bmap() on $Boot system file
1545  * as that really is in block zero but there is nothing we can do.  bmap() is
1546  * just broken in that respect (just like it cannot distinguish sparse from
1547  * not available or error).
1548  */
1549 static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
1550 {
1551     s64 ofs, size;
1552     loff_t i_size;
1553     LCN lcn;
1554     unsigned long blocksize, flags;
1555     ntfs_inode *ni = NTFS_I(mapping->host);
1556     ntfs_volume *vol = ni->vol;
1557     unsigned delta;
1558     unsigned char blocksize_bits, cluster_size_shift;
1559 
1560     ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
1561             ni->mft_no, (unsigned long long)block);
1562     if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
1563         ntfs_error(vol->sb, "BMAP does not make sense for %s "
1564                 "attributes, returning 0.",
1565                 (ni->type != AT_DATA) ? "non-data" :
1566                 (!NInoNonResident(ni) ? "resident" :
1567                 "encrypted"));
1568         return 0;
1569     }
1570     /* None of these can happen. */
1571     BUG_ON(NInoCompressed(ni));
1572     BUG_ON(NInoMstProtected(ni));
1573     blocksize = vol->sb->s_blocksize;
1574     blocksize_bits = vol->sb->s_blocksize_bits;
1575     ofs = (s64)block << blocksize_bits;
1576     read_lock_irqsave(&ni->size_lock, flags);
1577     size = ni->initialized_size;
1578     i_size = i_size_read(VFS_I(ni));
1579     read_unlock_irqrestore(&ni->size_lock, flags);
1580     /*
1581      * If the offset is outside the initialized size or the block straddles
1582      * the initialized size then pretend it is a hole unless the
1583      * initialized size equals the file size.
1584      */
1585     if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
1586         goto hole;
1587     cluster_size_shift = vol->cluster_size_bits;
1588     down_read(&ni->runlist.lock);
1589     lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
1590     up_read(&ni->runlist.lock);
1591     if (unlikely(lcn < LCN_HOLE)) {
1592         /*
1593          * Step down to an integer to avoid gcc doing a long long
1594          * comparision in the switch when we know @lcn is between
1595          * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
1596          *
1597          * Otherwise older gcc (at least on some architectures) will
1598          * try to use __cmpdi2() which is of course not available in
1599          * the kernel.
1600          */
1601         switch ((int)lcn) {
1602         case LCN_ENOENT:
1603             /*
1604              * If the offset is out of bounds then pretend it is a
1605              * hole.
1606              */
1607             goto hole;
1608         case LCN_ENOMEM:
1609             ntfs_error(vol->sb, "Not enough memory to complete "
1610                     "mapping for inode 0x%lx.  "
1611                     "Returning 0.", ni->mft_no);
1612             break;
1613         default:
1614             ntfs_error(vol->sb, "Failed to complete mapping for "
1615                     "inode 0x%lx.  Run chkdsk.  "
1616                     "Returning 0.", ni->mft_no);
1617             break;
1618         }
1619         return 0;
1620     }
1621     if (lcn < 0) {
1622         /* It is a hole. */
1623 hole:
1624         ntfs_debug("Done (returning hole).");
1625         return 0;
1626     }
1627     /*
1628      * The block is really allocated and fullfils all our criteria.
1629      * Convert the cluster to units of block size and return the result.
1630      */
1631     delta = ofs & vol->cluster_size_mask;
1632     if (unlikely(sizeof(block) < sizeof(lcn))) {
1633         block = lcn = ((lcn << cluster_size_shift) + delta) >>
1634                 blocksize_bits;
1635         /* If the block number was truncated return 0. */
1636         if (unlikely(block != lcn)) {
1637             ntfs_error(vol->sb, "Physical block 0x%llx is too "
1638                     "large to be returned, returning 0.",
1639                     (long long)lcn);
1640             return 0;
1641         }
1642     } else
1643         block = ((lcn << cluster_size_shift) + delta) >>
1644                 blocksize_bits;
1645     ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
1646     return block;
1647 }
1648 
1649 /**
1650  * ntfs_normal_aops - address space operations for normal inodes and attributes
1651  *
1652  * Note these are not used for compressed or mst protected inodes and
1653  * attributes.
1654  */
1655 const struct address_space_operations ntfs_normal_aops = {
1656     .read_folio = ntfs_read_folio,
1657 #ifdef NTFS_RW
1658     .writepage  = ntfs_writepage,
1659     .dirty_folio    = block_dirty_folio,
1660 #endif /* NTFS_RW */
1661     .bmap       = ntfs_bmap,
1662     .migrate_folio  = buffer_migrate_folio,
1663     .is_partially_uptodate = block_is_partially_uptodate,
1664     .error_remove_page = generic_error_remove_page,
1665 };
1666 
1667 /**
1668  * ntfs_compressed_aops - address space operations for compressed inodes
1669  */
1670 const struct address_space_operations ntfs_compressed_aops = {
1671     .read_folio = ntfs_read_folio,
1672 #ifdef NTFS_RW
1673     .writepage  = ntfs_writepage,
1674     .dirty_folio    = block_dirty_folio,
1675 #endif /* NTFS_RW */
1676     .migrate_folio  = buffer_migrate_folio,
1677     .is_partially_uptodate = block_is_partially_uptodate,
1678     .error_remove_page = generic_error_remove_page,
1679 };
1680 
1681 /**
1682  * ntfs_mst_aops - general address space operations for mst protecteed inodes
1683  *         and attributes
1684  */
1685 const struct address_space_operations ntfs_mst_aops = {
1686     .read_folio = ntfs_read_folio,  /* Fill page with data. */
1687 #ifdef NTFS_RW
1688     .writepage  = ntfs_writepage,   /* Write dirty page to disk. */
1689     .dirty_folio    = filemap_dirty_folio,
1690 #endif /* NTFS_RW */
1691     .migrate_folio  = buffer_migrate_folio,
1692     .is_partially_uptodate  = block_is_partially_uptodate,
1693     .error_remove_page = generic_error_remove_page,
1694 };
1695 
1696 #ifdef NTFS_RW
1697 
1698 /**
1699  * mark_ntfs_record_dirty - mark an ntfs record dirty
1700  * @page:   page containing the ntfs record to mark dirty
1701  * @ofs:    byte offset within @page at which the ntfs record begins
1702  *
1703  * Set the buffers and the page in which the ntfs record is located dirty.
1704  *
1705  * The latter also marks the vfs inode the ntfs record belongs to dirty
1706  * (I_DIRTY_PAGES only).
1707  *
1708  * If the page does not have buffers, we create them and set them uptodate.
1709  * The page may not be locked which is why we need to handle the buffers under
1710  * the mapping->private_lock.  Once the buffers are marked dirty we no longer
1711  * need the lock since try_to_free_buffers() does not free dirty buffers.
1712  */
1713 void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
1714     struct address_space *mapping = page->mapping;
1715     ntfs_inode *ni = NTFS_I(mapping->host);
1716     struct buffer_head *bh, *head, *buffers_to_free = NULL;
1717     unsigned int end, bh_size, bh_ofs;
1718 
1719     BUG_ON(!PageUptodate(page));
1720     end = ofs + ni->itype.index.block_size;
1721     bh_size = VFS_I(ni)->i_sb->s_blocksize;
1722     spin_lock(&mapping->private_lock);
1723     if (unlikely(!page_has_buffers(page))) {
1724         spin_unlock(&mapping->private_lock);
1725         bh = head = alloc_page_buffers(page, bh_size, true);
1726         spin_lock(&mapping->private_lock);
1727         if (likely(!page_has_buffers(page))) {
1728             struct buffer_head *tail;
1729 
1730             do {
1731                 set_buffer_uptodate(bh);
1732                 tail = bh;
1733                 bh = bh->b_this_page;
1734             } while (bh);
1735             tail->b_this_page = head;
1736             attach_page_private(page, head);
1737         } else
1738             buffers_to_free = bh;
1739     }
1740     bh = head = page_buffers(page);
1741     BUG_ON(!bh);
1742     do {
1743         bh_ofs = bh_offset(bh);
1744         if (bh_ofs + bh_size <= ofs)
1745             continue;
1746         if (unlikely(bh_ofs >= end))
1747             break;
1748         set_buffer_dirty(bh);
1749     } while ((bh = bh->b_this_page) != head);
1750     spin_unlock(&mapping->private_lock);
1751     filemap_dirty_folio(mapping, page_folio(page));
1752     if (unlikely(buffers_to_free)) {
1753         do {
1754             bh = buffers_to_free->b_this_page;
1755             free_buffer_head(buffers_to_free);
1756             buffers_to_free = bh;
1757         } while (buffers_to_free);
1758     }
1759 }
1760 
1761 #endif /* NTFS_RW */