fs/ntfs/file.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
0004  *
0005  * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
0006  */
0007
0008 #include <linux/blkdev.h>
0009 #include <linux/backing-dev.h>
0010 #include <linux/buffer_head.h>
0011 #include <linux/gfp.h>
0012 #include <linux/pagemap.h>
0013 #include <linux/pagevec.h>
0014 #include <linux/sched/signal.h>
0015 #include <linux/swap.h>
0016 #include <linux/uio.h>
0017 #include <linux/writeback.h>
0018
0019 #include <asm/page.h>
0020 #include <linux/uaccess.h>
0021
0022 #include "attrib.h"
0023 #include "bitmap.h"
0024 #include "inode.h"
0025 #include "debug.h"
0026 #include "lcnalloc.h"
0027 #include "malloc.h"
0028 #include "mft.h"
0029 #include "ntfs.h"
0030
0031 /**
0032  * ntfs_file_open - called when an inode is about to be opened
0033  * @vi:     inode to be opened
0034  * @filp:   file structure describing the inode
0035  *
0036  * Limit file size to the page cache limit on architectures where unsigned long
0037  * is 32-bits. This is the most we can do for now without overflowing the page
0038  * cache page index. Doing it this way means we don't run into problems because
0039  * of existing too large files. It would be better to allow the user to read
0040  * the beginning of the file but I doubt very much anyone is going to hit this
0041  * check on a 32-bit architecture, so there is no point in adding the extra
0042  * complexity required to support this.
0043  *
0044  * On 64-bit architectures, the check is hopefully optimized away by the
0045  * compiler.
0046  *
0047  * After the check passes, just call generic_file_open() to do its work.
0048  */
0049 static int ntfs_file_open(struct inode *vi, struct file *filp)
0050 {
0051     if (sizeof(unsigned long) < 8) {
0052         if (i_size_read(vi) > MAX_LFS_FILESIZE)
0053             return -EOVERFLOW;
0054     }
0055     return generic_file_open(vi, filp);
0056 }
0057
0058 #ifdef NTFS_RW
0059
0060 /**
0061  * ntfs_attr_extend_initialized - extend the initialized size of an attribute
0062  * @ni:         ntfs inode of the attribute to extend
0063  * @new_init_size:  requested new initialized size in bytes
0064  *
0065  * Extend the initialized size of an attribute described by the ntfs inode @ni
0066  * to @new_init_size bytes.  This involves zeroing any non-sparse space between
0067  * the old initialized size and @new_init_size both in the page cache and on
0068  * disk (if relevant complete pages are already uptodate in the page cache then
0069  * these are simply marked dirty).
0070  *
0071  * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
0072  * in the resident attribute case, it is tied to the initialized size and, in
0073  * the non-resident attribute case, it may not fall below the initialized size.
0074  *
0075  * Note that if the attribute is resident, we do not need to touch the page
0076  * cache at all.  This is because if the page cache page is not uptodate we
0077  * bring it uptodate later, when doing the write to the mft record since we
0078  * then already have the page mapped.  And if the page is uptodate, the
0079  * non-initialized region will already have been zeroed when the page was
0080  * brought uptodate and the region may in fact already have been overwritten
0081  * with new data via mmap() based writes, so we cannot just zero it.  And since
0082  * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
0083  * is unspecified, we choose not to do zeroing and thus we do not need to touch
0084  * the page at all.  For a more detailed explanation see ntfs_truncate() in
0085  * fs/ntfs/inode.c.
0086  *
0087  * Return 0 on success and -errno on error.  In the case that an error is
0088  * encountered it is possible that the initialized size will already have been
0089  * incremented some way towards @new_init_size but it is guaranteed that if
0090  * this is the case, the necessary zeroing will also have happened and that all
0091  * metadata is self-consistent.
0092  *
0093  * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
0094  *      held by the caller.
0095  */
0096 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
0097 {
0098     s64 old_init_size;
0099     loff_t old_i_size;
0100     pgoff_t index, end_index;
0101     unsigned long flags;
0102     struct inode *vi = VFS_I(ni);
0103     ntfs_inode *base_ni;
0104     MFT_RECORD *m = NULL;
0105     ATTR_RECORD *a;
0106     ntfs_attr_search_ctx *ctx = NULL;
0107     struct address_space *mapping;
0108     struct page *page = NULL;
0109     u8 *kattr;
0110     int err;
0111     u32 attr_len;
0112
0113     read_lock_irqsave(&ni->size_lock, flags);
0114     old_init_size = ni->initialized_size;
0115     old_i_size = i_size_read(vi);
0116     BUG_ON(new_init_size > ni->allocated_size);
0117     read_unlock_irqrestore(&ni->size_lock, flags);
0118     ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
0119             "old_initialized_size 0x%llx, "
0120             "new_initialized_size 0x%llx, i_size 0x%llx.",
0121             vi->i_ino, (unsigned)le32_to_cpu(ni->type),
0122             (unsigned long long)old_init_size,
0123             (unsigned long long)new_init_size, old_i_size);
0124     if (!NInoAttr(ni))
0125         base_ni = ni;
0126     else
0127         base_ni = ni->ext.base_ntfs_ino;
0128     /* Use goto to reduce indentation and we need the label below anyway. */
0129     if (NInoNonResident(ni))
0130         goto do_non_resident_extend;
0131     BUG_ON(old_init_size != old_i_size);
0132     m = map_mft_record(base_ni);
0133     if (IS_ERR(m)) {
0134         err = PTR_ERR(m);
0135         m = NULL;
0136         goto err_out;
0137     }
0138     ctx = ntfs_attr_get_search_ctx(base_ni, m);
0139     if (unlikely(!ctx)) {
0140         err = -ENOMEM;
0141         goto err_out;
0142     }
0143     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
0144             CASE_SENSITIVE, 0, NULL, 0, ctx);
0145     if (unlikely(err)) {
0146         if (err == -ENOENT)
0147             err = -EIO;
0148         goto err_out;
0149     }
0150     m = ctx->mrec;
0151     a = ctx->attr;
0152     BUG_ON(a->non_resident);
0153     /* The total length of the attribute value. */
0154     attr_len = le32_to_cpu(a->data.resident.value_length);
0155     BUG_ON(old_i_size != (loff_t)attr_len);
0156     /*
0157      * Do the zeroing in the mft record and update the attribute size in
0158      * the mft record.
0159      */
0160     kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
0161     memset(kattr + attr_len, 0, new_init_size - attr_len);
0162     a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
0163     /* Finally, update the sizes in the vfs and ntfs inodes. */
0164     write_lock_irqsave(&ni->size_lock, flags);
0165     i_size_write(vi, new_init_size);
0166     ni->initialized_size = new_init_size;
0167     write_unlock_irqrestore(&ni->size_lock, flags);
0168     goto done;
0169 do_non_resident_extend:
0170     /*
0171      * If the new initialized size @new_init_size exceeds the current file
0172      * size (vfs inode->i_size), we need to extend the file size to the
0173      * new initialized size.
0174      */
0175     if (new_init_size > old_i_size) {
0176         m = map_mft_record(base_ni);
0177         if (IS_ERR(m)) {
0178             err = PTR_ERR(m);
0179             m = NULL;
0180             goto err_out;
0181         }
0182         ctx = ntfs_attr_get_search_ctx(base_ni, m);
0183         if (unlikely(!ctx)) {
0184             err = -ENOMEM;
0185             goto err_out;
0186         }
0187         err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
0188                 CASE_SENSITIVE, 0, NULL, 0, ctx);
0189         if (unlikely(err)) {
0190             if (err == -ENOENT)
0191                 err = -EIO;
0192             goto err_out;
0193         }
0194         m = ctx->mrec;
0195         a = ctx->attr;
0196         BUG_ON(!a->non_resident);
0197         BUG_ON(old_i_size != (loff_t)
0198                 sle64_to_cpu(a->data.non_resident.data_size));
0199         a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
0200         flush_dcache_mft_record_page(ctx->ntfs_ino);
0201         mark_mft_record_dirty(ctx->ntfs_ino);
0202         /* Update the file size in the vfs inode. */
0203         i_size_write(vi, new_init_size);
0204         ntfs_attr_put_search_ctx(ctx);
0205         ctx = NULL;
0206         unmap_mft_record(base_ni);
0207         m = NULL;
0208     }
0209     mapping = vi->i_mapping;
0210     index = old_init_size >> PAGE_SHIFT;
0211     end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
0212     do {
0213         /*
0214          * Read the page.  If the page is not present, this will zero
0215          * the uninitialized regions for us.
0216          */
0217         page = read_mapping_page(mapping, index, NULL);
0218         if (IS_ERR(page)) {
0219             err = PTR_ERR(page);
0220             goto init_err_out;
0221         }
0222         /*
0223          * Update the initialized size in the ntfs inode.  This is
0224          * enough to make ntfs_writepage() work.
0225          */
0226         write_lock_irqsave(&ni->size_lock, flags);
0227         ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
0228         if (ni->initialized_size > new_init_size)
0229             ni->initialized_size = new_init_size;
0230         write_unlock_irqrestore(&ni->size_lock, flags);
0231         /* Set the page dirty so it gets written out. */
0232         set_page_dirty(page);
0233         put_page(page);
0234         /*
0235          * Play nice with the vm and the rest of the system.  This is
0236          * very much needed as we can potentially be modifying the
0237          * initialised size from a very small value to a really huge
0238          * value, e.g.
0239          *  f = open(somefile, O_TRUNC);
0240          *  truncate(f, 10GiB);
0241          *  seek(f, 10GiB);
0242          *  write(f, 1);
0243          * And this would mean we would be marking dirty hundreds of
0244          * thousands of pages or as in the above example more than
0245          * two and a half million pages!
0246          *
0247          * TODO: For sparse pages could optimize this workload by using
0248          * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
0249          * would be set in read_folio for sparse pages and here we would
0250          * not need to mark dirty any pages which have this bit set.
0251          * The only caveat is that we have to clear the bit everywhere
0252          * where we allocate any clusters that lie in the page or that
0253          * contain the page.
0254          *
0255          * TODO: An even greater optimization would be for us to only
0256          * call read_folio() on pages which are not in sparse regions as
0257          * determined from the runlist.  This would greatly reduce the
0258          * number of pages we read and make dirty in the case of sparse
0259          * files.
0260          */
0261         balance_dirty_pages_ratelimited(mapping);
0262         cond_resched();
0263     } while (++index < end_index);
0264     read_lock_irqsave(&ni->size_lock, flags);
0265     BUG_ON(ni->initialized_size != new_init_size);
0266     read_unlock_irqrestore(&ni->size_lock, flags);
0267     /* Now bring in sync the initialized_size in the mft record. */
0268     m = map_mft_record(base_ni);
0269     if (IS_ERR(m)) {
0270         err = PTR_ERR(m);
0271         m = NULL;
0272         goto init_err_out;
0273     }
0274     ctx = ntfs_attr_get_search_ctx(base_ni, m);
0275     if (unlikely(!ctx)) {
0276         err = -ENOMEM;
0277         goto init_err_out;
0278     }
0279     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
0280             CASE_SENSITIVE, 0, NULL, 0, ctx);
0281     if (unlikely(err)) {
0282         if (err == -ENOENT)
0283             err = -EIO;
0284         goto init_err_out;
0285     }
0286     m = ctx->mrec;
0287     a = ctx->attr;
0288     BUG_ON(!a->non_resident);
0289     a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
0290 done:
0291     flush_dcache_mft_record_page(ctx->ntfs_ino);
0292     mark_mft_record_dirty(ctx->ntfs_ino);
0293     if (ctx)
0294         ntfs_attr_put_search_ctx(ctx);
0295     if (m)
0296         unmap_mft_record(base_ni);
0297     ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
0298             (unsigned long long)new_init_size, i_size_read(vi));
0299     return 0;
0300 init_err_out:
0301     write_lock_irqsave(&ni->size_lock, flags);
0302     ni->initialized_size = old_init_size;
0303     write_unlock_irqrestore(&ni->size_lock, flags);
0304 err_out:
0305     if (ctx)
0306         ntfs_attr_put_search_ctx(ctx);
0307     if (m)
0308         unmap_mft_record(base_ni);
0309     ntfs_debug("Failed.  Returning error code %i.", err);
0310     return err;
0311 }
0312
0313 static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
0314         struct iov_iter *from)
0315 {
0316     loff_t pos;
0317     s64 end, ll;
0318     ssize_t err;
0319     unsigned long flags;
0320     struct file *file = iocb->ki_filp;
0321     struct inode *vi = file_inode(file);
0322     ntfs_inode *ni = NTFS_I(vi);
0323     ntfs_volume *vol = ni->vol;
0324
0325     ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
0326             "0x%llx, count 0x%zx.", vi->i_ino,
0327             (unsigned)le32_to_cpu(ni->type),
0328             (unsigned long long)iocb->ki_pos,
0329             iov_iter_count(from));
0330     err = generic_write_checks(iocb, from);
0331     if (unlikely(err <= 0))
0332         goto out;
0333     /*
0334      * All checks have passed.  Before we start doing any writing we want
0335      * to abort any totally illegal writes.
0336      */
0337     BUG_ON(NInoMstProtected(ni));
0338     BUG_ON(ni->type != AT_DATA);
0339     /* If file is encrypted, deny access, just like NT4. */
0340     if (NInoEncrypted(ni)) {
0341         /* Only $DATA attributes can be encrypted. */
0342         /*
0343          * Reminder for later: Encrypted files are _always_
0344          * non-resident so that the content can always be encrypted.
0345          */
0346         ntfs_debug("Denying write access to encrypted file.");
0347         err = -EACCES;
0348         goto out;
0349     }
0350     if (NInoCompressed(ni)) {
0351         /* Only unnamed $DATA attribute can be compressed. */
0352         BUG_ON(ni->name_len);
0353         /*
0354          * Reminder for later: If resident, the data is not actually
0355          * compressed.  Only on the switch to non-resident does
0356          * compression kick in.  This is in contrast to encrypted files
0357          * (see above).
0358          */
0359         ntfs_error(vi->i_sb, "Writing to compressed files is not "
0360                 "implemented yet.  Sorry.");
0361         err = -EOPNOTSUPP;
0362         goto out;
0363     }
0364     err = file_remove_privs(file);
0365     if (unlikely(err))
0366         goto out;
0367     /*
0368      * Our ->update_time method always succeeds thus file_update_time()
0369      * cannot fail either so there is no need to check the return code.
0370      */
0371     file_update_time(file);
0372     pos = iocb->ki_pos;
0373     /* The first byte after the last cluster being written to. */
0374     end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
0375             ~(u64)vol->cluster_size_mask;
0376     /*
0377      * If the write goes beyond the allocated size, extend the allocation
0378      * to cover the whole of the write, rounded up to the nearest cluster.
0379      */
0380     read_lock_irqsave(&ni->size_lock, flags);
0381     ll = ni->allocated_size;
0382     read_unlock_irqrestore(&ni->size_lock, flags);
0383     if (end > ll) {
0384         /*
0385          * Extend the allocation without changing the data size.
0386          *
0387          * Note we ensure the allocation is big enough to at least
0388          * write some data but we do not require the allocation to be
0389          * complete, i.e. it may be partial.
0390          */
0391         ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
0392         if (likely(ll >= 0)) {
0393             BUG_ON(pos >= ll);
0394             /* If the extension was partial truncate the write. */
0395             if (end > ll) {
0396                 ntfs_debug("Truncating write to inode 0x%lx, "
0397                         "attribute type 0x%x, because "
0398                         "the allocation was only "
0399                         "partially extended.",
0400                         vi->i_ino, (unsigned)
0401                         le32_to_cpu(ni->type));
0402                 iov_iter_truncate(from, ll - pos);
0403             }
0404         } else {
0405             err = ll;
0406             read_lock_irqsave(&ni->size_lock, flags);
0407             ll = ni->allocated_size;
0408             read_unlock_irqrestore(&ni->size_lock, flags);
0409             /* Perform a partial write if possible or fail. */
0410             if (pos < ll) {
0411                 ntfs_debug("Truncating write to inode 0x%lx "
0412                         "attribute type 0x%x, because "
0413                         "extending the allocation "
0414                         "failed (error %d).",
0415                         vi->i_ino, (unsigned)
0416                         le32_to_cpu(ni->type),
0417                         (int)-err);
0418                 iov_iter_truncate(from, ll - pos);
0419             } else {
0420                 if (err != -ENOSPC)
0421                     ntfs_error(vi->i_sb, "Cannot perform "
0422                             "write to inode "
0423                             "0x%lx, attribute "
0424                             "type 0x%x, because "
0425                             "extending the "
0426                             "allocation failed "
0427                             "(error %ld).",
0428                             vi->i_ino, (unsigned)
0429                             le32_to_cpu(ni->type),
0430                             (long)-err);
0431                 else
0432                     ntfs_debug("Cannot perform write to "
0433                             "inode 0x%lx, "
0434                             "attribute type 0x%x, "
0435                             "because there is not "
0436                             "space left.",
0437                             vi->i_ino, (unsigned)
0438                             le32_to_cpu(ni->type));
0439                 goto out;
0440             }
0441         }
0442     }
0443     /*
0444      * If the write starts beyond the initialized size, extend it up to the
0445      * beginning of the write and initialize all non-sparse space between
0446      * the old initialized size and the new one.  This automatically also
0447      * increments the vfs inode->i_size to keep it above or equal to the
0448      * initialized_size.
0449      */
0450     read_lock_irqsave(&ni->size_lock, flags);
0451     ll = ni->initialized_size;
0452     read_unlock_irqrestore(&ni->size_lock, flags);
0453     if (pos > ll) {
0454         /*
0455          * Wait for ongoing direct i/o to complete before proceeding.
0456          * New direct i/o cannot start as we hold i_mutex.
0457          */
0458         inode_dio_wait(vi);
0459         err = ntfs_attr_extend_initialized(ni, pos);
0460         if (unlikely(err < 0))
0461             ntfs_error(vi->i_sb, "Cannot perform write to inode "
0462                     "0x%lx, attribute type 0x%x, because "
0463                     "extending the initialized size "
0464                     "failed (error %d).", vi->i_ino,
0465                     (unsigned)le32_to_cpu(ni->type),
0466                     (int)-err);
0467     }
0468 out:
0469     return err;
0470 }
0471
0472 /**
0473  * __ntfs_grab_cache_pages - obtain a number of locked pages
0474  * @mapping:    address space mapping from which to obtain page cache pages
0475  * @index:  starting index in @mapping at which to begin obtaining pages
0476  * @nr_pages:   number of page cache pages to obtain
0477  * @pages:  array of pages in which to return the obtained page cache pages
0478  * @cached_page: allocated but as yet unused page
0479  *
0480  * Obtain @nr_pages locked page cache pages from the mapping @mapping and
0481  * starting at index @index.
0482  *
0483  * If a page is newly created, add it to lru list
0484  *
0485  * Note, the page locks are obtained in ascending page index order.
0486  */
0487 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
0488         pgoff_t index, const unsigned nr_pages, struct page **pages,
0489         struct page **cached_page)
0490 {
0491     int err, nr;
0492
0493     BUG_ON(!nr_pages);
0494     err = nr = 0;
0495     do {
0496         pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
0497                 FGP_ACCESSED);
0498         if (!pages[nr]) {
0499             if (!*cached_page) {
0500                 *cached_page = page_cache_alloc(mapping);
0501                 if (unlikely(!*cached_page)) {
0502                     err = -ENOMEM;
0503                     goto err_out;
0504                 }
0505             }
0506             err = add_to_page_cache_lru(*cached_page, mapping,
0507                    index,
0508                    mapping_gfp_constraint(mapping, GFP_KERNEL));
0509             if (unlikely(err)) {
0510                 if (err == -EEXIST)
0511                     continue;
0512                 goto err_out;
0513             }
0514             pages[nr] = *cached_page;
0515             *cached_page = NULL;
0516         }
0517         index++;
0518         nr++;
0519     } while (nr < nr_pages);
0520 out:
0521     return err;
0522 err_out:
0523     while (nr > 0) {
0524         unlock_page(pages[--nr]);
0525         put_page(pages[nr]);
0526     }
0527     goto out;
0528 }
0529
0530 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
0531 {
0532     lock_buffer(bh);
0533     get_bh(bh);
0534     bh->b_end_io = end_buffer_read_sync;
0535     return submit_bh(REQ_OP_READ, bh);
0536 }
0537
0538 /**
0539  * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
0540  * @pages:  array of destination pages
0541  * @nr_pages:   number of pages in @pages
0542  * @pos:    byte position in file at which the write begins
0543  * @bytes:  number of bytes to be written
0544  *
0545  * This is called for non-resident attributes from ntfs_file_buffered_write()
0546  * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
0547  * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
0548  * data has not yet been copied into the @pages.
0549  *
0550  * Need to fill any holes with actual clusters, allocate buffers if necessary,
0551  * ensure all the buffers are mapped, and bring uptodate any buffers that are
0552  * only partially being written to.
0553  *
0554  * If @nr_pages is greater than one, we are guaranteed that the cluster size is
0555  * greater than PAGE_SIZE, that all pages in @pages are entirely inside
0556  * the same cluster and that they are the entirety of that cluster, and that
0557  * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
0558  *
0559  * i_size is not to be modified yet.
0560  *
0561  * Return 0 on success or -errno on error.
0562  */
0563 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
0564         unsigned nr_pages, s64 pos, size_t bytes)
0565 {
0566     VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
0567     LCN lcn;
0568     s64 bh_pos, vcn_len, end, initialized_size;
0569     sector_t lcn_block;
0570     struct page *page;
0571     struct inode *vi;
0572     ntfs_inode *ni, *base_ni = NULL;
0573     ntfs_volume *vol;
0574     runlist_element *rl, *rl2;
0575     struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
0576     ntfs_attr_search_ctx *ctx = NULL;
0577     MFT_RECORD *m = NULL;
0578     ATTR_RECORD *a = NULL;
0579     unsigned long flags;
0580     u32 attr_rec_len = 0;
0581     unsigned blocksize, u;
0582     int err, mp_size;
0583     bool rl_write_locked, was_hole, is_retry;
0584     unsigned char blocksize_bits;
0585     struct {
0586         u8 runlist_merged:1;
0587         u8 mft_attr_mapped:1;
0588         u8 mp_rebuilt:1;
0589         u8 attr_switched:1;
0590     } status = { 0, 0, 0, 0 };
0591
0592     BUG_ON(!nr_pages);
0593     BUG_ON(!pages);
0594     BUG_ON(!*pages);
0595     vi = pages[0]->mapping->host;
0596     ni = NTFS_I(vi);
0597     vol = ni->vol;
0598     ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
0599             "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
0600             vi->i_ino, ni->type, pages[0]->index, nr_pages,
0601             (long long)pos, bytes);
0602     blocksize = vol->sb->s_blocksize;
0603     blocksize_bits = vol->sb->s_blocksize_bits;
0604     u = 0;
0605     do {
0606         page = pages[u];
0607         BUG_ON(!page);
0608         /*
0609          * create_empty_buffers() will create uptodate/dirty buffers if
0610          * the page is uptodate/dirty.
0611          */
0612         if (!page_has_buffers(page)) {
0613             create_empty_buffers(page, blocksize, 0);
0614             if (unlikely(!page_has_buffers(page)))
0615                 return -ENOMEM;
0616         }
0617     } while (++u < nr_pages);
0618     rl_write_locked = false;
0619     rl = NULL;
0620     err = 0;
0621     vcn = lcn = -1;
0622     vcn_len = 0;
0623     lcn_block = -1;
0624     was_hole = false;
0625     cpos = pos >> vol->cluster_size_bits;
0626     end = pos + bytes;
0627     cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
0628     /*
0629      * Loop over each page and for each page over each buffer.  Use goto to
0630      * reduce indentation.
0631      */
0632     u = 0;
0633 do_next_page:
0634     page = pages[u];
0635     bh_pos = (s64)page->index << PAGE_SHIFT;
0636     bh = head = page_buffers(page);
0637     do {
0638         VCN cdelta;
0639         s64 bh_end;
0640         unsigned bh_cofs;
0641
0642         /* Clear buffer_new on all buffers to reinitialise state. */
0643         if (buffer_new(bh))
0644             clear_buffer_new(bh);
0645         bh_end = bh_pos + blocksize;
0646         bh_cpos = bh_pos >> vol->cluster_size_bits;
0647         bh_cofs = bh_pos & vol->cluster_size_mask;
0648         if (buffer_mapped(bh)) {
0649             /*
0650              * The buffer is already mapped.  If it is uptodate,
0651              * ignore it.
0652              */
0653             if (buffer_uptodate(bh))
0654                 continue;
0655             /*
0656              * The buffer is not uptodate.  If the page is uptodate
0657              * set the buffer uptodate and otherwise ignore it.
0658              */
0659             if (PageUptodate(page)) {
0660                 set_buffer_uptodate(bh);
0661                 continue;
0662             }
0663             /*
0664              * Neither the page nor the buffer are uptodate.  If
0665              * the buffer is only partially being written to, we
0666              * need to read it in before the write, i.e. now.
0667              */
0668             if ((bh_pos < pos && bh_end > pos) ||
0669                     (bh_pos < end && bh_end > end)) {
0670                 /*
0671                  * If the buffer is fully or partially within
0672                  * the initialized size, do an actual read.
0673                  * Otherwise, simply zero the buffer.
0674                  */
0675                 read_lock_irqsave(&ni->size_lock, flags);
0676                 initialized_size = ni->initialized_size;
0677                 read_unlock_irqrestore(&ni->size_lock, flags);
0678                 if (bh_pos < initialized_size) {
0679                     ntfs_submit_bh_for_read(bh);
0680                     *wait_bh++ = bh;
0681                 } else {
0682                     zero_user(page, bh_offset(bh),
0683                             blocksize);
0684                     set_buffer_uptodate(bh);
0685                 }
0686             }
0687             continue;
0688         }
0689         /* Unmapped buffer.  Need to map it. */
0690         bh->b_bdev = vol->sb->s_bdev;
0691         /*
0692          * If the current buffer is in the same clusters as the map
0693          * cache, there is no need to check the runlist again.  The
0694          * map cache is made up of @vcn, which is the first cached file
0695          * cluster, @vcn_len which is the number of cached file
0696          * clusters, @lcn is the device cluster corresponding to @vcn,
0697          * and @lcn_block is the block number corresponding to @lcn.
0698          */
0699         cdelta = bh_cpos - vcn;
0700         if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
0701 map_buffer_cached:
0702             BUG_ON(lcn < 0);
0703             bh->b_blocknr = lcn_block +
0704                     (cdelta << (vol->cluster_size_bits -
0705                     blocksize_bits)) +
0706                     (bh_cofs >> blocksize_bits);
0707             set_buffer_mapped(bh);
0708             /*
0709              * If the page is uptodate so is the buffer.  If the
0710              * buffer is fully outside the write, we ignore it if
0711              * it was already allocated and we mark it dirty so it
0712              * gets written out if we allocated it.  On the other
0713              * hand, if we allocated the buffer but we are not
0714              * marking it dirty we set buffer_new so we can do
0715              * error recovery.
0716              */
0717             if (PageUptodate(page)) {
0718                 if (!buffer_uptodate(bh))
0719                     set_buffer_uptodate(bh);
0720                 if (unlikely(was_hole)) {
0721                     /* We allocated the buffer. */
0722                     clean_bdev_bh_alias(bh);
0723                     if (bh_end <= pos || bh_pos >= end)
0724                         mark_buffer_dirty(bh);
0725                     else
0726                         set_buffer_new(bh);
0727                 }
0728                 continue;
0729             }
0730             /* Page is _not_ uptodate. */
0731             if (likely(!was_hole)) {
0732                 /*
0733                  * Buffer was already allocated.  If it is not
0734                  * uptodate and is only partially being written
0735                  * to, we need to read it in before the write,
0736                  * i.e. now.
0737                  */
0738                 if (!buffer_uptodate(bh) && bh_pos < end &&
0739                         bh_end > pos &&
0740                         (bh_pos < pos ||
0741                         bh_end > end)) {
0742                     /*
0743                      * If the buffer is fully or partially
0744                      * within the initialized size, do an
0745                      * actual read.  Otherwise, simply zero
0746                      * the buffer.
0747                      */
0748                     read_lock_irqsave(&ni->size_lock,
0749                             flags);
0750                     initialized_size = ni->initialized_size;
0751                     read_unlock_irqrestore(&ni->size_lock,
0752                             flags);
0753                     if (bh_pos < initialized_size) {
0754                         ntfs_submit_bh_for_read(bh);
0755                         *wait_bh++ = bh;
0756                     } else {
0757                         zero_user(page, bh_offset(bh),
0758                                 blocksize);
0759                         set_buffer_uptodate(bh);
0760                     }
0761                 }
0762                 continue;
0763             }
0764             /* We allocated the buffer. */
0765             clean_bdev_bh_alias(bh);
0766             /*
0767              * If the buffer is fully outside the write, zero it,
0768              * set it uptodate, and mark it dirty so it gets
0769              * written out.  If it is partially being written to,
0770              * zero region surrounding the write but leave it to
0771              * commit write to do anything else.  Finally, if the
0772              * buffer is fully being overwritten, do nothing.
0773              */
0774             if (bh_end <= pos || bh_pos >= end) {
0775                 if (!buffer_uptodate(bh)) {
0776                     zero_user(page, bh_offset(bh),
0777                             blocksize);
0778                     set_buffer_uptodate(bh);
0779                 }
0780                 mark_buffer_dirty(bh);
0781                 continue;
0782             }
0783             set_buffer_new(bh);
0784             if (!buffer_uptodate(bh) &&
0785                     (bh_pos < pos || bh_end > end)) {
0786                 u8 *kaddr;
0787                 unsigned pofs;
0788
0789                 kaddr = kmap_atomic(page);
0790                 if (bh_pos < pos) {
0791                     pofs = bh_pos & ~PAGE_MASK;
0792                     memset(kaddr + pofs, 0, pos - bh_pos);
0793                 }
0794                 if (bh_end > end) {
0795                     pofs = end & ~PAGE_MASK;
0796                     memset(kaddr + pofs, 0, bh_end - end);
0797                 }
0798                 kunmap_atomic(kaddr);
0799                 flush_dcache_page(page);
0800             }
0801             continue;
0802         }
0803         /*
0804          * Slow path: this is the first buffer in the cluster.  If it
0805          * is outside allocated size and is not uptodate, zero it and
0806          * set it uptodate.
0807          */
0808         read_lock_irqsave(&ni->size_lock, flags);
0809         initialized_size = ni->allocated_size;
0810         read_unlock_irqrestore(&ni->size_lock, flags);
0811         if (bh_pos > initialized_size) {
0812             if (PageUptodate(page)) {
0813                 if (!buffer_uptodate(bh))
0814                     set_buffer_uptodate(bh);
0815             } else if (!buffer_uptodate(bh)) {
0816                 zero_user(page, bh_offset(bh), blocksize);
0817                 set_buffer_uptodate(bh);
0818             }
0819             continue;
0820         }
0821         is_retry = false;
0822         if (!rl) {
0823             down_read(&ni->runlist.lock);
0824 retry_remap:
0825             rl = ni->runlist.rl;
0826         }
0827         if (likely(rl != NULL)) {
0828             /* Seek to element containing target cluster. */
0829             while (rl->length && rl[1].vcn <= bh_cpos)
0830                 rl++;
0831             lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
0832             if (likely(lcn >= 0)) {
0833                 /*
0834                  * Successful remap, setup the map cache and
0835                  * use that to deal with the buffer.
0836                  */
0837                 was_hole = false;
0838                 vcn = bh_cpos;
0839                 vcn_len = rl[1].vcn - vcn;
0840                 lcn_block = lcn << (vol->cluster_size_bits -
0841                         blocksize_bits);
0842                 cdelta = 0;
0843                 /*
0844                  * If the number of remaining clusters touched
0845                  * by the write is smaller or equal to the
0846                  * number of cached clusters, unlock the
0847                  * runlist as the map cache will be used from
0848                  * now on.
0849                  */
0850                 if (likely(vcn + vcn_len >= cend)) {
0851                     if (rl_write_locked) {
0852                         up_write(&ni->runlist.lock);
0853                         rl_write_locked = false;
0854                     } else
0855                         up_read(&ni->runlist.lock);
0856                     rl = NULL;
0857                 }
0858                 goto map_buffer_cached;
0859             }
0860         } else
0861             lcn = LCN_RL_NOT_MAPPED;
0862         /*
0863          * If it is not a hole and not out of bounds, the runlist is
0864          * probably unmapped so try to map it now.
0865          */
0866         if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
0867             if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
0868                 /* Attempt to map runlist. */
0869                 if (!rl_write_locked) {
0870                     /*
0871                      * We need the runlist locked for
0872                      * writing, so if it is locked for
0873                      * reading relock it now and retry in
0874                      * case it changed whilst we dropped
0875                      * the lock.
0876                      */
0877                     up_read(&ni->runlist.lock);
0878                     down_write(&ni->runlist.lock);
0879                     rl_write_locked = true;
0880                     goto retry_remap;
0881                 }
0882                 err = ntfs_map_runlist_nolock(ni, bh_cpos,
0883                         NULL);
0884                 if (likely(!err)) {
0885                     is_retry = true;
0886                     goto retry_remap;
0887                 }
0888                 /*
0889                  * If @vcn is out of bounds, pretend @lcn is
0890                  * LCN_ENOENT.  As long as the buffer is out
0891                  * of bounds this will work fine.
0892                  */
0893                 if (err == -ENOENT) {
0894                     lcn = LCN_ENOENT;
0895                     err = 0;
0896                     goto rl_not_mapped_enoent;
0897                 }
0898             } else
0899                 err = -EIO;
0900             /* Failed to map the buffer, even after retrying. */
0901             bh->b_blocknr = -1;
0902             ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
0903                     "attribute type 0x%x, vcn 0x%llx, "
0904                     "vcn offset 0x%x, because its "
0905                     "location on disk could not be "
0906                     "determined%s (error code %i).",
0907                     ni->mft_no, ni->type,
0908                     (unsigned long long)bh_cpos,
0909                     (unsigned)bh_pos &
0910                     vol->cluster_size_mask,
0911                     is_retry ? " even after retrying" : "",
0912                     err);
0913             break;
0914         }
0915 rl_not_mapped_enoent:
0916         /*
0917          * The buffer is in a hole or out of bounds.  We need to fill
0918          * the hole, unless the buffer is in a cluster which is not
0919          * touched by the write, in which case we just leave the buffer
0920          * unmapped.  This can only happen when the cluster size is
0921          * less than the page cache size.
0922          */
0923         if (unlikely(vol->cluster_size < PAGE_SIZE)) {
0924             bh_cend = (bh_end + vol->cluster_size - 1) >>
0925                     vol->cluster_size_bits;
0926             if ((bh_cend <= cpos || bh_cpos >= cend)) {
0927                 bh->b_blocknr = -1;
0928                 /*
0929                  * If the buffer is uptodate we skip it.  If it
0930                  * is not but the page is uptodate, we can set
0931                  * the buffer uptodate.  If the page is not
0932                  * uptodate, we can clear the buffer and set it
0933                  * uptodate.  Whether this is worthwhile is
0934                  * debatable and this could be removed.
0935                  */
0936                 if (PageUptodate(page)) {
0937                     if (!buffer_uptodate(bh))
0938                         set_buffer_uptodate(bh);
0939                 } else if (!buffer_uptodate(bh)) {
0940                     zero_user(page, bh_offset(bh),
0941                         blocksize);
0942                     set_buffer_uptodate(bh);
0943                 }
0944                 continue;
0945             }
0946         }
0947         /*
0948          * Out of bounds buffer is invalid if it was not really out of
0949          * bounds.
0950          */
0951         BUG_ON(lcn != LCN_HOLE);
0952         /*
0953          * We need the runlist locked for writing, so if it is locked
0954          * for reading relock it now and retry in case it changed
0955          * whilst we dropped the lock.
0956          */
0957         BUG_ON(!rl);
0958         if (!rl_write_locked) {
0959             up_read(&ni->runlist.lock);
0960             down_write(&ni->runlist.lock);
0961             rl_write_locked = true;
0962             goto retry_remap;
0963         }
0964         /* Find the previous last allocated cluster. */
0965         BUG_ON(rl->lcn != LCN_HOLE);
0966         lcn = -1;
0967         rl2 = rl;
0968         while (--rl2 >= ni->runlist.rl) {
0969             if (rl2->lcn >= 0) {
0970                 lcn = rl2->lcn + rl2->length;
0971                 break;
0972             }
0973         }
0974         rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
0975                 false);
0976         if (IS_ERR(rl2)) {
0977             err = PTR_ERR(rl2);
0978             ntfs_debug("Failed to allocate cluster, error code %i.",
0979                     err);
0980             break;
0981         }
0982         lcn = rl2->lcn;
0983         rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
0984         if (IS_ERR(rl)) {
0985             err = PTR_ERR(rl);
0986             if (err != -ENOMEM)
0987                 err = -EIO;
0988             if (ntfs_cluster_free_from_rl(vol, rl2)) {
0989                 ntfs_error(vol->sb, "Failed to release "
0990                         "allocated cluster in error "
0991                         "code path.  Run chkdsk to "
0992                         "recover the lost cluster.");
0993                 NVolSetErrors(vol);
0994             }
0995             ntfs_free(rl2);
0996             break;
0997         }
0998         ni->runlist.rl = rl;
0999         status.runlist_merged = 1;
1000         ntfs_debug("Allocated cluster, lcn 0x%llx.",
1001                 (unsigned long long)lcn);
1002         /* Map and lock the mft record and get the attribute record. */
1003         if (!NInoAttr(ni))
1004             base_ni = ni;
1005         else
1006             base_ni = ni->ext.base_ntfs_ino;
1007         m = map_mft_record(base_ni);
1008         if (IS_ERR(m)) {
1009             err = PTR_ERR(m);
1010             break;
1011         }
1012         ctx = ntfs_attr_get_search_ctx(base_ni, m);
1013         if (unlikely(!ctx)) {
1014             err = -ENOMEM;
1015             unmap_mft_record(base_ni);
1016             break;
1017         }
1018         status.mft_attr_mapped = 1;
1019         err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1020                 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
1021         if (unlikely(err)) {
1022             if (err == -ENOENT)
1023                 err = -EIO;
1024             break;
1025         }
1026         m = ctx->mrec;
1027         a = ctx->attr;
1028         /*
1029          * Find the runlist element with which the attribute extent
1030          * starts.  Note, we cannot use the _attr_ version because we
1031          * have mapped the mft record.  That is ok because we know the
1032          * runlist fragment must be mapped already to have ever gotten
1033          * here, so we can just use the _rl_ version.
1034          */
1035         vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1036         rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
1037         BUG_ON(!rl2);
1038         BUG_ON(!rl2->length);
1039         BUG_ON(rl2->lcn < LCN_HOLE);
1040         highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
1041         /*
1042          * If @highest_vcn is zero, calculate the real highest_vcn
1043          * (which can really be zero).
1044          */
1045         if (!highest_vcn)
1046             highest_vcn = (sle64_to_cpu(
1047                     a->data.non_resident.allocated_size) >>
1048                     vol->cluster_size_bits) - 1;
1049         /*
1050          * Determine the size of the mapping pairs array for the new
1051          * extent, i.e. the old extent with the hole filled.
1052          */
1053         mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
1054                 highest_vcn);
1055         if (unlikely(mp_size <= 0)) {
1056             if (!(err = mp_size))
1057                 err = -EIO;
1058             ntfs_debug("Failed to get size for mapping pairs "
1059                     "array, error code %i.", err);
1060             break;
1061         }
1062         /*
1063          * Resize the attribute record to fit the new mapping pairs
1064          * array.
1065          */
1066         attr_rec_len = le32_to_cpu(a->length);
1067         err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1068                 a->data.non_resident.mapping_pairs_offset));
1069         if (unlikely(err)) {
1070             BUG_ON(err != -ENOSPC);
1071             // TODO: Deal with this by using the current attribute
1072             // and fill it with as much of the mapping pairs
1073             // array as possible.  Then loop over each attribute
1074             // extent rewriting the mapping pairs arrays as we go
1075             // along and if when we reach the end we have not
1076             // enough space, try to resize the last attribute
1077             // extent and if even that fails, add a new attribute
1078             // extent.
1079             // We could also try to resize at each step in the hope
1080             // that we will not need to rewrite every single extent.
1081             // Note, we may need to decompress some extents to fill
1082             // the runlist as we are walking the extents...
1083             ntfs_error(vol->sb, "Not enough space in the mft "
1084                     "record for the extended attribute "
1085                     "record.  This case is not "
1086                     "implemented yet.");
1087             err = -EOPNOTSUPP;
1088             break ;
1089         }
1090         status.mp_rebuilt = 1;
1091         /*
1092          * Generate the mapping pairs array directly into the attribute
1093          * record.
1094          */
1095         err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1096                 a->data.non_resident.mapping_pairs_offset),
1097                 mp_size, rl2, vcn, highest_vcn, NULL);
1098         if (unlikely(err)) {
1099             ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1100                     "attribute type 0x%x, because building "
1101                     "the mapping pairs failed with error "
1102                     "code %i.", vi->i_ino,
1103                     (unsigned)le32_to_cpu(ni->type), err);
1104             err = -EIO;
1105             break;
1106         }
1107         /* Update the highest_vcn but only if it was not set. */
1108         if (unlikely(!a->data.non_resident.highest_vcn))
1109             a->data.non_resident.highest_vcn =
1110                     cpu_to_sle64(highest_vcn);
1111         /*
1112          * If the attribute is sparse/compressed, update the compressed
1113          * size in the ntfs_inode structure and the attribute record.
1114          */
1115         if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1116             /*
1117              * If we are not in the first attribute extent, switch
1118              * to it, but first ensure the changes will make it to
1119              * disk later.
1120              */
1121             if (a->data.non_resident.lowest_vcn) {
1122                 flush_dcache_mft_record_page(ctx->ntfs_ino);
1123                 mark_mft_record_dirty(ctx->ntfs_ino);
1124                 ntfs_attr_reinit_search_ctx(ctx);
1125                 err = ntfs_attr_lookup(ni->type, ni->name,
1126                         ni->name_len, CASE_SENSITIVE,
1127                         0, NULL, 0, ctx);
1128                 if (unlikely(err)) {
1129                     status.attr_switched = 1;
1130                     break;
1131                 }
1132                 /* @m is not used any more so do not set it. */
1133                 a = ctx->attr;
1134             }
1135             write_lock_irqsave(&ni->size_lock, flags);
1136             ni->itype.compressed.size += vol->cluster_size;
1137             a->data.non_resident.compressed_size =
1138                     cpu_to_sle64(ni->itype.compressed.size);
1139             write_unlock_irqrestore(&ni->size_lock, flags);
1140         }
1141         /* Ensure the changes make it to disk. */
1142         flush_dcache_mft_record_page(ctx->ntfs_ino);
1143         mark_mft_record_dirty(ctx->ntfs_ino);
1144         ntfs_attr_put_search_ctx(ctx);
1145         unmap_mft_record(base_ni);
1146         /* Successfully filled the hole. */
1147         status.runlist_merged = 0;
1148         status.mft_attr_mapped = 0;
1149         status.mp_rebuilt = 0;
1150         /* Setup the map cache and use that to deal with the buffer. */
1151         was_hole = true;
1152         vcn = bh_cpos;
1153         vcn_len = 1;
1154         lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1155         cdelta = 0;
1156         /*
1157          * If the number of remaining clusters in the @pages is smaller
1158          * or equal to the number of cached clusters, unlock the
1159          * runlist as the map cache will be used from now on.
1160          */
1161         if (likely(vcn + vcn_len >= cend)) {
1162             up_write(&ni->runlist.lock);
1163             rl_write_locked = false;
1164             rl = NULL;
1165         }
1166         goto map_buffer_cached;
1167     } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1168     /* If there are no errors, do the next page. */
1169     if (likely(!err && ++u < nr_pages))
1170         goto do_next_page;
1171     /* If there are no errors, release the runlist lock if we took it. */
1172     if (likely(!err)) {
1173         if (unlikely(rl_write_locked)) {
1174             up_write(&ni->runlist.lock);
1175             rl_write_locked = false;
1176         } else if (unlikely(rl))
1177             up_read(&ni->runlist.lock);
1178         rl = NULL;
1179     }
1180     /* If we issued read requests, let them complete. */
1181     read_lock_irqsave(&ni->size_lock, flags);
1182     initialized_size = ni->initialized_size;
1183     read_unlock_irqrestore(&ni->size_lock, flags);
1184     while (wait_bh > wait) {
1185         bh = *--wait_bh;
1186         wait_on_buffer(bh);
1187         if (likely(buffer_uptodate(bh))) {
1188             page = bh->b_page;
1189             bh_pos = ((s64)page->index << PAGE_SHIFT) +
1190                     bh_offset(bh);
1191             /*
1192              * If the buffer overflows the initialized size, need
1193              * to zero the overflowing region.
1194              */
1195             if (unlikely(bh_pos + blocksize > initialized_size)) {
1196                 int ofs = 0;
1197
1198                 if (likely(bh_pos < initialized_size))
1199                     ofs = initialized_size - bh_pos;
1200                 zero_user_segment(page, bh_offset(bh) + ofs,
1201                         blocksize);
1202             }
1203         } else /* if (unlikely(!buffer_uptodate(bh))) */
1204             err = -EIO;
1205     }
1206     if (likely(!err)) {
1207         /* Clear buffer_new on all buffers. */
1208         u = 0;
1209         do {
1210             bh = head = page_buffers(pages[u]);
1211             do {
1212                 if (buffer_new(bh))
1213                     clear_buffer_new(bh);
1214             } while ((bh = bh->b_this_page) != head);
1215         } while (++u < nr_pages);
1216         ntfs_debug("Done.");
1217         return err;
1218     }
1219     if (status.attr_switched) {
1220         /* Get back to the attribute extent we modified. */
1221         ntfs_attr_reinit_search_ctx(ctx);
1222         if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1223                 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1224             ntfs_error(vol->sb, "Failed to find required "
1225                     "attribute extent of attribute in "
1226                     "error code path.  Run chkdsk to "
1227                     "recover.");
1228             write_lock_irqsave(&ni->size_lock, flags);
1229             ni->itype.compressed.size += vol->cluster_size;
1230             write_unlock_irqrestore(&ni->size_lock, flags);
1231             flush_dcache_mft_record_page(ctx->ntfs_ino);
1232             mark_mft_record_dirty(ctx->ntfs_ino);
1233             /*
1234              * The only thing that is now wrong is the compressed
1235              * size of the base attribute extent which chkdsk
1236              * should be able to fix.
1237              */
1238             NVolSetErrors(vol);
1239         } else {
1240             m = ctx->mrec;
1241             a = ctx->attr;
1242             status.attr_switched = 0;
1243         }
1244     }
1245     /*
1246      * If the runlist has been modified, need to restore it by punching a
1247      * hole into it and we then need to deallocate the on-disk cluster as
1248      * well.  Note, we only modify the runlist if we are able to generate a
1249      * new mapping pairs array, i.e. only when the mapped attribute extent
1250      * is not switched.
1251      */
1252     if (status.runlist_merged && !status.attr_switched) {
1253         BUG_ON(!rl_write_locked);
1254         /* Make the file cluster we allocated sparse in the runlist. */
1255         if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1256             ntfs_error(vol->sb, "Failed to punch hole into "
1257                     "attribute runlist in error code "
1258                     "path.  Run chkdsk to recover the "
1259                     "lost cluster.");
1260             NVolSetErrors(vol);
1261         } else /* if (success) */ {
1262             status.runlist_merged = 0;
1263             /*
1264              * Deallocate the on-disk cluster we allocated but only
1265              * if we succeeded in punching its vcn out of the
1266              * runlist.
1267              */
1268             down_write(&vol->lcnbmp_lock);
1269             if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1270                 ntfs_error(vol->sb, "Failed to release "
1271                         "allocated cluster in error "
1272                         "code path.  Run chkdsk to "
1273                         "recover the lost cluster.");
1274                 NVolSetErrors(vol);
1275             }
1276             up_write(&vol->lcnbmp_lock);
1277         }
1278     }
1279     /*
1280      * Resize the attribute record to its old size and rebuild the mapping
1281      * pairs array.  Note, we only can do this if the runlist has been
1282      * restored to its old state which also implies that the mapped
1283      * attribute extent is not switched.
1284      */
1285     if (status.mp_rebuilt && !status.runlist_merged) {
1286         if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1287             ntfs_error(vol->sb, "Failed to restore attribute "
1288                     "record in error code path.  Run "
1289                     "chkdsk to recover.");
1290             NVolSetErrors(vol);
1291         } else /* if (success) */ {
1292             if (ntfs_mapping_pairs_build(vol, (u8*)a +
1293                     le16_to_cpu(a->data.non_resident.
1294                     mapping_pairs_offset), attr_rec_len -
1295                     le16_to_cpu(a->data.non_resident.
1296                     mapping_pairs_offset), ni->runlist.rl,
1297                     vcn, highest_vcn, NULL)) {
1298                 ntfs_error(vol->sb, "Failed to restore "
1299                         "mapping pairs array in error "
1300                         "code path.  Run chkdsk to "
1301                         "recover.");
1302                 NVolSetErrors(vol);
1303             }
1304             flush_dcache_mft_record_page(ctx->ntfs_ino);
1305             mark_mft_record_dirty(ctx->ntfs_ino);
1306         }
1307     }
1308     /* Release the mft record and the attribute. */
1309     if (status.mft_attr_mapped) {
1310         ntfs_attr_put_search_ctx(ctx);
1311         unmap_mft_record(base_ni);
1312     }
1313     /* Release the runlist lock. */
1314     if (rl_write_locked)
1315         up_write(&ni->runlist.lock);
1316     else if (rl)
1317         up_read(&ni->runlist.lock);
1318     /*
1319      * Zero out any newly allocated blocks to avoid exposing stale data.
1320      * If BH_New is set, we know that the block was newly allocated above
1321      * and that it has not been fully zeroed and marked dirty yet.
1322      */
1323     nr_pages = u;
1324     u = 0;
1325     end = bh_cpos << vol->cluster_size_bits;
1326     do {
1327         page = pages[u];
1328         bh = head = page_buffers(page);
1329         do {
1330             if (u == nr_pages &&
1331                     ((s64)page->index << PAGE_SHIFT) +
1332                     bh_offset(bh) >= end)
1333                 break;
1334             if (!buffer_new(bh))
1335                 continue;
1336             clear_buffer_new(bh);
1337             if (!buffer_uptodate(bh)) {
1338                 if (PageUptodate(page))
1339                     set_buffer_uptodate(bh);
1340                 else {
1341                     zero_user(page, bh_offset(bh),
1342                             blocksize);
1343                     set_buffer_uptodate(bh);
1344                 }
1345             }
1346             mark_buffer_dirty(bh);
1347         } while ((bh = bh->b_this_page) != head);
1348     } while (++u <= nr_pages);
1349     ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
1350     return err;
1351 }
1352
1353 static inline void ntfs_flush_dcache_pages(struct page **pages,
1354         unsigned nr_pages)
1355 {
1356     BUG_ON(!nr_pages);
1357     /*
1358      * Warning: Do not do the decrement at the same time as the call to
1359      * flush_dcache_page() because it is a NULL macro on i386 and hence the
1360      * decrement never happens so the loop never terminates.
1361      */
1362     do {
1363         --nr_pages;
1364         flush_dcache_page(pages[nr_pages]);
1365     } while (nr_pages > 0);
1366 }
1367
1368 /**
1369  * ntfs_commit_pages_after_non_resident_write - commit the received data
1370  * @pages:  array of destination pages
1371  * @nr_pages:   number of pages in @pages
1372  * @pos:    byte position in file at which the write begins
1373  * @bytes:  number of bytes to be written
1374  *
1375  * See description of ntfs_commit_pages_after_write(), below.
1376  */
1377 static inline int ntfs_commit_pages_after_non_resident_write(
1378         struct page **pages, const unsigned nr_pages,
1379         s64 pos, size_t bytes)
1380 {
1381     s64 end, initialized_size;
1382     struct inode *vi;
1383     ntfs_inode *ni, *base_ni;
1384     struct buffer_head *bh, *head;
1385     ntfs_attr_search_ctx *ctx;
1386     MFT_RECORD *m;
1387     ATTR_RECORD *a;
1388     unsigned long flags;
1389     unsigned blocksize, u;
1390     int err;
1391
1392     vi = pages[0]->mapping->host;
1393     ni = NTFS_I(vi);
1394     blocksize = vi->i_sb->s_blocksize;
1395     end = pos + bytes;
1396     u = 0;
1397     do {
1398         s64 bh_pos;
1399         struct page *page;
1400         bool partial;
1401
1402         page = pages[u];
1403         bh_pos = (s64)page->index << PAGE_SHIFT;
1404         bh = head = page_buffers(page);
1405         partial = false;
1406         do {
1407             s64 bh_end;
1408
1409             bh_end = bh_pos + blocksize;
1410             if (bh_end <= pos || bh_pos >= end) {
1411                 if (!buffer_uptodate(bh))
1412                     partial = true;
1413             } else {
1414                 set_buffer_uptodate(bh);
1415                 mark_buffer_dirty(bh);
1416             }
1417         } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1418         /*
1419          * If all buffers are now uptodate but the page is not, set the
1420          * page uptodate.
1421          */
1422         if (!partial && !PageUptodate(page))
1423             SetPageUptodate(page);
1424     } while (++u < nr_pages);
1425     /*
1426      * Finally, if we do not need to update initialized_size or i_size we
1427      * are finished.
1428      */
1429     read_lock_irqsave(&ni->size_lock, flags);
1430     initialized_size = ni->initialized_size;
1431     read_unlock_irqrestore(&ni->size_lock, flags);
1432     if (end <= initialized_size) {
1433         ntfs_debug("Done.");
1434         return 0;
1435     }
1436     /*
1437      * Update initialized_size/i_size as appropriate, both in the inode and
1438      * the mft record.
1439      */
1440     if (!NInoAttr(ni))
1441         base_ni = ni;
1442     else
1443         base_ni = ni->ext.base_ntfs_ino;
1444     /* Map, pin, and lock the mft record. */
1445     m = map_mft_record(base_ni);
1446     if (IS_ERR(m)) {
1447         err = PTR_ERR(m);
1448         m = NULL;
1449         ctx = NULL;
1450         goto err_out;
1451     }
1452     BUG_ON(!NInoNonResident(ni));
1453     ctx = ntfs_attr_get_search_ctx(base_ni, m);
1454     if (unlikely(!ctx)) {
1455         err = -ENOMEM;
1456         goto err_out;
1457     }
1458     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1459             CASE_SENSITIVE, 0, NULL, 0, ctx);
1460     if (unlikely(err)) {
1461         if (err == -ENOENT)
1462             err = -EIO;
1463         goto err_out;
1464     }
1465     a = ctx->attr;
1466     BUG_ON(!a->non_resident);
1467     write_lock_irqsave(&ni->size_lock, flags);
1468     BUG_ON(end > ni->allocated_size);
1469     ni->initialized_size = end;
1470     a->data.non_resident.initialized_size = cpu_to_sle64(end);
1471     if (end > i_size_read(vi)) {
1472         i_size_write(vi, end);
1473         a->data.non_resident.data_size =
1474                 a->data.non_resident.initialized_size;
1475     }
1476     write_unlock_irqrestore(&ni->size_lock, flags);
1477     /* Mark the mft record dirty, so it gets written back. */
1478     flush_dcache_mft_record_page(ctx->ntfs_ino);
1479     mark_mft_record_dirty(ctx->ntfs_ino);
1480     ntfs_attr_put_search_ctx(ctx);
1481     unmap_mft_record(base_ni);
1482     ntfs_debug("Done.");
1483     return 0;
1484 err_out:
1485     if (ctx)
1486         ntfs_attr_put_search_ctx(ctx);
1487     if (m)
1488         unmap_mft_record(base_ni);
1489     ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1490             "code %i).", err);
1491     if (err != -ENOMEM)
1492         NVolSetErrors(ni->vol);
1493     return err;
1494 }
1495
1496 /**
1497  * ntfs_commit_pages_after_write - commit the received data
1498  * @pages:  array of destination pages
1499  * @nr_pages:   number of pages in @pages
1500  * @pos:    byte position in file at which the write begins
1501  * @bytes:  number of bytes to be written
1502  *
1503  * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
1504  * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
1505  * locked but not kmap()ped.  The source data has already been copied into the
1506  * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
1507  * the data was copied (for non-resident attributes only) and it returned
1508  * success.
1509  *
1510  * Need to set uptodate and mark dirty all buffers within the boundary of the
1511  * write.  If all buffers in a page are uptodate we set the page uptodate, too.
1512  *
1513  * Setting the buffers dirty ensures that they get written out later when
1514  * ntfs_writepage() is invoked by the VM.
1515  *
1516  * Finally, we need to update i_size and initialized_size as appropriate both
1517  * in the inode and the mft record.
1518  *
1519  * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1520  * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1521  * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
1522  * that case, it also marks the inode dirty.
1523  *
1524  * If things have gone as outlined in
1525  * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1526  * content modifications here for non-resident attributes.  For resident
1527  * attributes we need to do the uptodate bringing here which we combine with
1528  * the copying into the mft record which means we save one atomic kmap.
1529  *
1530  * Return 0 on success or -errno on error.
1531  */
1532 static int ntfs_commit_pages_after_write(struct page **pages,
1533         const unsigned nr_pages, s64 pos, size_t bytes)
1534 {
1535     s64 end, initialized_size;
1536     loff_t i_size;
1537     struct inode *vi;
1538     ntfs_inode *ni, *base_ni;
1539     struct page *page;
1540     ntfs_attr_search_ctx *ctx;
1541     MFT_RECORD *m;
1542     ATTR_RECORD *a;
1543     char *kattr, *kaddr;
1544     unsigned long flags;
1545     u32 attr_len;
1546     int err;
1547
1548     BUG_ON(!nr_pages);
1549     BUG_ON(!pages);
1550     page = pages[0];
1551     BUG_ON(!page);
1552     vi = page->mapping->host;
1553     ni = NTFS_I(vi);
1554     ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1555             "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1556             vi->i_ino, ni->type, page->index, nr_pages,
1557             (long long)pos, bytes);
1558     if (NInoNonResident(ni))
1559         return ntfs_commit_pages_after_non_resident_write(pages,
1560                 nr_pages, pos, bytes);
1561     BUG_ON(nr_pages > 1);
1562     /*
1563      * Attribute is resident, implying it is not compressed, encrypted, or
1564      * sparse.
1565      */
1566     if (!NInoAttr(ni))
1567         base_ni = ni;
1568     else
1569         base_ni = ni->ext.base_ntfs_ino;
1570     BUG_ON(NInoNonResident(ni));
1571     /* Map, pin, and lock the mft record. */
1572     m = map_mft_record(base_ni);
1573     if (IS_ERR(m)) {
1574         err = PTR_ERR(m);
1575         m = NULL;
1576         ctx = NULL;
1577         goto err_out;
1578     }
1579     ctx = ntfs_attr_get_search_ctx(base_ni, m);
1580     if (unlikely(!ctx)) {
1581         err = -ENOMEM;
1582         goto err_out;
1583     }
1584     err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1585             CASE_SENSITIVE, 0, NULL, 0, ctx);
1586     if (unlikely(err)) {
1587         if (err == -ENOENT)
1588             err = -EIO;
1589         goto err_out;
1590     }
1591     a = ctx->attr;
1592     BUG_ON(a->non_resident);
1593     /* The total length of the attribute value. */
1594     attr_len = le32_to_cpu(a->data.resident.value_length);
1595     i_size = i_size_read(vi);
1596     BUG_ON(attr_len != i_size);
1597     BUG_ON(pos > attr_len);
1598     end = pos + bytes;
1599     BUG_ON(end > le32_to_cpu(a->length) -
1600             le16_to_cpu(a->data.resident.value_offset));
1601     kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1602     kaddr = kmap_atomic(page);
1603     /* Copy the received data from the page to the mft record. */
1604     memcpy(kattr + pos, kaddr + pos, bytes);
1605     /* Update the attribute length if necessary. */
1606     if (end > attr_len) {
1607         attr_len = end;
1608         a->data.resident.value_length = cpu_to_le32(attr_len);
1609     }
1610     /*
1611      * If the page is not uptodate, bring the out of bounds area(s)
1612      * uptodate by copying data from the mft record to the page.
1613      */
1614     if (!PageUptodate(page)) {
1615         if (pos > 0)
1616             memcpy(kaddr, kattr, pos);
1617         if (end < attr_len)
1618             memcpy(kaddr + end, kattr + end, attr_len - end);
1619         /* Zero the region outside the end of the attribute value. */
1620         memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
1621         flush_dcache_page(page);
1622         SetPageUptodate(page);
1623     }
1624     kunmap_atomic(kaddr);
1625     /* Update initialized_size/i_size if necessary. */
1626     read_lock_irqsave(&ni->size_lock, flags);
1627     initialized_size = ni->initialized_size;
1628     BUG_ON(end > ni->allocated_size);
1629     read_unlock_irqrestore(&ni->size_lock, flags);
1630     BUG_ON(initialized_size != i_size);
1631     if (end > initialized_size) {
1632         write_lock_irqsave(&ni->size_lock, flags);
1633         ni->initialized_size = end;
1634         i_size_write(vi, end);
1635         write_unlock_irqrestore(&ni->size_lock, flags);
1636     }
1637     /* Mark the mft record dirty, so it gets written back. */
1638     flush_dcache_mft_record_page(ctx->ntfs_ino);
1639     mark_mft_record_dirty(ctx->ntfs_ino);
1640     ntfs_attr_put_search_ctx(ctx);
1641     unmap_mft_record(base_ni);
1642     ntfs_debug("Done.");
1643     return 0;
1644 err_out:
1645     if (err == -ENOMEM) {
1646         ntfs_warning(vi->i_sb, "Error allocating memory required to "
1647                 "commit the write.");
1648         if (PageUptodate(page)) {
1649             ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1650                     "dirty so the write will be retried "
1651                     "later on by the VM.");
1652             /*
1653              * Put the page on mapping->dirty_pages, but leave its
1654              * buffers' dirty state as-is.
1655              */
1656             __set_page_dirty_nobuffers(page);
1657             err = 0;
1658         } else
1659             ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
1660                     "data has been lost.");
1661     } else {
1662         ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1663                 "with error %i.", err);
1664         NVolSetErrors(ni->vol);
1665     }
1666     if (ctx)
1667         ntfs_attr_put_search_ctx(ctx);
1668     if (m)
1669         unmap_mft_record(base_ni);
1670     return err;
1671 }
1672
1673 /*
1674  * Copy as much as we can into the pages and return the number of bytes which
1675  * were successfully copied.  If a fault is encountered then clear the pages
1676  * out to (ofs + bytes) and return the number of bytes which were copied.
1677  */
1678 static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
1679         unsigned ofs, struct iov_iter *i, size_t bytes)
1680 {
1681     struct page **last_page = pages + nr_pages;
1682     size_t total = 0;
1683     unsigned len, copied;
1684
1685     do {
1686         len = PAGE_SIZE - ofs;
1687         if (len > bytes)
1688             len = bytes;
1689         copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
1690         total += copied;
1691         bytes -= copied;
1692         if (!bytes)
1693             break;
1694         if (copied < len)
1695             goto err;
1696         ofs = 0;
1697     } while (++pages < last_page);
1698 out:
1699     return total;
1700 err:
1701     /* Zero the rest of the target like __copy_from_user(). */
1702     len = PAGE_SIZE - copied;
1703     do {
1704         if (len > bytes)
1705             len = bytes;
1706         zero_user(*pages, copied, len);
1707         bytes -= len;
1708         copied = 0;
1709         len = PAGE_SIZE;
1710     } while (++pages < last_page);
1711     goto out;
1712 }
1713
1714 /**
1715  * ntfs_perform_write - perform buffered write to a file
1716  * @file:   file to write to
1717  * @i:      iov_iter with data to write
1718  * @pos:    byte offset in file at which to begin writing to
1719  */
1720 static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
1721         loff_t pos)
1722 {
1723     struct address_space *mapping = file->f_mapping;
1724     struct inode *vi = mapping->host;
1725     ntfs_inode *ni = NTFS_I(vi);
1726     ntfs_volume *vol = ni->vol;
1727     struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1728     struct page *cached_page = NULL;
1729     VCN last_vcn;
1730     LCN lcn;
1731     size_t bytes;
1732     ssize_t status, written = 0;
1733     unsigned nr_pages;
1734
1735     ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
1736             "0x%llx, count 0x%lx.", vi->i_ino,
1737             (unsigned)le32_to_cpu(ni->type),
1738             (unsigned long long)pos,
1739             (unsigned long)iov_iter_count(i));
1740     /*
1741      * If a previous ntfs_truncate() failed, repeat it and abort if it
1742      * fails again.
1743      */
1744     if (unlikely(NInoTruncateFailed(ni))) {
1745         int err;
1746
1747         inode_dio_wait(vi);
1748         err = ntfs_truncate(vi);
1749         if (err || NInoTruncateFailed(ni)) {
1750             if (!err)
1751                 err = -EIO;
1752             ntfs_error(vol->sb, "Cannot perform write to inode "
1753                     "0x%lx, attribute type 0x%x, because "
1754                     "ntfs_truncate() failed (error code "
1755                     "%i).", vi->i_ino,
1756                     (unsigned)le32_to_cpu(ni->type), err);
1757             return err;
1758         }
1759     }
1760     /*
1761      * Determine the number of pages per cluster for non-resident
1762      * attributes.
1763      */
1764     nr_pages = 1;
1765     if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
1766         nr_pages = vol->cluster_size >> PAGE_SHIFT;
1767     last_vcn = -1;
1768     do {
1769         VCN vcn;
1770         pgoff_t start_idx;
1771         unsigned ofs, do_pages, u;
1772         size_t copied;
1773
1774         start_idx = pos >> PAGE_SHIFT;
1775         ofs = pos & ~PAGE_MASK;
1776         bytes = PAGE_SIZE - ofs;
1777         do_pages = 1;
1778         if (nr_pages > 1) {
1779             vcn = pos >> vol->cluster_size_bits;
1780             if (vcn != last_vcn) {
1781                 last_vcn = vcn;
1782                 /*
1783                  * Get the lcn of the vcn the write is in.  If
1784                  * it is a hole, need to lock down all pages in
1785                  * the cluster.
1786                  */
1787                 down_read(&ni->runlist.lock);
1788                 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
1789                         vol->cluster_size_bits, false);
1790                 up_read(&ni->runlist.lock);
1791                 if (unlikely(lcn < LCN_HOLE)) {
1792                     if (lcn == LCN_ENOMEM)
1793                         status = -ENOMEM;
1794                     else {
1795                         status = -EIO;
1796                         ntfs_error(vol->sb, "Cannot "
1797                             "perform write to "
1798                             "inode 0x%lx, "
1799                             "attribute type 0x%x, "
1800                             "because the attribute "
1801                             "is corrupt.",
1802                             vi->i_ino, (unsigned)
1803                             le32_to_cpu(ni->type));
1804                     }
1805                     break;
1806                 }
1807                 if (lcn == LCN_HOLE) {
1808                     start_idx = (pos & ~(s64)
1809                             vol->cluster_size_mask)
1810                             >> PAGE_SHIFT;
1811                     bytes = vol->cluster_size - (pos &
1812                             vol->cluster_size_mask);
1813                     do_pages = nr_pages;
1814                 }
1815             }
1816         }
1817         if (bytes > iov_iter_count(i))
1818             bytes = iov_iter_count(i);
1819 again:
1820         /*
1821          * Bring in the user page(s) that we will copy from _first_.
1822          * Otherwise there is a nasty deadlock on copying from the same
1823          * page(s) as we are writing to, without it/them being marked
1824          * up-to-date.  Note, at present there is nothing to stop the
1825          * pages being swapped out between us bringing them into memory
1826          * and doing the actual copying.
1827          */
1828         if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
1829             status = -EFAULT;
1830             break;
1831         }
1832         /* Get and lock @do_pages starting at index @start_idx. */
1833         status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
1834                 pages, &cached_page);
1835         if (unlikely(status))
1836             break;
1837         /*
1838          * For non-resident attributes, we need to fill any holes with
1839          * actual clusters and ensure all bufferes are mapped.  We also
1840          * need to bring uptodate any buffers that are only partially
1841          * being written to.
1842          */
1843         if (NInoNonResident(ni)) {
1844             status = ntfs_prepare_pages_for_non_resident_write(
1845                     pages, do_pages, pos, bytes);
1846             if (unlikely(status)) {
1847                 do {
1848                     unlock_page(pages[--do_pages]);
1849                     put_page(pages[do_pages]);
1850                 } while (do_pages);
1851                 break;
1852             }
1853         }
1854         u = (pos >> PAGE_SHIFT) - pages[0]->index;
1855         copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
1856                     i, bytes);
1857         ntfs_flush_dcache_pages(pages + u, do_pages - u);
1858         status = 0;
1859         if (likely(copied == bytes)) {
1860             status = ntfs_commit_pages_after_write(pages, do_pages,
1861                     pos, bytes);
1862         }
1863         do {
1864             unlock_page(pages[--do_pages]);
1865             put_page(pages[do_pages]);
1866         } while (do_pages);
1867         if (unlikely(status < 0)) {
1868             iov_iter_revert(i, copied);
1869             break;
1870         }
1871         cond_resched();
1872         if (unlikely(copied < bytes)) {
1873             iov_iter_revert(i, copied);
1874             if (copied)
1875                 bytes = copied;
1876             else if (bytes > PAGE_SIZE - ofs)
1877                 bytes = PAGE_SIZE - ofs;
1878             goto again;
1879         }
1880         pos += copied;
1881         written += copied;
1882         balance_dirty_pages_ratelimited(mapping);
1883         if (fatal_signal_pending(current)) {
1884             status = -EINTR;
1885             break;
1886         }
1887     } while (iov_iter_count(i));
1888     if (cached_page)
1889         put_page(cached_page);
1890     ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
1891             written ? "written" : "status", (unsigned long)written,
1892             (long)status);
1893     return written ? written : status;
1894 }
1895
1896 /**
1897  * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
1898  * @iocb:   IO state structure
1899  * @from:   iov_iter with data to write
1900  *
1901  * Basically the same as generic_file_write_iter() except that it ends up
1902  * up calling ntfs_perform_write() instead of generic_perform_write() and that
1903  * O_DIRECT is not implemented.
1904  */
1905 static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1906 {
1907     struct file *file = iocb->ki_filp;
1908     struct inode *vi = file_inode(file);
1909     ssize_t written = 0;
1910     ssize_t err;
1911
1912     inode_lock(vi);
1913     /* We can write back this queue in page reclaim. */
1914     current->backing_dev_info = inode_to_bdi(vi);
1915     err = ntfs_prepare_file_for_write(iocb, from);
1916     if (iov_iter_count(from) && !err)
1917         written = ntfs_perform_write(file, from, iocb->ki_pos);
1918     current->backing_dev_info = NULL;
1919     inode_unlock(vi);
1920     iocb->ki_pos += written;
1921     if (likely(written > 0))
1922         written = generic_write_sync(iocb, written);
1923     return written ? written : err;
1924 }
1925
1926 /**
1927  * ntfs_file_fsync - sync a file to disk
1928  * @filp:   file to be synced
1929  * @datasync:   if non-zero only flush user data and not metadata
1930  *
1931  * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
1932  * system calls.  This function is inspired by fs/buffer.c::file_fsync().
1933  *
1934  * If @datasync is false, write the mft record and all associated extent mft
1935  * records as well as the $DATA attribute and then sync the block device.
1936  *
1937  * If @datasync is true and the attribute is non-resident, we skip the writing
1938  * of the mft record and all associated extent mft records (this might still
1939  * happen due to the write_inode_now() call).
1940  *
1941  * Also, if @datasync is true, we do not wait on the inode to be written out
1942  * but we always wait on the page cache pages to be written out.
1943  *
1944  * Locking: Caller must hold i_mutex on the inode.
1945  *
1946  * TODO: We should probably also write all attribute/index inodes associated
1947  * with this inode but since we have no simple way of getting to them we ignore
1948  * this problem for now.
1949  */
1950 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
1951                int datasync)
1952 {
1953     struct inode *vi = filp->f_mapping->host;
1954     int err, ret = 0;
1955
1956     ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
1957
1958     err = file_write_and_wait_range(filp, start, end);
1959     if (err)
1960         return err;
1961     inode_lock(vi);
1962
1963     BUG_ON(S_ISDIR(vi->i_mode));
1964     if (!datasync || !NInoNonResident(NTFS_I(vi)))
1965         ret = __ntfs_write_inode(vi, 1);
1966     write_inode_now(vi, !datasync);
1967     /*
1968      * NOTE: If we were to use mapping->private_list (see ext2 and
1969      * fs/buffer.c) for dirty blocks then we could optimize the below to be
1970      * sync_mapping_buffers(vi->i_mapping).
1971      */
1972     err = sync_blockdev(vi->i_sb->s_bdev);
1973     if (unlikely(err && !ret))
1974         ret = err;
1975     if (likely(!ret))
1976         ntfs_debug("Done.");
1977     else
1978         ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
1979                 "%u.", datasync ? "data" : "", vi->i_ino, -ret);
1980     inode_unlock(vi);
1981     return ret;
1982 }
1983
1984 #endif /* NTFS_RW */
1985
1986 const struct file_operations ntfs_file_ops = {
1987     .llseek     = generic_file_llseek,
1988     .read_iter  = generic_file_read_iter,
1989 #ifdef NTFS_RW
1990     .write_iter = ntfs_file_write_iter,
1991     .fsync      = ntfs_file_fsync,
1992 #endif /* NTFS_RW */
1993     .mmap       = generic_file_mmap,
1994     .open       = ntfs_file_open,
1995     .splice_read    = generic_file_splice_read,
1996 };
1997
1998 const struct inode_operations ntfs_file_inode_ops = {
1999 #ifdef NTFS_RW
2000     .setattr    = ntfs_setattr,
2001 #endif /* NTFS_RW */
2002 };
2003
2004 const struct file_operations ntfs_empty_file_ops = {};
2005
2006 const struct inode_operations ntfs_empty_inode_ops = {};