fs/ntfs/mft.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /**
0003  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
0004  *
0005  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
0006  * Copyright (c) 2002 Richard Russon
0007  */
0008
0009 #include <linux/buffer_head.h>
0010 #include <linux/slab.h>
0011 #include <linux/swap.h>
0012 #include <linux/bio.h>
0013
0014 #include "attrib.h"
0015 #include "aops.h"
0016 #include "bitmap.h"
0017 #include "debug.h"
0018 #include "dir.h"
0019 #include "lcnalloc.h"
0020 #include "malloc.h"
0021 #include "mft.h"
0022 #include "ntfs.h"
0023
0024 #define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
0025
0026 /**
0027  * map_mft_record_page - map the page in which a specific mft record resides
0028  * @ni:     ntfs inode whose mft record page to map
0029  *
0030  * This maps the page in which the mft record of the ntfs inode @ni is situated
0031  * and returns a pointer to the mft record within the mapped page.
0032  *
0033  * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
0034  * contains the negative error code returned.
0035  */
0036 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
0037 {
0038     loff_t i_size;
0039     ntfs_volume *vol = ni->vol;
0040     struct inode *mft_vi = vol->mft_ino;
0041     struct page *page;
0042     unsigned long index, end_index;
0043     unsigned ofs;
0044
0045     BUG_ON(ni->page);
0046     /*
0047      * The index into the page cache and the offset within the page cache
0048      * page of the wanted mft record. FIXME: We need to check for
0049      * overflowing the unsigned long, but I don't think we would ever get
0050      * here if the volume was that big...
0051      */
0052     index = (u64)ni->mft_no << vol->mft_record_size_bits >>
0053             PAGE_SHIFT;
0054     ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
0055
0056     i_size = i_size_read(mft_vi);
0057     /* The maximum valid index into the page cache for $MFT's data. */
0058     end_index = i_size >> PAGE_SHIFT;
0059
0060     /* If the wanted index is out of bounds the mft record doesn't exist. */
0061     if (unlikely(index >= end_index)) {
0062         if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
0063                 vol->mft_record_size) {
0064             page = ERR_PTR(-ENOENT);
0065             ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
0066                     "which is beyond the end of the mft.  "
0067                     "This is probably a bug in the ntfs "
0068                     "driver.", ni->mft_no);
0069             goto err_out;
0070         }
0071     }
0072     /* Read, map, and pin the page. */
0073     page = ntfs_map_page(mft_vi->i_mapping, index);
0074     if (!IS_ERR(page)) {
0075         /* Catch multi sector transfer fixup errors. */
0076         if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
0077                 ofs)))) {
0078             ni->page = page;
0079             ni->page_ofs = ofs;
0080             return page_address(page) + ofs;
0081         }
0082         ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
0083                 "Run chkdsk.", ni->mft_no);
0084         ntfs_unmap_page(page);
0085         page = ERR_PTR(-EIO);
0086         NVolSetErrors(vol);
0087     }
0088 err_out:
0089     ni->page = NULL;
0090     ni->page_ofs = 0;
0091     return (void*)page;
0092 }
0093
0094 /**
0095  * map_mft_record - map, pin and lock an mft record
0096  * @ni:     ntfs inode whose MFT record to map
0097  *
0098  * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
0099  * for the mutex if it was already locked by someone else.
0100  *
0101  * The page of the record is mapped using map_mft_record_page() before being
0102  * returned to the caller.
0103  *
0104  * This in turn uses ntfs_map_page() to get the page containing the wanted mft
0105  * record (it in turn calls read_cache_page() which reads it in from disk if
0106  * necessary, increments the use count on the page so that it cannot disappear
0107  * under us and returns a reference to the page cache page).
0108  *
0109  * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
0110  * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
0111  * and the post-read mst fixups on each mft record in the page have been
0112  * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
0113  * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
0114  * ntfs_map_page() waits for PG_locked to become clear and checks if
0115  * PG_uptodate is set and returns an error code if not. This provides
0116  * sufficient protection against races when reading/using the page.
0117  *
0118  * However there is the write mapping to think about. Doing the above described
0119  * checking here will be fine, because when initiating the write we will set
0120  * PG_locked and clear PG_uptodate making sure nobody is touching the page
0121  * contents. Doing the locking this way means that the commit to disk code in
0122  * the page cache code paths is automatically sufficiently locked with us as
0123  * we will not touch a page that has been locked or is not uptodate. The only
0124  * locking problem then is them locking the page while we are accessing it.
0125  *
0126  * So that code will end up having to own the mrec_lock of all mft
0127  * records/inodes present in the page before I/O can proceed. In that case we
0128  * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
0129  * accessing anything without owning the mrec_lock mutex.  But we do need to
0130  * use them because of the read_cache_page() invocation and the code becomes so
0131  * much simpler this way that it is well worth it.
0132  *
0133  * The mft record is now ours and we return a pointer to it. You need to check
0134  * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
0135  * the error code.
0136  *
0137  * NOTE: Caller is responsible for setting the mft record dirty before calling
0138  * unmap_mft_record(). This is obviously only necessary if the caller really
0139  * modified the mft record...
0140  * Q: Do we want to recycle one of the VFS inode state bits instead?
0141  * A: No, the inode ones mean we want to change the mft record, not we want to
0142  * write it out.
0143  */
0144 MFT_RECORD *map_mft_record(ntfs_inode *ni)
0145 {
0146     MFT_RECORD *m;
0147
0148     ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
0149
0150     /* Make sure the ntfs inode doesn't go away. */
0151     atomic_inc(&ni->count);
0152
0153     /* Serialize access to this mft record. */
0154     mutex_lock(&ni->mrec_lock);
0155
0156     m = map_mft_record_page(ni);
0157     if (!IS_ERR(m))
0158         return m;
0159
0160     mutex_unlock(&ni->mrec_lock);
0161     atomic_dec(&ni->count);
0162     ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
0163     return m;
0164 }
0165
0166 /**
0167  * unmap_mft_record_page - unmap the page in which a specific mft record resides
0168  * @ni:     ntfs inode whose mft record page to unmap
0169  *
0170  * This unmaps the page in which the mft record of the ntfs inode @ni is
0171  * situated and returns. This is a NOOP if highmem is not configured.
0172  *
0173  * The unmap happens via ntfs_unmap_page() which in turn decrements the use
0174  * count on the page thus releasing it from the pinned state.
0175  *
0176  * We do not actually unmap the page from memory of course, as that will be
0177  * done by the page cache code itself when memory pressure increases or
0178  * whatever.
0179  */
0180 static inline void unmap_mft_record_page(ntfs_inode *ni)
0181 {
0182     BUG_ON(!ni->page);
0183
0184     // TODO: If dirty, blah...
0185     ntfs_unmap_page(ni->page);
0186     ni->page = NULL;
0187     ni->page_ofs = 0;
0188     return;
0189 }
0190
0191 /**
0192  * unmap_mft_record - release a mapped mft record
0193  * @ni:     ntfs inode whose MFT record to unmap
0194  *
0195  * We release the page mapping and the mrec_lock mutex which unmaps the mft
0196  * record and releases it for others to get hold of. We also release the ntfs
0197  * inode by decrementing the ntfs inode reference count.
0198  *
0199  * NOTE: If caller has modified the mft record, it is imperative to set the mft
0200  * record dirty BEFORE calling unmap_mft_record().
0201  */
0202 void unmap_mft_record(ntfs_inode *ni)
0203 {
0204     struct page *page = ni->page;
0205
0206     BUG_ON(!page);
0207
0208     ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
0209
0210     unmap_mft_record_page(ni);
0211     mutex_unlock(&ni->mrec_lock);
0212     atomic_dec(&ni->count);
0213     /*
0214      * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
0215      * ntfs_clear_extent_inode() in the extent inode case, and to the
0216      * caller in the non-extent, yet pure ntfs inode case, to do the actual
0217      * tear down of all structures and freeing of all allocated memory.
0218      */
0219     return;
0220 }
0221
0222 /**
0223  * map_extent_mft_record - load an extent inode and attach it to its base
0224  * @base_ni:    base ntfs inode
0225  * @mref:   mft reference of the extent inode to load
0226  * @ntfs_ino:   on successful return, pointer to the ntfs_inode structure
0227  *
0228  * Load the extent mft record @mref and attach it to its base inode @base_ni.
0229  * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
0230  * PTR_ERR(result) gives the negative error code.
0231  *
0232  * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
0233  * structure of the mapped extent inode.
0234  */
0235 MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
0236         ntfs_inode **ntfs_ino)
0237 {
0238     MFT_RECORD *m;
0239     ntfs_inode *ni = NULL;
0240     ntfs_inode **extent_nis = NULL;
0241     int i;
0242     unsigned long mft_no = MREF(mref);
0243     u16 seq_no = MSEQNO(mref);
0244     bool destroy_ni = false;
0245
0246     ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
0247             mft_no, base_ni->mft_no);
0248     /* Make sure the base ntfs inode doesn't go away. */
0249     atomic_inc(&base_ni->count);
0250     /*
0251      * Check if this extent inode has already been added to the base inode,
0252      * in which case just return it. If not found, add it to the base
0253      * inode before returning it.
0254      */
0255     mutex_lock(&base_ni->extent_lock);
0256     if (base_ni->nr_extents > 0) {
0257         extent_nis = base_ni->ext.extent_ntfs_inos;
0258         for (i = 0; i < base_ni->nr_extents; i++) {
0259             if (mft_no != extent_nis[i]->mft_no)
0260                 continue;
0261             ni = extent_nis[i];
0262             /* Make sure the ntfs inode doesn't go away. */
0263             atomic_inc(&ni->count);
0264             break;
0265         }
0266     }
0267     if (likely(ni != NULL)) {
0268         mutex_unlock(&base_ni->extent_lock);
0269         atomic_dec(&base_ni->count);
0270         /* We found the record; just have to map and return it. */
0271         m = map_mft_record(ni);
0272         /* map_mft_record() has incremented this on success. */
0273         atomic_dec(&ni->count);
0274         if (!IS_ERR(m)) {
0275             /* Verify the sequence number. */
0276             if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
0277                 ntfs_debug("Done 1.");
0278                 *ntfs_ino = ni;
0279                 return m;
0280             }
0281             unmap_mft_record(ni);
0282             ntfs_error(base_ni->vol->sb, "Found stale extent mft "
0283                     "reference! Corrupt filesystem. "
0284                     "Run chkdsk.");
0285             return ERR_PTR(-EIO);
0286         }
0287 map_err_out:
0288         ntfs_error(base_ni->vol->sb, "Failed to map extent "
0289                 "mft record, error code %ld.", -PTR_ERR(m));
0290         return m;
0291     }
0292     /* Record wasn't there. Get a new ntfs inode and initialize it. */
0293     ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
0294     if (unlikely(!ni)) {
0295         mutex_unlock(&base_ni->extent_lock);
0296         atomic_dec(&base_ni->count);
0297         return ERR_PTR(-ENOMEM);
0298     }
0299     ni->vol = base_ni->vol;
0300     ni->seq_no = seq_no;
0301     ni->nr_extents = -1;
0302     ni->ext.base_ntfs_ino = base_ni;
0303     /* Now map the record. */
0304     m = map_mft_record(ni);
0305     if (IS_ERR(m)) {
0306         mutex_unlock(&base_ni->extent_lock);
0307         atomic_dec(&base_ni->count);
0308         ntfs_clear_extent_inode(ni);
0309         goto map_err_out;
0310     }
0311     /* Verify the sequence number if it is present. */
0312     if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
0313         ntfs_error(base_ni->vol->sb, "Found stale extent mft "
0314                 "reference! Corrupt filesystem. Run chkdsk.");
0315         destroy_ni = true;
0316         m = ERR_PTR(-EIO);
0317         goto unm_err_out;
0318     }
0319     /* Attach extent inode to base inode, reallocating memory if needed. */
0320     if (!(base_ni->nr_extents & 3)) {
0321         ntfs_inode **tmp;
0322         int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
0323
0324         tmp = kmalloc(new_size, GFP_NOFS);
0325         if (unlikely(!tmp)) {
0326             ntfs_error(base_ni->vol->sb, "Failed to allocate "
0327                     "internal buffer.");
0328             destroy_ni = true;
0329             m = ERR_PTR(-ENOMEM);
0330             goto unm_err_out;
0331         }
0332         if (base_ni->nr_extents) {
0333             BUG_ON(!base_ni->ext.extent_ntfs_inos);
0334             memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
0335                     4 * sizeof(ntfs_inode *));
0336             kfree(base_ni->ext.extent_ntfs_inos);
0337         }
0338         base_ni->ext.extent_ntfs_inos = tmp;
0339     }
0340     base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
0341     mutex_unlock(&base_ni->extent_lock);
0342     atomic_dec(&base_ni->count);
0343     ntfs_debug("Done 2.");
0344     *ntfs_ino = ni;
0345     return m;
0346 unm_err_out:
0347     unmap_mft_record(ni);
0348     mutex_unlock(&base_ni->extent_lock);
0349     atomic_dec(&base_ni->count);
0350     /*
0351      * If the extent inode was not attached to the base inode we need to
0352      * release it or we will leak memory.
0353      */
0354     if (destroy_ni)
0355         ntfs_clear_extent_inode(ni);
0356     return m;
0357 }
0358
0359 #ifdef NTFS_RW
0360
0361 /**
0362  * __mark_mft_record_dirty - set the mft record and the page containing it dirty
0363  * @ni:     ntfs inode describing the mapped mft record
0364  *
0365  * Internal function.  Users should call mark_mft_record_dirty() instead.
0366  *
0367  * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
0368  * as well as the page containing the mft record, dirty.  Also, mark the base
0369  * vfs inode dirty.  This ensures that any changes to the mft record are
0370  * written out to disk.
0371  *
0372  * NOTE:  We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
0373  * on the base vfs inode, because even though file data may have been modified,
0374  * it is dirty in the inode meta data rather than the data page cache of the
0375  * inode, and thus there are no data pages that need writing out.  Therefore, a
0376  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
0377  * other hand, is not sufficient, because ->write_inode needs to be called even
0378  * in case of fdatasync. This needs to happen or the file data would not
0379  * necessarily hit the device synchronously, even though the vfs inode has the
0380  * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
0381  * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
0382  * which is not what I_DIRTY_SYNC on its own would suggest.
0383  */
0384 void __mark_mft_record_dirty(ntfs_inode *ni)
0385 {
0386     ntfs_inode *base_ni;
0387
0388     ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
0389     BUG_ON(NInoAttr(ni));
0390     mark_ntfs_record_dirty(ni->page, ni->page_ofs);
0391     /* Determine the base vfs inode and mark it dirty, too. */
0392     mutex_lock(&ni->extent_lock);
0393     if (likely(ni->nr_extents >= 0))
0394         base_ni = ni;
0395     else
0396         base_ni = ni->ext.base_ntfs_ino;
0397     mutex_unlock(&ni->extent_lock);
0398     __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
0399 }
0400
0401 static const char *ntfs_please_email = "Please email "
0402         "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
0403         "this message.  Thank you.";
0404
0405 /**
0406  * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
0407  * @vol:    ntfs volume on which the mft record to synchronize resides
0408  * @mft_no: mft record number of mft record to synchronize
0409  * @m:      mapped, mst protected (extent) mft record to synchronize
0410  *
0411  * Write the mapped, mst protected (extent) mft record @m with mft record
0412  * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
0413  * bypassing the page cache and the $MFTMirr inode itself.
0414  *
0415  * This function is only for use at umount time when the mft mirror inode has
0416  * already been disposed off.  We BUG() if we are called while the mft mirror
0417  * inode is still attached to the volume.
0418  *
0419  * On success return 0.  On error return -errno.
0420  *
0421  * NOTE:  This function is not implemented yet as I am not convinced it can
0422  * actually be triggered considering the sequence of commits we do in super.c::
0423  * ntfs_put_super().  But just in case we provide this place holder as the
0424  * alternative would be either to BUG() or to get a NULL pointer dereference
0425  * and Oops.
0426  */
0427 static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
0428         const unsigned long mft_no, MFT_RECORD *m)
0429 {
0430     BUG_ON(vol->mftmirr_ino);
0431     ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
0432             "implemented yet.  %s", ntfs_please_email);
0433     return -EOPNOTSUPP;
0434 }
0435
0436 /**
0437  * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
0438  * @vol:    ntfs volume on which the mft record to synchronize resides
0439  * @mft_no: mft record number of mft record to synchronize
0440  * @m:      mapped, mst protected (extent) mft record to synchronize
0441  * @sync:   if true, wait for i/o completion
0442  *
0443  * Write the mapped, mst protected (extent) mft record @m with mft record
0444  * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
0445  *
0446  * On success return 0.  On error return -errno and set the volume errors flag
0447  * in the ntfs volume @vol.
0448  *
0449  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
0450  *
0451  * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
0452  * schedule i/o via ->writepage or do it via kntfsd or whatever.
0453  */
0454 int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
0455         MFT_RECORD *m, int sync)
0456 {
0457     struct page *page;
0458     unsigned int blocksize = vol->sb->s_blocksize;
0459     int max_bhs = vol->mft_record_size / blocksize;
0460     struct buffer_head *bhs[MAX_BHS];
0461     struct buffer_head *bh, *head;
0462     u8 *kmirr;
0463     runlist_element *rl;
0464     unsigned int block_start, block_end, m_start, m_end, page_ofs;
0465     int i_bhs, nr_bhs, err = 0;
0466     unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
0467
0468     ntfs_debug("Entering for inode 0x%lx.", mft_no);
0469     BUG_ON(!max_bhs);
0470     if (WARN_ON(max_bhs > MAX_BHS))
0471         return -EINVAL;
0472     if (unlikely(!vol->mftmirr_ino)) {
0473         /* This could happen during umount... */
0474         err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
0475         if (likely(!err))
0476             return err;
0477         goto err_out;
0478     }
0479     /* Get the page containing the mirror copy of the mft record @m. */
0480     page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
0481             (PAGE_SHIFT - vol->mft_record_size_bits));
0482     if (IS_ERR(page)) {
0483         ntfs_error(vol->sb, "Failed to map mft mirror page.");
0484         err = PTR_ERR(page);
0485         goto err_out;
0486     }
0487     lock_page(page);
0488     BUG_ON(!PageUptodate(page));
0489     ClearPageUptodate(page);
0490     /* Offset of the mft mirror record inside the page. */
0491     page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
0492     /* The address in the page of the mirror copy of the mft record @m. */
0493     kmirr = page_address(page) + page_ofs;
0494     /* Copy the mst protected mft record to the mirror. */
0495     memcpy(kmirr, m, vol->mft_record_size);
0496     /* Create uptodate buffers if not present. */
0497     if (unlikely(!page_has_buffers(page))) {
0498         struct buffer_head *tail;
0499
0500         bh = head = alloc_page_buffers(page, blocksize, true);
0501         do {
0502             set_buffer_uptodate(bh);
0503             tail = bh;
0504             bh = bh->b_this_page;
0505         } while (bh);
0506         tail->b_this_page = head;
0507         attach_page_private(page, head);
0508     }
0509     bh = head = page_buffers(page);
0510     BUG_ON(!bh);
0511     rl = NULL;
0512     nr_bhs = 0;
0513     block_start = 0;
0514     m_start = kmirr - (u8*)page_address(page);
0515     m_end = m_start + vol->mft_record_size;
0516     do {
0517         block_end = block_start + blocksize;
0518         /* If the buffer is outside the mft record, skip it. */
0519         if (block_end <= m_start)
0520             continue;
0521         if (unlikely(block_start >= m_end))
0522             break;
0523         /* Need to map the buffer if it is not mapped already. */
0524         if (unlikely(!buffer_mapped(bh))) {
0525             VCN vcn;
0526             LCN lcn;
0527             unsigned int vcn_ofs;
0528
0529             bh->b_bdev = vol->sb->s_bdev;
0530             /* Obtain the vcn and offset of the current block. */
0531             vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
0532                     (block_start - m_start);
0533             vcn_ofs = vcn & vol->cluster_size_mask;
0534             vcn >>= vol->cluster_size_bits;
0535             if (!rl) {
0536                 down_read(&NTFS_I(vol->mftmirr_ino)->
0537                         runlist.lock);
0538                 rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
0539                 /*
0540                  * $MFTMirr always has the whole of its runlist
0541                  * in memory.
0542                  */
0543                 BUG_ON(!rl);
0544             }
0545             /* Seek to element containing target vcn. */
0546             while (rl->length && rl[1].vcn <= vcn)
0547                 rl++;
0548             lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
0549             /* For $MFTMirr, only lcn >= 0 is a successful remap. */
0550             if (likely(lcn >= 0)) {
0551                 /* Setup buffer head to correct block. */
0552                 bh->b_blocknr = ((lcn <<
0553                         vol->cluster_size_bits) +
0554                         vcn_ofs) >> blocksize_bits;
0555                 set_buffer_mapped(bh);
0556             } else {
0557                 bh->b_blocknr = -1;
0558                 ntfs_error(vol->sb, "Cannot write mft mirror "
0559                         "record 0x%lx because its "
0560                         "location on disk could not "
0561                         "be determined (error code "
0562                         "%lli).", mft_no,
0563                         (long long)lcn);
0564                 err = -EIO;
0565             }
0566         }
0567         BUG_ON(!buffer_uptodate(bh));
0568         BUG_ON(!nr_bhs && (m_start != block_start));
0569         BUG_ON(nr_bhs >= max_bhs);
0570         bhs[nr_bhs++] = bh;
0571         BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
0572     } while (block_start = block_end, (bh = bh->b_this_page) != head);
0573     if (unlikely(rl))
0574         up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
0575     if (likely(!err)) {
0576         /* Lock buffers and start synchronous write i/o on them. */
0577         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
0578             struct buffer_head *tbh = bhs[i_bhs];
0579
0580             if (!trylock_buffer(tbh))
0581                 BUG();
0582             BUG_ON(!buffer_uptodate(tbh));
0583             clear_buffer_dirty(tbh);
0584             get_bh(tbh);
0585             tbh->b_end_io = end_buffer_write_sync;
0586             submit_bh(REQ_OP_WRITE, tbh);
0587         }
0588         /* Wait on i/o completion of buffers. */
0589         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
0590             struct buffer_head *tbh = bhs[i_bhs];
0591
0592             wait_on_buffer(tbh);
0593             if (unlikely(!buffer_uptodate(tbh))) {
0594                 err = -EIO;
0595                 /*
0596                  * Set the buffer uptodate so the page and
0597                  * buffer states do not become out of sync.
0598                  */
0599                 set_buffer_uptodate(tbh);
0600             }
0601         }
0602     } else /* if (unlikely(err)) */ {
0603         /* Clean the buffers. */
0604         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
0605             clear_buffer_dirty(bhs[i_bhs]);
0606     }
0607     /* Current state: all buffers are clean, unlocked, and uptodate. */
0608     /* Remove the mst protection fixups again. */
0609     post_write_mst_fixup((NTFS_RECORD*)kmirr);
0610     flush_dcache_page(page);
0611     SetPageUptodate(page);
0612     unlock_page(page);
0613     ntfs_unmap_page(page);
0614     if (likely(!err)) {
0615         ntfs_debug("Done.");
0616     } else {
0617         ntfs_error(vol->sb, "I/O error while writing mft mirror "
0618                 "record 0x%lx!", mft_no);
0619 err_out:
0620         ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
0621                 "code %i).  Volume will be left marked dirty "
0622                 "on umount.  Run ntfsfix on the partition "
0623                 "after umounting to correct this.", -err);
0624         NVolSetErrors(vol);
0625     }
0626     return err;
0627 }
0628
0629 /**
0630  * write_mft_record_nolock - write out a mapped (extent) mft record
0631  * @ni:     ntfs inode describing the mapped (extent) mft record
0632  * @m:      mapped (extent) mft record to write
0633  * @sync:   if true, wait for i/o completion
0634  *
0635  * Write the mapped (extent) mft record @m described by the (regular or extent)
0636  * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
0637  * the mft mirror, that is also updated.
0638  *
0639  * We only write the mft record if the ntfs inode @ni is dirty and the first
0640  * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
0641  * of subsequent buffers because we could have raced with
0642  * fs/ntfs/aops.c::mark_ntfs_record_dirty().
0643  *
0644  * On success, clean the mft record and return 0.  On error, leave the mft
0645  * record dirty and return -errno.
0646  *
0647  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
0648  * However, if the mft record has a counterpart in the mft mirror and @sync is
0649  * true, we write the mft record, wait for i/o completion, and only then write
0650  * the mft mirror copy.  This ensures that if the system crashes either the mft
0651  * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
0652  * false on the other hand, we start i/o on both and then wait for completion
0653  * on them.  This provides a speedup but no longer guarantees that you will end
0654  * up with a self-consistent mft record in the case of a crash but if you asked
0655  * for asynchronous writing you probably do not care about that anyway.
0656  *
0657  * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
0658  * schedule i/o via ->writepage or do it via kntfsd or whatever.
0659  */
0660 int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
0661 {
0662     ntfs_volume *vol = ni->vol;
0663     struct page *page = ni->page;
0664     unsigned int blocksize = vol->sb->s_blocksize;
0665     unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
0666     int max_bhs = vol->mft_record_size / blocksize;
0667     struct buffer_head *bhs[MAX_BHS];
0668     struct buffer_head *bh, *head;
0669     runlist_element *rl;
0670     unsigned int block_start, block_end, m_start, m_end;
0671     int i_bhs, nr_bhs, err = 0;
0672
0673     ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
0674     BUG_ON(NInoAttr(ni));
0675     BUG_ON(!max_bhs);
0676     BUG_ON(!PageLocked(page));
0677     if (WARN_ON(max_bhs > MAX_BHS)) {
0678         err = -EINVAL;
0679         goto err_out;
0680     }
0681     /*
0682      * If the ntfs_inode is clean no need to do anything.  If it is dirty,
0683      * mark it as clean now so that it can be redirtied later on if needed.
0684      * There is no danger of races since the caller is holding the locks
0685      * for the mft record @m and the page it is in.
0686      */
0687     if (!NInoTestClearDirty(ni))
0688         goto done;
0689     bh = head = page_buffers(page);
0690     BUG_ON(!bh);
0691     rl = NULL;
0692     nr_bhs = 0;
0693     block_start = 0;
0694     m_start = ni->page_ofs;
0695     m_end = m_start + vol->mft_record_size;
0696     do {
0697         block_end = block_start + blocksize;
0698         /* If the buffer is outside the mft record, skip it. */
0699         if (block_end <= m_start)
0700             continue;
0701         if (unlikely(block_start >= m_end))
0702             break;
0703         /*
0704          * If this block is not the first one in the record, we ignore
0705          * the buffer's dirty state because we could have raced with a
0706          * parallel mark_ntfs_record_dirty().
0707          */
0708         if (block_start == m_start) {
0709             /* This block is the first one in the record. */
0710             if (!buffer_dirty(bh)) {
0711                 BUG_ON(nr_bhs);
0712                 /* Clean records are not written out. */
0713                 break;
0714             }
0715         }
0716         /* Need to map the buffer if it is not mapped already. */
0717         if (unlikely(!buffer_mapped(bh))) {
0718             VCN vcn;
0719             LCN lcn;
0720             unsigned int vcn_ofs;
0721
0722             bh->b_bdev = vol->sb->s_bdev;
0723             /* Obtain the vcn and offset of the current block. */
0724             vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
0725                     (block_start - m_start);
0726             vcn_ofs = vcn & vol->cluster_size_mask;
0727             vcn >>= vol->cluster_size_bits;
0728             if (!rl) {
0729                 down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
0730                 rl = NTFS_I(vol->mft_ino)->runlist.rl;
0731                 BUG_ON(!rl);
0732             }
0733             /* Seek to element containing target vcn. */
0734             while (rl->length && rl[1].vcn <= vcn)
0735                 rl++;
0736             lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
0737             /* For $MFT, only lcn >= 0 is a successful remap. */
0738             if (likely(lcn >= 0)) {
0739                 /* Setup buffer head to correct block. */
0740                 bh->b_blocknr = ((lcn <<
0741                         vol->cluster_size_bits) +
0742                         vcn_ofs) >> blocksize_bits;
0743                 set_buffer_mapped(bh);
0744             } else {
0745                 bh->b_blocknr = -1;
0746                 ntfs_error(vol->sb, "Cannot write mft record "
0747                         "0x%lx because its location "
0748                         "on disk could not be "
0749                         "determined (error code %lli).",
0750                         ni->mft_no, (long long)lcn);
0751                 err = -EIO;
0752             }
0753         }
0754         BUG_ON(!buffer_uptodate(bh));
0755         BUG_ON(!nr_bhs && (m_start != block_start));
0756         BUG_ON(nr_bhs >= max_bhs);
0757         bhs[nr_bhs++] = bh;
0758         BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
0759     } while (block_start = block_end, (bh = bh->b_this_page) != head);
0760     if (unlikely(rl))
0761         up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
0762     if (!nr_bhs)
0763         goto done;
0764     if (unlikely(err))
0765         goto cleanup_out;
0766     /* Apply the mst protection fixups. */
0767     err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
0768     if (err) {
0769         ntfs_error(vol->sb, "Failed to apply mst fixups!");
0770         goto cleanup_out;
0771     }
0772     flush_dcache_mft_record_page(ni);
0773     /* Lock buffers and start synchronous write i/o on them. */
0774     for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
0775         struct buffer_head *tbh = bhs[i_bhs];
0776
0777         if (!trylock_buffer(tbh))
0778             BUG();
0779         BUG_ON(!buffer_uptodate(tbh));
0780         clear_buffer_dirty(tbh);
0781         get_bh(tbh);
0782         tbh->b_end_io = end_buffer_write_sync;
0783         submit_bh(REQ_OP_WRITE, tbh);
0784     }
0785     /* Synchronize the mft mirror now if not @sync. */
0786     if (!sync && ni->mft_no < vol->mftmirr_size)
0787         ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
0788     /* Wait on i/o completion of buffers. */
0789     for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
0790         struct buffer_head *tbh = bhs[i_bhs];
0791
0792         wait_on_buffer(tbh);
0793         if (unlikely(!buffer_uptodate(tbh))) {
0794             err = -EIO;
0795             /*
0796              * Set the buffer uptodate so the page and buffer
0797              * states do not become out of sync.
0798              */
0799             if (PageUptodate(page))
0800                 set_buffer_uptodate(tbh);
0801         }
0802     }
0803     /* If @sync, now synchronize the mft mirror. */
0804     if (sync && ni->mft_no < vol->mftmirr_size)
0805         ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
0806     /* Remove the mst protection fixups again. */
0807     post_write_mst_fixup((NTFS_RECORD*)m);
0808     flush_dcache_mft_record_page(ni);
0809     if (unlikely(err)) {
0810         /* I/O error during writing.  This is really bad! */
0811         ntfs_error(vol->sb, "I/O error while writing mft record "
0812                 "0x%lx!  Marking base inode as bad.  You "
0813                 "should unmount the volume and run chkdsk.",
0814                 ni->mft_no);
0815         goto err_out;
0816     }
0817 done:
0818     ntfs_debug("Done.");
0819     return 0;
0820 cleanup_out:
0821     /* Clean the buffers. */
0822     for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
0823         clear_buffer_dirty(bhs[i_bhs]);
0824 err_out:
0825     /*
0826      * Current state: all buffers are clean, unlocked, and uptodate.
0827      * The caller should mark the base inode as bad so that no more i/o
0828      * happens.  ->clear_inode() will still be invoked so all extent inodes
0829      * and other allocated memory will be freed.
0830      */
0831     if (err == -ENOMEM) {
0832         ntfs_error(vol->sb, "Not enough memory to write mft record.  "
0833                 "Redirtying so the write is retried later.");
0834         mark_mft_record_dirty(ni);
0835         err = 0;
0836     } else
0837         NVolSetErrors(vol);
0838     return err;
0839 }
0840
0841 /**
0842  * ntfs_may_write_mft_record - check if an mft record may be written out
0843  * @vol:    [IN]  ntfs volume on which the mft record to check resides
0844  * @mft_no: [IN]  mft record number of the mft record to check
0845  * @m:      [IN]  mapped mft record to check
0846  * @locked_ni:  [OUT] caller has to unlock this ntfs inode if one is returned
0847  *
0848  * Check if the mapped (base or extent) mft record @m with mft record number
0849  * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
0850  * and possible the ntfs inode of the mft record is locked and the base vfs
0851  * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
0852  * caller is responsible for unlocking the ntfs inode and unpinning the base
0853  * vfs inode.
0854  *
0855  * Return 'true' if the mft record may be written out and 'false' if not.
0856  *
0857  * The caller has locked the page and cleared the uptodate flag on it which
0858  * means that we can safely write out any dirty mft records that do not have
0859  * their inodes in icache as determined by ilookup5() as anyone
0860  * opening/creating such an inode would block when attempting to map the mft
0861  * record in read_cache_page() until we are finished with the write out.
0862  *
0863  * Here is a description of the tests we perform:
0864  *
0865  * If the inode is found in icache we know the mft record must be a base mft
0866  * record.  If it is dirty, we do not write it and return 'false' as the vfs
0867  * inode write paths will result in the access times being updated which would
0868  * cause the base mft record to be redirtied and written out again.  (We know
0869  * the access time update will modify the base mft record because Windows
0870  * chkdsk complains if the standard information attribute is not in the base
0871  * mft record.)
0872  *
0873  * If the inode is in icache and not dirty, we attempt to lock the mft record
0874  * and if we find the lock was already taken, it is not safe to write the mft
0875  * record and we return 'false'.
0876  *
0877  * If we manage to obtain the lock we have exclusive access to the mft record,
0878  * which also allows us safe writeout of the mft record.  We then set
0879  * @locked_ni to the locked ntfs inode and return 'true'.
0880  *
0881  * Note we cannot just lock the mft record and sleep while waiting for the lock
0882  * because this would deadlock due to lock reversal (normally the mft record is
0883  * locked before the page is locked but we already have the page locked here
0884  * when we try to lock the mft record).
0885  *
0886  * If the inode is not in icache we need to perform further checks.
0887  *
0888  * If the mft record is not a FILE record or it is a base mft record, we can
0889  * safely write it and return 'true'.
0890  *
0891  * We now know the mft record is an extent mft record.  We check if the inode
0892  * corresponding to its base mft record is in icache and obtain a reference to
0893  * it if it is.  If it is not, we can safely write it and return 'true'.
0894  *
0895  * We now have the base inode for the extent mft record.  We check if it has an
0896  * ntfs inode for the extent mft record attached and if not it is safe to write
0897  * the extent mft record and we return 'true'.
0898  *
0899  * The ntfs inode for the extent mft record is attached to the base inode so we
0900  * attempt to lock the extent mft record and if we find the lock was already
0901  * taken, it is not safe to write the extent mft record and we return 'false'.
0902  *
0903  * If we manage to obtain the lock we have exclusive access to the extent mft
0904  * record, which also allows us safe writeout of the extent mft record.  We
0905  * set the ntfs inode of the extent mft record clean and then set @locked_ni to
0906  * the now locked ntfs inode and return 'true'.
0907  *
0908  * Note, the reason for actually writing dirty mft records here and not just
0909  * relying on the vfs inode dirty code paths is that we can have mft records
0910  * modified without them ever having actual inodes in memory.  Also we can have
0911  * dirty mft records with clean ntfs inodes in memory.  None of the described
0912  * cases would result in the dirty mft records being written out if we only
0913  * relied on the vfs inode dirty code paths.  And these cases can really occur
0914  * during allocation of new mft records and in particular when the
0915  * initialized_size of the $MFT/$DATA attribute is extended and the new space
0916  * is initialized using ntfs_mft_record_format().  The clean inode can then
0917  * appear if the mft record is reused for a new inode before it got written
0918  * out.
0919  */
0920 bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
0921         const MFT_RECORD *m, ntfs_inode **locked_ni)
0922 {
0923     struct super_block *sb = vol->sb;
0924     struct inode *mft_vi = vol->mft_ino;
0925     struct inode *vi;
0926     ntfs_inode *ni, *eni, **extent_nis;
0927     int i;
0928     ntfs_attr na;
0929
0930     ntfs_debug("Entering for inode 0x%lx.", mft_no);
0931     /*
0932      * Normally we do not return a locked inode so set @locked_ni to NULL.
0933      */
0934     BUG_ON(!locked_ni);
0935     *locked_ni = NULL;
0936     /*
0937      * Check if the inode corresponding to this mft record is in the VFS
0938      * inode cache and obtain a reference to it if it is.
0939      */
0940     ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
0941     na.mft_no = mft_no;
0942     na.name = NULL;
0943     na.name_len = 0;
0944     na.type = AT_UNUSED;
0945     /*
0946      * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
0947      * we get here for it rather often.
0948      */
0949     if (!mft_no) {
0950         /* Balance the below iput(). */
0951         vi = igrab(mft_vi);
0952         BUG_ON(vi != mft_vi);
0953     } else {
0954         /*
0955          * Have to use ilookup5_nowait() since ilookup5() waits for the
0956          * inode lock which causes ntfs to deadlock when a concurrent
0957          * inode write via the inode dirty code paths and the page
0958          * dirty code path of the inode dirty code path when writing
0959          * $MFT occurs.
0960          */
0961         vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
0962     }
0963     if (vi) {
0964         ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
0965         /* The inode is in icache. */
0966         ni = NTFS_I(vi);
0967         /* Take a reference to the ntfs inode. */
0968         atomic_inc(&ni->count);
0969         /* If the inode is dirty, do not write this record. */
0970         if (NInoDirty(ni)) {
0971             ntfs_debug("Inode 0x%lx is dirty, do not write it.",
0972                     mft_no);
0973             atomic_dec(&ni->count);
0974             iput(vi);
0975             return false;
0976         }
0977         ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
0978         /* The inode is not dirty, try to take the mft record lock. */
0979         if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
0980             ntfs_debug("Mft record 0x%lx is already locked, do "
0981                     "not write it.", mft_no);
0982             atomic_dec(&ni->count);
0983             iput(vi);
0984             return false;
0985         }
0986         ntfs_debug("Managed to lock mft record 0x%lx, write it.",
0987                 mft_no);
0988         /*
0989          * The write has to occur while we hold the mft record lock so
0990          * return the locked ntfs inode.
0991          */
0992         *locked_ni = ni;
0993         return true;
0994     }
0995     ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
0996     /* The inode is not in icache. */
0997     /* Write the record if it is not a mft record (type "FILE"). */
0998     if (!ntfs_is_mft_record(m->magic)) {
0999         ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
1000                 mft_no);
1001         return true;
1002     }
1003     /* Write the mft record if it is a base inode. */
1004     if (!m->base_mft_record) {
1005         ntfs_debug("Mft record 0x%lx is a base record, write it.",
1006                 mft_no);
1007         return true;
1008     }
1009     /*
1010      * This is an extent mft record.  Check if the inode corresponding to
1011      * its base mft record is in icache and obtain a reference to it if it
1012      * is.
1013      */
1014     na.mft_no = MREF_LE(m->base_mft_record);
1015     ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
1016             "inode 0x%lx in icache.", mft_no, na.mft_no);
1017     if (!na.mft_no) {
1018         /* Balance the below iput(). */
1019         vi = igrab(mft_vi);
1020         BUG_ON(vi != mft_vi);
1021     } else
1022         vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
1023                 &na);
1024     if (!vi) {
1025         /*
1026          * The base inode is not in icache, write this extent mft
1027          * record.
1028          */
1029         ntfs_debug("Base inode 0x%lx is not in icache, write the "
1030                 "extent record.", na.mft_no);
1031         return true;
1032     }
1033     ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1034     /*
1035      * The base inode is in icache.  Check if it has the extent inode
1036      * corresponding to this extent mft record attached.
1037      */
1038     ni = NTFS_I(vi);
1039     mutex_lock(&ni->extent_lock);
1040     if (ni->nr_extents <= 0) {
1041         /*
1042          * The base inode has no attached extent inodes, write this
1043          * extent mft record.
1044          */
1045         mutex_unlock(&ni->extent_lock);
1046         iput(vi);
1047         ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
1048                 "write the extent record.", na.mft_no);
1049         return true;
1050     }
1051     /* Iterate over the attached extent inodes. */
1052     extent_nis = ni->ext.extent_ntfs_inos;
1053     for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
1054         if (mft_no == extent_nis[i]->mft_no) {
1055             /*
1056              * Found the extent inode corresponding to this extent
1057              * mft record.
1058              */
1059             eni = extent_nis[i];
1060             break;
1061         }
1062     }
1063     /*
1064      * If the extent inode was not attached to the base inode, write this
1065      * extent mft record.
1066      */
1067     if (!eni) {
1068         mutex_unlock(&ni->extent_lock);
1069         iput(vi);
1070         ntfs_debug("Extent inode 0x%lx is not attached to its base "
1071                 "inode 0x%lx, write the extent record.",
1072                 mft_no, na.mft_no);
1073         return true;
1074     }
1075     ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
1076             mft_no, na.mft_no);
1077     /* Take a reference to the extent ntfs inode. */
1078     atomic_inc(&eni->count);
1079     mutex_unlock(&ni->extent_lock);
1080     /*
1081      * Found the extent inode coresponding to this extent mft record.
1082      * Try to take the mft record lock.
1083      */
1084     if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
1085         atomic_dec(&eni->count);
1086         iput(vi);
1087         ntfs_debug("Extent mft record 0x%lx is already locked, do "
1088                 "not write it.", mft_no);
1089         return false;
1090     }
1091     ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
1092             mft_no);
1093     if (NInoTestClearDirty(eni))
1094         ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
1095                 mft_no);
1096     /*
1097      * The write has to occur while we hold the mft record lock so return
1098      * the locked extent ntfs inode.
1099      */
1100     *locked_ni = eni;
1101     return true;
1102 }
1103
1104 static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
1105         "chkdsk.";
1106
1107 /**
1108  * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
1109  * @vol:    volume on which to search for a free mft record
1110  * @base_ni:    open base inode if allocating an extent mft record or NULL
1111  *
1112  * Search for a free mft record in the mft bitmap attribute on the ntfs volume
1113  * @vol.
1114  *
1115  * If @base_ni is NULL start the search at the default allocator position.
1116  *
1117  * If @base_ni is not NULL start the search at the mft record after the base
1118  * mft record @base_ni.
1119  *
1120  * Return the free mft record on success and -errno on error.  An error code of
1121  * -ENOSPC means that there are no free mft records in the currently
1122  * initialized mft bitmap.
1123  *
1124  * Locking: Caller must hold vol->mftbmp_lock for writing.
1125  */
1126 static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
1127         ntfs_inode *base_ni)
1128 {
1129     s64 pass_end, ll, data_pos, pass_start, ofs, bit;
1130     unsigned long flags;
1131     struct address_space *mftbmp_mapping;
1132     u8 *buf, *byte;
1133     struct page *page;
1134     unsigned int page_ofs, size;
1135     u8 pass, b;
1136
1137     ntfs_debug("Searching for free mft record in the currently "
1138             "initialized mft bitmap.");
1139     mftbmp_mapping = vol->mftbmp_ino->i_mapping;
1140     /*
1141      * Set the end of the pass making sure we do not overflow the mft
1142      * bitmap.
1143      */
1144     read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
1145     pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
1146             vol->mft_record_size_bits;
1147     read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
1148     read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1149     ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
1150     read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1151     if (pass_end > ll)
1152         pass_end = ll;
1153     pass = 1;
1154     if (!base_ni)
1155         data_pos = vol->mft_data_pos;
1156     else
1157         data_pos = base_ni->mft_no + 1;
1158     if (data_pos < 24)
1159         data_pos = 24;
1160     if (data_pos >= pass_end) {
1161         data_pos = 24;
1162         pass = 2;
1163         /* This happens on a freshly formatted volume. */
1164         if (data_pos >= pass_end)
1165             return -ENOSPC;
1166     }
1167     pass_start = data_pos;
1168     ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
1169             "pass_end 0x%llx, data_pos 0x%llx.", pass,
1170             (long long)pass_start, (long long)pass_end,
1171             (long long)data_pos);
1172     /* Loop until a free mft record is found. */
1173     for (; pass <= 2;) {
1174         /* Cap size to pass_end. */
1175         ofs = data_pos >> 3;
1176         page_ofs = ofs & ~PAGE_MASK;
1177         size = PAGE_SIZE - page_ofs;
1178         ll = ((pass_end + 7) >> 3) - ofs;
1179         if (size > ll)
1180             size = ll;
1181         size <<= 3;
1182         /*
1183          * If we are still within the active pass, search the next page
1184          * for a zero bit.
1185          */
1186         if (size) {
1187             page = ntfs_map_page(mftbmp_mapping,
1188                     ofs >> PAGE_SHIFT);
1189             if (IS_ERR(page)) {
1190                 ntfs_error(vol->sb, "Failed to read mft "
1191                         "bitmap, aborting.");
1192                 return PTR_ERR(page);
1193             }
1194             buf = (u8*)page_address(page) + page_ofs;
1195             bit = data_pos & 7;
1196             data_pos &= ~7ull;
1197             ntfs_debug("Before inner for loop: size 0x%x, "
1198                     "data_pos 0x%llx, bit 0x%llx", size,
1199                     (long long)data_pos, (long long)bit);
1200             for (; bit < size && data_pos + bit < pass_end;
1201                     bit &= ~7ull, bit += 8) {
1202                 byte = buf + (bit >> 3);
1203                 if (*byte == 0xff)
1204                     continue;
1205                 b = ffz((unsigned long)*byte);
1206                 if (b < 8 && b >= (bit & 7)) {
1207                     ll = data_pos + (bit & ~7ull) + b;
1208                     if (unlikely(ll > (1ll << 32))) {
1209                         ntfs_unmap_page(page);
1210                         return -ENOSPC;
1211                     }
1212                     *byte |= 1 << b;
1213                     flush_dcache_page(page);
1214                     set_page_dirty(page);
1215                     ntfs_unmap_page(page);
1216                     ntfs_debug("Done.  (Found and "
1217                             "allocated mft record "
1218                             "0x%llx.)",
1219                             (long long)ll);
1220                     return ll;
1221                 }
1222             }
1223             ntfs_debug("After inner for loop: size 0x%x, "
1224                     "data_pos 0x%llx, bit 0x%llx", size,
1225                     (long long)data_pos, (long long)bit);
1226             data_pos += size;
1227             ntfs_unmap_page(page);
1228             /*
1229              * If the end of the pass has not been reached yet,
1230              * continue searching the mft bitmap for a zero bit.
1231              */
1232             if (data_pos < pass_end)
1233                 continue;
1234         }
1235         /* Do the next pass. */
1236         if (++pass == 2) {
1237             /*
1238              * Starting the second pass, in which we scan the first
1239              * part of the zone which we omitted earlier.
1240              */
1241             pass_end = pass_start;
1242             data_pos = pass_start = 24;
1243             ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
1244                     "0x%llx.", pass, (long long)pass_start,
1245                     (long long)pass_end);
1246             if (data_pos >= pass_end)
1247                 break;
1248         }
1249     }
1250     /* No free mft records in currently initialized mft bitmap. */
1251     ntfs_debug("Done.  (No free mft records left in currently initialized "
1252             "mft bitmap.)");
1253     return -ENOSPC;
1254 }
1255
1256 /**
1257  * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1258  * @vol:    volume on which to extend the mft bitmap attribute
1259  *
1260  * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1261  *
1262  * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1263  * data_size.
1264  *
1265  * Return 0 on success and -errno on error.
1266  *
1267  * Locking: - Caller must hold vol->mftbmp_lock for writing.
1268  *      - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1269  *        writing and releases it before returning.
1270  *      - This function takes vol->lcnbmp_lock for writing and releases it
1271  *        before returning.
1272  */
1273 static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1274 {
1275     LCN lcn;
1276     s64 ll;
1277     unsigned long flags;
1278     struct page *page;
1279     ntfs_inode *mft_ni, *mftbmp_ni;
1280     runlist_element *rl, *rl2 = NULL;
1281     ntfs_attr_search_ctx *ctx = NULL;
1282     MFT_RECORD *mrec;
1283     ATTR_RECORD *a = NULL;
1284     int ret, mp_size;
1285     u32 old_alen = 0;
1286     u8 *b, tb;
1287     struct {
1288         u8 added_cluster:1;
1289         u8 added_run:1;
1290         u8 mp_rebuilt:1;
1291     } status = { 0, 0, 0 };
1292
1293     ntfs_debug("Extending mft bitmap allocation.");
1294     mft_ni = NTFS_I(vol->mft_ino);
1295     mftbmp_ni = NTFS_I(vol->mftbmp_ino);
1296     /*
1297      * Determine the last lcn of the mft bitmap.  The allocated size of the
1298      * mft bitmap cannot be zero so we are ok to do this.
1299      */
1300     down_write(&mftbmp_ni->runlist.lock);
1301     read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1302     ll = mftbmp_ni->allocated_size;
1303     read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1304     rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1305             (ll - 1) >> vol->cluster_size_bits, NULL);
1306     if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1307         up_write(&mftbmp_ni->runlist.lock);
1308         ntfs_error(vol->sb, "Failed to determine last allocated "
1309                 "cluster of mft bitmap attribute.");
1310         if (!IS_ERR(rl))
1311             ret = -EIO;
1312         else
1313             ret = PTR_ERR(rl);
1314         return ret;
1315     }
1316     lcn = rl->lcn + rl->length;
1317     ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1318             (long long)lcn);
1319     /*
1320      * Attempt to get the cluster following the last allocated cluster by
1321      * hand as it may be in the MFT zone so the allocator would not give it
1322      * to us.
1323      */
1324     ll = lcn >> 3;
1325     page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
1326             ll >> PAGE_SHIFT);
1327     if (IS_ERR(page)) {
1328         up_write(&mftbmp_ni->runlist.lock);
1329         ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1330         return PTR_ERR(page);
1331     }
1332     b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
1333     tb = 1 << (lcn & 7ull);
1334     down_write(&vol->lcnbmp_lock);
1335     if (*b != 0xff && !(*b & tb)) {
1336         /* Next cluster is free, allocate it. */
1337         *b |= tb;
1338         flush_dcache_page(page);
1339         set_page_dirty(page);
1340         up_write(&vol->lcnbmp_lock);
1341         ntfs_unmap_page(page);
1342         /* Update the mft bitmap runlist. */
1343         rl->length++;
1344         rl[1].vcn++;
1345         status.added_cluster = 1;
1346         ntfs_debug("Appending one cluster to mft bitmap.");
1347     } else {
1348         up_write(&vol->lcnbmp_lock);
1349         ntfs_unmap_page(page);
1350         /* Allocate a cluster from the DATA_ZONE. */
1351         rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1352                 true);
1353         if (IS_ERR(rl2)) {
1354             up_write(&mftbmp_ni->runlist.lock);
1355             ntfs_error(vol->sb, "Failed to allocate a cluster for "
1356                     "the mft bitmap.");
1357             return PTR_ERR(rl2);
1358         }
1359         rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
1360         if (IS_ERR(rl)) {
1361             up_write(&mftbmp_ni->runlist.lock);
1362             ntfs_error(vol->sb, "Failed to merge runlists for mft "
1363                     "bitmap.");
1364             if (ntfs_cluster_free_from_rl(vol, rl2)) {
1365                 ntfs_error(vol->sb, "Failed to deallocate "
1366                         "allocated cluster.%s", es);
1367                 NVolSetErrors(vol);
1368             }
1369             ntfs_free(rl2);
1370             return PTR_ERR(rl);
1371         }
1372         mftbmp_ni->runlist.rl = rl;
1373         status.added_run = 1;
1374         ntfs_debug("Adding one run to mft bitmap.");
1375         /* Find the last run in the new runlist. */
1376         for (; rl[1].length; rl++)
1377             ;
1378     }
1379     /*
1380      * Update the attribute record as well.  Note: @rl is the last
1381      * (non-terminator) runlist element of mft bitmap.
1382      */
1383     mrec = map_mft_record(mft_ni);
1384     if (IS_ERR(mrec)) {
1385         ntfs_error(vol->sb, "Failed to map mft record.");
1386         ret = PTR_ERR(mrec);
1387         goto undo_alloc;
1388     }
1389     ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1390     if (unlikely(!ctx)) {
1391         ntfs_error(vol->sb, "Failed to get search context.");
1392         ret = -ENOMEM;
1393         goto undo_alloc;
1394     }
1395     ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1396             mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1397             0, ctx);
1398     if (unlikely(ret)) {
1399         ntfs_error(vol->sb, "Failed to find last attribute extent of "
1400                 "mft bitmap attribute.");
1401         if (ret == -ENOENT)
1402             ret = -EIO;
1403         goto undo_alloc;
1404     }
1405     a = ctx->attr;
1406     ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1407     /* Search back for the previous last allocated cluster of mft bitmap. */
1408     for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1409         if (ll >= rl2->vcn)
1410             break;
1411     }
1412     BUG_ON(ll < rl2->vcn);
1413     BUG_ON(ll >= rl2->vcn + rl2->length);
1414     /* Get the size for the new mapping pairs array for this extent. */
1415     mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
1416     if (unlikely(mp_size <= 0)) {
1417         ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1418                 "mft bitmap attribute extent.");
1419         ret = mp_size;
1420         if (!ret)
1421             ret = -EIO;
1422         goto undo_alloc;
1423     }
1424     /* Expand the attribute record if necessary. */
1425     old_alen = le32_to_cpu(a->length);
1426     ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1427             le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1428     if (unlikely(ret)) {
1429         if (ret != -ENOSPC) {
1430             ntfs_error(vol->sb, "Failed to resize attribute "
1431                     "record for mft bitmap attribute.");
1432             goto undo_alloc;
1433         }
1434         // TODO: Deal with this by moving this extent to a new mft
1435         // record or by starting a new extent in a new mft record or by
1436         // moving other attributes out of this mft record.
1437         // Note: It will need to be a special mft record and if none of
1438         // those are available it gets rather complicated...
1439         ntfs_error(vol->sb, "Not enough space in this mft record to "
1440                 "accommodate extended mft bitmap attribute "
1441                 "extent.  Cannot handle this yet.");
1442         ret = -EOPNOTSUPP;
1443         goto undo_alloc;
1444     }
1445     status.mp_rebuilt = 1;
1446     /* Generate the mapping pairs array directly into the attr record. */
1447     ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1448             le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1449             mp_size, rl2, ll, -1, NULL);
1450     if (unlikely(ret)) {
1451         ntfs_error(vol->sb, "Failed to build mapping pairs array for "
1452                 "mft bitmap attribute.");
1453         goto undo_alloc;
1454     }
1455     /* Update the highest_vcn. */
1456     a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1457     /*
1458      * We now have extended the mft bitmap allocated_size by one cluster.
1459      * Reflect this in the ntfs_inode structure and the attribute record.
1460      */
1461     if (a->data.non_resident.lowest_vcn) {
1462         /*
1463          * We are not in the first attribute extent, switch to it, but
1464          * first ensure the changes will make it to disk later.
1465          */
1466         flush_dcache_mft_record_page(ctx->ntfs_ino);
1467         mark_mft_record_dirty(ctx->ntfs_ino);
1468         ntfs_attr_reinit_search_ctx(ctx);
1469         ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1470                 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
1471                 0, ctx);
1472         if (unlikely(ret)) {
1473             ntfs_error(vol->sb, "Failed to find first attribute "
1474                     "extent of mft bitmap attribute.");
1475             goto restore_undo_alloc;
1476         }
1477         a = ctx->attr;
1478     }
1479     write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1480     mftbmp_ni->allocated_size += vol->cluster_size;
1481     a->data.non_resident.allocated_size =
1482             cpu_to_sle64(mftbmp_ni->allocated_size);
1483     write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1484     /* Ensure the changes make it to disk. */
1485     flush_dcache_mft_record_page(ctx->ntfs_ino);
1486     mark_mft_record_dirty(ctx->ntfs_ino);
1487     ntfs_attr_put_search_ctx(ctx);
1488     unmap_mft_record(mft_ni);
1489     up_write(&mftbmp_ni->runlist.lock);
1490     ntfs_debug("Done.");
1491     return 0;
1492 restore_undo_alloc:
1493     ntfs_attr_reinit_search_ctx(ctx);
1494     if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1495             mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1496             0, ctx)) {
1497         ntfs_error(vol->sb, "Failed to find last attribute extent of "
1498                 "mft bitmap attribute.%s", es);
1499         write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1500         mftbmp_ni->allocated_size += vol->cluster_size;
1501         write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1502         ntfs_attr_put_search_ctx(ctx);
1503         unmap_mft_record(mft_ni);
1504         up_write(&mftbmp_ni->runlist.lock);
1505         /*
1506          * The only thing that is now wrong is ->allocated_size of the
1507          * base attribute extent which chkdsk should be able to fix.
1508          */
1509         NVolSetErrors(vol);
1510         return ret;
1511     }
1512     a = ctx->attr;
1513     a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
1514 undo_alloc:
1515     if (status.added_cluster) {
1516         /* Truncate the last run in the runlist by one cluster. */
1517         rl->length--;
1518         rl[1].vcn--;
1519     } else if (status.added_run) {
1520         lcn = rl->lcn;
1521         /* Remove the last run from the runlist. */
1522         rl->lcn = rl[1].lcn;
1523         rl->length = 0;
1524     }
1525     /* Deallocate the cluster. */
1526     down_write(&vol->lcnbmp_lock);
1527     if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1528         ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1529         NVolSetErrors(vol);
1530     }
1531     up_write(&vol->lcnbmp_lock);
1532     if (status.mp_rebuilt) {
1533         if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1534                 a->data.non_resident.mapping_pairs_offset),
1535                 old_alen - le16_to_cpu(
1536                 a->data.non_resident.mapping_pairs_offset),
1537                 rl2, ll, -1, NULL)) {
1538             ntfs_error(vol->sb, "Failed to restore mapping pairs "
1539                     "array.%s", es);
1540             NVolSetErrors(vol);
1541         }
1542         if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1543             ntfs_error(vol->sb, "Failed to restore attribute "
1544                     "record.%s", es);
1545             NVolSetErrors(vol);
1546         }
1547         flush_dcache_mft_record_page(ctx->ntfs_ino);
1548         mark_mft_record_dirty(ctx->ntfs_ino);
1549     }
1550     if (ctx)
1551         ntfs_attr_put_search_ctx(ctx);
1552     if (!IS_ERR(mrec))
1553         unmap_mft_record(mft_ni);
1554     up_write(&mftbmp_ni->runlist.lock);
1555     return ret;
1556 }
1557
1558 /**
1559  * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1560  * @vol:    volume on which to extend the mft bitmap attribute
1561  *
1562  * Extend the initialized portion of the mft bitmap attribute on the ntfs
1563  * volume @vol by 8 bytes.
1564  *
1565  * Note:  Only changes initialized_size and data_size, i.e. requires that
1566  * allocated_size is big enough to fit the new initialized_size.
1567  *
1568  * Return 0 on success and -error on error.
1569  *
1570  * Locking: Caller must hold vol->mftbmp_lock for writing.
1571  */
1572 static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
1573 {
1574     s64 old_data_size, old_initialized_size;
1575     unsigned long flags;
1576     struct inode *mftbmp_vi;
1577     ntfs_inode *mft_ni, *mftbmp_ni;
1578     ntfs_attr_search_ctx *ctx;
1579     MFT_RECORD *mrec;
1580     ATTR_RECORD *a;
1581     int ret;
1582
1583     ntfs_debug("Extending mft bitmap initiailized (and data) size.");
1584     mft_ni = NTFS_I(vol->mft_ino);
1585     mftbmp_vi = vol->mftbmp_ino;
1586     mftbmp_ni = NTFS_I(mftbmp_vi);
1587     /* Get the attribute record. */
1588     mrec = map_mft_record(mft_ni);
1589     if (IS_ERR(mrec)) {
1590         ntfs_error(vol->sb, "Failed to map mft record.");
1591         return PTR_ERR(mrec);
1592     }
1593     ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1594     if (unlikely(!ctx)) {
1595         ntfs_error(vol->sb, "Failed to get search context.");
1596         ret = -ENOMEM;
1597         goto unm_err_out;
1598     }
1599     ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1600             mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
1601     if (unlikely(ret)) {
1602         ntfs_error(vol->sb, "Failed to find first attribute extent of "
1603                 "mft bitmap attribute.");
1604         if (ret == -ENOENT)
1605             ret = -EIO;
1606         goto put_err_out;
1607     }
1608     a = ctx->attr;
1609     write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1610     old_data_size = i_size_read(mftbmp_vi);
1611     old_initialized_size = mftbmp_ni->initialized_size;
1612     /*
1613      * We can simply update the initialized_size before filling the space
1614      * with zeroes because the caller is holding the mft bitmap lock for
1615      * writing which ensures that no one else is trying to access the data.
1616      */
1617     mftbmp_ni->initialized_size += 8;
1618     a->data.non_resident.initialized_size =
1619             cpu_to_sle64(mftbmp_ni->initialized_size);
1620     if (mftbmp_ni->initialized_size > old_data_size) {
1621         i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
1622         a->data.non_resident.data_size =
1623                 cpu_to_sle64(mftbmp_ni->initialized_size);
1624     }
1625     write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1626     /* Ensure the changes make it to disk. */
1627     flush_dcache_mft_record_page(ctx->ntfs_ino);
1628     mark_mft_record_dirty(ctx->ntfs_ino);
1629     ntfs_attr_put_search_ctx(ctx);
1630     unmap_mft_record(mft_ni);
1631     /* Initialize the mft bitmap attribute value with zeroes. */
1632     ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1633     if (likely(!ret)) {
1634         ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
1635                 "bitmap.");
1636         return 0;
1637     }
1638     ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1639     /* Try to recover from the error. */
1640     mrec = map_mft_record(mft_ni);
1641     if (IS_ERR(mrec)) {
1642         ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1643         NVolSetErrors(vol);
1644         return ret;
1645     }
1646     ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1647     if (unlikely(!ctx)) {
1648         ntfs_error(vol->sb, "Failed to get search context.%s", es);
1649         NVolSetErrors(vol);
1650         goto unm_err_out;
1651     }
1652     if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1653             mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
1654         ntfs_error(vol->sb, "Failed to find first attribute extent of "
1655                 "mft bitmap attribute.%s", es);
1656         NVolSetErrors(vol);
1657 put_err_out:
1658         ntfs_attr_put_search_ctx(ctx);
1659 unm_err_out:
1660         unmap_mft_record(mft_ni);
1661         goto err_out;
1662     }
1663     a = ctx->attr;
1664     write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1665     mftbmp_ni->initialized_size = old_initialized_size;
1666     a->data.non_resident.initialized_size =
1667             cpu_to_sle64(old_initialized_size);
1668     if (i_size_read(mftbmp_vi) != old_data_size) {
1669         i_size_write(mftbmp_vi, old_data_size);
1670         a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
1671     }
1672     write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1673     flush_dcache_mft_record_page(ctx->ntfs_ino);
1674     mark_mft_record_dirty(ctx->ntfs_ino);
1675     ntfs_attr_put_search_ctx(ctx);
1676     unmap_mft_record(mft_ni);
1677 #ifdef DEBUG
1678     read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1679     ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
1680             "data_size 0x%llx, initialized_size 0x%llx.",
1681             (long long)mftbmp_ni->allocated_size,
1682             (long long)i_size_read(mftbmp_vi),
1683             (long long)mftbmp_ni->initialized_size);
1684     read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1685 #endif /* DEBUG */
1686 err_out:
1687     return ret;
1688 }
1689
1690 /**
1691  * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1692  * @vol:    volume on which to extend the mft data attribute
1693  *
1694  * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1695  * worth of clusters or if not enough space for this by one mft record worth
1696  * of clusters.
1697  *
1698  * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
1699  * data_size.
1700  *
1701  * Return 0 on success and -errno on error.
1702  *
1703  * Locking: - Caller must hold vol->mftbmp_lock for writing.
1704  *      - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1705  *        writing and releases it before returning.
1706  *      - This function calls functions which take vol->lcnbmp_lock for
1707  *        writing and release it before returning.
1708  */
1709 static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1710 {
1711     LCN lcn;
1712     VCN old_last_vcn;
1713     s64 min_nr, nr, ll;
1714     unsigned long flags;
1715     ntfs_inode *mft_ni;
1716     runlist_element *rl, *rl2;
1717     ntfs_attr_search_ctx *ctx = NULL;
1718     MFT_RECORD *mrec;
1719     ATTR_RECORD *a = NULL;
1720     int ret, mp_size;
1721     u32 old_alen = 0;
1722     bool mp_rebuilt = false;
1723
1724     ntfs_debug("Extending mft data allocation.");
1725     mft_ni = NTFS_I(vol->mft_ino);
1726     /*
1727      * Determine the preferred allocation location, i.e. the last lcn of
1728      * the mft data attribute.  The allocated size of the mft data
1729      * attribute cannot be zero so we are ok to do this.
1730      */
1731     down_write(&mft_ni->runlist.lock);
1732     read_lock_irqsave(&mft_ni->size_lock, flags);
1733     ll = mft_ni->allocated_size;
1734     read_unlock_irqrestore(&mft_ni->size_lock, flags);
1735     rl = ntfs_attr_find_vcn_nolock(mft_ni,
1736             (ll - 1) >> vol->cluster_size_bits, NULL);
1737     if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1738         up_write(&mft_ni->runlist.lock);
1739         ntfs_error(vol->sb, "Failed to determine last allocated "
1740                 "cluster of mft data attribute.");
1741         if (!IS_ERR(rl))
1742             ret = -EIO;
1743         else
1744             ret = PTR_ERR(rl);
1745         return ret;
1746     }
1747     lcn = rl->lcn + rl->length;
1748     ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
1749     /* Minimum allocation is one mft record worth of clusters. */
1750     min_nr = vol->mft_record_size >> vol->cluster_size_bits;
1751     if (!min_nr)
1752         min_nr = 1;
1753     /* Want to allocate 16 mft records worth of clusters. */
1754     nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
1755     if (!nr)
1756         nr = min_nr;
1757     /* Ensure we do not go above 2^32-1 mft records. */
1758     read_lock_irqsave(&mft_ni->size_lock, flags);
1759     ll = mft_ni->allocated_size;
1760     read_unlock_irqrestore(&mft_ni->size_lock, flags);
1761     if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1762             vol->mft_record_size_bits >= (1ll << 32))) {
1763         nr = min_nr;
1764         if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1765                 vol->mft_record_size_bits >= (1ll << 32))) {
1766             ntfs_warning(vol->sb, "Cannot allocate mft record "
1767                     "because the maximum number of inodes "
1768                     "(2^32) has already been reached.");
1769             up_write(&mft_ni->runlist.lock);
1770             return -ENOSPC;
1771         }
1772     }
1773     ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1774             nr > min_nr ? "default" : "minimal", (long long)nr);
1775     old_last_vcn = rl[1].vcn;
1776     do {
1777         rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1778                 true);
1779         if (!IS_ERR(rl2))
1780             break;
1781         if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
1782             ntfs_error(vol->sb, "Failed to allocate the minimal "
1783                     "number of clusters (%lli) for the "
1784                     "mft data attribute.", (long long)nr);
1785             up_write(&mft_ni->runlist.lock);
1786             return PTR_ERR(rl2);
1787         }
1788         /*
1789          * There is not enough space to do the allocation, but there
1790          * might be enough space to do a minimal allocation so try that
1791          * before failing.
1792          */
1793         nr = min_nr;
1794         ntfs_debug("Retrying mft data allocation with minimal cluster "
1795                 "count %lli.", (long long)nr);
1796     } while (1);
1797     rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
1798     if (IS_ERR(rl)) {
1799         up_write(&mft_ni->runlist.lock);
1800         ntfs_error(vol->sb, "Failed to merge runlists for mft data "
1801                 "attribute.");
1802         if (ntfs_cluster_free_from_rl(vol, rl2)) {
1803             ntfs_error(vol->sb, "Failed to deallocate clusters "
1804                     "from the mft data attribute.%s", es);
1805             NVolSetErrors(vol);
1806         }
1807         ntfs_free(rl2);
1808         return PTR_ERR(rl);
1809     }
1810     mft_ni->runlist.rl = rl;
1811     ntfs_debug("Allocated %lli clusters.", (long long)nr);
1812     /* Find the last run in the new runlist. */
1813     for (; rl[1].length; rl++)
1814         ;
1815     /* Update the attribute record as well. */
1816     mrec = map_mft_record(mft_ni);
1817     if (IS_ERR(mrec)) {
1818         ntfs_error(vol->sb, "Failed to map mft record.");
1819         ret = PTR_ERR(mrec);
1820         goto undo_alloc;
1821     }
1822     ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1823     if (unlikely(!ctx)) {
1824         ntfs_error(vol->sb, "Failed to get search context.");
1825         ret = -ENOMEM;
1826         goto undo_alloc;
1827     }
1828     ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1829             CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
1830     if (unlikely(ret)) {
1831         ntfs_error(vol->sb, "Failed to find last attribute extent of "
1832                 "mft data attribute.");
1833         if (ret == -ENOENT)
1834             ret = -EIO;
1835         goto undo_alloc;
1836     }
1837     a = ctx->attr;
1838     ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1839     /* Search back for the previous last allocated cluster of mft bitmap. */
1840     for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1841         if (ll >= rl2->vcn)
1842             break;
1843     }
1844     BUG_ON(ll < rl2->vcn);
1845     BUG_ON(ll >= rl2->vcn + rl2->length);
1846     /* Get the size for the new mapping pairs array for this extent. */
1847     mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
1848     if (unlikely(mp_size <= 0)) {
1849         ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1850                 "mft data attribute extent.");
1851         ret = mp_size;
1852         if (!ret)
1853             ret = -EIO;
1854         goto undo_alloc;
1855     }
1856     /* Expand the attribute record if necessary. */
1857     old_alen = le32_to_cpu(a->length);
1858     ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1859             le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1860     if (unlikely(ret)) {
1861         if (ret != -ENOSPC) {
1862             ntfs_error(vol->sb, "Failed to resize attribute "
1863                     "record for mft data attribute.");
1864             goto undo_alloc;
1865         }
1866         // TODO: Deal with this by moving this extent to a new mft
1867         // record or by starting a new extent in a new mft record or by
1868         // moving other attributes out of this mft record.
1869         // Note: Use the special reserved mft records and ensure that
1870         // this extent is not required to find the mft record in
1871         // question.  If no free special records left we would need to
1872         // move an existing record away, insert ours in its place, and
1873         // then place the moved record into the newly allocated space
1874         // and we would then need to update all references to this mft
1875         // record appropriately.  This is rather complicated...
1876         ntfs_error(vol->sb, "Not enough space in this mft record to "
1877                 "accommodate extended mft data attribute "
1878                 "extent.  Cannot handle this yet.");
1879         ret = -EOPNOTSUPP;
1880         goto undo_alloc;
1881     }
1882     mp_rebuilt = true;
1883     /* Generate the mapping pairs array directly into the attr record. */
1884     ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1885             le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1886             mp_size, rl2, ll, -1, NULL);
1887     if (unlikely(ret)) {
1888         ntfs_error(vol->sb, "Failed to build mapping pairs array of "
1889                 "mft data attribute.");
1890         goto undo_alloc;
1891     }
1892     /* Update the highest_vcn. */
1893     a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1894     /*
1895      * We now have extended the mft data allocated_size by nr clusters.
1896      * Reflect this in the ntfs_inode structure and the attribute record.
1897      * @rl is the last (non-terminator) runlist element of mft data
1898      * attribute.
1899      */
1900     if (a->data.non_resident.lowest_vcn) {
1901         /*
1902          * We are not in the first attribute extent, switch to it, but
1903          * first ensure the changes will make it to disk later.
1904          */
1905         flush_dcache_mft_record_page(ctx->ntfs_ino);
1906         mark_mft_record_dirty(ctx->ntfs_ino);
1907         ntfs_attr_reinit_search_ctx(ctx);
1908         ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1909                 mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
1910                 ctx);
1911         if (unlikely(ret)) {
1912             ntfs_error(vol->sb, "Failed to find first attribute "
1913                     "extent of mft data attribute.");
1914             goto restore_undo_alloc;
1915         }
1916         a = ctx->attr;
1917     }
1918     write_lock_irqsave(&mft_ni->size_lock, flags);
1919     mft_ni->allocated_size += nr << vol->cluster_size_bits;
1920     a->data.non_resident.allocated_size =
1921             cpu_to_sle64(mft_ni->allocated_size);
1922     write_unlock_irqrestore(&mft_ni->size_lock, flags);
1923     /* Ensure the changes make it to disk. */
1924     flush_dcache_mft_record_page(ctx->ntfs_ino);
1925     mark_mft_record_dirty(ctx->ntfs_ino);
1926     ntfs_attr_put_search_ctx(ctx);
1927     unmap_mft_record(mft_ni);
1928     up_write(&mft_ni->runlist.lock);
1929     ntfs_debug("Done.");
1930     return 0;
1931 restore_undo_alloc:
1932     ntfs_attr_reinit_search_ctx(ctx);
1933     if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1934             CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
1935         ntfs_error(vol->sb, "Failed to find last attribute extent of "
1936                 "mft data attribute.%s", es);
1937         write_lock_irqsave(&mft_ni->size_lock, flags);
1938         mft_ni->allocated_size += nr << vol->cluster_size_bits;
1939         write_unlock_irqrestore(&mft_ni->size_lock, flags);
1940         ntfs_attr_put_search_ctx(ctx);
1941         unmap_mft_record(mft_ni);
1942         up_write(&mft_ni->runlist.lock);
1943         /*
1944          * The only thing that is now wrong is ->allocated_size of the
1945          * base attribute extent which chkdsk should be able to fix.
1946          */
1947         NVolSetErrors(vol);
1948         return ret;
1949     }
1950     ctx->attr->data.non_resident.highest_vcn =
1951             cpu_to_sle64(old_last_vcn - 1);
1952 undo_alloc:
1953     if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1954         ntfs_error(vol->sb, "Failed to free clusters from mft data "
1955                 "attribute.%s", es);
1956         NVolSetErrors(vol);
1957     }
1958     a = ctx->attr;
1959     if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1960         ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1961                 "runlist.%s", es);
1962         NVolSetErrors(vol);
1963     }
1964     if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1965         if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1966                 a->data.non_resident.mapping_pairs_offset),
1967                 old_alen - le16_to_cpu(
1968                 a->data.non_resident.mapping_pairs_offset),
1969                 rl2, ll, -1, NULL)) {
1970             ntfs_error(vol->sb, "Failed to restore mapping pairs "
1971                     "array.%s", es);
1972             NVolSetErrors(vol);
1973         }
1974         if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1975             ntfs_error(vol->sb, "Failed to restore attribute "
1976                     "record.%s", es);
1977             NVolSetErrors(vol);
1978         }
1979         flush_dcache_mft_record_page(ctx->ntfs_ino);
1980         mark_mft_record_dirty(ctx->ntfs_ino);
1981     } else if (IS_ERR(ctx->mrec)) {
1982         ntfs_error(vol->sb, "Failed to restore attribute search "
1983                 "context.%s", es);
1984         NVolSetErrors(vol);
1985     }
1986     if (ctx)
1987         ntfs_attr_put_search_ctx(ctx);
1988     if (!IS_ERR(mrec))
1989         unmap_mft_record(mft_ni);
1990     up_write(&mft_ni->runlist.lock);
1991     return ret;
1992 }
1993
1994 /**
1995  * ntfs_mft_record_layout - layout an mft record into a memory buffer
1996  * @vol:    volume to which the mft record will belong
1997  * @mft_no: mft reference specifying the mft record number
1998  * @m:      destination buffer of size >= @vol->mft_record_size bytes
1999  *
2000  * Layout an empty, unused mft record with the mft record number @mft_no into
2001  * the buffer @m.  The volume @vol is needed because the mft record structure
2002  * was modified in NTFS 3.1 so we need to know which volume version this mft
2003  * record will be used on.
2004  *
2005  * Return 0 on success and -errno on error.
2006  */
2007 static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
2008         MFT_RECORD *m)
2009 {
2010     ATTR_RECORD *a;
2011
2012     ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2013     if (mft_no >= (1ll << 32)) {
2014         ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
2015                 "maximum of 2^32.", (long long)mft_no);
2016         return -ERANGE;
2017     }
2018     /* Start by clearing the whole mft record to gives us a clean slate. */
2019     memset(m, 0, vol->mft_record_size);
2020     /* Aligned to 2-byte boundary. */
2021     if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
2022         m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
2023     else {
2024         m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
2025         /*
2026          * Set the NTFS 3.1+ specific fields while we know that the
2027          * volume version is 3.1+.
2028          */
2029         m->reserved = 0;
2030         m->mft_record_number = cpu_to_le32((u32)mft_no);
2031     }
2032     m->magic = magic_FILE;
2033     if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
2034         m->usa_count = cpu_to_le16(vol->mft_record_size /
2035                 NTFS_BLOCK_SIZE + 1);
2036     else {
2037         m->usa_count = cpu_to_le16(1);
2038         ntfs_warning(vol->sb, "Sector size is bigger than mft record "
2039                 "size.  Setting usa_count to 1.  If chkdsk "
2040                 "reports this as corruption, please email "
2041                 "linux-ntfs-dev@lists.sourceforge.net stating "
2042                 "that you saw this message and that the "
2043                 "modified filesystem created was corrupt.  "
2044                 "Thank you.");
2045     }
2046     /* Set the update sequence number to 1. */
2047     *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
2048     m->lsn = 0;
2049     m->sequence_number = cpu_to_le16(1);
2050     m->link_count = 0;
2051     /*
2052      * Place the attributes straight after the update sequence array,
2053      * aligned to 8-byte boundary.
2054      */
2055     m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
2056             (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
2057     m->flags = 0;
2058     /*
2059      * Using attrs_offset plus eight bytes (for the termination attribute).
2060      * attrs_offset is already aligned to 8-byte boundary, so no need to
2061      * align again.
2062      */
2063     m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
2064     m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
2065     m->base_mft_record = 0;
2066     m->next_attr_instance = 0;
2067     /* Add the termination attribute. */
2068     a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2069     a->type = AT_END;
2070     a->length = 0;
2071     ntfs_debug("Done.");
2072     return 0;
2073 }
2074
2075 /**
2076  * ntfs_mft_record_format - format an mft record on an ntfs volume
2077  * @vol:    volume on which to format the mft record
2078  * @mft_no: mft record number to format
2079  *
2080  * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
2081  * mft record into the appropriate place of the mft data attribute.  This is
2082  * used when extending the mft data attribute.
2083  *
2084  * Return 0 on success and -errno on error.
2085  */
2086 static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
2087 {
2088     loff_t i_size;
2089     struct inode *mft_vi = vol->mft_ino;
2090     struct page *page;
2091     MFT_RECORD *m;
2092     pgoff_t index, end_index;
2093     unsigned int ofs;
2094     int err;
2095
2096     ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2097     /*
2098      * The index into the page cache and the offset within the page cache
2099      * page of the wanted mft record.
2100      */
2101     index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
2102     ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
2103     /* The maximum valid index into the page cache for $MFT's data. */
2104     i_size = i_size_read(mft_vi);
2105     end_index = i_size >> PAGE_SHIFT;
2106     if (unlikely(index >= end_index)) {
2107         if (unlikely(index > end_index || ofs + vol->mft_record_size >=
2108                 (i_size & ~PAGE_MASK))) {
2109             ntfs_error(vol->sb, "Tried to format non-existing mft "
2110                     "record 0x%llx.", (long long)mft_no);
2111             return -ENOENT;
2112         }
2113     }
2114     /* Read, map, and pin the page containing the mft record. */
2115     page = ntfs_map_page(mft_vi->i_mapping, index);
2116     if (IS_ERR(page)) {
2117         ntfs_error(vol->sb, "Failed to map page containing mft record "
2118                 "to format 0x%llx.", (long long)mft_no);
2119         return PTR_ERR(page);
2120     }
2121     lock_page(page);
2122     BUG_ON(!PageUptodate(page));
2123     ClearPageUptodate(page);
2124     m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2125     err = ntfs_mft_record_layout(vol, mft_no, m);
2126     if (unlikely(err)) {
2127         ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
2128                 (long long)mft_no);
2129         SetPageUptodate(page);
2130         unlock_page(page);
2131         ntfs_unmap_page(page);
2132         return err;
2133     }
2134     flush_dcache_page(page);
2135     SetPageUptodate(page);
2136     unlock_page(page);
2137     /*
2138      * Make sure the mft record is written out to disk.  We could use
2139      * ilookup5() to check if an inode is in icache and so on but this is
2140      * unnecessary as ntfs_writepage() will write the dirty record anyway.
2141      */
2142     mark_ntfs_record_dirty(page, ofs);
2143     ntfs_unmap_page(page);
2144     ntfs_debug("Done.");
2145     return 0;
2146 }
2147
2148 /**
2149  * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2150  * @vol:    [IN]  volume on which to allocate the mft record
2151  * @mode:   [IN]  mode if want a file or directory, i.e. base inode or 0
2152  * @base_ni:    [IN]  open base inode if allocating an extent mft record or NULL
2153  * @mrec:   [OUT] on successful return this is the mapped mft record
2154  *
2155  * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2156  *
2157  * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2158  * direvctory inode, and allocate it at the default allocator position.  In
2159  * this case @mode is the file mode as given to us by the caller.  We in
2160  * particular use @mode to distinguish whether a file or a directory is being
2161  * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2162  *
2163  * If @base_ni is not NULL make the allocated mft record an extent record,
2164  * allocate it starting at the mft record after the base mft record and attach
2165  * the allocated and opened ntfs inode to the base inode @base_ni.  In this
2166  * case @mode must be 0 as it is meaningless for extent inodes.
2167  *
2168  * You need to check the return value with IS_ERR().  If false, the function
2169  * was successful and the return value is the now opened ntfs inode of the
2170  * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
2171  * and locked mft record.  If IS_ERR() is true, the function failed and the
2172  * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
2173  * this case.
2174  *
2175  * Allocation strategy:
2176  *
2177  * To find a free mft record, we scan the mft bitmap for a zero bit.  To
2178  * optimize this we start scanning at the place specified by @base_ni or if
2179  * @base_ni is NULL we start where we last stopped and we perform wrap around
2180  * when we reach the end.  Note, we do not try to allocate mft records below
2181  * number 24 because numbers 0 to 15 are the defined system files anyway and 16
2182  * to 24 are special in that they are used for storing extension mft records
2183  * for the $DATA attribute of $MFT.  This is required to avoid the possibility
2184  * of creating a runlist with a circular dependency which once written to disk
2185  * can never be read in again.  Windows will only use records 16 to 24 for
2186  * normal files if the volume is completely out of space.  We never use them
2187  * which means that when the volume is really out of space we cannot create any
2188  * more files while Windows can still create up to 8 small files.  We can start
2189  * doing this at some later time, it does not matter much for now.
2190  *
2191  * When scanning the mft bitmap, we only search up to the last allocated mft
2192  * record.  If there are no free records left in the range 24 to number of
2193  * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2194  * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
2195  * records at a time or one cluster, if cluster size is above 16kiB.  If there
2196  * is not sufficient space to do this, we try to extend by a single mft record
2197  * or one cluster, if cluster size is above the mft record size.
2198  *
2199  * No matter how many mft records we allocate, we initialize only the first
2200  * allocated mft record, incrementing mft data size and initialized size
2201  * accordingly, open an ntfs_inode for it and return it to the caller, unless
2202  * there are less than 24 mft records, in which case we allocate and initialize
2203  * mft records until we reach record 24 which we consider as the first free mft
2204  * record for use by normal files.
2205  *
2206  * If during any stage we overflow the initialized data in the mft bitmap, we
2207  * extend the initialized size (and data size) by 8 bytes, allocating another
2208  * cluster if required.  The bitmap data size has to be at least equal to the
2209  * number of mft records in the mft, but it can be bigger, in which case the
2210  * superflous bits are padded with zeroes.
2211  *
2212  * Thus, when we return successfully (IS_ERR() is false), we will have:
2213  *  - initialized / extended the mft bitmap if necessary,
2214  *  - initialized / extended the mft data if necessary,
2215  *  - set the bit corresponding to the mft record being allocated in the
2216  *    mft bitmap,
2217  *  - opened an ntfs_inode for the allocated mft record, and we will have
2218  *  - returned the ntfs_inode as well as the allocated mapped, pinned, and
2219  *    locked mft record.
2220  *
2221  * On error, the volume will be left in a consistent state and no record will
2222  * be allocated.  If rolling back a partial operation fails, we may leave some
2223  * inconsistent metadata in which case we set NVolErrors() so the volume is
2224  * left dirty when unmounted.
2225  *
2226  * Note, this function cannot make use of most of the normal functions, like
2227  * for example for attribute resizing, etc, because when the run list overflows
2228  * the base mft record and an attribute list is used, it is very important that
2229  * the extension mft records used to store the $DATA attribute of $MFT can be
2230  * reached without having to read the information contained inside them, as
2231  * this would make it impossible to find them in the first place after the
2232  * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
2233  * rule because the bitmap is not essential for finding the mft records, but on
2234  * the other hand, handling the bitmap in this special way would make life
2235  * easier because otherwise there might be circular invocations of functions
2236  * when reading the bitmap.
2237  */
2238 ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
2239         ntfs_inode *base_ni, MFT_RECORD **mrec)
2240 {
2241     s64 ll, bit, old_data_initialized, old_data_size;
2242     unsigned long flags;
2243     struct inode *vi;
2244     struct page *page;
2245     ntfs_inode *mft_ni, *mftbmp_ni, *ni;
2246     ntfs_attr_search_ctx *ctx;
2247     MFT_RECORD *m;
2248     ATTR_RECORD *a;
2249     pgoff_t index;
2250     unsigned int ofs;
2251     int err;
2252     le16 seq_no, usn;
2253     bool record_formatted = false;
2254
2255     if (base_ni) {
2256         ntfs_debug("Entering (allocating an extent mft record for "
2257                 "base mft record 0x%llx).",
2258                 (long long)base_ni->mft_no);
2259         /* @mode and @base_ni are mutually exclusive. */
2260         BUG_ON(mode);
2261     } else
2262         ntfs_debug("Entering (allocating a base mft record).");
2263     if (mode) {
2264         /* @mode and @base_ni are mutually exclusive. */
2265         BUG_ON(base_ni);
2266         /* We only support creation of normal files and directories. */
2267         if (!S_ISREG(mode) && !S_ISDIR(mode))
2268             return ERR_PTR(-EOPNOTSUPP);
2269     }
2270     BUG_ON(!mrec);
2271     mft_ni = NTFS_I(vol->mft_ino);
2272     mftbmp_ni = NTFS_I(vol->mftbmp_ino);
2273     down_write(&vol->mftbmp_lock);
2274     bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2275     if (bit >= 0) {
2276         ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2277                 (long long)bit);
2278         goto have_alloc_rec;
2279     }
2280     if (bit != -ENOSPC) {
2281         up_write(&vol->mftbmp_lock);
2282         return ERR_PTR(bit);
2283     }
2284     /*
2285      * No free mft records left.  If the mft bitmap already covers more
2286      * than the currently used mft records, the next records are all free,
2287      * so we can simply allocate the first unused mft record.
2288      * Note: We also have to make sure that the mft bitmap at least covers
2289      * the first 24 mft records as they are special and whilst they may not
2290      * be in use, we do not allocate from them.
2291      */
2292     read_lock_irqsave(&mft_ni->size_lock, flags);
2293     ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2294     read_unlock_irqrestore(&mft_ni->size_lock, flags);
2295     read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2296     old_data_initialized = mftbmp_ni->initialized_size;
2297     read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2298     if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
2299         bit = ll;
2300         if (bit < 24)
2301             bit = 24;
2302         if (unlikely(bit >= (1ll << 32)))
2303             goto max_err_out;
2304         ntfs_debug("Found free record (#2), bit 0x%llx.",
2305                 (long long)bit);
2306         goto found_free_rec;
2307     }
2308     /*
2309      * The mft bitmap needs to be expanded until it covers the first unused
2310      * mft record that we can allocate.
2311      * Note: The smallest mft record we allocate is mft record 24.
2312      */
2313     bit = old_data_initialized << 3;
2314     if (unlikely(bit >= (1ll << 32)))
2315         goto max_err_out;
2316     read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2317     old_data_size = mftbmp_ni->allocated_size;
2318     ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
2319             "data_size 0x%llx, initialized_size 0x%llx.",
2320             (long long)old_data_size,
2321             (long long)i_size_read(vol->mftbmp_ino),
2322             (long long)old_data_initialized);
2323     read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2324     if (old_data_initialized + 8 > old_data_size) {
2325         /* Need to extend bitmap by one more cluster. */
2326         ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2327         err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2328         if (unlikely(err)) {
2329             up_write(&vol->mftbmp_lock);
2330             goto err_out;
2331         }
2332 #ifdef DEBUG
2333         read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2334         ntfs_debug("Status of mftbmp after allocation extension: "
2335                 "allocated_size 0x%llx, data_size 0x%llx, "
2336                 "initialized_size 0x%llx.",
2337                 (long long)mftbmp_ni->allocated_size,
2338                 (long long)i_size_read(vol->mftbmp_ino),
2339                 (long long)mftbmp_ni->initialized_size);
2340         read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2341 #endif /* DEBUG */
2342     }
2343     /*
2344      * We now have sufficient allocated space, extend the initialized_size
2345      * as well as the data_size if necessary and fill the new space with
2346      * zeroes.
2347      */
2348     err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2349     if (unlikely(err)) {
2350         up_write(&vol->mftbmp_lock);
2351         goto err_out;
2352     }
2353 #ifdef DEBUG
2354     read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2355     ntfs_debug("Status of mftbmp after initialized extension: "
2356             "allocated_size 0x%llx, data_size 0x%llx, "
2357             "initialized_size 0x%llx.",
2358             (long long)mftbmp_ni->allocated_size,
2359             (long long)i_size_read(vol->mftbmp_ino),
2360             (long long)mftbmp_ni->initialized_size);
2361     read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2362 #endif /* DEBUG */
2363     ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2364 found_free_rec:
2365     /* @bit is the found free mft record, allocate it in the mft bitmap. */
2366     ntfs_debug("At found_free_rec.");
2367     err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
2368     if (unlikely(err)) {
2369         ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2370         up_write(&vol->mftbmp_lock);
2371         goto err_out;
2372     }
2373     ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2374 have_alloc_rec:
2375     /*
2376      * The mft bitmap is now uptodate.  Deal with mft data attribute now.
2377      * Note, we keep hold of the mft bitmap lock for writing until all
2378      * modifications to the mft data attribute are complete, too, as they
2379      * will impact decisions for mft bitmap and mft record allocation done
2380      * by a parallel allocation and if the lock is not maintained a
2381      * parallel allocation could allocate the same mft record as this one.
2382      */
2383     ll = (bit + 1) << vol->mft_record_size_bits;
2384     read_lock_irqsave(&mft_ni->size_lock, flags);
2385     old_data_initialized = mft_ni->initialized_size;
2386     read_unlock_irqrestore(&mft_ni->size_lock, flags);
2387     if (ll <= old_data_initialized) {
2388         ntfs_debug("Allocated mft record already initialized.");
2389         goto mft_rec_already_initialized;
2390     }
2391     ntfs_debug("Initializing allocated mft record.");
2392     /*
2393      * The mft record is outside the initialized data.  Extend the mft data
2394      * attribute until it covers the allocated record.  The loop is only
2395      * actually traversed more than once when a freshly formatted volume is
2396      * first written to so it optimizes away nicely in the common case.
2397      */
2398     read_lock_irqsave(&mft_ni->size_lock, flags);
2399     ntfs_debug("Status of mft data before extension: "
2400             "allocated_size 0x%llx, data_size 0x%llx, "
2401             "initialized_size 0x%llx.",
2402             (long long)mft_ni->allocated_size,
2403             (long long)i_size_read(vol->mft_ino),
2404             (long long)mft_ni->initialized_size);
2405     while (ll > mft_ni->allocated_size) {
2406         read_unlock_irqrestore(&mft_ni->size_lock, flags);
2407         err = ntfs_mft_data_extend_allocation_nolock(vol);
2408         if (unlikely(err)) {
2409             ntfs_error(vol->sb, "Failed to extend mft data "
2410                     "allocation.");
2411             goto undo_mftbmp_alloc_nolock;
2412         }
2413         read_lock_irqsave(&mft_ni->size_lock, flags);
2414         ntfs_debug("Status of mft data after allocation extension: "
2415                 "allocated_size 0x%llx, data_size 0x%llx, "
2416                 "initialized_size 0x%llx.",
2417                 (long long)mft_ni->allocated_size,
2418                 (long long)i_size_read(vol->mft_ino),
2419                 (long long)mft_ni->initialized_size);
2420     }
2421     read_unlock_irqrestore(&mft_ni->size_lock, flags);
2422     /*
2423      * Extend mft data initialized size (and data size of course) to reach
2424      * the allocated mft record, formatting the mft records allong the way.
2425      * Note: We only modify the ntfs_inode structure as that is all that is
2426      * needed by ntfs_mft_record_format().  We will update the attribute
2427      * record itself in one fell swoop later on.
2428      */
2429     write_lock_irqsave(&mft_ni->size_lock, flags);
2430     old_data_initialized = mft_ni->initialized_size;
2431     old_data_size = vol->mft_ino->i_size;
2432     while (ll > mft_ni->initialized_size) {
2433         s64 new_initialized_size, mft_no;
2434
2435         new_initialized_size = mft_ni->initialized_size +
2436                 vol->mft_record_size;
2437         mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2438         if (new_initialized_size > i_size_read(vol->mft_ino))
2439             i_size_write(vol->mft_ino, new_initialized_size);
2440         write_unlock_irqrestore(&mft_ni->size_lock, flags);
2441         ntfs_debug("Initializing mft record 0x%llx.",
2442                 (long long)mft_no);
2443         err = ntfs_mft_record_format(vol, mft_no);
2444         if (unlikely(err)) {
2445             ntfs_error(vol->sb, "Failed to format mft record.");
2446             goto undo_data_init;
2447         }
2448         write_lock_irqsave(&mft_ni->size_lock, flags);
2449         mft_ni->initialized_size = new_initialized_size;
2450     }
2451     write_unlock_irqrestore(&mft_ni->size_lock, flags);
2452     record_formatted = true;
2453     /* Update the mft data attribute record to reflect the new sizes. */
2454     m = map_mft_record(mft_ni);
2455     if (IS_ERR(m)) {
2456         ntfs_error(vol->sb, "Failed to map mft record.");
2457         err = PTR_ERR(m);
2458         goto undo_data_init;
2459     }
2460     ctx = ntfs_attr_get_search_ctx(mft_ni, m);
2461     if (unlikely(!ctx)) {
2462         ntfs_error(vol->sb, "Failed to get search context.");
2463         err = -ENOMEM;
2464         unmap_mft_record(mft_ni);
2465         goto undo_data_init;
2466     }
2467     err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2468             CASE_SENSITIVE, 0, NULL, 0, ctx);
2469     if (unlikely(err)) {
2470         ntfs_error(vol->sb, "Failed to find first attribute extent of "
2471                 "mft data attribute.");
2472         ntfs_attr_put_search_ctx(ctx);
2473         unmap_mft_record(mft_ni);
2474         goto undo_data_init;
2475     }
2476     a = ctx->attr;
2477     read_lock_irqsave(&mft_ni->size_lock, flags);
2478     a->data.non_resident.initialized_size =
2479             cpu_to_sle64(mft_ni->initialized_size);
2480     a->data.non_resident.data_size =
2481             cpu_to_sle64(i_size_read(vol->mft_ino));
2482     read_unlock_irqrestore(&mft_ni->size_lock, flags);
2483     /* Ensure the changes make it to disk. */
2484     flush_dcache_mft_record_page(ctx->ntfs_ino);
2485     mark_mft_record_dirty(ctx->ntfs_ino);
2486     ntfs_attr_put_search_ctx(ctx);
2487     unmap_mft_record(mft_ni);
2488     read_lock_irqsave(&mft_ni->size_lock, flags);
2489     ntfs_debug("Status of mft data after mft record initialization: "
2490             "allocated_size 0x%llx, data_size 0x%llx, "
2491             "initialized_size 0x%llx.",
2492             (long long)mft_ni->allocated_size,
2493             (long long)i_size_read(vol->mft_ino),
2494             (long long)mft_ni->initialized_size);
2495     BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
2496     BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
2497     read_unlock_irqrestore(&mft_ni->size_lock, flags);
2498 mft_rec_already_initialized:
2499     /*
2500      * We can finally drop the mft bitmap lock as the mft data attribute
2501      * has been fully updated.  The only disparity left is that the
2502      * allocated mft record still needs to be marked as in use to match the
2503      * set bit in the mft bitmap but this is actually not a problem since
2504      * this mft record is not referenced from anywhere yet and the fact
2505      * that it is allocated in the mft bitmap means that no-one will try to
2506      * allocate it either.
2507      */
2508     up_write(&vol->mftbmp_lock);
2509     /*
2510      * We now have allocated and initialized the mft record.  Calculate the
2511      * index of and the offset within the page cache page the record is in.
2512      */
2513     index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
2514     ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
2515     /* Read, map, and pin the page containing the mft record. */
2516     page = ntfs_map_page(vol->mft_ino->i_mapping, index);
2517     if (IS_ERR(page)) {
2518         ntfs_error(vol->sb, "Failed to map page containing allocated "
2519                 "mft record 0x%llx.", (long long)bit);
2520         err = PTR_ERR(page);
2521         goto undo_mftbmp_alloc;
2522     }
2523     lock_page(page);
2524     BUG_ON(!PageUptodate(page));
2525     ClearPageUptodate(page);
2526     m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2527     /* If we just formatted the mft record no need to do it again. */
2528     if (!record_formatted) {
2529         /* Sanity check that the mft record is really not in use. */
2530         if (ntfs_is_file_record(m->magic) &&
2531                 (m->flags & MFT_RECORD_IN_USE)) {
2532             ntfs_error(vol->sb, "Mft record 0x%llx was marked "
2533                     "free in mft bitmap but is marked "
2534                     "used itself.  Corrupt filesystem.  "
2535                     "Unmount and run chkdsk.",
2536                     (long long)bit);
2537             err = -EIO;
2538             SetPageUptodate(page);
2539             unlock_page(page);
2540             ntfs_unmap_page(page);
2541             NVolSetErrors(vol);
2542             goto undo_mftbmp_alloc;
2543         }
2544         /*
2545          * We need to (re-)format the mft record, preserving the
2546          * sequence number if it is not zero as well as the update
2547          * sequence number if it is not zero or -1 (0xffff).  This
2548          * means we do not need to care whether or not something went
2549          * wrong with the previous mft record.
2550          */
2551         seq_no = m->sequence_number;
2552         usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
2553         err = ntfs_mft_record_layout(vol, bit, m);
2554         if (unlikely(err)) {
2555             ntfs_error(vol->sb, "Failed to layout allocated mft "
2556                     "record 0x%llx.", (long long)bit);
2557             SetPageUptodate(page);
2558             unlock_page(page);
2559             ntfs_unmap_page(page);
2560             goto undo_mftbmp_alloc;
2561         }
2562         if (seq_no)
2563             m->sequence_number = seq_no;
2564         if (usn && le16_to_cpu(usn) != 0xffff)
2565             *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
2566     }
2567     /* Set the mft record itself in use. */
2568     m->flags |= MFT_RECORD_IN_USE;
2569     if (S_ISDIR(mode))
2570         m->flags |= MFT_RECORD_IS_DIRECTORY;
2571     flush_dcache_page(page);
2572     SetPageUptodate(page);
2573     if (base_ni) {
2574         MFT_RECORD *m_tmp;
2575
2576         /*
2577          * Setup the base mft record in the extent mft record.  This
2578          * completes initialization of the allocated extent mft record
2579          * and we can simply use it with map_extent_mft_record().
2580          */
2581         m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2582                 base_ni->seq_no);
2583         /*
2584          * Allocate an extent inode structure for the new mft record,
2585          * attach it to the base inode @base_ni and map, pin, and lock
2586          * its, i.e. the allocated, mft record.
2587          */
2588         m_tmp = map_extent_mft_record(base_ni, bit, &ni);
2589         if (IS_ERR(m_tmp)) {
2590             ntfs_error(vol->sb, "Failed to map allocated extent "
2591                     "mft record 0x%llx.", (long long)bit);
2592             err = PTR_ERR(m_tmp);
2593             /* Set the mft record itself not in use. */
2594             m->flags &= cpu_to_le16(
2595                     ~le16_to_cpu(MFT_RECORD_IN_USE));
2596             flush_dcache_page(page);
2597             /* Make sure the mft record is written out to disk. */
2598             mark_ntfs_record_dirty(page, ofs);
2599             unlock_page(page);
2600             ntfs_unmap_page(page);
2601             goto undo_mftbmp_alloc;
2602         }
2603         BUG_ON(m != m_tmp);
2604         /*
2605          * Make sure the allocated mft record is written out to disk.
2606          * No need to set the inode dirty because the caller is going
2607          * to do that anyway after finishing with the new extent mft
2608          * record (e.g. at a minimum a new attribute will be added to
2609          * the mft record.
2610          */
2611         mark_ntfs_record_dirty(page, ofs);
2612         unlock_page(page);
2613         /*
2614          * Need to unmap the page since map_extent_mft_record() mapped
2615          * it as well so we have it mapped twice at the moment.
2616          */
2617         ntfs_unmap_page(page);
2618     } else {
2619         /*
2620          * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
2621          * is set to 1 but the mft record->link_count is 0.  The caller
2622          * needs to bear this in mind.
2623          */
2624         vi = new_inode(vol->sb);
2625         if (unlikely(!vi)) {
2626             err = -ENOMEM;
2627             /* Set the mft record itself not in use. */
2628             m->flags &= cpu_to_le16(
2629                     ~le16_to_cpu(MFT_RECORD_IN_USE));
2630             flush_dcache_page(page);
2631             /* Make sure the mft record is written out to disk. */
2632             mark_ntfs_record_dirty(page, ofs);
2633             unlock_page(page);
2634             ntfs_unmap_page(page);
2635             goto undo_mftbmp_alloc;
2636         }
2637         vi->i_ino = bit;
2638
2639         /* The owner and group come from the ntfs volume. */
2640         vi->i_uid = vol->uid;
2641         vi->i_gid = vol->gid;
2642
2643         /* Initialize the ntfs specific part of @vi. */
2644         ntfs_init_big_inode(vi);
2645         ni = NTFS_I(vi);
2646         /*
2647          * Set the appropriate mode, attribute type, and name.  For
2648          * directories, also setup the index values to the defaults.
2649          */
2650         if (S_ISDIR(mode)) {
2651             vi->i_mode = S_IFDIR | S_IRWXUGO;
2652             vi->i_mode &= ~vol->dmask;
2653
2654             NInoSetMstProtected(ni);
2655             ni->type = AT_INDEX_ALLOCATION;
2656             ni->name = I30;
2657             ni->name_len = 4;
2658
2659             ni->itype.index.block_size = 4096;
2660             ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
2661             ni->itype.index.collation_rule = COLLATION_FILE_NAME;
2662             if (vol->cluster_size <= ni->itype.index.block_size) {
2663                 ni->itype.index.vcn_size = vol->cluster_size;
2664                 ni->itype.index.vcn_size_bits =
2665                         vol->cluster_size_bits;
2666             } else {
2667                 ni->itype.index.vcn_size = vol->sector_size;
2668                 ni->itype.index.vcn_size_bits =
2669                         vol->sector_size_bits;
2670             }
2671         } else {
2672             vi->i_mode = S_IFREG | S_IRWXUGO;
2673             vi->i_mode &= ~vol->fmask;
2674
2675             ni->type = AT_DATA;
2676             ni->name = NULL;
2677             ni->name_len = 0;
2678         }
2679         if (IS_RDONLY(vi))
2680             vi->i_mode &= ~S_IWUGO;
2681
2682         /* Set the inode times to the current time. */
2683         vi->i_atime = vi->i_mtime = vi->i_ctime =
2684             current_time(vi);
2685         /*
2686          * Set the file size to 0, the ntfs inode sizes are set to 0 by
2687          * the call to ntfs_init_big_inode() below.
2688          */
2689         vi->i_size = 0;
2690         vi->i_blocks = 0;
2691
2692         /* Set the sequence number. */
2693         vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
2694         /*
2695          * Manually map, pin, and lock the mft record as we already
2696          * have its page mapped and it is very easy to do.
2697          */
2698         atomic_inc(&ni->count);
2699         mutex_lock(&ni->mrec_lock);
2700         ni->page = page;
2701         ni->page_ofs = ofs;
2702         /*
2703          * Make sure the allocated mft record is written out to disk.
2704          * NOTE: We do not set the ntfs inode dirty because this would
2705          * fail in ntfs_write_inode() because the inode does not have a
2706          * standard information attribute yet.  Also, there is no need
2707          * to set the inode dirty because the caller is going to do
2708          * that anyway after finishing with the new mft record (e.g. at
2709          * a minimum some new attributes will be added to the mft
2710          * record.
2711          */
2712         mark_ntfs_record_dirty(page, ofs);
2713         unlock_page(page);
2714
2715         /* Add the inode to the inode hash for the superblock. */
2716         insert_inode_hash(vi);
2717
2718         /* Update the default mft allocation position. */
2719         vol->mft_data_pos = bit + 1;
2720     }
2721     /*
2722      * Return the opened, allocated inode of the allocated mft record as
2723      * well as the mapped, pinned, and locked mft record.
2724      */
2725     ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2726             base_ni ? "extent " : "", (long long)bit);
2727     *mrec = m;
2728     return ni;
2729 undo_data_init:
2730     write_lock_irqsave(&mft_ni->size_lock, flags);
2731     mft_ni->initialized_size = old_data_initialized;
2732     i_size_write(vol->mft_ino, old_data_size);
2733     write_unlock_irqrestore(&mft_ni->size_lock, flags);
2734     goto undo_mftbmp_alloc_nolock;
2735 undo_mftbmp_alloc:
2736     down_write(&vol->mftbmp_lock);
2737 undo_mftbmp_alloc_nolock:
2738     if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
2739         ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2740         NVolSetErrors(vol);
2741     }
2742     up_write(&vol->mftbmp_lock);
2743 err_out:
2744     return ERR_PTR(err);
2745 max_err_out:
2746     ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
2747             "number of inodes (2^32) has already been reached.");
2748     up_write(&vol->mftbmp_lock);
2749     return ERR_PTR(-ENOSPC);
2750 }
2751
2752 /**
2753  * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
2754  * @ni:     ntfs inode of the mapped extent mft record to free
2755  * @m:      mapped extent mft record of the ntfs inode @ni
2756  *
2757  * Free the mapped extent mft record @m of the extent ntfs inode @ni.
2758  *
2759  * Note that this function unmaps the mft record and closes and destroys @ni
2760  * internally and hence you cannot use either @ni nor @m any more after this
2761  * function returns success.
2762  *
2763  * On success return 0 and on error return -errno.  @ni and @m are still valid
2764  * in this case and have not been freed.
2765  *
2766  * For some errors an error message is displayed and the success code 0 is
2767  * returned and the volume is then left dirty on umount.  This makes sense in
2768  * case we could not rollback the changes that were already done since the
2769  * caller no longer wants to reference this mft record so it does not matter to
2770  * the caller if something is wrong with it as long as it is properly detached
2771  * from the base inode.
2772  */
2773 int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2774 {
2775     unsigned long mft_no = ni->mft_no;
2776     ntfs_volume *vol = ni->vol;
2777     ntfs_inode *base_ni;
2778     ntfs_inode **extent_nis;
2779     int i, err;
2780     le16 old_seq_no;
2781     u16 seq_no;
2782
2783     BUG_ON(NInoAttr(ni));
2784     BUG_ON(ni->nr_extents != -1);
2785
2786     mutex_lock(&ni->extent_lock);
2787     base_ni = ni->ext.base_ntfs_ino;
2788     mutex_unlock(&ni->extent_lock);
2789
2790     BUG_ON(base_ni->nr_extents <= 0);
2791
2792     ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
2793             mft_no, base_ni->mft_no);
2794
2795     mutex_lock(&base_ni->extent_lock);
2796
2797     /* Make sure we are holding the only reference to the extent inode. */
2798     if (atomic_read(&ni->count) > 2) {
2799         ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
2800                 "not freeing.", base_ni->mft_no);
2801         mutex_unlock(&base_ni->extent_lock);
2802         return -EBUSY;
2803     }
2804
2805     /* Dissociate the ntfs inode from the base inode. */
2806     extent_nis = base_ni->ext.extent_ntfs_inos;
2807     err = -ENOENT;
2808     for (i = 0; i < base_ni->nr_extents; i++) {
2809         if (ni != extent_nis[i])
2810             continue;
2811         extent_nis += i;
2812         base_ni->nr_extents--;
2813         memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
2814                 sizeof(ntfs_inode*));
2815         err = 0;
2816         break;
2817     }
2818
2819     mutex_unlock(&base_ni->extent_lock);
2820
2821     if (unlikely(err)) {
2822         ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
2823                 "its base inode 0x%lx.", mft_no,
2824                 base_ni->mft_no);
2825         BUG();
2826     }
2827
2828     /*
2829      * The extent inode is no longer attached to the base inode so no one
2830      * can get a reference to it any more.
2831      */
2832
2833     /* Mark the mft record as not in use. */
2834     m->flags &= ~MFT_RECORD_IN_USE;
2835
2836     /* Increment the sequence number, skipping zero, if it is not zero. */
2837     old_seq_no = m->sequence_number;
2838     seq_no = le16_to_cpu(old_seq_no);
2839     if (seq_no == 0xffff)
2840         seq_no = 1;
2841     else if (seq_no)
2842         seq_no++;
2843     m->sequence_number = cpu_to_le16(seq_no);
2844
2845     /*
2846      * Set the ntfs inode dirty and write it out.  We do not need to worry
2847      * about the base inode here since whatever caused the extent mft
2848      * record to be freed is guaranteed to do it already.
2849      */
2850     NInoSetDirty(ni);
2851     err = write_mft_record(ni, m, 0);
2852     if (unlikely(err)) {
2853         ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
2854                 "freeing.", mft_no);
2855         goto rollback;
2856     }
2857 rollback_error:
2858     /* Unmap and throw away the now freed extent inode. */
2859     unmap_extent_mft_record(ni);
2860     ntfs_clear_extent_inode(ni);
2861
2862     /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
2863     down_write(&vol->mftbmp_lock);
2864     err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
2865     up_write(&vol->mftbmp_lock);
2866     if (unlikely(err)) {
2867         /*
2868          * The extent inode is gone but we failed to deallocate it in
2869          * the mft bitmap.  Just emit a warning and leave the volume
2870          * dirty on umount.
2871          */
2872         ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2873         NVolSetErrors(vol);
2874     }
2875     return 0;
2876 rollback:
2877     /* Rollback what we did... */
2878     mutex_lock(&base_ni->extent_lock);
2879     extent_nis = base_ni->ext.extent_ntfs_inos;
2880     if (!(base_ni->nr_extents & 3)) {
2881         int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
2882
2883         extent_nis = kmalloc(new_size, GFP_NOFS);
2884         if (unlikely(!extent_nis)) {
2885             ntfs_error(vol->sb, "Failed to allocate internal "
2886                     "buffer during rollback.%s", es);
2887             mutex_unlock(&base_ni->extent_lock);
2888             NVolSetErrors(vol);
2889             goto rollback_error;
2890         }
2891         if (base_ni->nr_extents) {
2892             BUG_ON(!base_ni->ext.extent_ntfs_inos);
2893             memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
2894                     new_size - 4 * sizeof(ntfs_inode*));
2895             kfree(base_ni->ext.extent_ntfs_inos);
2896         }
2897         base_ni->ext.extent_ntfs_inos = extent_nis;
2898     }
2899     m->flags |= MFT_RECORD_IN_USE;
2900     m->sequence_number = old_seq_no;
2901     extent_nis[base_ni->nr_extents++] = ni;
2902     mutex_unlock(&base_ni->extent_lock);
2903     mark_mft_record_dirty(ni);
2904     return err;
2905 }
2906 #endif /* NTFS_RW */