Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 #include <linux/slab.h>
0003 #include <linux/stat.h>
0004 #include <linux/sched/xacct.h>
0005 #include <linux/fcntl.h>
0006 #include <linux/file.h>
0007 #include <linux/uio.h>
0008 #include <linux/fsnotify.h>
0009 #include <linux/security.h>
0010 #include <linux/export.h>
0011 #include <linux/syscalls.h>
0012 #include <linux/pagemap.h>
0013 #include <linux/splice.h>
0014 #include <linux/compat.h>
0015 #include <linux/mount.h>
0016 #include <linux/fs.h>
0017 #include <linux/dax.h>
0018 #include "internal.h"
0019 
0020 #include <linux/uaccess.h>
0021 #include <asm/unistd.h>
0022 
0023 /*
0024  * Performs necessary checks before doing a clone.
0025  *
0026  * Can adjust amount of bytes to clone via @req_count argument.
0027  * Returns appropriate error code that caller should return or
0028  * zero in case the clone should be allowed.
0029  */
0030 static int generic_remap_checks(struct file *file_in, loff_t pos_in,
0031                 struct file *file_out, loff_t pos_out,
0032                 loff_t *req_count, unsigned int remap_flags)
0033 {
0034     struct inode *inode_in = file_in->f_mapping->host;
0035     struct inode *inode_out = file_out->f_mapping->host;
0036     uint64_t count = *req_count;
0037     uint64_t bcount;
0038     loff_t size_in, size_out;
0039     loff_t bs = inode_out->i_sb->s_blocksize;
0040     int ret;
0041 
0042     /* The start of both ranges must be aligned to an fs block. */
0043     if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
0044         return -EINVAL;
0045 
0046     /* Ensure offsets don't wrap. */
0047     if (pos_in + count < pos_in || pos_out + count < pos_out)
0048         return -EINVAL;
0049 
0050     size_in = i_size_read(inode_in);
0051     size_out = i_size_read(inode_out);
0052 
0053     /* Dedupe requires both ranges to be within EOF. */
0054     if ((remap_flags & REMAP_FILE_DEDUP) &&
0055         (pos_in >= size_in || pos_in + count > size_in ||
0056          pos_out >= size_out || pos_out + count > size_out))
0057         return -EINVAL;
0058 
0059     /* Ensure the infile range is within the infile. */
0060     if (pos_in >= size_in)
0061         return -EINVAL;
0062     count = min(count, size_in - (uint64_t)pos_in);
0063 
0064     ret = generic_write_check_limits(file_out, pos_out, &count);
0065     if (ret)
0066         return ret;
0067 
0068     /*
0069      * If the user wanted us to link to the infile's EOF, round up to the
0070      * next block boundary for this check.
0071      *
0072      * Otherwise, make sure the count is also block-aligned, having
0073      * already confirmed the starting offsets' block alignment.
0074      */
0075     if (pos_in + count == size_in &&
0076         (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
0077         bcount = ALIGN(size_in, bs) - pos_in;
0078     } else {
0079         if (!IS_ALIGNED(count, bs))
0080             count = ALIGN_DOWN(count, bs);
0081         bcount = count;
0082     }
0083 
0084     /* Don't allow overlapped cloning within the same file. */
0085     if (inode_in == inode_out &&
0086         pos_out + bcount > pos_in &&
0087         pos_out < pos_in + bcount)
0088         return -EINVAL;
0089 
0090     /*
0091      * We shortened the request but the caller can't deal with that, so
0092      * bounce the request back to userspace.
0093      */
0094     if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
0095         return -EINVAL;
0096 
0097     *req_count = count;
0098     return 0;
0099 }
0100 
0101 static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
0102                  bool write)
0103 {
0104     if (unlikely(pos < 0 || len < 0))
0105         return -EINVAL;
0106 
0107     if (unlikely((loff_t) (pos + len) < 0))
0108         return -EINVAL;
0109 
0110     return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
0111 }
0112 
0113 /*
0114  * Ensure that we don't remap a partial EOF block in the middle of something
0115  * else.  Assume that the offsets have already been checked for block
0116  * alignment.
0117  *
0118  * For clone we only link a partial EOF block above or at the destination file's
0119  * EOF.  For deduplication we accept a partial EOF block only if it ends at the
0120  * destination file's EOF (can not link it into the middle of a file).
0121  *
0122  * Shorten the request if possible.
0123  */
0124 static int generic_remap_check_len(struct inode *inode_in,
0125                    struct inode *inode_out,
0126                    loff_t pos_out,
0127                    loff_t *len,
0128                    unsigned int remap_flags)
0129 {
0130     u64 blkmask = i_blocksize(inode_in) - 1;
0131     loff_t new_len = *len;
0132 
0133     if ((*len & blkmask) == 0)
0134         return 0;
0135 
0136     if (pos_out + *len < i_size_read(inode_out))
0137         new_len &= ~blkmask;
0138 
0139     if (new_len == *len)
0140         return 0;
0141 
0142     if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
0143         *len = new_len;
0144         return 0;
0145     }
0146 
0147     return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
0148 }
0149 
0150 /* Read a page's worth of file data into the page cache. */
0151 static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
0152 {
0153     return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
0154 }
0155 
0156 /*
0157  * Lock two folios, ensuring that we lock in offset order if the folios
0158  * are from the same file.
0159  */
0160 static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2)
0161 {
0162     /* Always lock in order of increasing index. */
0163     if (folio1->index > folio2->index)
0164         swap(folio1, folio2);
0165 
0166     folio_lock(folio1);
0167     if (folio1 != folio2)
0168         folio_lock(folio2);
0169 }
0170 
0171 /* Unlock two folios, being careful not to unlock the same folio twice. */
0172 static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
0173 {
0174     folio_unlock(folio1);
0175     if (folio1 != folio2)
0176         folio_unlock(folio2);
0177 }
0178 
0179 /*
0180  * Compare extents of two files to see if they are the same.
0181  * Caller must have locked both inodes to prevent write races.
0182  */
0183 static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
0184                      struct file *dest, loff_t dstoff,
0185                      loff_t len, bool *is_same)
0186 {
0187     bool same = true;
0188     int error = -EINVAL;
0189 
0190     while (len) {
0191         struct folio *src_folio, *dst_folio;
0192         void *src_addr, *dst_addr;
0193         loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
0194                      PAGE_SIZE - offset_in_page(dstoff));
0195 
0196         cmp_len = min(cmp_len, len);
0197         if (cmp_len <= 0)
0198             goto out_error;
0199 
0200         src_folio = vfs_dedupe_get_folio(src, srcoff);
0201         if (IS_ERR(src_folio)) {
0202             error = PTR_ERR(src_folio);
0203             goto out_error;
0204         }
0205         dst_folio = vfs_dedupe_get_folio(dest, dstoff);
0206         if (IS_ERR(dst_folio)) {
0207             error = PTR_ERR(dst_folio);
0208             folio_put(src_folio);
0209             goto out_error;
0210         }
0211 
0212         vfs_lock_two_folios(src_folio, dst_folio);
0213 
0214         /*
0215          * Now that we've locked both folios, make sure they're still
0216          * mapped to the file data we're interested in.  If not,
0217          * someone is invalidating pages on us and we lose.
0218          */
0219         if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
0220             src_folio->mapping != src->f_mapping ||
0221             dst_folio->mapping != dest->f_mapping) {
0222             same = false;
0223             goto unlock;
0224         }
0225 
0226         src_addr = kmap_local_folio(src_folio,
0227                     offset_in_folio(src_folio, srcoff));
0228         dst_addr = kmap_local_folio(dst_folio,
0229                     offset_in_folio(dst_folio, dstoff));
0230 
0231         flush_dcache_folio(src_folio);
0232         flush_dcache_folio(dst_folio);
0233 
0234         if (memcmp(src_addr, dst_addr, cmp_len))
0235             same = false;
0236 
0237         kunmap_local(dst_addr);
0238         kunmap_local(src_addr);
0239 unlock:
0240         vfs_unlock_two_folios(src_folio, dst_folio);
0241         folio_put(dst_folio);
0242         folio_put(src_folio);
0243 
0244         if (!same)
0245             break;
0246 
0247         srcoff += cmp_len;
0248         dstoff += cmp_len;
0249         len -= cmp_len;
0250     }
0251 
0252     *is_same = same;
0253     return 0;
0254 
0255 out_error:
0256     return error;
0257 }
0258 
0259 /*
0260  * Check that the two inodes are eligible for cloning, the ranges make
0261  * sense, and then flush all dirty data.  Caller must ensure that the
0262  * inodes have been locked against any other modifications.
0263  *
0264  * If there's an error, then the usual negative error code is returned.
0265  * Otherwise returns 0 with *len set to the request length.
0266  */
0267 int
0268 __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
0269                 struct file *file_out, loff_t pos_out,
0270                 loff_t *len, unsigned int remap_flags,
0271                 const struct iomap_ops *dax_read_ops)
0272 {
0273     struct inode *inode_in = file_inode(file_in);
0274     struct inode *inode_out = file_inode(file_out);
0275     bool same_inode = (inode_in == inode_out);
0276     int ret;
0277 
0278     /* Don't touch certain kinds of inodes */
0279     if (IS_IMMUTABLE(inode_out))
0280         return -EPERM;
0281 
0282     if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
0283         return -ETXTBSY;
0284 
0285     /* Don't reflink dirs, pipes, sockets... */
0286     if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
0287         return -EISDIR;
0288     if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
0289         return -EINVAL;
0290 
0291     /* Zero length dedupe exits immediately; reflink goes to EOF. */
0292     if (*len == 0) {
0293         loff_t isize = i_size_read(inode_in);
0294 
0295         if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
0296             return 0;
0297         if (pos_in > isize)
0298             return -EINVAL;
0299         *len = isize - pos_in;
0300         if (*len == 0)
0301             return 0;
0302     }
0303 
0304     /* Check that we don't violate system file offset limits. */
0305     ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
0306             remap_flags);
0307     if (ret)
0308         return ret;
0309 
0310     /* Wait for the completion of any pending IOs on both files */
0311     inode_dio_wait(inode_in);
0312     if (!same_inode)
0313         inode_dio_wait(inode_out);
0314 
0315     ret = filemap_write_and_wait_range(inode_in->i_mapping,
0316             pos_in, pos_in + *len - 1);
0317     if (ret)
0318         return ret;
0319 
0320     ret = filemap_write_and_wait_range(inode_out->i_mapping,
0321             pos_out, pos_out + *len - 1);
0322     if (ret)
0323         return ret;
0324 
0325     /*
0326      * Check that the extents are the same.
0327      */
0328     if (remap_flags & REMAP_FILE_DEDUP) {
0329         bool        is_same = false;
0330 
0331         if (*len == 0)
0332             return 0;
0333 
0334         if (!IS_DAX(inode_in))
0335             ret = vfs_dedupe_file_range_compare(file_in, pos_in,
0336                     file_out, pos_out, *len, &is_same);
0337         else if (dax_read_ops)
0338             ret = dax_dedupe_file_range_compare(inode_in, pos_in,
0339                     inode_out, pos_out, *len, &is_same,
0340                     dax_read_ops);
0341         else
0342             return -EINVAL;
0343         if (ret)
0344             return ret;
0345         if (!is_same)
0346             return -EBADE;
0347     }
0348 
0349     ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
0350             remap_flags);
0351     if (ret)
0352         return ret;
0353 
0354     /* If can't alter the file contents, we're done. */
0355     if (!(remap_flags & REMAP_FILE_DEDUP))
0356         ret = file_modified(file_out);
0357 
0358     return ret;
0359 }
0360 
0361 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
0362                   struct file *file_out, loff_t pos_out,
0363                   loff_t *len, unsigned int remap_flags)
0364 {
0365     return __generic_remap_file_range_prep(file_in, pos_in, file_out,
0366                            pos_out, len, remap_flags, NULL);
0367 }
0368 EXPORT_SYMBOL(generic_remap_file_range_prep);
0369 
0370 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
0371                struct file *file_out, loff_t pos_out,
0372                loff_t len, unsigned int remap_flags)
0373 {
0374     loff_t ret;
0375 
0376     WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
0377 
0378     if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
0379         return -EXDEV;
0380 
0381     ret = generic_file_rw_checks(file_in, file_out);
0382     if (ret < 0)
0383         return ret;
0384 
0385     if (!file_in->f_op->remap_file_range)
0386         return -EOPNOTSUPP;
0387 
0388     ret = remap_verify_area(file_in, pos_in, len, false);
0389     if (ret)
0390         return ret;
0391 
0392     ret = remap_verify_area(file_out, pos_out, len, true);
0393     if (ret)
0394         return ret;
0395 
0396     ret = file_in->f_op->remap_file_range(file_in, pos_in,
0397             file_out, pos_out, len, remap_flags);
0398     if (ret < 0)
0399         return ret;
0400 
0401     fsnotify_access(file_in);
0402     fsnotify_modify(file_out);
0403     return ret;
0404 }
0405 EXPORT_SYMBOL(do_clone_file_range);
0406 
0407 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
0408                 struct file *file_out, loff_t pos_out,
0409                 loff_t len, unsigned int remap_flags)
0410 {
0411     loff_t ret;
0412 
0413     file_start_write(file_out);
0414     ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
0415                   remap_flags);
0416     file_end_write(file_out);
0417 
0418     return ret;
0419 }
0420 EXPORT_SYMBOL(vfs_clone_file_range);
0421 
0422 /* Check whether we are allowed to dedupe the destination file */
0423 static bool allow_file_dedupe(struct file *file)
0424 {
0425     struct user_namespace *mnt_userns = file_mnt_user_ns(file);
0426     struct inode *inode = file_inode(file);
0427 
0428     if (capable(CAP_SYS_ADMIN))
0429         return true;
0430     if (file->f_mode & FMODE_WRITE)
0431         return true;
0432     if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
0433         return true;
0434     if (!inode_permission(mnt_userns, inode, MAY_WRITE))
0435         return true;
0436     return false;
0437 }
0438 
0439 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
0440                  struct file *dst_file, loff_t dst_pos,
0441                  loff_t len, unsigned int remap_flags)
0442 {
0443     loff_t ret;
0444 
0445     WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
0446                      REMAP_FILE_CAN_SHORTEN));
0447 
0448     ret = mnt_want_write_file(dst_file);
0449     if (ret)
0450         return ret;
0451 
0452     /*
0453      * This is redundant if called from vfs_dedupe_file_range(), but other
0454      * callers need it and it's not performance sesitive...
0455      */
0456     ret = remap_verify_area(src_file, src_pos, len, false);
0457     if (ret)
0458         goto out_drop_write;
0459 
0460     ret = remap_verify_area(dst_file, dst_pos, len, true);
0461     if (ret)
0462         goto out_drop_write;
0463 
0464     ret = -EPERM;
0465     if (!allow_file_dedupe(dst_file))
0466         goto out_drop_write;
0467 
0468     ret = -EXDEV;
0469     if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
0470         goto out_drop_write;
0471 
0472     ret = -EISDIR;
0473     if (S_ISDIR(file_inode(dst_file)->i_mode))
0474         goto out_drop_write;
0475 
0476     ret = -EINVAL;
0477     if (!dst_file->f_op->remap_file_range)
0478         goto out_drop_write;
0479 
0480     if (len == 0) {
0481         ret = 0;
0482         goto out_drop_write;
0483     }
0484 
0485     ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
0486             dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
0487 out_drop_write:
0488     mnt_drop_write_file(dst_file);
0489 
0490     return ret;
0491 }
0492 EXPORT_SYMBOL(vfs_dedupe_file_range_one);
0493 
0494 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
0495 {
0496     struct file_dedupe_range_info *info;
0497     struct inode *src = file_inode(file);
0498     u64 off;
0499     u64 len;
0500     int i;
0501     int ret;
0502     u16 count = same->dest_count;
0503     loff_t deduped;
0504 
0505     if (!(file->f_mode & FMODE_READ))
0506         return -EINVAL;
0507 
0508     if (same->reserved1 || same->reserved2)
0509         return -EINVAL;
0510 
0511     off = same->src_offset;
0512     len = same->src_length;
0513 
0514     if (S_ISDIR(src->i_mode))
0515         return -EISDIR;
0516 
0517     if (!S_ISREG(src->i_mode))
0518         return -EINVAL;
0519 
0520     if (!file->f_op->remap_file_range)
0521         return -EOPNOTSUPP;
0522 
0523     ret = remap_verify_area(file, off, len, false);
0524     if (ret < 0)
0525         return ret;
0526     ret = 0;
0527 
0528     if (off + len > i_size_read(src))
0529         return -EINVAL;
0530 
0531     /* Arbitrary 1G limit on a single dedupe request, can be raised. */
0532     len = min_t(u64, len, 1 << 30);
0533 
0534     /* pre-format output fields to sane values */
0535     for (i = 0; i < count; i++) {
0536         same->info[i].bytes_deduped = 0ULL;
0537         same->info[i].status = FILE_DEDUPE_RANGE_SAME;
0538     }
0539 
0540     for (i = 0, info = same->info; i < count; i++, info++) {
0541         struct fd dst_fd = fdget(info->dest_fd);
0542         struct file *dst_file = dst_fd.file;
0543 
0544         if (!dst_file) {
0545             info->status = -EBADF;
0546             goto next_loop;
0547         }
0548 
0549         if (info->reserved) {
0550             info->status = -EINVAL;
0551             goto next_fdput;
0552         }
0553 
0554         deduped = vfs_dedupe_file_range_one(file, off, dst_file,
0555                             info->dest_offset, len,
0556                             REMAP_FILE_CAN_SHORTEN);
0557         if (deduped == -EBADE)
0558             info->status = FILE_DEDUPE_RANGE_DIFFERS;
0559         else if (deduped < 0)
0560             info->status = deduped;
0561         else
0562             info->bytes_deduped = len;
0563 
0564 next_fdput:
0565         fdput(dst_fd);
0566 next_loop:
0567         if (fatal_signal_pending(current))
0568             break;
0569     }
0570     return ret;
0571 }
0572 EXPORT_SYMBOL(vfs_dedupe_file_range);