Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * (C) 2001 Clemson University and The University of Chicago
0004  * Copyright 2018 Omnibond Systems, L.L.C.
0005  *
0006  * See COPYING in top-level directory.
0007  */
0008 
0009 /*
0010  *  Linux VFS file operations.
0011  */
0012 
0013 #include "protocol.h"
0014 #include "orangefs-kernel.h"
0015 #include "orangefs-bufmap.h"
0016 #include <linux/fs.h>
0017 #include <linux/pagemap.h>
0018 
0019 static int flush_racache(struct inode *inode)
0020 {
0021     struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
0022     struct orangefs_kernel_op_s *new_op;
0023     int ret;
0024 
0025     gossip_debug(GOSSIP_UTILS_DEBUG,
0026         "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
0027         get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
0028         orangefs_inode->refn.fs_id);
0029 
0030     new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
0031     if (!new_op)
0032         return -ENOMEM;
0033     new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
0034 
0035     ret = service_operation(new_op, "orangefs_flush_racache",
0036         get_interruptible_flag(inode));
0037 
0038     gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
0039         __func__, ret);
0040 
0041     op_release(new_op);
0042     return ret;
0043 }
0044 
0045 /*
0046  * Post and wait for the I/O upcall to finish
0047  */
0048 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
0049     loff_t *offset, struct iov_iter *iter, size_t total_size,
0050     loff_t readahead_size, struct orangefs_write_range *wr,
0051     int *index_return, struct file *file)
0052 {
0053     struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
0054     struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
0055     struct orangefs_kernel_op_s *new_op = NULL;
0056     int buffer_index;
0057     ssize_t ret;
0058     size_t copy_amount;
0059     int open_for_read;
0060     int open_for_write;
0061 
0062     new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
0063     if (!new_op)
0064         return -ENOMEM;
0065 
0066     /* synchronous I/O */
0067     new_op->upcall.req.io.readahead_size = readahead_size;
0068     new_op->upcall.req.io.io_type = type;
0069     new_op->upcall.req.io.refn = orangefs_inode->refn;
0070 
0071 populate_shared_memory:
0072     /* get a shared buffer index */
0073     buffer_index = orangefs_bufmap_get();
0074     if (buffer_index < 0) {
0075         ret = buffer_index;
0076         gossip_debug(GOSSIP_FILE_DEBUG,
0077                  "%s: orangefs_bufmap_get failure (%zd)\n",
0078                  __func__, ret);
0079         goto out;
0080     }
0081     gossip_debug(GOSSIP_FILE_DEBUG,
0082              "%s(%pU): GET op %p -> buffer_index %d\n",
0083              __func__,
0084              handle,
0085              new_op,
0086              buffer_index);
0087 
0088     new_op->uses_shared_memory = 1;
0089     new_op->upcall.req.io.buf_index = buffer_index;
0090     new_op->upcall.req.io.count = total_size;
0091     new_op->upcall.req.io.offset = *offset;
0092     if (type == ORANGEFS_IO_WRITE && wr) {
0093         new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
0094         new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
0095     }
0096     /*
0097      * Orangefs has no open, and orangefs checks file permissions
0098      * on each file access. Posix requires that file permissions
0099      * be checked on open and nowhere else. Orangefs-through-the-kernel
0100      * needs to seem posix compliant.
0101      *
0102      * The VFS opens files, even if the filesystem provides no
0103      * method. We can see if a file was successfully opened for
0104      * read and or for write by looking at file->f_mode.
0105      *
0106      * When writes are flowing from the page cache, file is no
0107      * longer available. We can trust the VFS to have checked
0108      * file->f_mode before writing to the page cache.
0109      *
0110      * The mode of a file might change between when it is opened
0111      * and IO commences, or it might be created with an arbitrary mode.
0112      *
0113      * We'll make sure we don't hit EACCES during the IO stage by
0114      * using UID 0. Some of the time we have access without changing
0115      * to UID 0 - how to check?
0116      */
0117     if (file) {
0118         open_for_write = file->f_mode & FMODE_WRITE;
0119         open_for_read = file->f_mode & FMODE_READ;
0120     } else {
0121         open_for_write = 1;
0122         open_for_read = 0; /* not relevant? */
0123     }
0124     if ((type == ORANGEFS_IO_WRITE) && open_for_write)
0125         new_op->upcall.uid = 0;
0126     if ((type == ORANGEFS_IO_READ) && open_for_read)
0127         new_op->upcall.uid = 0;
0128 
0129     gossip_debug(GOSSIP_FILE_DEBUG,
0130              "%s(%pU): offset: %llu total_size: %zd\n",
0131              __func__,
0132              handle,
0133              llu(*offset),
0134              total_size);
0135     /*
0136      * Stage 1: copy the buffers into client-core's address space
0137      */
0138     if (type == ORANGEFS_IO_WRITE && total_size) {
0139         ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index,
0140             total_size);
0141         if (ret < 0) {
0142             gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
0143                 __func__, (long)ret);
0144             goto out;
0145         }
0146     }
0147 
0148     gossip_debug(GOSSIP_FILE_DEBUG,
0149              "%s(%pU): Calling post_io_request with tag (%llu)\n",
0150              __func__,
0151              handle,
0152              llu(new_op->tag));
0153 
0154     /* Stage 2: Service the I/O operation */
0155     ret = service_operation(new_op,
0156                 type == ORANGEFS_IO_WRITE ?
0157                     "file_write" :
0158                     "file_read",
0159                 get_interruptible_flag(inode));
0160 
0161     /*
0162      * If service_operation() returns -EAGAIN #and# the operation was
0163      * purged from orangefs_request_list or htable_ops_in_progress, then
0164      * we know that the client was restarted, causing the shared memory
0165      * area to be wiped clean.  To restart a  write operation in this
0166      * case, we must re-copy the data from the user's iovec to a NEW
0167      * shared memory location. To restart a read operation, we must get
0168      * a new shared memory location.
0169      */
0170     if (ret == -EAGAIN && op_state_purged(new_op)) {
0171         orangefs_bufmap_put(buffer_index);
0172         if (type == ORANGEFS_IO_WRITE)
0173             iov_iter_revert(iter, total_size);
0174         gossip_debug(GOSSIP_FILE_DEBUG,
0175                  "%s:going to repopulate_shared_memory.\n",
0176                  __func__);
0177         goto populate_shared_memory;
0178     }
0179 
0180     if (ret < 0) {
0181         if (ret == -EINTR) {
0182             /*
0183              * We can't return EINTR if any data was written,
0184              * it's not POSIX. It is minimally acceptable
0185              * to give a partial write, the way NFS does.
0186              *
0187              * It would be optimal to return all or nothing,
0188              * but if a userspace write is bigger than
0189              * an IO buffer, and the interrupt occurs
0190              * between buffer writes, that would not be
0191              * possible.
0192              */
0193             switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
0194             /*
0195              * If the op was waiting when the interrupt
0196              * occurred, then the client-core did not
0197              * trigger the write.
0198              */
0199             case OP_VFS_STATE_WAITING:
0200                 if (*offset == 0)
0201                     ret = -EINTR;
0202                 else
0203                     ret = 0;
0204                 break;
0205             /*
0206              * If the op was in progress when the interrupt
0207              * occurred, then the client-core was able to
0208              * trigger the write.
0209              */
0210             case OP_VFS_STATE_INPROGR:
0211                 if (type == ORANGEFS_IO_READ)
0212                     ret = -EINTR;
0213                 else
0214                     ret = total_size;
0215                 break;
0216             default:
0217                 gossip_err("%s: unexpected op state :%d:.\n",
0218                        __func__,
0219                        new_op->op_state);
0220                 ret = 0;
0221                 break;
0222             }
0223             gossip_debug(GOSSIP_FILE_DEBUG,
0224                      "%s: got EINTR, state:%d: %p\n",
0225                      __func__,
0226                      new_op->op_state,
0227                      new_op);
0228         } else {
0229             gossip_err("%s: error in %s handle %pU, returning %zd\n",
0230                 __func__,
0231                 type == ORANGEFS_IO_READ ?
0232                     "read from" : "write to",
0233                 handle, ret);
0234         }
0235         if (orangefs_cancel_op_in_progress(new_op))
0236             return ret;
0237 
0238         goto out;
0239     }
0240 
0241     /*
0242      * Stage 3: Post copy buffers from client-core's address space
0243      */
0244     if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) {
0245         /*
0246          * NOTE: the iovector can either contain addresses which
0247          *       can futher be kernel-space or user-space addresses.
0248          *       or it can pointers to struct page's
0249          */
0250 
0251         copy_amount = new_op->downcall.resp.io.amt_complete;
0252 
0253         ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
0254             copy_amount);
0255         if (ret < 0) {
0256             gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
0257                 __func__, (long)ret);
0258             goto out;
0259         }
0260     }
0261     gossip_debug(GOSSIP_FILE_DEBUG,
0262         "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
0263         __func__,
0264         handle,
0265         type == ORANGEFS_IO_READ ?  "read" : "written",
0266         (int)new_op->downcall.resp.io.amt_complete);
0267 
0268     ret = new_op->downcall.resp.io.amt_complete;
0269 
0270 out:
0271     if (buffer_index >= 0) {
0272         orangefs_bufmap_put(buffer_index);
0273         gossip_debug(GOSSIP_FILE_DEBUG,
0274             "%s(%pU): PUT buffer_index %d\n",
0275             __func__, handle, buffer_index);
0276         buffer_index = -1;
0277     }
0278     op_release(new_op);
0279     return ret;
0280 }
0281 
0282 int orangefs_revalidate_mapping(struct inode *inode)
0283 {
0284     struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
0285     struct address_space *mapping = inode->i_mapping;
0286     unsigned long *bitlock = &orangefs_inode->bitlock;
0287     int ret;
0288 
0289     while (1) {
0290         ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
0291         if (ret)
0292             return ret;
0293         spin_lock(&inode->i_lock);
0294         if (test_bit(1, bitlock)) {
0295             spin_unlock(&inode->i_lock);
0296             continue;
0297         }
0298         if (!time_before(jiffies, orangefs_inode->mapping_time))
0299             break;
0300         spin_unlock(&inode->i_lock);
0301         return 0;
0302     }
0303 
0304     set_bit(1, bitlock);
0305     smp_wmb();
0306     spin_unlock(&inode->i_lock);
0307 
0308     unmap_mapping_range(mapping, 0, 0, 0);
0309     ret = filemap_write_and_wait(mapping);
0310     if (!ret)
0311         ret = invalidate_inode_pages2(mapping);
0312 
0313     orangefs_inode->mapping_time = jiffies +
0314         orangefs_cache_timeout_msecs*HZ/1000;
0315 
0316     clear_bit(1, bitlock);
0317     smp_mb__after_atomic();
0318     wake_up_bit(bitlock, 1);
0319 
0320     return ret;
0321 }
0322 
0323 static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
0324     struct iov_iter *iter)
0325 {
0326     int ret;
0327     orangefs_stats.reads++;
0328 
0329     down_read(&file_inode(iocb->ki_filp)->i_rwsem);
0330     ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
0331     if (ret)
0332         goto out;
0333 
0334     ret = generic_file_read_iter(iocb, iter);
0335 out:
0336     up_read(&file_inode(iocb->ki_filp)->i_rwsem);
0337     return ret;
0338 }
0339 
0340 static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
0341     struct iov_iter *iter)
0342 {
0343     int ret;
0344     orangefs_stats.writes++;
0345 
0346     if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
0347         ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
0348         if (ret)
0349             return ret;
0350     }
0351 
0352     ret = generic_file_write_iter(iocb, iter);
0353     return ret;
0354 }
0355 
0356 static vm_fault_t orangefs_fault(struct vm_fault *vmf)
0357 {
0358     struct file *file = vmf->vma->vm_file;
0359     int ret;
0360     ret = orangefs_inode_getattr(file->f_mapping->host,
0361         ORANGEFS_GETATTR_SIZE);
0362     if (ret == -ESTALE)
0363         ret = -EIO;
0364     if (ret) {
0365         gossip_err("%s: orangefs_inode_getattr failed, "
0366             "ret:%d:.\n", __func__, ret);
0367         return VM_FAULT_SIGBUS;
0368     }
0369     return filemap_fault(vmf);
0370 }
0371 
0372 static const struct vm_operations_struct orangefs_file_vm_ops = {
0373     .fault = orangefs_fault,
0374     .map_pages = filemap_map_pages,
0375     .page_mkwrite = orangefs_page_mkwrite,
0376 };
0377 
0378 /*
0379  * Memory map a region of a file.
0380  */
0381 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
0382 {
0383     int ret;
0384 
0385     ret = orangefs_revalidate_mapping(file_inode(file));
0386     if (ret)
0387         return ret;
0388 
0389     gossip_debug(GOSSIP_FILE_DEBUG,
0390              "orangefs_file_mmap: called on %pD\n", file);
0391 
0392     /* set the sequential readahead hint */
0393     vma->vm_flags |= VM_SEQ_READ;
0394     vma->vm_flags &= ~VM_RAND_READ;
0395 
0396     file_accessed(file);
0397     vma->vm_ops = &orangefs_file_vm_ops;
0398     return 0;
0399 }
0400 
0401 #define mapping_nrpages(idata) ((idata)->nrpages)
0402 
0403 /*
0404  * Called to notify the module that there are no more references to
0405  * this file (i.e. no processes have it open).
0406  *
0407  * \note Not called when each file is closed.
0408  */
0409 static int orangefs_file_release(struct inode *inode, struct file *file)
0410 {
0411     gossip_debug(GOSSIP_FILE_DEBUG,
0412              "orangefs_file_release: called on %pD\n",
0413              file);
0414 
0415     /*
0416      * remove all associated inode pages from the page cache and
0417      * readahead cache (if any); this forces an expensive refresh of
0418      * data for the next caller of mmap (or 'get_block' accesses)
0419      */
0420     if (file_inode(file) &&
0421         file_inode(file)->i_mapping &&
0422         mapping_nrpages(&file_inode(file)->i_data)) {
0423         if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
0424             gossip_debug(GOSSIP_INODE_DEBUG,
0425                 "calling flush_racache on %pU\n",
0426                 get_khandle_from_ino(inode));
0427             flush_racache(inode);
0428             gossip_debug(GOSSIP_INODE_DEBUG,
0429                 "flush_racache finished\n");
0430         }
0431 
0432     }
0433     return 0;
0434 }
0435 
0436 /*
0437  * Push all data for a specific file onto permanent storage.
0438  */
0439 static int orangefs_fsync(struct file *file,
0440                loff_t start,
0441                loff_t end,
0442                int datasync)
0443 {
0444     int ret;
0445     struct orangefs_inode_s *orangefs_inode =
0446         ORANGEFS_I(file_inode(file));
0447     struct orangefs_kernel_op_s *new_op = NULL;
0448 
0449     ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
0450         start, end);
0451     if (ret < 0)
0452         return ret;
0453 
0454     new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
0455     if (!new_op)
0456         return -ENOMEM;
0457     new_op->upcall.req.fsync.refn = orangefs_inode->refn;
0458 
0459     ret = service_operation(new_op,
0460             "orangefs_fsync",
0461             get_interruptible_flag(file_inode(file)));
0462 
0463     gossip_debug(GOSSIP_FILE_DEBUG,
0464              "orangefs_fsync got return value of %d\n",
0465              ret);
0466 
0467     op_release(new_op);
0468     return ret;
0469 }
0470 
0471 /*
0472  * Change the file pointer position for an instance of an open file.
0473  *
0474  * \note If .llseek is overriden, we must acquire lock as described in
0475  *       Documentation/filesystems/locking.rst.
0476  *
0477  * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
0478  * require much changes to the FS
0479  */
0480 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
0481 {
0482     int ret = -EINVAL;
0483     struct inode *inode = file_inode(file);
0484 
0485     if (origin == SEEK_END) {
0486         /*
0487          * revalidate the inode's file size.
0488          * NOTE: We are only interested in file size here,
0489          * so we set mask accordingly.
0490          */
0491         ret = orangefs_inode_getattr(file->f_mapping->host,
0492             ORANGEFS_GETATTR_SIZE);
0493         if (ret == -ESTALE)
0494             ret = -EIO;
0495         if (ret) {
0496             gossip_debug(GOSSIP_FILE_DEBUG,
0497                      "%s:%s:%d calling make bad inode\n",
0498                      __FILE__,
0499                      __func__,
0500                      __LINE__);
0501             return ret;
0502         }
0503     }
0504 
0505     gossip_debug(GOSSIP_FILE_DEBUG,
0506              "orangefs_file_llseek: offset is %ld | origin is %d"
0507              " | inode size is %lu\n",
0508              (long)offset,
0509              origin,
0510              (unsigned long)i_size_read(inode));
0511 
0512     return generic_file_llseek(file, offset, origin);
0513 }
0514 
0515 /*
0516  * Support local locks (locks that only this kernel knows about)
0517  * if Orangefs was mounted -o local_lock.
0518  */
0519 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
0520 {
0521     int rc = -EINVAL;
0522 
0523     if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
0524         if (cmd == F_GETLK) {
0525             rc = 0;
0526             posix_test_lock(filp, fl);
0527         } else {
0528             rc = posix_lock_file(filp, fl, NULL);
0529         }
0530     }
0531 
0532     return rc;
0533 }
0534 
0535 static int orangefs_flush(struct file *file, fl_owner_t id)
0536 {
0537     /*
0538      * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
0539      * service_operation in orangefs_fsync.
0540      *
0541      * Do not send fsync to OrangeFS server on a close.  Do send fsync
0542      * on an explicit fsync call.  This duplicates historical OrangeFS
0543      * behavior.
0544      */
0545     int r;
0546 
0547     r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
0548     if (r > 0)
0549         return 0;
0550     else
0551         return r;
0552 }
0553 
0554 /** ORANGEFS implementation of VFS file operations */
0555 const struct file_operations orangefs_file_operations = {
0556     .llseek     = orangefs_file_llseek,
0557     .read_iter  = orangefs_file_read_iter,
0558     .write_iter = orangefs_file_write_iter,
0559     .lock       = orangefs_lock,
0560     .mmap       = orangefs_file_mmap,
0561     .open       = generic_file_open,
0562     .splice_read    = generic_file_splice_read,
0563     .splice_write   = iter_file_splice_write,
0564     .flush      = orangefs_flush,
0565     .release    = orangefs_file_release,
0566     .fsync      = orangefs_fsync,
0567 };