Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * memfd_create system call and file sealing support
0003  *
0004  * Code was originally included in shmem.c, and broken out to facilitate
0005  * use by hugetlbfs as well as tmpfs.
0006  *
0007  * This file is released under the GPL.
0008  */
0009 
0010 #include <linux/fs.h>
0011 #include <linux/vfs.h>
0012 #include <linux/pagemap.h>
0013 #include <linux/file.h>
0014 #include <linux/mm.h>
0015 #include <linux/sched/signal.h>
0016 #include <linux/khugepaged.h>
0017 #include <linux/syscalls.h>
0018 #include <linux/hugetlb.h>
0019 #include <linux/shmem_fs.h>
0020 #include <linux/memfd.h>
0021 #include <uapi/linux/memfd.h>
0022 
0023 /*
0024  * We need a tag: a new tag would expand every xa_node by 8 bytes,
0025  * so reuse a tag which we firmly believe is never set or cleared on tmpfs
0026  * or hugetlbfs because they are memory only filesystems.
0027  */
0028 #define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
0029 #define LAST_SCAN               4       /* about 150ms max */
0030 
0031 static void memfd_tag_pins(struct xa_state *xas)
0032 {
0033     struct page *page;
0034     int latency = 0;
0035     int cache_count;
0036 
0037     lru_add_drain();
0038 
0039     xas_lock_irq(xas);
0040     xas_for_each(xas, page, ULONG_MAX) {
0041         cache_count = 1;
0042         if (!xa_is_value(page) &&
0043             PageTransHuge(page) && !PageHuge(page))
0044             cache_count = HPAGE_PMD_NR;
0045 
0046         if (!xa_is_value(page) &&
0047             page_count(page) - total_mapcount(page) != cache_count)
0048             xas_set_mark(xas, MEMFD_TAG_PINNED);
0049         if (cache_count != 1)
0050             xas_set(xas, page->index + cache_count);
0051 
0052         latency += cache_count;
0053         if (latency < XA_CHECK_SCHED)
0054             continue;
0055         latency = 0;
0056 
0057         xas_pause(xas);
0058         xas_unlock_irq(xas);
0059         cond_resched();
0060         xas_lock_irq(xas);
0061     }
0062     xas_unlock_irq(xas);
0063 }
0064 
0065 /*
0066  * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
0067  * via get_user_pages(), drivers might have some pending I/O without any active
0068  * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
0069  * and see whether it has an elevated ref-count. If so, we tag them and wait for
0070  * them to be dropped.
0071  * The caller must guarantee that no new user will acquire writable references
0072  * to those pages to avoid races.
0073  */
0074 static int memfd_wait_for_pins(struct address_space *mapping)
0075 {
0076     XA_STATE(xas, &mapping->i_pages, 0);
0077     struct page *page;
0078     int error, scan;
0079 
0080     memfd_tag_pins(&xas);
0081 
0082     error = 0;
0083     for (scan = 0; scan <= LAST_SCAN; scan++) {
0084         int latency = 0;
0085         int cache_count;
0086 
0087         if (!xas_marked(&xas, MEMFD_TAG_PINNED))
0088             break;
0089 
0090         if (!scan)
0091             lru_add_drain_all();
0092         else if (schedule_timeout_killable((HZ << scan) / 200))
0093             scan = LAST_SCAN;
0094 
0095         xas_set(&xas, 0);
0096         xas_lock_irq(&xas);
0097         xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
0098             bool clear = true;
0099 
0100             cache_count = 1;
0101             if (!xa_is_value(page) &&
0102                 PageTransHuge(page) && !PageHuge(page))
0103                 cache_count = HPAGE_PMD_NR;
0104 
0105             if (!xa_is_value(page) && cache_count !=
0106                 page_count(page) - total_mapcount(page)) {
0107                 /*
0108                  * On the last scan, we clean up all those tags
0109                  * we inserted; but make a note that we still
0110                  * found pages pinned.
0111                  */
0112                 if (scan == LAST_SCAN)
0113                     error = -EBUSY;
0114                 else
0115                     clear = false;
0116             }
0117             if (clear)
0118                 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
0119 
0120             latency += cache_count;
0121             if (latency < XA_CHECK_SCHED)
0122                 continue;
0123             latency = 0;
0124 
0125             xas_pause(&xas);
0126             xas_unlock_irq(&xas);
0127             cond_resched();
0128             xas_lock_irq(&xas);
0129         }
0130         xas_unlock_irq(&xas);
0131     }
0132 
0133     return error;
0134 }
0135 
0136 static unsigned int *memfd_file_seals_ptr(struct file *file)
0137 {
0138     if (shmem_file(file))
0139         return &SHMEM_I(file_inode(file))->seals;
0140 
0141 #ifdef CONFIG_HUGETLBFS
0142     if (is_file_hugepages(file))
0143         return &HUGETLBFS_I(file_inode(file))->seals;
0144 #endif
0145 
0146     return NULL;
0147 }
0148 
0149 #define F_ALL_SEALS (F_SEAL_SEAL | \
0150              F_SEAL_SHRINK | \
0151              F_SEAL_GROW | \
0152              F_SEAL_WRITE | \
0153              F_SEAL_FUTURE_WRITE)
0154 
0155 static int memfd_add_seals(struct file *file, unsigned int seals)
0156 {
0157     struct inode *inode = file_inode(file);
0158     unsigned int *file_seals;
0159     int error;
0160 
0161     /*
0162      * SEALING
0163      * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
0164      * but restrict access to a specific subset of file operations. Seals
0165      * can only be added, but never removed. This way, mutually untrusted
0166      * parties can share common memory regions with a well-defined policy.
0167      * A malicious peer can thus never perform unwanted operations on a
0168      * shared object.
0169      *
0170      * Seals are only supported on special tmpfs or hugetlbfs files and
0171      * always affect the whole underlying inode. Once a seal is set, it
0172      * may prevent some kinds of access to the file. Currently, the
0173      * following seals are defined:
0174      *   SEAL_SEAL: Prevent further seals from being set on this file
0175      *   SEAL_SHRINK: Prevent the file from shrinking
0176      *   SEAL_GROW: Prevent the file from growing
0177      *   SEAL_WRITE: Prevent write access to the file
0178      *
0179      * As we don't require any trust relationship between two parties, we
0180      * must prevent seals from being removed. Therefore, sealing a file
0181      * only adds a given set of seals to the file, it never touches
0182      * existing seals. Furthermore, the "setting seals"-operation can be
0183      * sealed itself, which basically prevents any further seal from being
0184      * added.
0185      *
0186      * Semantics of sealing are only defined on volatile files. Only
0187      * anonymous tmpfs and hugetlbfs files support sealing. More
0188      * importantly, seals are never written to disk. Therefore, there's
0189      * no plan to support it on other file types.
0190      */
0191 
0192     if (!(file->f_mode & FMODE_WRITE))
0193         return -EPERM;
0194     if (seals & ~(unsigned int)F_ALL_SEALS)
0195         return -EINVAL;
0196 
0197     inode_lock(inode);
0198 
0199     file_seals = memfd_file_seals_ptr(file);
0200     if (!file_seals) {
0201         error = -EINVAL;
0202         goto unlock;
0203     }
0204 
0205     if (*file_seals & F_SEAL_SEAL) {
0206         error = -EPERM;
0207         goto unlock;
0208     }
0209 
0210     if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
0211         error = mapping_deny_writable(file->f_mapping);
0212         if (error)
0213             goto unlock;
0214 
0215         error = memfd_wait_for_pins(file->f_mapping);
0216         if (error) {
0217             mapping_allow_writable(file->f_mapping);
0218             goto unlock;
0219         }
0220     }
0221 
0222     *file_seals |= seals;
0223     error = 0;
0224 
0225 unlock:
0226     inode_unlock(inode);
0227     return error;
0228 }
0229 
0230 static int memfd_get_seals(struct file *file)
0231 {
0232     unsigned int *seals = memfd_file_seals_ptr(file);
0233 
0234     return seals ? *seals : -EINVAL;
0235 }
0236 
0237 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
0238 {
0239     long error;
0240 
0241     switch (cmd) {
0242     case F_ADD_SEALS:
0243         /* disallow upper 32bit */
0244         if (arg > UINT_MAX)
0245             return -EINVAL;
0246 
0247         error = memfd_add_seals(file, arg);
0248         break;
0249     case F_GET_SEALS:
0250         error = memfd_get_seals(file);
0251         break;
0252     default:
0253         error = -EINVAL;
0254         break;
0255     }
0256 
0257     return error;
0258 }
0259 
0260 #define MFD_NAME_PREFIX "memfd:"
0261 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
0262 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
0263 
0264 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
0265 
0266 SYSCALL_DEFINE2(memfd_create,
0267         const char __user *, uname,
0268         unsigned int, flags)
0269 {
0270     unsigned int *file_seals;
0271     struct file *file;
0272     int fd, error;
0273     char *name;
0274     long len;
0275 
0276     if (!(flags & MFD_HUGETLB)) {
0277         if (flags & ~(unsigned int)MFD_ALL_FLAGS)
0278             return -EINVAL;
0279     } else {
0280         /* Allow huge page size encoding in flags. */
0281         if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
0282                 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
0283             return -EINVAL;
0284     }
0285 
0286     /* length includes terminating zero */
0287     len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
0288     if (len <= 0)
0289         return -EFAULT;
0290     if (len > MFD_NAME_MAX_LEN + 1)
0291         return -EINVAL;
0292 
0293     name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
0294     if (!name)
0295         return -ENOMEM;
0296 
0297     strcpy(name, MFD_NAME_PREFIX);
0298     if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
0299         error = -EFAULT;
0300         goto err_name;
0301     }
0302 
0303     /* terminating-zero may have changed after strnlen_user() returned */
0304     if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
0305         error = -EFAULT;
0306         goto err_name;
0307     }
0308 
0309     fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
0310     if (fd < 0) {
0311         error = fd;
0312         goto err_name;
0313     }
0314 
0315     if (flags & MFD_HUGETLB) {
0316         file = hugetlb_file_setup(name, 0, VM_NORESERVE,
0317                     HUGETLB_ANONHUGE_INODE,
0318                     (flags >> MFD_HUGE_SHIFT) &
0319                     MFD_HUGE_MASK);
0320     } else
0321         file = shmem_file_setup(name, 0, VM_NORESERVE);
0322     if (IS_ERR(file)) {
0323         error = PTR_ERR(file);
0324         goto err_fd;
0325     }
0326     file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
0327     file->f_flags |= O_LARGEFILE;
0328 
0329     if (flags & MFD_ALLOW_SEALING) {
0330         file_seals = memfd_file_seals_ptr(file);
0331         *file_seals &= ~F_SEAL_SEAL;
0332     }
0333 
0334     fd_install(fd, file);
0335     kfree(name);
0336     return fd;
0337 
0338 err_fd:
0339     put_unused_fd(fd);
0340 err_name:
0341     kfree(name);
0342     return error;
0343 }