0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #include <linux/fs.h>
0011 #include <linux/vfs.h>
0012 #include <linux/pagemap.h>
0013 #include <linux/file.h>
0014 #include <linux/mm.h>
0015 #include <linux/sched/signal.h>
0016 #include <linux/khugepaged.h>
0017 #include <linux/syscalls.h>
0018 #include <linux/hugetlb.h>
0019 #include <linux/shmem_fs.h>
0020 #include <linux/memfd.h>
0021 #include <uapi/linux/memfd.h>
0022
0023
0024
0025
0026
0027
0028 #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
0029 #define LAST_SCAN 4
0030
0031 static void memfd_tag_pins(struct xa_state *xas)
0032 {
0033 struct page *page;
0034 int latency = 0;
0035 int cache_count;
0036
0037 lru_add_drain();
0038
0039 xas_lock_irq(xas);
0040 xas_for_each(xas, page, ULONG_MAX) {
0041 cache_count = 1;
0042 if (!xa_is_value(page) &&
0043 PageTransHuge(page) && !PageHuge(page))
0044 cache_count = HPAGE_PMD_NR;
0045
0046 if (!xa_is_value(page) &&
0047 page_count(page) - total_mapcount(page) != cache_count)
0048 xas_set_mark(xas, MEMFD_TAG_PINNED);
0049 if (cache_count != 1)
0050 xas_set(xas, page->index + cache_count);
0051
0052 latency += cache_count;
0053 if (latency < XA_CHECK_SCHED)
0054 continue;
0055 latency = 0;
0056
0057 xas_pause(xas);
0058 xas_unlock_irq(xas);
0059 cond_resched();
0060 xas_lock_irq(xas);
0061 }
0062 xas_unlock_irq(xas);
0063 }
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074 static int memfd_wait_for_pins(struct address_space *mapping)
0075 {
0076 XA_STATE(xas, &mapping->i_pages, 0);
0077 struct page *page;
0078 int error, scan;
0079
0080 memfd_tag_pins(&xas);
0081
0082 error = 0;
0083 for (scan = 0; scan <= LAST_SCAN; scan++) {
0084 int latency = 0;
0085 int cache_count;
0086
0087 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
0088 break;
0089
0090 if (!scan)
0091 lru_add_drain_all();
0092 else if (schedule_timeout_killable((HZ << scan) / 200))
0093 scan = LAST_SCAN;
0094
0095 xas_set(&xas, 0);
0096 xas_lock_irq(&xas);
0097 xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
0098 bool clear = true;
0099
0100 cache_count = 1;
0101 if (!xa_is_value(page) &&
0102 PageTransHuge(page) && !PageHuge(page))
0103 cache_count = HPAGE_PMD_NR;
0104
0105 if (!xa_is_value(page) && cache_count !=
0106 page_count(page) - total_mapcount(page)) {
0107
0108
0109
0110
0111
0112 if (scan == LAST_SCAN)
0113 error = -EBUSY;
0114 else
0115 clear = false;
0116 }
0117 if (clear)
0118 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
0119
0120 latency += cache_count;
0121 if (latency < XA_CHECK_SCHED)
0122 continue;
0123 latency = 0;
0124
0125 xas_pause(&xas);
0126 xas_unlock_irq(&xas);
0127 cond_resched();
0128 xas_lock_irq(&xas);
0129 }
0130 xas_unlock_irq(&xas);
0131 }
0132
0133 return error;
0134 }
0135
0136 static unsigned int *memfd_file_seals_ptr(struct file *file)
0137 {
0138 if (shmem_file(file))
0139 return &SHMEM_I(file_inode(file))->seals;
0140
0141 #ifdef CONFIG_HUGETLBFS
0142 if (is_file_hugepages(file))
0143 return &HUGETLBFS_I(file_inode(file))->seals;
0144 #endif
0145
0146 return NULL;
0147 }
0148
0149 #define F_ALL_SEALS (F_SEAL_SEAL | \
0150 F_SEAL_SHRINK | \
0151 F_SEAL_GROW | \
0152 F_SEAL_WRITE | \
0153 F_SEAL_FUTURE_WRITE)
0154
0155 static int memfd_add_seals(struct file *file, unsigned int seals)
0156 {
0157 struct inode *inode = file_inode(file);
0158 unsigned int *file_seals;
0159 int error;
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192 if (!(file->f_mode & FMODE_WRITE))
0193 return -EPERM;
0194 if (seals & ~(unsigned int)F_ALL_SEALS)
0195 return -EINVAL;
0196
0197 inode_lock(inode);
0198
0199 file_seals = memfd_file_seals_ptr(file);
0200 if (!file_seals) {
0201 error = -EINVAL;
0202 goto unlock;
0203 }
0204
0205 if (*file_seals & F_SEAL_SEAL) {
0206 error = -EPERM;
0207 goto unlock;
0208 }
0209
0210 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
0211 error = mapping_deny_writable(file->f_mapping);
0212 if (error)
0213 goto unlock;
0214
0215 error = memfd_wait_for_pins(file->f_mapping);
0216 if (error) {
0217 mapping_allow_writable(file->f_mapping);
0218 goto unlock;
0219 }
0220 }
0221
0222 *file_seals |= seals;
0223 error = 0;
0224
0225 unlock:
0226 inode_unlock(inode);
0227 return error;
0228 }
0229
0230 static int memfd_get_seals(struct file *file)
0231 {
0232 unsigned int *seals = memfd_file_seals_ptr(file);
0233
0234 return seals ? *seals : -EINVAL;
0235 }
0236
0237 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
0238 {
0239 long error;
0240
0241 switch (cmd) {
0242 case F_ADD_SEALS:
0243
0244 if (arg > UINT_MAX)
0245 return -EINVAL;
0246
0247 error = memfd_add_seals(file, arg);
0248 break;
0249 case F_GET_SEALS:
0250 error = memfd_get_seals(file);
0251 break;
0252 default:
0253 error = -EINVAL;
0254 break;
0255 }
0256
0257 return error;
0258 }
0259
0260 #define MFD_NAME_PREFIX "memfd:"
0261 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
0262 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
0263
0264 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
0265
0266 SYSCALL_DEFINE2(memfd_create,
0267 const char __user *, uname,
0268 unsigned int, flags)
0269 {
0270 unsigned int *file_seals;
0271 struct file *file;
0272 int fd, error;
0273 char *name;
0274 long len;
0275
0276 if (!(flags & MFD_HUGETLB)) {
0277 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
0278 return -EINVAL;
0279 } else {
0280
0281 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
0282 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
0283 return -EINVAL;
0284 }
0285
0286
0287 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
0288 if (len <= 0)
0289 return -EFAULT;
0290 if (len > MFD_NAME_MAX_LEN + 1)
0291 return -EINVAL;
0292
0293 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
0294 if (!name)
0295 return -ENOMEM;
0296
0297 strcpy(name, MFD_NAME_PREFIX);
0298 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
0299 error = -EFAULT;
0300 goto err_name;
0301 }
0302
0303
0304 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
0305 error = -EFAULT;
0306 goto err_name;
0307 }
0308
0309 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
0310 if (fd < 0) {
0311 error = fd;
0312 goto err_name;
0313 }
0314
0315 if (flags & MFD_HUGETLB) {
0316 file = hugetlb_file_setup(name, 0, VM_NORESERVE,
0317 HUGETLB_ANONHUGE_INODE,
0318 (flags >> MFD_HUGE_SHIFT) &
0319 MFD_HUGE_MASK);
0320 } else
0321 file = shmem_file_setup(name, 0, VM_NORESERVE);
0322 if (IS_ERR(file)) {
0323 error = PTR_ERR(file);
0324 goto err_fd;
0325 }
0326 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
0327 file->f_flags |= O_LARGEFILE;
0328
0329 if (flags & MFD_ALLOW_SEALING) {
0330 file_seals = memfd_file_seals_ptr(file);
0331 *file_seals &= ~F_SEAL_SEAL;
0332 }
0333
0334 fd_install(fd, file);
0335 kfree(name);
0336 return fd;
0337
0338 err_fd:
0339 put_unused_fd(fd);
0340 err_name:
0341 kfree(name);
0342 return error;
0343 }