0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0011
0012 #include <linux/kernel.h>
0013 #include <linux/slab.h>
0014 #include <linux/backing-dev.h>
0015 #include <linux/mm.h>
0016 #include <linux/mm_inline.h>
0017 #include <linux/vmacache.h>
0018 #include <linux/shm.h>
0019 #include <linux/mman.h>
0020 #include <linux/pagemap.h>
0021 #include <linux/swap.h>
0022 #include <linux/syscalls.h>
0023 #include <linux/capability.h>
0024 #include <linux/init.h>
0025 #include <linux/file.h>
0026 #include <linux/fs.h>
0027 #include <linux/personality.h>
0028 #include <linux/security.h>
0029 #include <linux/hugetlb.h>
0030 #include <linux/shmem_fs.h>
0031 #include <linux/profile.h>
0032 #include <linux/export.h>
0033 #include <linux/mount.h>
0034 #include <linux/mempolicy.h>
0035 #include <linux/rmap.h>
0036 #include <linux/mmu_notifier.h>
0037 #include <linux/mmdebug.h>
0038 #include <linux/perf_event.h>
0039 #include <linux/audit.h>
0040 #include <linux/khugepaged.h>
0041 #include <linux/uprobes.h>
0042 #include <linux/rbtree_augmented.h>
0043 #include <linux/notifier.h>
0044 #include <linux/memory.h>
0045 #include <linux/printk.h>
0046 #include <linux/userfaultfd_k.h>
0047 #include <linux/moduleparam.h>
0048 #include <linux/pkeys.h>
0049 #include <linux/oom.h>
0050 #include <linux/sched/mm.h>
0051
0052 #include <linux/uaccess.h>
0053 #include <asm/cacheflush.h>
0054 #include <asm/tlb.h>
0055 #include <asm/mmu_context.h>
0056
0057 #define CREATE_TRACE_POINTS
0058 #include <trace/events/mmap.h>
0059
0060 #include "internal.h"
0061
0062 #ifndef arch_mmap_check
0063 #define arch_mmap_check(addr, len, flags) (0)
0064 #endif
0065
0066 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
0067 const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
0068 const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
0069 int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
0070 #endif
0071 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
0072 const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
0073 const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
0074 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
0075 #endif
0076
0077 static bool ignore_rlimit_data;
0078 core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
0079
0080 static void unmap_region(struct mm_struct *mm,
0081 struct vm_area_struct *vma, struct vm_area_struct *prev,
0082 unsigned long start, unsigned long end);
0083
0084 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
0085 {
0086 return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
0087 }
0088
0089
0090 void vma_set_page_prot(struct vm_area_struct *vma)
0091 {
0092 unsigned long vm_flags = vma->vm_flags;
0093 pgprot_t vm_page_prot;
0094
0095 vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
0096 if (vma_wants_writenotify(vma, vm_page_prot)) {
0097 vm_flags &= ~VM_SHARED;
0098 vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
0099 }
0100
0101 WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
0102 }
0103
0104
0105
0106
0107 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
0108 struct file *file, struct address_space *mapping)
0109 {
0110 if (vma->vm_flags & VM_SHARED)
0111 mapping_unmap_writable(mapping);
0112
0113 flush_dcache_mmap_lock(mapping);
0114 vma_interval_tree_remove(vma, &mapping->i_mmap);
0115 flush_dcache_mmap_unlock(mapping);
0116 }
0117
0118
0119
0120
0121
0122 void unlink_file_vma(struct vm_area_struct *vma)
0123 {
0124 struct file *file = vma->vm_file;
0125
0126 if (file) {
0127 struct address_space *mapping = file->f_mapping;
0128 i_mmap_lock_write(mapping);
0129 __remove_shared_vm_struct(vma, file, mapping);
0130 i_mmap_unlock_write(mapping);
0131 }
0132 }
0133
0134
0135
0136
0137 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
0138 {
0139 struct vm_area_struct *next = vma->vm_next;
0140
0141 might_sleep();
0142 if (vma->vm_ops && vma->vm_ops->close)
0143 vma->vm_ops->close(vma);
0144 if (vma->vm_file)
0145 fput(vma->vm_file);
0146 mpol_put(vma_policy(vma));
0147 vm_area_free(vma);
0148 return next;
0149 }
0150
0151 static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
0152 struct list_head *uf);
0153 SYSCALL_DEFINE1(brk, unsigned long, brk)
0154 {
0155 unsigned long newbrk, oldbrk, origbrk;
0156 struct mm_struct *mm = current->mm;
0157 struct vm_area_struct *next;
0158 unsigned long min_brk;
0159 bool populate;
0160 bool downgraded = false;
0161 LIST_HEAD(uf);
0162
0163 if (mmap_write_lock_killable(mm))
0164 return -EINTR;
0165
0166 origbrk = mm->brk;
0167
0168 #ifdef CONFIG_COMPAT_BRK
0169
0170
0171
0172
0173
0174 if (current->brk_randomized)
0175 min_brk = mm->start_brk;
0176 else
0177 min_brk = mm->end_data;
0178 #else
0179 min_brk = mm->start_brk;
0180 #endif
0181 if (brk < min_brk)
0182 goto out;
0183
0184
0185
0186
0187
0188
0189
0190 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
0191 mm->end_data, mm->start_data))
0192 goto out;
0193
0194 newbrk = PAGE_ALIGN(brk);
0195 oldbrk = PAGE_ALIGN(mm->brk);
0196 if (oldbrk == newbrk) {
0197 mm->brk = brk;
0198 goto success;
0199 }
0200
0201
0202
0203
0204
0205 if (brk <= mm->brk) {
0206 int ret;
0207
0208
0209
0210
0211
0212
0213 mm->brk = brk;
0214 ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
0215 if (ret < 0) {
0216 mm->brk = origbrk;
0217 goto out;
0218 } else if (ret == 1) {
0219 downgraded = true;
0220 }
0221 goto success;
0222 }
0223
0224
0225 next = find_vma(mm, oldbrk);
0226 if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
0227 goto out;
0228
0229
0230 if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
0231 goto out;
0232 mm->brk = brk;
0233
0234 success:
0235 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
0236 if (downgraded)
0237 mmap_read_unlock(mm);
0238 else
0239 mmap_write_unlock(mm);
0240 userfaultfd_unmap_complete(mm, &uf);
0241 if (populate)
0242 mm_populate(oldbrk, newbrk - oldbrk);
0243 return brk;
0244
0245 out:
0246 mmap_write_unlock(mm);
0247 return origbrk;
0248 }
0249
0250 static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
0251 {
0252 unsigned long gap, prev_end;
0253
0254
0255
0256
0257
0258
0259
0260 gap = vm_start_gap(vma);
0261 if (vma->vm_prev) {
0262 prev_end = vm_end_gap(vma->vm_prev);
0263 if (gap > prev_end)
0264 gap -= prev_end;
0265 else
0266 gap = 0;
0267 }
0268 return gap;
0269 }
0270
0271 #ifdef CONFIG_DEBUG_VM_RB
0272 static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
0273 {
0274 unsigned long max = vma_compute_gap(vma), subtree_gap;
0275 if (vma->vm_rb.rb_left) {
0276 subtree_gap = rb_entry(vma->vm_rb.rb_left,
0277 struct vm_area_struct, vm_rb)->rb_subtree_gap;
0278 if (subtree_gap > max)
0279 max = subtree_gap;
0280 }
0281 if (vma->vm_rb.rb_right) {
0282 subtree_gap = rb_entry(vma->vm_rb.rb_right,
0283 struct vm_area_struct, vm_rb)->rb_subtree_gap;
0284 if (subtree_gap > max)
0285 max = subtree_gap;
0286 }
0287 return max;
0288 }
0289
0290 static int browse_rb(struct mm_struct *mm)
0291 {
0292 struct rb_root *root = &mm->mm_rb;
0293 int i = 0, j, bug = 0;
0294 struct rb_node *nd, *pn = NULL;
0295 unsigned long prev = 0, pend = 0;
0296
0297 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
0298 struct vm_area_struct *vma;
0299 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
0300 if (vma->vm_start < prev) {
0301 pr_emerg("vm_start %lx < prev %lx\n",
0302 vma->vm_start, prev);
0303 bug = 1;
0304 }
0305 if (vma->vm_start < pend) {
0306 pr_emerg("vm_start %lx < pend %lx\n",
0307 vma->vm_start, pend);
0308 bug = 1;
0309 }
0310 if (vma->vm_start > vma->vm_end) {
0311 pr_emerg("vm_start %lx > vm_end %lx\n",
0312 vma->vm_start, vma->vm_end);
0313 bug = 1;
0314 }
0315 spin_lock(&mm->page_table_lock);
0316 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
0317 pr_emerg("free gap %lx, correct %lx\n",
0318 vma->rb_subtree_gap,
0319 vma_compute_subtree_gap(vma));
0320 bug = 1;
0321 }
0322 spin_unlock(&mm->page_table_lock);
0323 i++;
0324 pn = nd;
0325 prev = vma->vm_start;
0326 pend = vma->vm_end;
0327 }
0328 j = 0;
0329 for (nd = pn; nd; nd = rb_prev(nd))
0330 j++;
0331 if (i != j) {
0332 pr_emerg("backwards %d, forwards %d\n", j, i);
0333 bug = 1;
0334 }
0335 return bug ? -1 : i;
0336 }
0337
0338 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
0339 {
0340 struct rb_node *nd;
0341
0342 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
0343 struct vm_area_struct *vma;
0344 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
0345 VM_BUG_ON_VMA(vma != ignore &&
0346 vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
0347 vma);
0348 }
0349 }
0350
0351 static void validate_mm(struct mm_struct *mm)
0352 {
0353 int bug = 0;
0354 int i = 0;
0355 unsigned long highest_address = 0;
0356 struct vm_area_struct *vma = mm->mmap;
0357
0358 while (vma) {
0359 struct anon_vma *anon_vma = vma->anon_vma;
0360 struct anon_vma_chain *avc;
0361
0362 if (anon_vma) {
0363 anon_vma_lock_read(anon_vma);
0364 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
0365 anon_vma_interval_tree_verify(avc);
0366 anon_vma_unlock_read(anon_vma);
0367 }
0368
0369 highest_address = vm_end_gap(vma);
0370 vma = vma->vm_next;
0371 i++;
0372 }
0373 if (i != mm->map_count) {
0374 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
0375 bug = 1;
0376 }
0377 if (highest_address != mm->highest_vm_end) {
0378 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
0379 mm->highest_vm_end, highest_address);
0380 bug = 1;
0381 }
0382 i = browse_rb(mm);
0383 if (i != mm->map_count) {
0384 if (i != -1)
0385 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
0386 bug = 1;
0387 }
0388 VM_BUG_ON_MM(bug, mm);
0389 }
0390 #else
0391 #define validate_mm_rb(root, ignore) do { } while (0)
0392 #define validate_mm(mm) do { } while (0)
0393 #endif
0394
0395 RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
0396 struct vm_area_struct, vm_rb,
0397 unsigned long, rb_subtree_gap, vma_compute_gap)
0398
0399
0400
0401
0402
0403
0404 static void vma_gap_update(struct vm_area_struct *vma)
0405 {
0406
0407
0408
0409
0410 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
0411 }
0412
0413 static inline void vma_rb_insert(struct vm_area_struct *vma,
0414 struct rb_root *root)
0415 {
0416
0417 validate_mm_rb(root, NULL);
0418
0419 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
0420 }
0421
0422 static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
0423 {
0424
0425
0426
0427
0428
0429 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
0430 }
0431
0432 static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
0433 struct rb_root *root,
0434 struct vm_area_struct *ignore)
0435 {
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445 validate_mm_rb(root, ignore);
0446
0447 __vma_rb_erase(vma, root);
0448 }
0449
0450 static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
0451 struct rb_root *root)
0452 {
0453 vma_rb_erase_ignore(vma, root, vma);
0454 }
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470 static inline void
0471 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
0472 {
0473 struct anon_vma_chain *avc;
0474
0475 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
0476 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
0477 }
0478
0479 static inline void
0480 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
0481 {
0482 struct anon_vma_chain *avc;
0483
0484 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
0485 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
0486 }
0487
0488 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
0489 unsigned long end, struct vm_area_struct **pprev,
0490 struct rb_node ***rb_link, struct rb_node **rb_parent)
0491 {
0492 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
0493
0494 mmap_assert_locked(mm);
0495 __rb_link = &mm->mm_rb.rb_node;
0496 rb_prev = __rb_parent = NULL;
0497
0498 while (*__rb_link) {
0499 struct vm_area_struct *vma_tmp;
0500
0501 __rb_parent = *__rb_link;
0502 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
0503
0504 if (vma_tmp->vm_end > addr) {
0505
0506 if (vma_tmp->vm_start < end)
0507 return -ENOMEM;
0508 __rb_link = &__rb_parent->rb_left;
0509 } else {
0510 rb_prev = __rb_parent;
0511 __rb_link = &__rb_parent->rb_right;
0512 }
0513 }
0514
0515 *pprev = NULL;
0516 if (rb_prev)
0517 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
0518 *rb_link = __rb_link;
0519 *rb_parent = __rb_parent;
0520 return 0;
0521 }
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532 static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
0533 struct vm_area_struct *vma)
0534 {
0535 if (!vma)
0536 return mm->mmap;
0537
0538 return vma->vm_next;
0539 }
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555 static inline int
0556 munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
0557 struct vm_area_struct **pprev, struct rb_node ***link,
0558 struct rb_node **parent, struct list_head *uf)
0559 {
0560
0561 while (find_vma_links(mm, start, start + len, pprev, link, parent))
0562 if (do_munmap(mm, start, len, uf))
0563 return -ENOMEM;
0564
0565 return 0;
0566 }
0567 static unsigned long count_vma_pages_range(struct mm_struct *mm,
0568 unsigned long addr, unsigned long end)
0569 {
0570 unsigned long nr_pages = 0;
0571 struct vm_area_struct *vma;
0572
0573
0574 vma = find_vma_intersection(mm, addr, end);
0575 if (!vma)
0576 return 0;
0577
0578 nr_pages = (min(end, vma->vm_end) -
0579 max(addr, vma->vm_start)) >> PAGE_SHIFT;
0580
0581
0582 for (vma = vma->vm_next; vma; vma = vma->vm_next) {
0583 unsigned long overlap_len;
0584
0585 if (vma->vm_start > end)
0586 break;
0587
0588 overlap_len = min(end, vma->vm_end) - vma->vm_start;
0589 nr_pages += overlap_len >> PAGE_SHIFT;
0590 }
0591
0592 return nr_pages;
0593 }
0594
0595 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
0596 struct rb_node **rb_link, struct rb_node *rb_parent)
0597 {
0598
0599 if (vma->vm_next)
0600 vma_gap_update(vma->vm_next);
0601 else
0602 mm->highest_vm_end = vm_end_gap(vma);
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
0614 vma->rb_subtree_gap = 0;
0615 vma_gap_update(vma);
0616 vma_rb_insert(vma, &mm->mm_rb);
0617 }
0618
0619 static void __vma_link_file(struct vm_area_struct *vma)
0620 {
0621 struct file *file;
0622
0623 file = vma->vm_file;
0624 if (file) {
0625 struct address_space *mapping = file->f_mapping;
0626
0627 if (vma->vm_flags & VM_SHARED)
0628 mapping_allow_writable(mapping);
0629
0630 flush_dcache_mmap_lock(mapping);
0631 vma_interval_tree_insert(vma, &mapping->i_mmap);
0632 flush_dcache_mmap_unlock(mapping);
0633 }
0634 }
0635
0636 static void
0637 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
0638 struct vm_area_struct *prev, struct rb_node **rb_link,
0639 struct rb_node *rb_parent)
0640 {
0641 __vma_link_list(mm, vma, prev);
0642 __vma_link_rb(mm, vma, rb_link, rb_parent);
0643 }
0644
0645 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
0646 struct vm_area_struct *prev, struct rb_node **rb_link,
0647 struct rb_node *rb_parent)
0648 {
0649 struct address_space *mapping = NULL;
0650
0651 if (vma->vm_file) {
0652 mapping = vma->vm_file->f_mapping;
0653 i_mmap_lock_write(mapping);
0654 }
0655
0656 __vma_link(mm, vma, prev, rb_link, rb_parent);
0657 __vma_link_file(vma);
0658
0659 if (mapping)
0660 i_mmap_unlock_write(mapping);
0661
0662 mm->map_count++;
0663 validate_mm(mm);
0664 }
0665
0666
0667
0668
0669
0670 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
0671 {
0672 struct vm_area_struct *prev;
0673 struct rb_node **rb_link, *rb_parent;
0674
0675 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
0676 &prev, &rb_link, &rb_parent))
0677 BUG();
0678 __vma_link(mm, vma, prev, rb_link, rb_parent);
0679 mm->map_count++;
0680 }
0681
0682 static __always_inline void __vma_unlink(struct mm_struct *mm,
0683 struct vm_area_struct *vma,
0684 struct vm_area_struct *ignore)
0685 {
0686 vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
0687 __vma_unlink_list(mm, vma);
0688
0689 vmacache_invalidate(mm);
0690 }
0691
0692
0693
0694
0695
0696
0697
0698
0699 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
0700 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
0701 struct vm_area_struct *expand)
0702 {
0703 struct mm_struct *mm = vma->vm_mm;
0704 struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
0705 struct address_space *mapping = NULL;
0706 struct rb_root_cached *root = NULL;
0707 struct anon_vma *anon_vma = NULL;
0708 struct file *file = vma->vm_file;
0709 bool start_changed = false, end_changed = false;
0710 long adjust_next = 0;
0711 int remove_next = 0;
0712
0713 if (next && !insert) {
0714 struct vm_area_struct *exporter = NULL, *importer = NULL;
0715
0716 if (end >= next->vm_end) {
0717
0718
0719
0720
0721
0722
0723 if (next == expand) {
0724
0725
0726
0727
0728 VM_WARN_ON(end != next->vm_end);
0729
0730
0731
0732
0733
0734 remove_next = 3;
0735 VM_WARN_ON(file != next->vm_file);
0736 swap(vma, next);
0737 } else {
0738 VM_WARN_ON(expand != vma);
0739
0740
0741
0742
0743 remove_next = 1 + (end > next->vm_end);
0744 VM_WARN_ON(remove_next == 2 &&
0745 end != next->vm_next->vm_end);
0746
0747 end = next->vm_end;
0748 }
0749
0750 exporter = next;
0751 importer = vma;
0752
0753
0754
0755
0756
0757 if (remove_next == 2 && !next->anon_vma)
0758 exporter = next->vm_next;
0759
0760 } else if (end > next->vm_start) {
0761
0762
0763
0764
0765 adjust_next = (end - next->vm_start);
0766 exporter = next;
0767 importer = vma;
0768 VM_WARN_ON(expand != importer);
0769 } else if (end < vma->vm_end) {
0770
0771
0772
0773
0774
0775 adjust_next = -(vma->vm_end - end);
0776 exporter = vma;
0777 importer = next;
0778 VM_WARN_ON(expand != importer);
0779 }
0780
0781
0782
0783
0784
0785
0786 if (exporter && exporter->anon_vma && !importer->anon_vma) {
0787 int error;
0788
0789 importer->anon_vma = exporter->anon_vma;
0790 error = anon_vma_clone(importer, exporter);
0791 if (error)
0792 return error;
0793 }
0794 }
0795 again:
0796 vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
0797
0798 if (file) {
0799 mapping = file->f_mapping;
0800 root = &mapping->i_mmap;
0801 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
0802
0803 if (adjust_next)
0804 uprobe_munmap(next, next->vm_start, next->vm_end);
0805
0806 i_mmap_lock_write(mapping);
0807 if (insert) {
0808
0809
0810
0811
0812
0813
0814 __vma_link_file(insert);
0815 }
0816 }
0817
0818 anon_vma = vma->anon_vma;
0819 if (!anon_vma && adjust_next)
0820 anon_vma = next->anon_vma;
0821 if (anon_vma) {
0822 VM_WARN_ON(adjust_next && next->anon_vma &&
0823 anon_vma != next->anon_vma);
0824 anon_vma_lock_write(anon_vma);
0825 anon_vma_interval_tree_pre_update_vma(vma);
0826 if (adjust_next)
0827 anon_vma_interval_tree_pre_update_vma(next);
0828 }
0829
0830 if (file) {
0831 flush_dcache_mmap_lock(mapping);
0832 vma_interval_tree_remove(vma, root);
0833 if (adjust_next)
0834 vma_interval_tree_remove(next, root);
0835 }
0836
0837 if (start != vma->vm_start) {
0838 vma->vm_start = start;
0839 start_changed = true;
0840 }
0841 if (end != vma->vm_end) {
0842 vma->vm_end = end;
0843 end_changed = true;
0844 }
0845 vma->vm_pgoff = pgoff;
0846 if (adjust_next) {
0847 next->vm_start += adjust_next;
0848 next->vm_pgoff += adjust_next >> PAGE_SHIFT;
0849 }
0850
0851 if (file) {
0852 if (adjust_next)
0853 vma_interval_tree_insert(next, root);
0854 vma_interval_tree_insert(vma, root);
0855 flush_dcache_mmap_unlock(mapping);
0856 }
0857
0858 if (remove_next) {
0859
0860
0861
0862
0863 if (remove_next != 3)
0864 __vma_unlink(mm, next, next);
0865 else
0866
0867
0868
0869
0870
0871
0872
0873
0874
0875 __vma_unlink(mm, next, vma);
0876 if (file)
0877 __remove_shared_vm_struct(next, file, mapping);
0878 } else if (insert) {
0879
0880
0881
0882
0883
0884 __insert_vm_struct(mm, insert);
0885 } else {
0886 if (start_changed)
0887 vma_gap_update(vma);
0888 if (end_changed) {
0889 if (!next)
0890 mm->highest_vm_end = vm_end_gap(vma);
0891 else if (!adjust_next)
0892 vma_gap_update(next);
0893 }
0894 }
0895
0896 if (anon_vma) {
0897 anon_vma_interval_tree_post_update_vma(vma);
0898 if (adjust_next)
0899 anon_vma_interval_tree_post_update_vma(next);
0900 anon_vma_unlock_write(anon_vma);
0901 }
0902
0903 if (file) {
0904 i_mmap_unlock_write(mapping);
0905 uprobe_mmap(vma);
0906
0907 if (adjust_next)
0908 uprobe_mmap(next);
0909 }
0910
0911 if (remove_next) {
0912 if (file) {
0913 uprobe_munmap(next, next->vm_start, next->vm_end);
0914 fput(file);
0915 }
0916 if (next->anon_vma)
0917 anon_vma_merge(vma, next);
0918 mm->map_count--;
0919 mpol_put(vma_policy(next));
0920 vm_area_free(next);
0921
0922
0923
0924
0925
0926 if (remove_next != 3) {
0927
0928
0929
0930
0931
0932
0933 next = vma->vm_next;
0934 } else {
0935
0936
0937
0938
0939
0940
0941
0942
0943
0944
0945 next = vma;
0946 }
0947 if (remove_next == 2) {
0948 remove_next = 1;
0949 end = next->vm_end;
0950 goto again;
0951 }
0952 else if (next)
0953 vma_gap_update(next);
0954 else {
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972
0973
0974 VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
0975 }
0976 }
0977 if (insert && file)
0978 uprobe_mmap(insert);
0979
0980 validate_mm(mm);
0981
0982 return 0;
0983 }
0984
0985
0986
0987
0988
0989 static inline int is_mergeable_vma(struct vm_area_struct *vma,
0990 struct file *file, unsigned long vm_flags,
0991 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
0992 struct anon_vma_name *anon_name)
0993 {
0994
0995
0996
0997
0998
0999
1000
1001
1002 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
1003 return 0;
1004 if (vma->vm_file != file)
1005 return 0;
1006 if (vma->vm_ops && vma->vm_ops->close)
1007 return 0;
1008 if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
1009 return 0;
1010 if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
1011 return 0;
1012 return 1;
1013 }
1014
1015 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
1016 struct anon_vma *anon_vma2,
1017 struct vm_area_struct *vma)
1018 {
1019
1020
1021
1022
1023 if ((!anon_vma1 || !anon_vma2) && (!vma ||
1024 list_is_singular(&vma->anon_vma_chain)))
1025 return 1;
1026 return anon_vma1 == anon_vma2;
1027 }
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040 static int
1041 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
1042 struct anon_vma *anon_vma, struct file *file,
1043 pgoff_t vm_pgoff,
1044 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
1045 struct anon_vma_name *anon_name)
1046 {
1047 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
1048 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1049 if (vma->vm_pgoff == vm_pgoff)
1050 return 1;
1051 }
1052 return 0;
1053 }
1054
1055
1056
1057
1058
1059
1060
1061
1062 static int
1063 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1064 struct anon_vma *anon_vma, struct file *file,
1065 pgoff_t vm_pgoff,
1066 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
1067 struct anon_vma_name *anon_name)
1068 {
1069 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
1070 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1071 pgoff_t vm_pglen;
1072 vm_pglen = vma_pages(vma);
1073 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
1074 return 1;
1075 }
1076 return 0;
1077 }
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122 struct vm_area_struct *vma_merge(struct mm_struct *mm,
1123 struct vm_area_struct *prev, unsigned long addr,
1124 unsigned long end, unsigned long vm_flags,
1125 struct anon_vma *anon_vma, struct file *file,
1126 pgoff_t pgoff, struct mempolicy *policy,
1127 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
1128 struct anon_vma_name *anon_name)
1129 {
1130 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1131 struct vm_area_struct *area, *next;
1132 int err;
1133
1134
1135
1136
1137
1138 if (vm_flags & VM_SPECIAL)
1139 return NULL;
1140
1141 next = vma_next(mm, prev);
1142 area = next;
1143 if (area && area->vm_end == end)
1144 next = next->vm_next;
1145
1146
1147 VM_WARN_ON(prev && addr <= prev->vm_start);
1148 VM_WARN_ON(area && end > area->vm_end);
1149 VM_WARN_ON(addr >= end);
1150
1151
1152
1153
1154 if (prev && prev->vm_end == addr &&
1155 mpol_equal(vma_policy(prev), policy) &&
1156 can_vma_merge_after(prev, vm_flags,
1157 anon_vma, file, pgoff,
1158 vm_userfaultfd_ctx, anon_name)) {
1159
1160
1161
1162 if (next && end == next->vm_start &&
1163 mpol_equal(policy, vma_policy(next)) &&
1164 can_vma_merge_before(next, vm_flags,
1165 anon_vma, file,
1166 pgoff+pglen,
1167 vm_userfaultfd_ctx, anon_name) &&
1168 is_mergeable_anon_vma(prev->anon_vma,
1169 next->anon_vma, NULL)) {
1170
1171 err = __vma_adjust(prev, prev->vm_start,
1172 next->vm_end, prev->vm_pgoff, NULL,
1173 prev);
1174 } else
1175 err = __vma_adjust(prev, prev->vm_start,
1176 end, prev->vm_pgoff, NULL, prev);
1177 if (err)
1178 return NULL;
1179 khugepaged_enter_vma(prev, vm_flags);
1180 return prev;
1181 }
1182
1183
1184
1185
1186 if (next && end == next->vm_start &&
1187 mpol_equal(policy, vma_policy(next)) &&
1188 can_vma_merge_before(next, vm_flags,
1189 anon_vma, file, pgoff+pglen,
1190 vm_userfaultfd_ctx, anon_name)) {
1191 if (prev && addr < prev->vm_end)
1192 err = __vma_adjust(prev, prev->vm_start,
1193 addr, prev->vm_pgoff, NULL, next);
1194 else {
1195 err = __vma_adjust(area, addr, next->vm_end,
1196 next->vm_pgoff - pglen, NULL, next);
1197
1198
1199
1200
1201
1202 area = next;
1203 }
1204 if (err)
1205 return NULL;
1206 khugepaged_enter_vma(area, vm_flags);
1207 return area;
1208 }
1209
1210 return NULL;
1211 }
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1227 {
1228 return a->vm_end == b->vm_start &&
1229 mpol_equal(vma_policy(a), vma_policy(b)) &&
1230 a->vm_file == b->vm_file &&
1231 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
1232 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1233 }
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1258 {
1259 if (anon_vma_compatible(a, b)) {
1260 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1261
1262 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1263 return anon_vma;
1264 }
1265 return NULL;
1266 }
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1277 {
1278 struct anon_vma *anon_vma = NULL;
1279
1280
1281 if (vma->vm_next) {
1282 anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
1283 if (anon_vma)
1284 return anon_vma;
1285 }
1286
1287
1288 if (vma->vm_prev)
1289 anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301 return anon_vma;
1302 }
1303
1304
1305
1306
1307
1308 static inline unsigned long round_hint_to_min(unsigned long hint)
1309 {
1310 hint &= PAGE_MASK;
1311 if (((void *)hint != NULL) &&
1312 (hint < mmap_min_addr))
1313 return PAGE_ALIGN(mmap_min_addr);
1314 return hint;
1315 }
1316
1317 int mlock_future_check(struct mm_struct *mm, unsigned long flags,
1318 unsigned long len)
1319 {
1320 unsigned long locked, lock_limit;
1321
1322
1323 if (flags & VM_LOCKED) {
1324 locked = len >> PAGE_SHIFT;
1325 locked += mm->locked_vm;
1326 lock_limit = rlimit(RLIMIT_MEMLOCK);
1327 lock_limit >>= PAGE_SHIFT;
1328 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1329 return -EAGAIN;
1330 }
1331 return 0;
1332 }
1333
1334 static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
1335 {
1336 if (S_ISREG(inode->i_mode))
1337 return MAX_LFS_FILESIZE;
1338
1339 if (S_ISBLK(inode->i_mode))
1340 return MAX_LFS_FILESIZE;
1341
1342 if (S_ISSOCK(inode->i_mode))
1343 return MAX_LFS_FILESIZE;
1344
1345
1346 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
1347 return 0;
1348
1349
1350 return ULONG_MAX;
1351 }
1352
1353 static inline bool file_mmap_ok(struct file *file, struct inode *inode,
1354 unsigned long pgoff, unsigned long len)
1355 {
1356 u64 maxsize = file_mmap_size_max(file, inode);
1357
1358 if (maxsize && len > maxsize)
1359 return false;
1360 maxsize -= len;
1361 if (pgoff > maxsize >> PAGE_SHIFT)
1362 return false;
1363 return true;
1364 }
1365
1366
1367
1368
1369 unsigned long do_mmap(struct file *file, unsigned long addr,
1370 unsigned long len, unsigned long prot,
1371 unsigned long flags, unsigned long pgoff,
1372 unsigned long *populate, struct list_head *uf)
1373 {
1374 struct mm_struct *mm = current->mm;
1375 vm_flags_t vm_flags;
1376 int pkey = 0;
1377
1378 *populate = 0;
1379
1380 if (!len)
1381 return -EINVAL;
1382
1383
1384
1385
1386
1387
1388
1389 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1390 if (!(file && path_noexec(&file->f_path)))
1391 prot |= PROT_EXEC;
1392
1393
1394 if (flags & MAP_FIXED_NOREPLACE)
1395 flags |= MAP_FIXED;
1396
1397 if (!(flags & MAP_FIXED))
1398 addr = round_hint_to_min(addr);
1399
1400
1401 len = PAGE_ALIGN(len);
1402 if (!len)
1403 return -ENOMEM;
1404
1405
1406 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1407 return -EOVERFLOW;
1408
1409
1410 if (mm->map_count > sysctl_max_map_count)
1411 return -ENOMEM;
1412
1413
1414
1415
1416 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1417 if (IS_ERR_VALUE(addr))
1418 return addr;
1419
1420 if (flags & MAP_FIXED_NOREPLACE) {
1421 if (find_vma_intersection(mm, addr, addr + len))
1422 return -EEXIST;
1423 }
1424
1425 if (prot == PROT_EXEC) {
1426 pkey = execute_only_pkey(mm);
1427 if (pkey < 0)
1428 pkey = 0;
1429 }
1430
1431
1432
1433
1434
1435 vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1436 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1437
1438 if (flags & MAP_LOCKED)
1439 if (!can_do_mlock())
1440 return -EPERM;
1441
1442 if (mlock_future_check(mm, vm_flags, len))
1443 return -EAGAIN;
1444
1445 if (file) {
1446 struct inode *inode = file_inode(file);
1447 unsigned long flags_mask;
1448
1449 if (!file_mmap_ok(file, inode, pgoff, len))
1450 return -EOVERFLOW;
1451
1452 flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
1453
1454 switch (flags & MAP_TYPE) {
1455 case MAP_SHARED:
1456
1457
1458
1459
1460
1461
1462
1463 flags &= LEGACY_MAP_MASK;
1464 fallthrough;
1465 case MAP_SHARED_VALIDATE:
1466 if (flags & ~flags_mask)
1467 return -EOPNOTSUPP;
1468 if (prot & PROT_WRITE) {
1469 if (!(file->f_mode & FMODE_WRITE))
1470 return -EACCES;
1471 if (IS_SWAPFILE(file->f_mapping->host))
1472 return -ETXTBSY;
1473 }
1474
1475
1476
1477
1478
1479 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1480 return -EACCES;
1481
1482 vm_flags |= VM_SHARED | VM_MAYSHARE;
1483 if (!(file->f_mode & FMODE_WRITE))
1484 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1485 fallthrough;
1486 case MAP_PRIVATE:
1487 if (!(file->f_mode & FMODE_READ))
1488 return -EACCES;
1489 if (path_noexec(&file->f_path)) {
1490 if (vm_flags & VM_EXEC)
1491 return -EPERM;
1492 vm_flags &= ~VM_MAYEXEC;
1493 }
1494
1495 if (!file->f_op->mmap)
1496 return -ENODEV;
1497 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1498 return -EINVAL;
1499 break;
1500
1501 default:
1502 return -EINVAL;
1503 }
1504 } else {
1505 switch (flags & MAP_TYPE) {
1506 case MAP_SHARED:
1507 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1508 return -EINVAL;
1509
1510
1511
1512 pgoff = 0;
1513 vm_flags |= VM_SHARED | VM_MAYSHARE;
1514 break;
1515 case MAP_PRIVATE:
1516
1517
1518
1519 pgoff = addr >> PAGE_SHIFT;
1520 break;
1521 default:
1522 return -EINVAL;
1523 }
1524 }
1525
1526
1527
1528
1529
1530 if (flags & MAP_NORESERVE) {
1531
1532 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1533 vm_flags |= VM_NORESERVE;
1534
1535
1536 if (file && is_file_hugepages(file))
1537 vm_flags |= VM_NORESERVE;
1538 }
1539
1540 addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
1541 if (!IS_ERR_VALUE(addr) &&
1542 ((vm_flags & VM_LOCKED) ||
1543 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1544 *populate = len;
1545 return addr;
1546 }
1547
1548 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
1549 unsigned long prot, unsigned long flags,
1550 unsigned long fd, unsigned long pgoff)
1551 {
1552 struct file *file = NULL;
1553 unsigned long retval;
1554
1555 if (!(flags & MAP_ANONYMOUS)) {
1556 audit_mmap_fd(fd, flags);
1557 file = fget(fd);
1558 if (!file)
1559 return -EBADF;
1560 if (is_file_hugepages(file)) {
1561 len = ALIGN(len, huge_page_size(hstate_file(file)));
1562 } else if (unlikely(flags & MAP_HUGETLB)) {
1563 retval = -EINVAL;
1564 goto out_fput;
1565 }
1566 } else if (flags & MAP_HUGETLB) {
1567 struct hstate *hs;
1568
1569 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1570 if (!hs)
1571 return -EINVAL;
1572
1573 len = ALIGN(len, huge_page_size(hs));
1574
1575
1576
1577
1578 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1579 VM_NORESERVE,
1580 HUGETLB_ANONHUGE_INODE,
1581 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1582 if (IS_ERR(file))
1583 return PTR_ERR(file);
1584 }
1585
1586 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1587 out_fput:
1588 if (file)
1589 fput(file);
1590 return retval;
1591 }
1592
1593 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1594 unsigned long, prot, unsigned long, flags,
1595 unsigned long, fd, unsigned long, pgoff)
1596 {
1597 return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
1598 }
1599
1600 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1601 struct mmap_arg_struct {
1602 unsigned long addr;
1603 unsigned long len;
1604 unsigned long prot;
1605 unsigned long flags;
1606 unsigned long fd;
1607 unsigned long offset;
1608 };
1609
1610 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1611 {
1612 struct mmap_arg_struct a;
1613
1614 if (copy_from_user(&a, arg, sizeof(a)))
1615 return -EFAULT;
1616 if (offset_in_page(a.offset))
1617 return -EINVAL;
1618
1619 return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1620 a.offset >> PAGE_SHIFT);
1621 }
1622 #endif
1623
1624
1625
1626
1627
1628
1629
1630 int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
1631 {
1632 vm_flags_t vm_flags = vma->vm_flags;
1633 const struct vm_operations_struct *vm_ops = vma->vm_ops;
1634
1635
1636 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1637 return 0;
1638
1639
1640 if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
1641 return 1;
1642
1643
1644
1645 if (pgprot_val(vm_page_prot) !=
1646 pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
1647 return 0;
1648
1649
1650
1651
1652
1653 if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
1654 return 1;
1655
1656
1657 if (vm_flags & VM_PFNMAP)
1658 return 0;
1659
1660
1661 return vma->vm_file && vma->vm_file->f_mapping &&
1662 mapping_can_writeback(vma->vm_file->f_mapping);
1663 }
1664
1665
1666
1667
1668
1669 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1670 {
1671
1672
1673
1674
1675 if (file && is_file_hugepages(file))
1676 return 0;
1677
1678 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1679 }
1680
1681 unsigned long mmap_region(struct file *file, unsigned long addr,
1682 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
1683 struct list_head *uf)
1684 {
1685 struct mm_struct *mm = current->mm;
1686 struct vm_area_struct *vma, *prev, *merge;
1687 int error;
1688 struct rb_node **rb_link, *rb_parent;
1689 unsigned long charged = 0;
1690
1691
1692 if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
1693 unsigned long nr_pages;
1694
1695
1696
1697
1698
1699 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1700
1701 if (!may_expand_vm(mm, vm_flags,
1702 (len >> PAGE_SHIFT) - nr_pages))
1703 return -ENOMEM;
1704 }
1705
1706
1707 if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
1708 return -ENOMEM;
1709
1710
1711
1712 if (accountable_mapping(file, vm_flags)) {
1713 charged = len >> PAGE_SHIFT;
1714 if (security_vm_enough_memory_mm(mm, charged))
1715 return -ENOMEM;
1716 vm_flags |= VM_ACCOUNT;
1717 }
1718
1719
1720
1721
1722 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1723 NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
1724 if (vma)
1725 goto out;
1726
1727
1728
1729
1730
1731
1732 vma = vm_area_alloc(mm);
1733 if (!vma) {
1734 error = -ENOMEM;
1735 goto unacct_error;
1736 }
1737
1738 vma->vm_start = addr;
1739 vma->vm_end = addr + len;
1740 vma->vm_flags = vm_flags;
1741 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1742 vma->vm_pgoff = pgoff;
1743
1744 if (file) {
1745 if (vm_flags & VM_SHARED) {
1746 error = mapping_map_writable(file->f_mapping);
1747 if (error)
1748 goto free_vma;
1749 }
1750
1751 vma->vm_file = get_file(file);
1752 error = call_mmap(file, vma);
1753 if (error)
1754 goto unmap_and_free_vma;
1755
1756
1757
1758
1759
1760
1761
1762
1763 WARN_ON_ONCE(addr != vma->vm_start);
1764
1765 addr = vma->vm_start;
1766
1767
1768
1769
1770 if (unlikely(vm_flags != vma->vm_flags && prev)) {
1771 merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
1772 NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
1773 if (merge) {
1774
1775
1776
1777
1778 fput(vma->vm_file);
1779 vm_area_free(vma);
1780 vma = merge;
1781
1782 vm_flags = vma->vm_flags;
1783 goto unmap_writable;
1784 }
1785 }
1786
1787 vm_flags = vma->vm_flags;
1788 } else if (vm_flags & VM_SHARED) {
1789 error = shmem_zero_setup(vma);
1790 if (error)
1791 goto free_vma;
1792 } else {
1793 vma_set_anonymous(vma);
1794 }
1795
1796
1797 if (!arch_validate_flags(vma->vm_flags)) {
1798 error = -EINVAL;
1799 if (file)
1800 goto unmap_and_free_vma;
1801 else
1802 goto free_vma;
1803 }
1804
1805 vma_link(mm, vma, prev, rb_link, rb_parent);
1806
1807
1808
1809
1810
1811 khugepaged_enter_vma(vma, vma->vm_flags);
1812
1813
1814 unmap_writable:
1815 if (file && vm_flags & VM_SHARED)
1816 mapping_unmap_writable(file->f_mapping);
1817 file = vma->vm_file;
1818 out:
1819 perf_event_mmap(vma);
1820
1821 vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
1822 if (vm_flags & VM_LOCKED) {
1823 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
1824 is_vm_hugetlb_page(vma) ||
1825 vma == get_gate_vma(current->mm))
1826 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1827 else
1828 mm->locked_vm += (len >> PAGE_SHIFT);
1829 }
1830
1831 if (file)
1832 uprobe_mmap(vma);
1833
1834
1835
1836
1837
1838
1839
1840
1841 vma->vm_flags |= VM_SOFTDIRTY;
1842
1843 vma_set_page_prot(vma);
1844
1845 return addr;
1846
1847 unmap_and_free_vma:
1848 fput(vma->vm_file);
1849 vma->vm_file = NULL;
1850
1851
1852 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1853 if (vm_flags & VM_SHARED)
1854 mapping_unmap_writable(file->f_mapping);
1855 free_vma:
1856 vm_area_free(vma);
1857 unacct_error:
1858 if (charged)
1859 vm_unacct_memory(charged);
1860 return error;
1861 }
1862
1863 static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1864 {
1865
1866
1867
1868
1869
1870
1871
1872
1873 struct mm_struct *mm = current->mm;
1874 struct vm_area_struct *vma;
1875 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1876
1877
1878 length = info->length + info->align_mask;
1879 if (length < info->length)
1880 return -ENOMEM;
1881
1882
1883 if (info->high_limit < length)
1884 return -ENOMEM;
1885 high_limit = info->high_limit - length;
1886
1887 if (info->low_limit > high_limit)
1888 return -ENOMEM;
1889 low_limit = info->low_limit + length;
1890
1891
1892 if (RB_EMPTY_ROOT(&mm->mm_rb))
1893 goto check_highest;
1894 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1895 if (vma->rb_subtree_gap < length)
1896 goto check_highest;
1897
1898 while (true) {
1899
1900 gap_end = vm_start_gap(vma);
1901 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1902 struct vm_area_struct *left =
1903 rb_entry(vma->vm_rb.rb_left,
1904 struct vm_area_struct, vm_rb);
1905 if (left->rb_subtree_gap >= length) {
1906 vma = left;
1907 continue;
1908 }
1909 }
1910
1911 gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
1912 check_current:
1913
1914 if (gap_start > high_limit)
1915 return -ENOMEM;
1916 if (gap_end >= low_limit &&
1917 gap_end > gap_start && gap_end - gap_start >= length)
1918 goto found;
1919
1920
1921 if (vma->vm_rb.rb_right) {
1922 struct vm_area_struct *right =
1923 rb_entry(vma->vm_rb.rb_right,
1924 struct vm_area_struct, vm_rb);
1925 if (right->rb_subtree_gap >= length) {
1926 vma = right;
1927 continue;
1928 }
1929 }
1930
1931
1932 while (true) {
1933 struct rb_node *prev = &vma->vm_rb;
1934 if (!rb_parent(prev))
1935 goto check_highest;
1936 vma = rb_entry(rb_parent(prev),
1937 struct vm_area_struct, vm_rb);
1938 if (prev == vma->vm_rb.rb_left) {
1939 gap_start = vm_end_gap(vma->vm_prev);
1940 gap_end = vm_start_gap(vma);
1941 goto check_current;
1942 }
1943 }
1944 }
1945
1946 check_highest:
1947
1948 gap_start = mm->highest_vm_end;
1949 gap_end = ULONG_MAX;
1950 if (gap_start > high_limit)
1951 return -ENOMEM;
1952
1953 found:
1954
1955 if (gap_start < info->low_limit)
1956 gap_start = info->low_limit;
1957
1958
1959 gap_start += (info->align_offset - gap_start) & info->align_mask;
1960
1961 VM_BUG_ON(gap_start + info->length > info->high_limit);
1962 VM_BUG_ON(gap_start + info->length > gap_end);
1963 return gap_start;
1964 }
1965
1966 static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1967 {
1968 struct mm_struct *mm = current->mm;
1969 struct vm_area_struct *vma;
1970 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1971
1972
1973 length = info->length + info->align_mask;
1974 if (length < info->length)
1975 return -ENOMEM;
1976
1977
1978
1979
1980
1981 gap_end = info->high_limit;
1982 if (gap_end < length)
1983 return -ENOMEM;
1984 high_limit = gap_end - length;
1985
1986 if (info->low_limit > high_limit)
1987 return -ENOMEM;
1988 low_limit = info->low_limit + length;
1989
1990
1991 gap_start = mm->highest_vm_end;
1992 if (gap_start <= high_limit)
1993 goto found_highest;
1994
1995
1996 if (RB_EMPTY_ROOT(&mm->mm_rb))
1997 return -ENOMEM;
1998 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1999 if (vma->rb_subtree_gap < length)
2000 return -ENOMEM;
2001
2002 while (true) {
2003
2004 gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
2005 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
2006 struct vm_area_struct *right =
2007 rb_entry(vma->vm_rb.rb_right,
2008 struct vm_area_struct, vm_rb);
2009 if (right->rb_subtree_gap >= length) {
2010 vma = right;
2011 continue;
2012 }
2013 }
2014
2015 check_current:
2016
2017 gap_end = vm_start_gap(vma);
2018 if (gap_end < low_limit)
2019 return -ENOMEM;
2020 if (gap_start <= high_limit &&
2021 gap_end > gap_start && gap_end - gap_start >= length)
2022 goto found;
2023
2024
2025 if (vma->vm_rb.rb_left) {
2026 struct vm_area_struct *left =
2027 rb_entry(vma->vm_rb.rb_left,
2028 struct vm_area_struct, vm_rb);
2029 if (left->rb_subtree_gap >= length) {
2030 vma = left;
2031 continue;
2032 }
2033 }
2034
2035
2036 while (true) {
2037 struct rb_node *prev = &vma->vm_rb;
2038 if (!rb_parent(prev))
2039 return -ENOMEM;
2040 vma = rb_entry(rb_parent(prev),
2041 struct vm_area_struct, vm_rb);
2042 if (prev == vma->vm_rb.rb_right) {
2043 gap_start = vma->vm_prev ?
2044 vm_end_gap(vma->vm_prev) : 0;
2045 goto check_current;
2046 }
2047 }
2048 }
2049
2050 found:
2051
2052 if (gap_end > info->high_limit)
2053 gap_end = info->high_limit;
2054
2055 found_highest:
2056
2057 gap_end -= info->length;
2058 gap_end -= (gap_end - info->align_offset) & info->align_mask;
2059
2060 VM_BUG_ON(gap_end < info->low_limit);
2061 VM_BUG_ON(gap_end < gap_start);
2062 return gap_end;
2063 }
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074 unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
2075 {
2076 unsigned long addr;
2077
2078 if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
2079 addr = unmapped_area_topdown(info);
2080 else
2081 addr = unmapped_area(info);
2082
2083 trace_vm_unmapped_area(addr, info);
2084 return addr;
2085 }
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098 unsigned long
2099 generic_get_unmapped_area(struct file *filp, unsigned long addr,
2100 unsigned long len, unsigned long pgoff,
2101 unsigned long flags)
2102 {
2103 struct mm_struct *mm = current->mm;
2104 struct vm_area_struct *vma, *prev;
2105 struct vm_unmapped_area_info info;
2106 const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
2107
2108 if (len > mmap_end - mmap_min_addr)
2109 return -ENOMEM;
2110
2111 if (flags & MAP_FIXED)
2112 return addr;
2113
2114 if (addr) {
2115 addr = PAGE_ALIGN(addr);
2116 vma = find_vma_prev(mm, addr, &prev);
2117 if (mmap_end - len >= addr && addr >= mmap_min_addr &&
2118 (!vma || addr + len <= vm_start_gap(vma)) &&
2119 (!prev || addr >= vm_end_gap(prev)))
2120 return addr;
2121 }
2122
2123 info.flags = 0;
2124 info.length = len;
2125 info.low_limit = mm->mmap_base;
2126 info.high_limit = mmap_end;
2127 info.align_mask = 0;
2128 info.align_offset = 0;
2129 return vm_unmapped_area(&info);
2130 }
2131
2132 #ifndef HAVE_ARCH_UNMAPPED_AREA
2133 unsigned long
2134 arch_get_unmapped_area(struct file *filp, unsigned long addr,
2135 unsigned long len, unsigned long pgoff,
2136 unsigned long flags)
2137 {
2138 return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
2139 }
2140 #endif
2141
2142
2143
2144
2145
2146 unsigned long
2147 generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
2148 unsigned long len, unsigned long pgoff,
2149 unsigned long flags)
2150 {
2151 struct vm_area_struct *vma, *prev;
2152 struct mm_struct *mm = current->mm;
2153 struct vm_unmapped_area_info info;
2154 const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
2155
2156
2157 if (len > mmap_end - mmap_min_addr)
2158 return -ENOMEM;
2159
2160 if (flags & MAP_FIXED)
2161 return addr;
2162
2163
2164 if (addr) {
2165 addr = PAGE_ALIGN(addr);
2166 vma = find_vma_prev(mm, addr, &prev);
2167 if (mmap_end - len >= addr && addr >= mmap_min_addr &&
2168 (!vma || addr + len <= vm_start_gap(vma)) &&
2169 (!prev || addr >= vm_end_gap(prev)))
2170 return addr;
2171 }
2172
2173 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
2174 info.length = len;
2175 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
2176 info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
2177 info.align_mask = 0;
2178 info.align_offset = 0;
2179 addr = vm_unmapped_area(&info);
2180
2181
2182
2183
2184
2185
2186
2187 if (offset_in_page(addr)) {
2188 VM_BUG_ON(addr != -ENOMEM);
2189 info.flags = 0;
2190 info.low_limit = TASK_UNMAPPED_BASE;
2191 info.high_limit = mmap_end;
2192 addr = vm_unmapped_area(&info);
2193 }
2194
2195 return addr;
2196 }
2197
2198 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
2199 unsigned long
2200 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
2201 unsigned long len, unsigned long pgoff,
2202 unsigned long flags)
2203 {
2204 return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
2205 }
2206 #endif
2207
2208 unsigned long
2209 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2210 unsigned long pgoff, unsigned long flags)
2211 {
2212 unsigned long (*get_area)(struct file *, unsigned long,
2213 unsigned long, unsigned long, unsigned long);
2214
2215 unsigned long error = arch_mmap_check(addr, len, flags);
2216 if (error)
2217 return error;
2218
2219
2220 if (len > TASK_SIZE)
2221 return -ENOMEM;
2222
2223 get_area = current->mm->get_unmapped_area;
2224 if (file) {
2225 if (file->f_op->get_unmapped_area)
2226 get_area = file->f_op->get_unmapped_area;
2227 } else if (flags & MAP_SHARED) {
2228
2229
2230
2231
2232
2233 pgoff = 0;
2234 get_area = shmem_get_unmapped_area;
2235 }
2236
2237 addr = get_area(file, addr, len, pgoff, flags);
2238 if (IS_ERR_VALUE(addr))
2239 return addr;
2240
2241 if (addr > TASK_SIZE - len)
2242 return -ENOMEM;
2243 if (offset_in_page(addr))
2244 return -EINVAL;
2245
2246 error = security_mmap_addr(addr);
2247 return error ? error : addr;
2248 }
2249
2250 EXPORT_SYMBOL(get_unmapped_area);
2251
2252
2253 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2254 {
2255 struct rb_node *rb_node;
2256 struct vm_area_struct *vma;
2257
2258 mmap_assert_locked(mm);
2259
2260 vma = vmacache_find(mm, addr);
2261 if (likely(vma))
2262 return vma;
2263
2264 rb_node = mm->mm_rb.rb_node;
2265
2266 while (rb_node) {
2267 struct vm_area_struct *tmp;
2268
2269 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2270
2271 if (tmp->vm_end > addr) {
2272 vma = tmp;
2273 if (tmp->vm_start <= addr)
2274 break;
2275 rb_node = rb_node->rb_left;
2276 } else
2277 rb_node = rb_node->rb_right;
2278 }
2279
2280 if (vma)
2281 vmacache_update(addr, vma);
2282 return vma;
2283 }
2284
2285 EXPORT_SYMBOL(find_vma);
2286
2287
2288
2289
2290 struct vm_area_struct *
2291 find_vma_prev(struct mm_struct *mm, unsigned long addr,
2292 struct vm_area_struct **pprev)
2293 {
2294 struct vm_area_struct *vma;
2295
2296 vma = find_vma(mm, addr);
2297 if (vma) {
2298 *pprev = vma->vm_prev;
2299 } else {
2300 struct rb_node *rb_node = rb_last(&mm->mm_rb);
2301
2302 *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
2303 }
2304 return vma;
2305 }
2306
2307
2308
2309
2310
2311
2312 static int acct_stack_growth(struct vm_area_struct *vma,
2313 unsigned long size, unsigned long grow)
2314 {
2315 struct mm_struct *mm = vma->vm_mm;
2316 unsigned long new_start;
2317
2318
2319 if (!may_expand_vm(mm, vma->vm_flags, grow))
2320 return -ENOMEM;
2321
2322
2323 if (size > rlimit(RLIMIT_STACK))
2324 return -ENOMEM;
2325
2326
2327 if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
2328 return -ENOMEM;
2329
2330
2331 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2332 vma->vm_end - size;
2333 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2334 return -EFAULT;
2335
2336
2337
2338
2339
2340 if (security_vm_enough_memory_mm(mm, grow))
2341 return -ENOMEM;
2342
2343 return 0;
2344 }
2345
2346 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2347
2348
2349
2350
2351 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2352 {
2353 struct mm_struct *mm = vma->vm_mm;
2354 struct vm_area_struct *next;
2355 unsigned long gap_addr;
2356 int error = 0;
2357
2358 if (!(vma->vm_flags & VM_GROWSUP))
2359 return -EFAULT;
2360
2361
2362 address &= PAGE_MASK;
2363 if (address >= (TASK_SIZE & PAGE_MASK))
2364 return -ENOMEM;
2365 address += PAGE_SIZE;
2366
2367
2368 gap_addr = address + stack_guard_gap;
2369
2370
2371 if (gap_addr < address || gap_addr > TASK_SIZE)
2372 gap_addr = TASK_SIZE;
2373
2374 next = vma->vm_next;
2375 if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
2376 if (!(next->vm_flags & VM_GROWSUP))
2377 return -ENOMEM;
2378
2379 }
2380
2381
2382 if (unlikely(anon_vma_prepare(vma)))
2383 return -ENOMEM;
2384
2385
2386
2387
2388
2389
2390 anon_vma_lock_write(vma->anon_vma);
2391
2392
2393 if (address > vma->vm_end) {
2394 unsigned long size, grow;
2395
2396 size = address - vma->vm_start;
2397 grow = (address - vma->vm_end) >> PAGE_SHIFT;
2398
2399 error = -ENOMEM;
2400 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2401 error = acct_stack_growth(vma, size, grow);
2402 if (!error) {
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414 spin_lock(&mm->page_table_lock);
2415 if (vma->vm_flags & VM_LOCKED)
2416 mm->locked_vm += grow;
2417 vm_stat_account(mm, vma->vm_flags, grow);
2418 anon_vma_interval_tree_pre_update_vma(vma);
2419 vma->vm_end = address;
2420 anon_vma_interval_tree_post_update_vma(vma);
2421 if (vma->vm_next)
2422 vma_gap_update(vma->vm_next);
2423 else
2424 mm->highest_vm_end = vm_end_gap(vma);
2425 spin_unlock(&mm->page_table_lock);
2426
2427 perf_event_mmap(vma);
2428 }
2429 }
2430 }
2431 anon_vma_unlock_write(vma->anon_vma);
2432 khugepaged_enter_vma(vma, vma->vm_flags);
2433 validate_mm(mm);
2434 return error;
2435 }
2436 #endif
2437
2438
2439
2440
2441 int expand_downwards(struct vm_area_struct *vma,
2442 unsigned long address)
2443 {
2444 struct mm_struct *mm = vma->vm_mm;
2445 struct vm_area_struct *prev;
2446 int error = 0;
2447
2448 address &= PAGE_MASK;
2449 if (address < mmap_min_addr)
2450 return -EPERM;
2451
2452
2453 prev = vma->vm_prev;
2454
2455 if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
2456 vma_is_accessible(prev)) {
2457 if (address - prev->vm_end < stack_guard_gap)
2458 return -ENOMEM;
2459 }
2460
2461
2462 if (unlikely(anon_vma_prepare(vma)))
2463 return -ENOMEM;
2464
2465
2466
2467
2468
2469
2470 anon_vma_lock_write(vma->anon_vma);
2471
2472
2473 if (address < vma->vm_start) {
2474 unsigned long size, grow;
2475
2476 size = vma->vm_end - address;
2477 grow = (vma->vm_start - address) >> PAGE_SHIFT;
2478
2479 error = -ENOMEM;
2480 if (grow <= vma->vm_pgoff) {
2481 error = acct_stack_growth(vma, size, grow);
2482 if (!error) {
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494 spin_lock(&mm->page_table_lock);
2495 if (vma->vm_flags & VM_LOCKED)
2496 mm->locked_vm += grow;
2497 vm_stat_account(mm, vma->vm_flags, grow);
2498 anon_vma_interval_tree_pre_update_vma(vma);
2499 vma->vm_start = address;
2500 vma->vm_pgoff -= grow;
2501 anon_vma_interval_tree_post_update_vma(vma);
2502 vma_gap_update(vma);
2503 spin_unlock(&mm->page_table_lock);
2504
2505 perf_event_mmap(vma);
2506 }
2507 }
2508 }
2509 anon_vma_unlock_write(vma->anon_vma);
2510 khugepaged_enter_vma(vma, vma->vm_flags);
2511 validate_mm(mm);
2512 return error;
2513 }
2514
2515
2516 unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
2517
2518 static int __init cmdline_parse_stack_guard_gap(char *p)
2519 {
2520 unsigned long val;
2521 char *endptr;
2522
2523 val = simple_strtoul(p, &endptr, 10);
2524 if (!*endptr)
2525 stack_guard_gap = val << PAGE_SHIFT;
2526
2527 return 1;
2528 }
2529 __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
2530
2531 #ifdef CONFIG_STACK_GROWSUP
2532 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2533 {
2534 return expand_upwards(vma, address);
2535 }
2536
2537 struct vm_area_struct *
2538 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2539 {
2540 struct vm_area_struct *vma, *prev;
2541
2542 addr &= PAGE_MASK;
2543 vma = find_vma_prev(mm, addr, &prev);
2544 if (vma && (vma->vm_start <= addr))
2545 return vma;
2546 if (!prev || expand_stack(prev, addr))
2547 return NULL;
2548 if (prev->vm_flags & VM_LOCKED)
2549 populate_vma_page_range(prev, addr, prev->vm_end, NULL);
2550 return prev;
2551 }
2552 #else
2553 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2554 {
2555 return expand_downwards(vma, address);
2556 }
2557
2558 struct vm_area_struct *
2559 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2560 {
2561 struct vm_area_struct *vma;
2562 unsigned long start;
2563
2564 addr &= PAGE_MASK;
2565 vma = find_vma(mm, addr);
2566 if (!vma)
2567 return NULL;
2568 if (vma->vm_start <= addr)
2569 return vma;
2570 if (!(vma->vm_flags & VM_GROWSDOWN))
2571 return NULL;
2572 start = vma->vm_start;
2573 if (expand_stack(vma, addr))
2574 return NULL;
2575 if (vma->vm_flags & VM_LOCKED)
2576 populate_vma_page_range(vma, addr, start, NULL);
2577 return vma;
2578 }
2579 #endif
2580
2581 EXPORT_SYMBOL_GPL(find_extend_vma);
2582
2583
2584
2585
2586
2587
2588
2589 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2590 {
2591 unsigned long nr_accounted = 0;
2592
2593
2594 update_hiwater_vm(mm);
2595 do {
2596 long nrpages = vma_pages(vma);
2597
2598 if (vma->vm_flags & VM_ACCOUNT)
2599 nr_accounted += nrpages;
2600 vm_stat_account(mm, vma->vm_flags, -nrpages);
2601 vma = remove_vma(vma);
2602 } while (vma);
2603 vm_unacct_memory(nr_accounted);
2604 validate_mm(mm);
2605 }
2606
2607
2608
2609
2610
2611
2612 static void unmap_region(struct mm_struct *mm,
2613 struct vm_area_struct *vma, struct vm_area_struct *prev,
2614 unsigned long start, unsigned long end)
2615 {
2616 struct vm_area_struct *next = vma_next(mm, prev);
2617 struct mmu_gather tlb;
2618
2619 lru_add_drain();
2620 tlb_gather_mmu(&tlb, mm);
2621 update_hiwater_rss(mm);
2622 unmap_vmas(&tlb, vma, start, end);
2623 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2624 next ? next->vm_start : USER_PGTABLES_CEILING);
2625 tlb_finish_mmu(&tlb);
2626 }
2627
2628
2629
2630
2631
2632 static bool
2633 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2634 struct vm_area_struct *prev, unsigned long end)
2635 {
2636 struct vm_area_struct **insertion_point;
2637 struct vm_area_struct *tail_vma = NULL;
2638
2639 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2640 vma->vm_prev = NULL;
2641 do {
2642 vma_rb_erase(vma, &mm->mm_rb);
2643 if (vma->vm_flags & VM_LOCKED)
2644 mm->locked_vm -= vma_pages(vma);
2645 mm->map_count--;
2646 tail_vma = vma;
2647 vma = vma->vm_next;
2648 } while (vma && vma->vm_start < end);
2649 *insertion_point = vma;
2650 if (vma) {
2651 vma->vm_prev = prev;
2652 vma_gap_update(vma);
2653 } else
2654 mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
2655 tail_vma->vm_next = NULL;
2656
2657
2658 vmacache_invalidate(mm);
2659
2660
2661
2662
2663
2664
2665 if (vma && (vma->vm_flags & VM_GROWSDOWN))
2666 return false;
2667 if (prev && (prev->vm_flags & VM_GROWSUP))
2668 return false;
2669 return true;
2670 }
2671
2672
2673
2674
2675
2676 int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2677 unsigned long addr, int new_below)
2678 {
2679 struct vm_area_struct *new;
2680 int err;
2681
2682 if (vma->vm_ops && vma->vm_ops->may_split) {
2683 err = vma->vm_ops->may_split(vma, addr);
2684 if (err)
2685 return err;
2686 }
2687
2688 new = vm_area_dup(vma);
2689 if (!new)
2690 return -ENOMEM;
2691
2692 if (new_below)
2693 new->vm_end = addr;
2694 else {
2695 new->vm_start = addr;
2696 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2697 }
2698
2699 err = vma_dup_policy(vma, new);
2700 if (err)
2701 goto out_free_vma;
2702
2703 err = anon_vma_clone(new, vma);
2704 if (err)
2705 goto out_free_mpol;
2706
2707 if (new->vm_file)
2708 get_file(new->vm_file);
2709
2710 if (new->vm_ops && new->vm_ops->open)
2711 new->vm_ops->open(new);
2712
2713 if (new_below)
2714 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2715 ((addr - new->vm_start) >> PAGE_SHIFT), new);
2716 else
2717 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2718
2719
2720 if (!err)
2721 return 0;
2722
2723
2724 if (new->vm_ops && new->vm_ops->close)
2725 new->vm_ops->close(new);
2726 if (new->vm_file)
2727 fput(new->vm_file);
2728 unlink_anon_vmas(new);
2729 out_free_mpol:
2730 mpol_put(vma_policy(new));
2731 out_free_vma:
2732 vm_area_free(new);
2733 return err;
2734 }
2735
2736
2737
2738
2739
2740 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2741 unsigned long addr, int new_below)
2742 {
2743 if (mm->map_count >= sysctl_max_map_count)
2744 return -ENOMEM;
2745
2746 return __split_vma(mm, vma, addr, new_below);
2747 }
2748
2749
2750
2751
2752
2753
2754 int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2755 struct list_head *uf, bool downgrade)
2756 {
2757 unsigned long end;
2758 struct vm_area_struct *vma, *prev, *last;
2759
2760 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
2761 return -EINVAL;
2762
2763 len = PAGE_ALIGN(len);
2764 end = start + len;
2765 if (len == 0)
2766 return -EINVAL;
2767
2768
2769
2770
2771
2772
2773 arch_unmap(mm, start, end);
2774
2775
2776 vma = find_vma_intersection(mm, start, end);
2777 if (!vma)
2778 return 0;
2779 prev = vma->vm_prev;
2780
2781
2782
2783
2784
2785
2786
2787
2788 if (start > vma->vm_start) {
2789 int error;
2790
2791
2792
2793
2794
2795
2796 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2797 return -ENOMEM;
2798
2799 error = __split_vma(mm, vma, start, 0);
2800 if (error)
2801 return error;
2802 prev = vma;
2803 }
2804
2805
2806 last = find_vma(mm, end);
2807 if (last && end > last->vm_start) {
2808 int error = __split_vma(mm, last, end, 1);
2809 if (error)
2810 return error;
2811 }
2812 vma = vma_next(mm, prev);
2813
2814 if (unlikely(uf)) {
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824 int error = userfaultfd_unmap_prep(vma, start, end, uf);
2825 if (error)
2826 return error;
2827 }
2828
2829
2830 if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
2831 downgrade = false;
2832
2833 if (downgrade)
2834 mmap_write_downgrade(mm);
2835
2836 unmap_region(mm, vma, prev, start, end);
2837
2838
2839 remove_vma_list(mm, vma);
2840
2841 return downgrade ? 1 : 0;
2842 }
2843
2844 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2845 struct list_head *uf)
2846 {
2847 return __do_munmap(mm, start, len, uf, false);
2848 }
2849
2850 static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
2851 {
2852 int ret;
2853 struct mm_struct *mm = current->mm;
2854 LIST_HEAD(uf);
2855
2856 if (mmap_write_lock_killable(mm))
2857 return -EINTR;
2858
2859 ret = __do_munmap(mm, start, len, &uf, downgrade);
2860
2861
2862
2863
2864
2865 if (ret == 1) {
2866 mmap_read_unlock(mm);
2867 ret = 0;
2868 } else
2869 mmap_write_unlock(mm);
2870
2871 userfaultfd_unmap_complete(mm, &uf);
2872 return ret;
2873 }
2874
2875 int vm_munmap(unsigned long start, size_t len)
2876 {
2877 return __vm_munmap(start, len, false);
2878 }
2879 EXPORT_SYMBOL(vm_munmap);
2880
2881 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2882 {
2883 addr = untagged_addr(addr);
2884 return __vm_munmap(addr, len, true);
2885 }
2886
2887
2888
2889
2890
2891 SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2892 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
2893 {
2894
2895 struct mm_struct *mm = current->mm;
2896 struct vm_area_struct *vma;
2897 unsigned long populate = 0;
2898 unsigned long ret = -EINVAL;
2899 struct file *file;
2900
2901 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
2902 current->comm, current->pid);
2903
2904 if (prot)
2905 return ret;
2906 start = start & PAGE_MASK;
2907 size = size & PAGE_MASK;
2908
2909 if (start + size <= start)
2910 return ret;
2911
2912
2913 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2914 return ret;
2915
2916 if (mmap_write_lock_killable(mm))
2917 return -EINTR;
2918
2919 vma = vma_lookup(mm, start);
2920
2921 if (!vma || !(vma->vm_flags & VM_SHARED))
2922 goto out;
2923
2924 if (start + size > vma->vm_end) {
2925 struct vm_area_struct *next;
2926
2927 for (next = vma->vm_next; next; next = next->vm_next) {
2928
2929 if (next->vm_start != next->vm_prev->vm_end)
2930 goto out;
2931
2932 if (next->vm_file != vma->vm_file)
2933 goto out;
2934
2935 if (next->vm_flags != vma->vm_flags)
2936 goto out;
2937
2938 if (start + size <= next->vm_end)
2939 break;
2940 }
2941
2942 if (!next)
2943 goto out;
2944 }
2945
2946 prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
2947 prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
2948 prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
2949
2950 flags &= MAP_NONBLOCK;
2951 flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
2952 if (vma->vm_flags & VM_LOCKED)
2953 flags |= MAP_LOCKED;
2954
2955 file = get_file(vma->vm_file);
2956 ret = do_mmap(vma->vm_file, start, size,
2957 prot, flags, pgoff, &populate, NULL);
2958 fput(file);
2959 out:
2960 mmap_write_unlock(mm);
2961 if (populate)
2962 mm_populate(ret, populate);
2963 if (!IS_ERR_VALUE(ret))
2964 ret = 0;
2965 return ret;
2966 }
2967
2968
2969
2970
2971
2972
2973 static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
2974 {
2975 struct mm_struct *mm = current->mm;
2976 struct vm_area_struct *vma, *prev;
2977 struct rb_node **rb_link, *rb_parent;
2978 pgoff_t pgoff = addr >> PAGE_SHIFT;
2979 int error;
2980 unsigned long mapped_addr;
2981
2982
2983 if ((flags & (~VM_EXEC)) != 0)
2984 return -EINVAL;
2985 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2986
2987 mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2988 if (IS_ERR_VALUE(mapped_addr))
2989 return mapped_addr;
2990
2991 error = mlock_future_check(mm, mm->def_flags, len);
2992 if (error)
2993 return error;
2994
2995
2996 if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
2997 return -ENOMEM;
2998
2999
3000 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
3001 return -ENOMEM;
3002
3003 if (mm->map_count > sysctl_max_map_count)
3004 return -ENOMEM;
3005
3006 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
3007 return -ENOMEM;
3008
3009
3010 vma = vma_merge(mm, prev, addr, addr + len, flags,
3011 NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
3012 if (vma)
3013 goto out;
3014
3015
3016
3017
3018 vma = vm_area_alloc(mm);
3019 if (!vma) {
3020 vm_unacct_memory(len >> PAGE_SHIFT);
3021 return -ENOMEM;
3022 }
3023
3024 vma_set_anonymous(vma);
3025 vma->vm_start = addr;
3026 vma->vm_end = addr + len;
3027 vma->vm_pgoff = pgoff;
3028 vma->vm_flags = flags;
3029 vma->vm_page_prot = vm_get_page_prot(flags);
3030 vma_link(mm, vma, prev, rb_link, rb_parent);
3031 out:
3032 perf_event_mmap(vma);
3033 mm->total_vm += len >> PAGE_SHIFT;
3034 mm->data_vm += len >> PAGE_SHIFT;
3035 if (flags & VM_LOCKED)
3036 mm->locked_vm += (len >> PAGE_SHIFT);
3037 vma->vm_flags |= VM_SOFTDIRTY;
3038 return 0;
3039 }
3040
3041 int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
3042 {
3043 struct mm_struct *mm = current->mm;
3044 unsigned long len;
3045 int ret;
3046 bool populate;
3047 LIST_HEAD(uf);
3048
3049 len = PAGE_ALIGN(request);
3050 if (len < request)
3051 return -ENOMEM;
3052 if (!len)
3053 return 0;
3054
3055 if (mmap_write_lock_killable(mm))
3056 return -EINTR;
3057
3058 ret = do_brk_flags(addr, len, flags, &uf);
3059 populate = ((mm->def_flags & VM_LOCKED) != 0);
3060 mmap_write_unlock(mm);
3061 userfaultfd_unmap_complete(mm, &uf);
3062 if (populate && !ret)
3063 mm_populate(addr, len);
3064 return ret;
3065 }
3066 EXPORT_SYMBOL(vm_brk_flags);
3067
3068 int vm_brk(unsigned long addr, unsigned long len)
3069 {
3070 return vm_brk_flags(addr, len, 0);
3071 }
3072 EXPORT_SYMBOL(vm_brk);
3073
3074
3075 void exit_mmap(struct mm_struct *mm)
3076 {
3077 struct mmu_gather tlb;
3078 struct vm_area_struct *vma;
3079 unsigned long nr_accounted = 0;
3080
3081
3082 mmu_notifier_release(mm);
3083
3084 if (unlikely(mm_is_oom_victim(mm))) {
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097 (void)__oom_reap_task_mm(mm);
3098 set_bit(MMF_OOM_SKIP, &mm->flags);
3099 }
3100
3101 mmap_write_lock(mm);
3102 arch_exit_mmap(mm);
3103
3104 vma = mm->mmap;
3105 if (!vma) {
3106
3107 mmap_write_unlock(mm);
3108 return;
3109 }
3110
3111 lru_add_drain();
3112 flush_cache_mm(mm);
3113 tlb_gather_mmu_fullmm(&tlb, mm);
3114
3115
3116 unmap_vmas(&tlb, vma, 0, -1);
3117 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
3118 tlb_finish_mmu(&tlb);
3119
3120
3121 while (vma) {
3122 if (vma->vm_flags & VM_ACCOUNT)
3123 nr_accounted += vma_pages(vma);
3124 vma = remove_vma(vma);
3125 cond_resched();
3126 }
3127 mm->mmap = NULL;
3128 mmap_write_unlock(mm);
3129 vm_unacct_memory(nr_accounted);
3130 }
3131
3132
3133
3134
3135
3136 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
3137 {
3138 struct vm_area_struct *prev;
3139 struct rb_node **rb_link, *rb_parent;
3140
3141 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
3142 &prev, &rb_link, &rb_parent))
3143 return -ENOMEM;
3144 if ((vma->vm_flags & VM_ACCOUNT) &&
3145 security_vm_enough_memory_mm(mm, vma_pages(vma)))
3146 return -ENOMEM;
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160 if (vma_is_anonymous(vma)) {
3161 BUG_ON(vma->anon_vma);
3162 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
3163 }
3164
3165 vma_link(mm, vma, prev, rb_link, rb_parent);
3166 return 0;
3167 }
3168
3169
3170
3171
3172
3173 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
3174 unsigned long addr, unsigned long len, pgoff_t pgoff,
3175 bool *need_rmap_locks)
3176 {
3177 struct vm_area_struct *vma = *vmap;
3178 unsigned long vma_start = vma->vm_start;
3179 struct mm_struct *mm = vma->vm_mm;
3180 struct vm_area_struct *new_vma, *prev;
3181 struct rb_node **rb_link, *rb_parent;
3182 bool faulted_in_anon_vma = true;
3183
3184
3185
3186
3187
3188 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
3189 pgoff = addr >> PAGE_SHIFT;
3190 faulted_in_anon_vma = false;
3191 }
3192
3193 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
3194 return NULL;
3195 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
3196 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
3197 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
3198 if (new_vma) {
3199
3200
3201
3202 if (unlikely(vma_start >= new_vma->vm_start &&
3203 vma_start < new_vma->vm_end)) {
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
3217 *vmap = vma = new_vma;
3218 }
3219 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
3220 } else {
3221 new_vma = vm_area_dup(vma);
3222 if (!new_vma)
3223 goto out;
3224 new_vma->vm_start = addr;
3225 new_vma->vm_end = addr + len;
3226 new_vma->vm_pgoff = pgoff;
3227 if (vma_dup_policy(vma, new_vma))
3228 goto out_free_vma;
3229 if (anon_vma_clone(new_vma, vma))
3230 goto out_free_mempol;
3231 if (new_vma->vm_file)
3232 get_file(new_vma->vm_file);
3233 if (new_vma->vm_ops && new_vma->vm_ops->open)
3234 new_vma->vm_ops->open(new_vma);
3235 vma_link(mm, new_vma, prev, rb_link, rb_parent);
3236 *need_rmap_locks = false;
3237 }
3238 return new_vma;
3239
3240 out_free_mempol:
3241 mpol_put(vma_policy(new_vma));
3242 out_free_vma:
3243 vm_area_free(new_vma);
3244 out:
3245 return NULL;
3246 }
3247
3248
3249
3250
3251
3252 bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
3253 {
3254 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
3255 return false;
3256
3257 if (is_data_mapping(flags) &&
3258 mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
3259
3260 if (rlimit(RLIMIT_DATA) == 0 &&
3261 mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
3262 return true;
3263
3264 pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
3265 current->comm, current->pid,
3266 (mm->data_vm + npages) << PAGE_SHIFT,
3267 rlimit(RLIMIT_DATA),
3268 ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
3269
3270 if (!ignore_rlimit_data)
3271 return false;
3272 }
3273
3274 return true;
3275 }
3276
3277 void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
3278 {
3279 WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
3280
3281 if (is_exec_mapping(flags))
3282 mm->exec_vm += npages;
3283 else if (is_stack_mapping(flags))
3284 mm->stack_vm += npages;
3285 else if (is_data_mapping(flags))
3286 mm->data_vm += npages;
3287 }
3288
3289 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
3290
3291
3292
3293
3294 static void special_mapping_close(struct vm_area_struct *vma)
3295 {
3296 }
3297
3298 static const char *special_mapping_name(struct vm_area_struct *vma)
3299 {
3300 return ((struct vm_special_mapping *)vma->vm_private_data)->name;
3301 }
3302
3303 static int special_mapping_mremap(struct vm_area_struct *new_vma)
3304 {
3305 struct vm_special_mapping *sm = new_vma->vm_private_data;
3306
3307 if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
3308 return -EFAULT;
3309
3310 if (sm->mremap)
3311 return sm->mremap(sm, new_vma);
3312
3313 return 0;
3314 }
3315
3316 static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
3317 {
3318
3319
3320
3321
3322
3323
3324 return -EINVAL;
3325 }
3326
3327 static const struct vm_operations_struct special_mapping_vmops = {
3328 .close = special_mapping_close,
3329 .fault = special_mapping_fault,
3330 .mremap = special_mapping_mremap,
3331 .name = special_mapping_name,
3332
3333 .access = NULL,
3334 .may_split = special_mapping_split,
3335 };
3336
3337 static const struct vm_operations_struct legacy_special_mapping_vmops = {
3338 .close = special_mapping_close,
3339 .fault = special_mapping_fault,
3340 };
3341
3342 static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
3343 {
3344 struct vm_area_struct *vma = vmf->vma;
3345 pgoff_t pgoff;
3346 struct page **pages;
3347
3348 if (vma->vm_ops == &legacy_special_mapping_vmops) {
3349 pages = vma->vm_private_data;
3350 } else {
3351 struct vm_special_mapping *sm = vma->vm_private_data;
3352
3353 if (sm->fault)
3354 return sm->fault(sm, vmf->vma, vmf);
3355
3356 pages = sm->pages;
3357 }
3358
3359 for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
3360 pgoff--;
3361
3362 if (*pages) {
3363 struct page *page = *pages;
3364 get_page(page);
3365 vmf->page = page;
3366 return 0;
3367 }
3368
3369 return VM_FAULT_SIGBUS;
3370 }
3371
3372 static struct vm_area_struct *__install_special_mapping(
3373 struct mm_struct *mm,
3374 unsigned long addr, unsigned long len,
3375 unsigned long vm_flags, void *priv,
3376 const struct vm_operations_struct *ops)
3377 {
3378 int ret;
3379 struct vm_area_struct *vma;
3380
3381 vma = vm_area_alloc(mm);
3382 if (unlikely(vma == NULL))
3383 return ERR_PTR(-ENOMEM);
3384
3385 vma->vm_start = addr;
3386 vma->vm_end = addr + len;
3387
3388 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
3389 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
3390 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
3391
3392 vma->vm_ops = ops;
3393 vma->vm_private_data = priv;
3394
3395 ret = insert_vm_struct(mm, vma);
3396 if (ret)
3397 goto out;
3398
3399 vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
3400
3401 perf_event_mmap(vma);
3402
3403 return vma;
3404
3405 out:
3406 vm_area_free(vma);
3407 return ERR_PTR(ret);
3408 }
3409
3410 bool vma_is_special_mapping(const struct vm_area_struct *vma,
3411 const struct vm_special_mapping *sm)
3412 {
3413 return vma->vm_private_data == sm &&
3414 (vma->vm_ops == &special_mapping_vmops ||
3415 vma->vm_ops == &legacy_special_mapping_vmops);
3416 }
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427 struct vm_area_struct *_install_special_mapping(
3428 struct mm_struct *mm,
3429 unsigned long addr, unsigned long len,
3430 unsigned long vm_flags, const struct vm_special_mapping *spec)
3431 {
3432 return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
3433 &special_mapping_vmops);
3434 }
3435
3436 int install_special_mapping(struct mm_struct *mm,
3437 unsigned long addr, unsigned long len,
3438 unsigned long vm_flags, struct page **pages)
3439 {
3440 struct vm_area_struct *vma = __install_special_mapping(
3441 mm, addr, len, vm_flags, (void *)pages,
3442 &legacy_special_mapping_vmops);
3443
3444 return PTR_ERR_OR_ZERO(vma);
3445 }
3446
3447 static DEFINE_MUTEX(mm_all_locks_mutex);
3448
3449 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
3450 {
3451 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
3452
3453
3454
3455
3456 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466 if (__test_and_set_bit(0, (unsigned long *)
3467 &anon_vma->root->rb_root.rb_root.rb_node))
3468 BUG();
3469 }
3470 }
3471
3472 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3473 {
3474 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3485 BUG();
3486 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
3487 }
3488 }
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527 int mm_take_all_locks(struct mm_struct *mm)
3528 {
3529 struct vm_area_struct *vma;
3530 struct anon_vma_chain *avc;
3531
3532 mmap_assert_write_locked(mm);
3533
3534 mutex_lock(&mm_all_locks_mutex);
3535
3536 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3537 if (signal_pending(current))
3538 goto out_unlock;
3539 if (vma->vm_file && vma->vm_file->f_mapping &&
3540 is_vm_hugetlb_page(vma))
3541 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3542 }
3543
3544 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3545 if (signal_pending(current))
3546 goto out_unlock;
3547 if (vma->vm_file && vma->vm_file->f_mapping &&
3548 !is_vm_hugetlb_page(vma))
3549 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3550 }
3551
3552 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3553 if (signal_pending(current))
3554 goto out_unlock;
3555 if (vma->anon_vma)
3556 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3557 vm_lock_anon_vma(mm, avc->anon_vma);
3558 }
3559
3560 return 0;
3561
3562 out_unlock:
3563 mm_drop_all_locks(mm);
3564 return -EINTR;
3565 }
3566
3567 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3568 {
3569 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582 if (!__test_and_clear_bit(0, (unsigned long *)
3583 &anon_vma->root->rb_root.rb_root.rb_node))
3584 BUG();
3585 anon_vma_unlock_write(anon_vma);
3586 }
3587 }
3588
3589 static void vm_unlock_mapping(struct address_space *mapping)
3590 {
3591 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3592
3593
3594
3595
3596 i_mmap_unlock_write(mapping);
3597 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3598 &mapping->flags))
3599 BUG();
3600 }
3601 }
3602
3603
3604
3605
3606
3607 void mm_drop_all_locks(struct mm_struct *mm)
3608 {
3609 struct vm_area_struct *vma;
3610 struct anon_vma_chain *avc;
3611
3612 mmap_assert_write_locked(mm);
3613 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3614
3615 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3616 if (vma->anon_vma)
3617 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3618 vm_unlock_anon_vma(avc->anon_vma);
3619 if (vma->vm_file && vma->vm_file->f_mapping)
3620 vm_unlock_mapping(vma->vm_file->f_mapping);
3621 }
3622
3623 mutex_unlock(&mm_all_locks_mutex);
3624 }
3625
3626
3627
3628
3629 void __init mmap_init(void)
3630 {
3631 int ret;
3632
3633 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3634 VM_BUG_ON(ret);
3635 }
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647 static int init_user_reserve(void)
3648 {
3649 unsigned long free_kbytes;
3650
3651 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3652
3653 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3654 return 0;
3655 }
3656 subsys_initcall(init_user_reserve);
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668 static int init_admin_reserve(void)
3669 {
3670 unsigned long free_kbytes;
3671
3672 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3673
3674 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3675 return 0;
3676 }
3677 subsys_initcall(init_admin_reserve);
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697 static int reserve_mem_notifier(struct notifier_block *nb,
3698 unsigned long action, void *data)
3699 {
3700 unsigned long tmp, free_kbytes;
3701
3702 switch (action) {
3703 case MEM_ONLINE:
3704
3705 tmp = sysctl_user_reserve_kbytes;
3706 if (0 < tmp && tmp < (1UL << 17))
3707 init_user_reserve();
3708
3709
3710 tmp = sysctl_admin_reserve_kbytes;
3711 if (0 < tmp && tmp < (1UL << 13))
3712 init_admin_reserve();
3713
3714 break;
3715 case MEM_OFFLINE:
3716 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3717
3718 if (sysctl_user_reserve_kbytes > free_kbytes) {
3719 init_user_reserve();
3720 pr_info("vm.user_reserve_kbytes reset to %lu\n",
3721 sysctl_user_reserve_kbytes);
3722 }
3723
3724 if (sysctl_admin_reserve_kbytes > free_kbytes) {
3725 init_admin_reserve();
3726 pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3727 sysctl_admin_reserve_kbytes);
3728 }
3729 break;
3730 default:
3731 break;
3732 }
3733 return NOTIFY_OK;
3734 }
3735
3736 static struct notifier_block reserve_mem_nb = {
3737 .notifier_call = reserve_mem_notifier,
3738 };
3739
3740 static int __meminit init_reserve_notifier(void)
3741 {
3742 if (register_hotmemory_notifier(&reserve_mem_nb))
3743 pr_err("Failed registering memory add/remove notifier for admin reserve\n");
3744
3745 return 0;
3746 }
3747 subsys_initcall(init_reserve_notifier);