Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/pagewalk.h>
0003 #include <linux/highmem.h>
0004 #include <linux/sched.h>
0005 #include <linux/hugetlb.h>
0006 
0007 /*
0008  * We want to know the real level where a entry is located ignoring any
0009  * folding of levels which may be happening. For example if p4d is folded then
0010  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
0011  */
0012 static int real_depth(int depth)
0013 {
0014     if (depth == 3 && PTRS_PER_PMD == 1)
0015         depth = 2;
0016     if (depth == 2 && PTRS_PER_PUD == 1)
0017         depth = 1;
0018     if (depth == 1 && PTRS_PER_P4D == 1)
0019         depth = 0;
0020     return depth;
0021 }
0022 
0023 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
0024                 unsigned long end, struct mm_walk *walk)
0025 {
0026     const struct mm_walk_ops *ops = walk->ops;
0027     int err = 0;
0028 
0029     for (;;) {
0030         err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
0031         if (err)
0032                break;
0033         if (addr >= end - PAGE_SIZE)
0034             break;
0035         addr += PAGE_SIZE;
0036         pte++;
0037     }
0038     return err;
0039 }
0040 
0041 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
0042               struct mm_walk *walk)
0043 {
0044     pte_t *pte;
0045     int err = 0;
0046     spinlock_t *ptl;
0047 
0048     if (walk->no_vma) {
0049         pte = pte_offset_map(pmd, addr);
0050         err = walk_pte_range_inner(pte, addr, end, walk);
0051         pte_unmap(pte);
0052     } else {
0053         pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
0054         err = walk_pte_range_inner(pte, addr, end, walk);
0055         pte_unmap_unlock(pte, ptl);
0056     }
0057 
0058     return err;
0059 }
0060 
0061 #ifdef CONFIG_ARCH_HAS_HUGEPD
0062 static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
0063                  unsigned long end, struct mm_walk *walk, int pdshift)
0064 {
0065     int err = 0;
0066     const struct mm_walk_ops *ops = walk->ops;
0067     int shift = hugepd_shift(*phpd);
0068     int page_size = 1 << shift;
0069 
0070     if (!ops->pte_entry)
0071         return 0;
0072 
0073     if (addr & (page_size - 1))
0074         return 0;
0075 
0076     for (;;) {
0077         pte_t *pte;
0078 
0079         spin_lock(&walk->mm->page_table_lock);
0080         pte = hugepte_offset(*phpd, addr, pdshift);
0081         err = ops->pte_entry(pte, addr, addr + page_size, walk);
0082         spin_unlock(&walk->mm->page_table_lock);
0083 
0084         if (err)
0085             break;
0086         if (addr >= end - page_size)
0087             break;
0088         addr += page_size;
0089     }
0090     return err;
0091 }
0092 #else
0093 static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
0094                  unsigned long end, struct mm_walk *walk, int pdshift)
0095 {
0096     return 0;
0097 }
0098 #endif
0099 
0100 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
0101               struct mm_walk *walk)
0102 {
0103     pmd_t *pmd;
0104     unsigned long next;
0105     const struct mm_walk_ops *ops = walk->ops;
0106     int err = 0;
0107     int depth = real_depth(3);
0108 
0109     pmd = pmd_offset(pud, addr);
0110     do {
0111 again:
0112         next = pmd_addr_end(addr, end);
0113         if (pmd_none(*pmd)) {
0114             if (ops->pte_hole)
0115                 err = ops->pte_hole(addr, next, depth, walk);
0116             if (err)
0117                 break;
0118             continue;
0119         }
0120 
0121         walk->action = ACTION_SUBTREE;
0122 
0123         /*
0124          * This implies that each ->pmd_entry() handler
0125          * needs to know about pmd_trans_huge() pmds
0126          */
0127         if (ops->pmd_entry)
0128             err = ops->pmd_entry(pmd, addr, next, walk);
0129         if (err)
0130             break;
0131 
0132         if (walk->action == ACTION_AGAIN)
0133             goto again;
0134 
0135         /*
0136          * Check this here so we only break down trans_huge
0137          * pages when we _need_ to
0138          */
0139         if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
0140             walk->action == ACTION_CONTINUE ||
0141             !(ops->pte_entry))
0142             continue;
0143 
0144         if (walk->vma) {
0145             split_huge_pmd(walk->vma, pmd, addr);
0146             if (pmd_trans_unstable(pmd))
0147                 goto again;
0148         }
0149 
0150         if (is_hugepd(__hugepd(pmd_val(*pmd))))
0151             err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
0152         else
0153             err = walk_pte_range(pmd, addr, next, walk);
0154         if (err)
0155             break;
0156     } while (pmd++, addr = next, addr != end);
0157 
0158     return err;
0159 }
0160 
0161 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
0162               struct mm_walk *walk)
0163 {
0164     pud_t *pud;
0165     unsigned long next;
0166     const struct mm_walk_ops *ops = walk->ops;
0167     int err = 0;
0168     int depth = real_depth(2);
0169 
0170     pud = pud_offset(p4d, addr);
0171     do {
0172  again:
0173         next = pud_addr_end(addr, end);
0174         if (pud_none(*pud)) {
0175             if (ops->pte_hole)
0176                 err = ops->pte_hole(addr, next, depth, walk);
0177             if (err)
0178                 break;
0179             continue;
0180         }
0181 
0182         walk->action = ACTION_SUBTREE;
0183 
0184         if (ops->pud_entry)
0185             err = ops->pud_entry(pud, addr, next, walk);
0186         if (err)
0187             break;
0188 
0189         if (walk->action == ACTION_AGAIN)
0190             goto again;
0191 
0192         if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
0193             walk->action == ACTION_CONTINUE ||
0194             !(ops->pmd_entry || ops->pte_entry))
0195             continue;
0196 
0197         if (walk->vma)
0198             split_huge_pud(walk->vma, pud, addr);
0199         if (pud_none(*pud))
0200             goto again;
0201 
0202         if (is_hugepd(__hugepd(pud_val(*pud))))
0203             err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
0204         else
0205             err = walk_pmd_range(pud, addr, next, walk);
0206         if (err)
0207             break;
0208     } while (pud++, addr = next, addr != end);
0209 
0210     return err;
0211 }
0212 
0213 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
0214               struct mm_walk *walk)
0215 {
0216     p4d_t *p4d;
0217     unsigned long next;
0218     const struct mm_walk_ops *ops = walk->ops;
0219     int err = 0;
0220     int depth = real_depth(1);
0221 
0222     p4d = p4d_offset(pgd, addr);
0223     do {
0224         next = p4d_addr_end(addr, end);
0225         if (p4d_none_or_clear_bad(p4d)) {
0226             if (ops->pte_hole)
0227                 err = ops->pte_hole(addr, next, depth, walk);
0228             if (err)
0229                 break;
0230             continue;
0231         }
0232         if (ops->p4d_entry) {
0233             err = ops->p4d_entry(p4d, addr, next, walk);
0234             if (err)
0235                 break;
0236         }
0237         if (is_hugepd(__hugepd(p4d_val(*p4d))))
0238             err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
0239         else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
0240             err = walk_pud_range(p4d, addr, next, walk);
0241         if (err)
0242             break;
0243     } while (p4d++, addr = next, addr != end);
0244 
0245     return err;
0246 }
0247 
0248 static int walk_pgd_range(unsigned long addr, unsigned long end,
0249               struct mm_walk *walk)
0250 {
0251     pgd_t *pgd;
0252     unsigned long next;
0253     const struct mm_walk_ops *ops = walk->ops;
0254     int err = 0;
0255 
0256     if (walk->pgd)
0257         pgd = walk->pgd + pgd_index(addr);
0258     else
0259         pgd = pgd_offset(walk->mm, addr);
0260     do {
0261         next = pgd_addr_end(addr, end);
0262         if (pgd_none_or_clear_bad(pgd)) {
0263             if (ops->pte_hole)
0264                 err = ops->pte_hole(addr, next, 0, walk);
0265             if (err)
0266                 break;
0267             continue;
0268         }
0269         if (ops->pgd_entry) {
0270             err = ops->pgd_entry(pgd, addr, next, walk);
0271             if (err)
0272                 break;
0273         }
0274         if (is_hugepd(__hugepd(pgd_val(*pgd))))
0275             err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
0276         else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
0277             err = walk_p4d_range(pgd, addr, next, walk);
0278         if (err)
0279             break;
0280     } while (pgd++, addr = next, addr != end);
0281 
0282     return err;
0283 }
0284 
0285 #ifdef CONFIG_HUGETLB_PAGE
0286 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
0287                        unsigned long end)
0288 {
0289     unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
0290     return boundary < end ? boundary : end;
0291 }
0292 
0293 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
0294                   struct mm_walk *walk)
0295 {
0296     struct vm_area_struct *vma = walk->vma;
0297     struct hstate *h = hstate_vma(vma);
0298     unsigned long next;
0299     unsigned long hmask = huge_page_mask(h);
0300     unsigned long sz = huge_page_size(h);
0301     pte_t *pte;
0302     const struct mm_walk_ops *ops = walk->ops;
0303     int err = 0;
0304 
0305     do {
0306         next = hugetlb_entry_end(h, addr, end);
0307         pte = huge_pte_offset(walk->mm, addr & hmask, sz);
0308 
0309         if (pte)
0310             err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
0311         else if (ops->pte_hole)
0312             err = ops->pte_hole(addr, next, -1, walk);
0313 
0314         if (err)
0315             break;
0316     } while (addr = next, addr != end);
0317 
0318     return err;
0319 }
0320 
0321 #else /* CONFIG_HUGETLB_PAGE */
0322 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
0323                   struct mm_walk *walk)
0324 {
0325     return 0;
0326 }
0327 
0328 #endif /* CONFIG_HUGETLB_PAGE */
0329 
0330 /*
0331  * Decide whether we really walk over the current vma on [@start, @end)
0332  * or skip it via the returned value. Return 0 if we do walk over the
0333  * current vma, and return 1 if we skip the vma. Negative values means
0334  * error, where we abort the current walk.
0335  */
0336 static int walk_page_test(unsigned long start, unsigned long end,
0337             struct mm_walk *walk)
0338 {
0339     struct vm_area_struct *vma = walk->vma;
0340     const struct mm_walk_ops *ops = walk->ops;
0341 
0342     if (ops->test_walk)
0343         return ops->test_walk(start, end, walk);
0344 
0345     /*
0346      * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
0347      * range, so we don't walk over it as we do for normal vmas. However,
0348      * Some callers are interested in handling hole range and they don't
0349      * want to just ignore any single address range. Such users certainly
0350      * define their ->pte_hole() callbacks, so let's delegate them to handle
0351      * vma(VM_PFNMAP).
0352      */
0353     if (vma->vm_flags & VM_PFNMAP) {
0354         int err = 1;
0355         if (ops->pte_hole)
0356             err = ops->pte_hole(start, end, -1, walk);
0357         return err ? err : 1;
0358     }
0359     return 0;
0360 }
0361 
0362 static int __walk_page_range(unsigned long start, unsigned long end,
0363             struct mm_walk *walk)
0364 {
0365     int err = 0;
0366     struct vm_area_struct *vma = walk->vma;
0367     const struct mm_walk_ops *ops = walk->ops;
0368 
0369     if (ops->pre_vma) {
0370         err = ops->pre_vma(start, end, walk);
0371         if (err)
0372             return err;
0373     }
0374 
0375     if (is_vm_hugetlb_page(vma)) {
0376         if (ops->hugetlb_entry)
0377             err = walk_hugetlb_range(start, end, walk);
0378     } else
0379         err = walk_pgd_range(start, end, walk);
0380 
0381     if (ops->post_vma)
0382         ops->post_vma(walk);
0383 
0384     return err;
0385 }
0386 
0387 /**
0388  * walk_page_range - walk page table with caller specific callbacks
0389  * @mm:     mm_struct representing the target process of page table walk
0390  * @start:  start address of the virtual address range
0391  * @end:    end address of the virtual address range
0392  * @ops:    operation to call during the walk
0393  * @private:    private data for callbacks' usage
0394  *
0395  * Recursively walk the page table tree of the process represented by @mm
0396  * within the virtual address range [@start, @end). During walking, we can do
0397  * some caller-specific works for each entry, by setting up pmd_entry(),
0398  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
0399  * callbacks, the associated entries/pages are just ignored.
0400  * The return values of these callbacks are commonly defined like below:
0401  *
0402  *  - 0  : succeeded to handle the current entry, and if you don't reach the
0403  *         end address yet, continue to walk.
0404  *  - >0 : succeeded to handle the current entry, and return to the caller
0405  *         with caller specific value.
0406  *  - <0 : failed to handle the current entry, and return to the caller
0407  *         with error code.
0408  *
0409  * Before starting to walk page table, some callers want to check whether
0410  * they really want to walk over the current vma, typically by checking
0411  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
0412  * purpose.
0413  *
0414  * If operations need to be staged before and committed after a vma is walked,
0415  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
0416  * since it is intended to handle commit-type operations, can't return any
0417  * errors.
0418  *
0419  * struct mm_walk keeps current values of some common data like vma and pmd,
0420  * which are useful for the access from callbacks. If you want to pass some
0421  * caller-specific data to callbacks, @private should be helpful.
0422  *
0423  * Locking:
0424  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
0425  *   because these function traverse vma list and/or access to vma's data.
0426  */
0427 int walk_page_range(struct mm_struct *mm, unsigned long start,
0428         unsigned long end, const struct mm_walk_ops *ops,
0429         void *private)
0430 {
0431     int err = 0;
0432     unsigned long next;
0433     struct vm_area_struct *vma;
0434     struct mm_walk walk = {
0435         .ops        = ops,
0436         .mm     = mm,
0437         .private    = private,
0438     };
0439 
0440     if (start >= end)
0441         return -EINVAL;
0442 
0443     if (!walk.mm)
0444         return -EINVAL;
0445 
0446     mmap_assert_locked(walk.mm);
0447 
0448     vma = find_vma(walk.mm, start);
0449     do {
0450         if (!vma) { /* after the last vma */
0451             walk.vma = NULL;
0452             next = end;
0453             if (ops->pte_hole)
0454                 err = ops->pte_hole(start, next, -1, &walk);
0455         } else if (start < vma->vm_start) { /* outside vma */
0456             walk.vma = NULL;
0457             next = min(end, vma->vm_start);
0458             if (ops->pte_hole)
0459                 err = ops->pte_hole(start, next, -1, &walk);
0460         } else { /* inside vma */
0461             walk.vma = vma;
0462             next = min(end, vma->vm_end);
0463             vma = vma->vm_next;
0464 
0465             err = walk_page_test(start, next, &walk);
0466             if (err > 0) {
0467                 /*
0468                  * positive return values are purely for
0469                  * controlling the pagewalk, so should never
0470                  * be passed to the callers.
0471                  */
0472                 err = 0;
0473                 continue;
0474             }
0475             if (err < 0)
0476                 break;
0477             err = __walk_page_range(start, next, &walk);
0478         }
0479         if (err)
0480             break;
0481     } while (start = next, start < end);
0482     return err;
0483 }
0484 
0485 /*
0486  * Similar to walk_page_range() but can walk any page tables even if they are
0487  * not backed by VMAs. Because 'unusual' entries may be walked this function
0488  * will also not lock the PTEs for the pte_entry() callback. This is useful for
0489  * walking the kernel pages tables or page tables for firmware.
0490  */
0491 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
0492               unsigned long end, const struct mm_walk_ops *ops,
0493               pgd_t *pgd,
0494               void *private)
0495 {
0496     struct mm_walk walk = {
0497         .ops        = ops,
0498         .mm     = mm,
0499         .pgd        = pgd,
0500         .private    = private,
0501         .no_vma     = true
0502     };
0503 
0504     if (start >= end || !walk.mm)
0505         return -EINVAL;
0506 
0507     mmap_assert_write_locked(walk.mm);
0508 
0509     return walk_pgd_range(start, end, &walk);
0510 }
0511 
0512 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
0513         void *private)
0514 {
0515     struct mm_walk walk = {
0516         .ops        = ops,
0517         .mm     = vma->vm_mm,
0518         .vma        = vma,
0519         .private    = private,
0520     };
0521     int err;
0522 
0523     if (!walk.mm)
0524         return -EINVAL;
0525 
0526     mmap_assert_locked(walk.mm);
0527 
0528     err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
0529     if (err > 0)
0530         return 0;
0531     if (err < 0)
0532         return err;
0533     return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
0534 }
0535 
0536 /**
0537  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
0538  * @mapping: Pointer to the struct address_space
0539  * @first_index: First page offset in the address_space
0540  * @nr: Number of incremental page offsets to cover
0541  * @ops:    operation to call during the walk
0542  * @private:    private data for callbacks' usage
0543  *
0544  * This function walks all memory areas mapped into a struct address_space.
0545  * The walk is limited to only the given page-size index range, but if
0546  * the index boundaries cross a huge page-table entry, that entry will be
0547  * included.
0548  *
0549  * Also see walk_page_range() for additional information.
0550  *
0551  * Locking:
0552  *   This function can't require that the struct mm_struct::mmap_lock is held,
0553  *   since @mapping may be mapped by multiple processes. Instead
0554  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
0555  *   callbacks, and it's up tho the caller to ensure that the
0556  *   struct mm_struct::mmap_lock is not needed.
0557  *
0558  *   Also this means that a caller can't rely on the struct
0559  *   vm_area_struct::vm_flags to be constant across a call,
0560  *   except for immutable flags. Callers requiring this shouldn't use
0561  *   this function.
0562  *
0563  * Return: 0 on success, negative error code on failure, positive number on
0564  * caller defined premature termination.
0565  */
0566 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
0567               pgoff_t nr, const struct mm_walk_ops *ops,
0568               void *private)
0569 {
0570     struct mm_walk walk = {
0571         .ops        = ops,
0572         .private    = private,
0573     };
0574     struct vm_area_struct *vma;
0575     pgoff_t vba, vea, cba, cea;
0576     unsigned long start_addr, end_addr;
0577     int err = 0;
0578 
0579     lockdep_assert_held(&mapping->i_mmap_rwsem);
0580     vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
0581                   first_index + nr - 1) {
0582         /* Clip to the vma */
0583         vba = vma->vm_pgoff;
0584         vea = vba + vma_pages(vma);
0585         cba = first_index;
0586         cba = max(cba, vba);
0587         cea = first_index + nr;
0588         cea = min(cea, vea);
0589 
0590         start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
0591         end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
0592         if (start_addr >= end_addr)
0593             continue;
0594 
0595         walk.vma = vma;
0596         walk.mm = vma->vm_mm;
0597 
0598         err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
0599         if (err > 0) {
0600             err = 0;
0601             break;
0602         } else if (err < 0)
0603             break;
0604 
0605         err = __walk_page_range(start_addr, end_addr, &walk);
0606         if (err)
0607             break;
0608     }
0609 
0610     return err;
0611 }