Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _LINUX_RMAP_H
0003 #define _LINUX_RMAP_H
0004 /*
0005  * Declarations for Reverse Mapping functions in mm/rmap.c
0006  */
0007 
0008 #include <linux/list.h>
0009 #include <linux/slab.h>
0010 #include <linux/mm.h>
0011 #include <linux/rwsem.h>
0012 #include <linux/memcontrol.h>
0013 #include <linux/highmem.h>
0014 #include <linux/pagemap.h>
0015 #include <linux/memremap.h>
0016 
0017 /*
0018  * The anon_vma heads a list of private "related" vmas, to scan if
0019  * an anonymous page pointing to this anon_vma needs to be unmapped:
0020  * the vmas on the list will be related by forking, or by splitting.
0021  *
0022  * Since vmas come and go as they are split and merged (particularly
0023  * in mprotect), the mapping field of an anonymous page cannot point
0024  * directly to a vma: instead it points to an anon_vma, on whose list
0025  * the related vmas can be easily linked or unlinked.
0026  *
0027  * After unlinking the last vma on the list, we must garbage collect
0028  * the anon_vma object itself: we're guaranteed no page can be
0029  * pointing to this anon_vma once its vma list is empty.
0030  */
0031 struct anon_vma {
0032     struct anon_vma *root;      /* Root of this anon_vma tree */
0033     struct rw_semaphore rwsem;  /* W: modification, R: walking the list */
0034     /*
0035      * The refcount is taken on an anon_vma when there is no
0036      * guarantee that the vma of page tables will exist for
0037      * the duration of the operation. A caller that takes
0038      * the reference is responsible for clearing up the
0039      * anon_vma if they are the last user on release
0040      */
0041     atomic_t refcount;
0042 
0043     /*
0044      * Count of child anon_vmas. Equals to the count of all anon_vmas that
0045      * have ->parent pointing to this one, including itself.
0046      *
0047      * This counter is used for making decision about reusing anon_vma
0048      * instead of forking new one. See comments in function anon_vma_clone.
0049      */
0050     unsigned long num_children;
0051     /* Count of VMAs whose ->anon_vma pointer points to this object. */
0052     unsigned long num_active_vmas;
0053 
0054     struct anon_vma *parent;    /* Parent of this anon_vma */
0055 
0056     /*
0057      * NOTE: the LSB of the rb_root.rb_node is set by
0058      * mm_take_all_locks() _after_ taking the above lock. So the
0059      * rb_root must only be read/written after taking the above lock
0060      * to be sure to see a valid next pointer. The LSB bit itself
0061      * is serialized by a system wide lock only visible to
0062      * mm_take_all_locks() (mm_all_locks_mutex).
0063      */
0064 
0065     /* Interval tree of private "related" vmas */
0066     struct rb_root_cached rb_root;
0067 };
0068 
0069 /*
0070  * The copy-on-write semantics of fork mean that an anon_vma
0071  * can become associated with multiple processes. Furthermore,
0072  * each child process will have its own anon_vma, where new
0073  * pages for that process are instantiated.
0074  *
0075  * This structure allows us to find the anon_vmas associated
0076  * with a VMA, or the VMAs associated with an anon_vma.
0077  * The "same_vma" list contains the anon_vma_chains linking
0078  * all the anon_vmas associated with this VMA.
0079  * The "rb" field indexes on an interval tree the anon_vma_chains
0080  * which link all the VMAs associated with this anon_vma.
0081  */
0082 struct anon_vma_chain {
0083     struct vm_area_struct *vma;
0084     struct anon_vma *anon_vma;
0085     struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
0086     struct rb_node rb;          /* locked by anon_vma->rwsem */
0087     unsigned long rb_subtree_last;
0088 #ifdef CONFIG_DEBUG_VM_RB
0089     unsigned long cached_vma_start, cached_vma_last;
0090 #endif
0091 };
0092 
0093 enum ttu_flags {
0094     TTU_SPLIT_HUGE_PMD  = 0x4,  /* split huge PMD if any */
0095     TTU_IGNORE_MLOCK    = 0x8,  /* ignore mlock */
0096     TTU_SYNC        = 0x10, /* avoid racy checks with PVMW_SYNC */
0097     TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
0098     TTU_BATCH_FLUSH     = 0x40, /* Batch TLB flushes where possible
0099                      * and caller guarantees they will
0100                      * do a final flush if necessary */
0101     TTU_RMAP_LOCKED     = 0x80, /* do not grab rmap lock:
0102                      * caller holds it */
0103 };
0104 
0105 #ifdef CONFIG_MMU
0106 static inline void get_anon_vma(struct anon_vma *anon_vma)
0107 {
0108     atomic_inc(&anon_vma->refcount);
0109 }
0110 
0111 void __put_anon_vma(struct anon_vma *anon_vma);
0112 
0113 static inline void put_anon_vma(struct anon_vma *anon_vma)
0114 {
0115     if (atomic_dec_and_test(&anon_vma->refcount))
0116         __put_anon_vma(anon_vma);
0117 }
0118 
0119 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
0120 {
0121     down_write(&anon_vma->root->rwsem);
0122 }
0123 
0124 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
0125 {
0126     up_write(&anon_vma->root->rwsem);
0127 }
0128 
0129 static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
0130 {
0131     down_read(&anon_vma->root->rwsem);
0132 }
0133 
0134 static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
0135 {
0136     return down_read_trylock(&anon_vma->root->rwsem);
0137 }
0138 
0139 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
0140 {
0141     up_read(&anon_vma->root->rwsem);
0142 }
0143 
0144 
0145 /*
0146  * anon_vma helper functions.
0147  */
0148 void anon_vma_init(void);   /* create anon_vma_cachep */
0149 int  __anon_vma_prepare(struct vm_area_struct *);
0150 void unlink_anon_vmas(struct vm_area_struct *);
0151 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
0152 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
0153 
0154 static inline int anon_vma_prepare(struct vm_area_struct *vma)
0155 {
0156     if (likely(vma->anon_vma))
0157         return 0;
0158 
0159     return __anon_vma_prepare(vma);
0160 }
0161 
0162 static inline void anon_vma_merge(struct vm_area_struct *vma,
0163                   struct vm_area_struct *next)
0164 {
0165     VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
0166     unlink_anon_vmas(next);
0167 }
0168 
0169 struct anon_vma *page_get_anon_vma(struct page *page);
0170 
0171 /* RMAP flags, currently only relevant for some anon rmap operations. */
0172 typedef int __bitwise rmap_t;
0173 
0174 /*
0175  * No special request: if the page is a subpage of a compound page, it is
0176  * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
0177  */
0178 #define RMAP_NONE       ((__force rmap_t)0)
0179 
0180 /* The (sub)page is exclusive to a single process. */
0181 #define RMAP_EXCLUSIVE      ((__force rmap_t)BIT(0))
0182 
0183 /*
0184  * The compound page is not mapped via PTEs, but instead via a single PMD and
0185  * should be accounted accordingly.
0186  */
0187 #define RMAP_COMPOUND       ((__force rmap_t)BIT(1))
0188 
0189 /*
0190  * rmap interfaces called when adding or removing pte of page
0191  */
0192 void page_move_anon_rmap(struct page *, struct vm_area_struct *);
0193 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
0194         unsigned long address, rmap_t flags);
0195 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
0196         unsigned long address);
0197 void page_add_file_rmap(struct page *, struct vm_area_struct *,
0198         bool compound);
0199 void page_remove_rmap(struct page *, struct vm_area_struct *,
0200         bool compound);
0201 
0202 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
0203         unsigned long address, rmap_t flags);
0204 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
0205         unsigned long address);
0206 
0207 static inline void __page_dup_rmap(struct page *page, bool compound)
0208 {
0209     atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
0210 }
0211 
0212 static inline void page_dup_file_rmap(struct page *page, bool compound)
0213 {
0214     __page_dup_rmap(page, compound);
0215 }
0216 
0217 /**
0218  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
0219  *              anonymous page
0220  * @page: the page to duplicate the mapping for
0221  * @compound: the page is mapped as compound or as a small page
0222  * @vma: the source vma
0223  *
0224  * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
0225  *
0226  * Duplicating the mapping can only fail if the page may be pinned; device
0227  * private pages cannot get pinned and consequently this function cannot fail.
0228  *
0229  * If duplicating the mapping succeeds, the page has to be mapped R/O into
0230  * the parent and the child. It must *not* get mapped writable after this call.
0231  *
0232  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
0233  */
0234 static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
0235                      struct vm_area_struct *vma)
0236 {
0237     VM_BUG_ON_PAGE(!PageAnon(page), page);
0238 
0239     /*
0240      * No need to check+clear for already shared pages, including KSM
0241      * pages.
0242      */
0243     if (!PageAnonExclusive(page))
0244         goto dup;
0245 
0246     /*
0247      * If this page may have been pinned by the parent process,
0248      * don't allow to duplicate the mapping but instead require to e.g.,
0249      * copy the page immediately for the child so that we'll always
0250      * guarantee the pinned page won't be randomly replaced in the
0251      * future on write faults.
0252      */
0253     if (likely(!is_device_private_page(page) &&
0254         unlikely(page_needs_cow_for_dma(vma, page))))
0255         return -EBUSY;
0256 
0257     ClearPageAnonExclusive(page);
0258     /*
0259      * It's okay to share the anon page between both processes, mapping
0260      * the page R/O into both processes.
0261      */
0262 dup:
0263     __page_dup_rmap(page, compound);
0264     return 0;
0265 }
0266 
0267 /**
0268  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
0269  *                shared to prepare for KSM or temporary unmapping
0270  * @page: the exclusive anonymous page to try marking possibly shared
0271  *
0272  * The caller needs to hold the PT lock and has to have the page table entry
0273  * cleared/invalidated+flushed, to properly sync against GUP-fast.
0274  *
0275  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
0276  * to duplicate a mapping, but instead to prepare for KSM or temporarily
0277  * unmapping a page (swap, migration) via page_remove_rmap().
0278  *
0279  * Marking the page shared can only fail if the page may be pinned; device
0280  * private pages cannot get pinned and consequently this function cannot fail.
0281  *
0282  * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
0283  * otherwise.
0284  */
0285 static inline int page_try_share_anon_rmap(struct page *page)
0286 {
0287     VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
0288 
0289     /* See page_try_dup_anon_rmap(). */
0290     if (likely(!is_device_private_page(page) &&
0291         unlikely(page_maybe_dma_pinned(page))))
0292         return -EBUSY;
0293 
0294     ClearPageAnonExclusive(page);
0295     return 0;
0296 }
0297 
0298 /*
0299  * Called from mm/vmscan.c to handle paging out
0300  */
0301 int folio_referenced(struct folio *, int is_locked,
0302             struct mem_cgroup *memcg, unsigned long *vm_flags);
0303 
0304 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
0305 void try_to_unmap(struct folio *, enum ttu_flags flags);
0306 
0307 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
0308                 unsigned long end, struct page **pages,
0309                 void *arg);
0310 
0311 /* Avoid racy checks */
0312 #define PVMW_SYNC       (1 << 0)
0313 /* Look for migration entries rather than present PTEs */
0314 #define PVMW_MIGRATION      (1 << 1)
0315 
0316 struct page_vma_mapped_walk {
0317     unsigned long pfn;
0318     unsigned long nr_pages;
0319     pgoff_t pgoff;
0320     struct vm_area_struct *vma;
0321     unsigned long address;
0322     pmd_t *pmd;
0323     pte_t *pte;
0324     spinlock_t *ptl;
0325     unsigned int flags;
0326 };
0327 
0328 #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)   \
0329     struct page_vma_mapped_walk name = {                \
0330         .pfn = page_to_pfn(_page),              \
0331         .nr_pages = compound_nr(_page),             \
0332         .pgoff = page_to_pgoff(_page),              \
0333         .vma = _vma,                        \
0334         .address = _address,                    \
0335         .flags = _flags,                    \
0336     }
0337 
0338 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
0339     struct page_vma_mapped_walk name = {                \
0340         .pfn = folio_pfn(_folio),               \
0341         .nr_pages = folio_nr_pages(_folio),         \
0342         .pgoff = folio_pgoff(_folio),               \
0343         .vma = _vma,                        \
0344         .address = _address,                    \
0345         .flags = _flags,                    \
0346     }
0347 
0348 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
0349 {
0350     /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
0351     if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
0352         pte_unmap(pvmw->pte);
0353     if (pvmw->ptl)
0354         spin_unlock(pvmw->ptl);
0355 }
0356 
0357 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
0358 
0359 /*
0360  * Used by swapoff to help locate where page is expected in vma.
0361  */
0362 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
0363 
0364 /*
0365  * Cleans the PTEs of shared mappings.
0366  * (and since clean PTEs should also be readonly, write protects them too)
0367  *
0368  * returns the number of cleaned PTEs.
0369  */
0370 int folio_mkclean(struct folio *);
0371 
0372 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
0373               struct vm_area_struct *vma);
0374 
0375 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
0376 
0377 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
0378 
0379 /*
0380  * rmap_walk_control: To control rmap traversing for specific needs
0381  *
0382  * arg: passed to rmap_one() and invalid_vma()
0383  * try_lock: bail out if the rmap lock is contended
0384  * contended: indicate the rmap traversal bailed out due to lock contention
0385  * rmap_one: executed on each vma where page is mapped
0386  * done: for checking traversing termination condition
0387  * anon_lock: for getting anon_lock by optimized way rather than default
0388  * invalid_vma: for skipping uninterested vma
0389  */
0390 struct rmap_walk_control {
0391     void *arg;
0392     bool try_lock;
0393     bool contended;
0394     /*
0395      * Return false if page table scanning in rmap_walk should be stopped.
0396      * Otherwise, return true.
0397      */
0398     bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
0399                     unsigned long addr, void *arg);
0400     int (*done)(struct folio *folio);
0401     struct anon_vma *(*anon_lock)(struct folio *folio,
0402                       struct rmap_walk_control *rwc);
0403     bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
0404 };
0405 
0406 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
0407 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
0408 
0409 /*
0410  * Called by memory-failure.c to kill processes.
0411  */
0412 struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
0413                       struct rmap_walk_control *rwc);
0414 void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
0415 
0416 #else   /* !CONFIG_MMU */
0417 
0418 #define anon_vma_init()     do {} while (0)
0419 #define anon_vma_prepare(vma)   (0)
0420 #define anon_vma_link(vma)  do {} while (0)
0421 
0422 static inline int folio_referenced(struct folio *folio, int is_locked,
0423                   struct mem_cgroup *memcg,
0424                   unsigned long *vm_flags)
0425 {
0426     *vm_flags = 0;
0427     return 0;
0428 }
0429 
0430 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
0431 {
0432 }
0433 
0434 static inline int folio_mkclean(struct folio *folio)
0435 {
0436     return 0;
0437 }
0438 #endif  /* CONFIG_MMU */
0439 
0440 static inline int page_mkclean(struct page *page)
0441 {
0442     return folio_mkclean(page_folio(page));
0443 }
0444 #endif  /* _LINUX_RMAP_H */