the-tree/mm/mmu_gather.c

0001 #include <linux/gfp.h>
0002 #include <linux/highmem.h>
0003 #include <linux/kernel.h>
0004 #include <linux/mmdebug.h>
0005 #include <linux/mm_types.h>
0006 #include <linux/mm_inline.h>
0007 #include <linux/pagemap.h>
0008 #include <linux/rcupdate.h>
0009 #include <linux/smp.h>
0010 #include <linux/swap.h>
0011
0012 #include <asm/pgalloc.h>
0013 #include <asm/tlb.h>
0014
0015 #ifndef CONFIG_MMU_GATHER_NO_GATHER
0016
0017 static bool tlb_next_batch(struct mmu_gather *tlb)
0018 {
0019     struct mmu_gather_batch *batch;
0020
0021     batch = tlb->active;
0022     if (batch->next) {
0023         tlb->active = batch->next;
0024         return true;
0025     }
0026
0027     if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
0028         return false;
0029
0030     batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
0031     if (!batch)
0032         return false;
0033
0034     tlb->batch_count++;
0035     batch->next = NULL;
0036     batch->nr   = 0;
0037     batch->max  = MAX_GATHER_BATCH;
0038
0039     tlb->active->next = batch;
0040     tlb->active = batch;
0041
0042     return true;
0043 }
0044
0045 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
0046 {
0047     struct mmu_gather_batch *batch;
0048
0049     for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
0050         struct page **pages = batch->pages;
0051
0052         do {
0053             /*
0054              * limit free batch count when PAGE_SIZE > 4K
0055              */
0056             unsigned int nr = min(512U, batch->nr);
0057
0058             free_pages_and_swap_cache(pages, nr);
0059             pages += nr;
0060             batch->nr -= nr;
0061
0062             cond_resched();
0063         } while (batch->nr);
0064     }
0065     tlb->active = &tlb->local;
0066 }
0067
0068 static void tlb_batch_list_free(struct mmu_gather *tlb)
0069 {
0070     struct mmu_gather_batch *batch, *next;
0071
0072     for (batch = tlb->local.next; batch; batch = next) {
0073         next = batch->next;
0074         free_pages((unsigned long)batch, 0);
0075     }
0076     tlb->local.next = NULL;
0077 }
0078
0079 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
0080 {
0081     struct mmu_gather_batch *batch;
0082
0083     VM_BUG_ON(!tlb->end);
0084
0085 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
0086     VM_WARN_ON(tlb->page_size != page_size);
0087 #endif
0088
0089     batch = tlb->active;
0090     /*
0091      * Add the page and check if we are full. If so
0092      * force a flush.
0093      */
0094     batch->pages[batch->nr++] = page;
0095     if (batch->nr == batch->max) {
0096         if (!tlb_next_batch(tlb))
0097             return true;
0098         batch = tlb->active;
0099     }
0100     VM_BUG_ON_PAGE(batch->nr > batch->max, page);
0101
0102     return false;
0103 }
0104
0105 #endif /* MMU_GATHER_NO_GATHER */
0106
0107 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
0108
0109 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
0110 {
0111     int i;
0112
0113     for (i = 0; i < batch->nr; i++)
0114         __tlb_remove_table(batch->tables[i]);
0115
0116     free_page((unsigned long)batch);
0117 }
0118
0119 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
0120
0121 /*
0122  * Semi RCU freeing of the page directories.
0123  *
0124  * This is needed by some architectures to implement software pagetable walkers.
0125  *
0126  * gup_fast() and other software pagetable walkers do a lockless page-table
0127  * walk and therefore needs some synchronization with the freeing of the page
0128  * directories. The chosen means to accomplish that is by disabling IRQs over
0129  * the walk.
0130  *
0131  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
0132  * since we unlink the page, flush TLBs, free the page. Since the disabling of
0133  * IRQs delays the completion of the TLB flush we can never observe an already
0134  * freed page.
0135  *
0136  * Architectures that do not have this (PPC) need to delay the freeing by some
0137  * other means, this is that means.
0138  *
0139  * What we do is batch the freed directory pages (tables) and RCU free them.
0140  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
0141  * holds off grace periods.
0142  *
0143  * However, in order to batch these pages we need to allocate storage, this
0144  * allocation is deep inside the MM code and can thus easily fail on memory
0145  * pressure. To guarantee progress we fall back to single table freeing, see
0146  * the implementation of tlb_remove_table_one().
0147  *
0148  */
0149
0150 static void tlb_remove_table_smp_sync(void *arg)
0151 {
0152     /* Simply deliver the interrupt */
0153 }
0154
0155 static void tlb_remove_table_sync_one(void)
0156 {
0157     /*
0158      * This isn't an RCU grace period and hence the page-tables cannot be
0159      * assumed to be actually RCU-freed.
0160      *
0161      * It is however sufficient for software page-table walkers that rely on
0162      * IRQ disabling.
0163      */
0164     smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
0165 }
0166
0167 static void tlb_remove_table_rcu(struct rcu_head *head)
0168 {
0169     __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
0170 }
0171
0172 static void tlb_remove_table_free(struct mmu_table_batch *batch)
0173 {
0174     call_rcu(&batch->rcu, tlb_remove_table_rcu);
0175 }
0176
0177 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
0178
0179 static void tlb_remove_table_sync_one(void) { }
0180
0181 static void tlb_remove_table_free(struct mmu_table_batch *batch)
0182 {
0183     __tlb_remove_table_free(batch);
0184 }
0185
0186 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
0187
0188 /*
0189  * If we want tlb_remove_table() to imply TLB invalidates.
0190  */
0191 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
0192 {
0193     if (tlb_needs_table_invalidate()) {
0194         /*
0195          * Invalidate page-table caches used by hardware walkers. Then
0196          * we still need to RCU-sched wait while freeing the pages
0197          * because software walkers can still be in-flight.
0198          */
0199         tlb_flush_mmu_tlbonly(tlb);
0200     }
0201 }
0202
0203 static void tlb_remove_table_one(void *table)
0204 {
0205     tlb_remove_table_sync_one();
0206     __tlb_remove_table(table);
0207 }
0208
0209 static void tlb_table_flush(struct mmu_gather *tlb)
0210 {
0211     struct mmu_table_batch **batch = &tlb->batch;
0212
0213     if (*batch) {
0214         tlb_table_invalidate(tlb);
0215         tlb_remove_table_free(*batch);
0216         *batch = NULL;
0217     }
0218 }
0219
0220 void tlb_remove_table(struct mmu_gather *tlb, void *table)
0221 {
0222     struct mmu_table_batch **batch = &tlb->batch;
0223
0224     if (*batch == NULL) {
0225         *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
0226         if (*batch == NULL) {
0227             tlb_table_invalidate(tlb);
0228             tlb_remove_table_one(table);
0229             return;
0230         }
0231         (*batch)->nr = 0;
0232     }
0233
0234     (*batch)->tables[(*batch)->nr++] = table;
0235     if ((*batch)->nr == MAX_TABLE_BATCH)
0236         tlb_table_flush(tlb);
0237 }
0238
0239 static inline void tlb_table_init(struct mmu_gather *tlb)
0240 {
0241     tlb->batch = NULL;
0242 }
0243
0244 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
0245
0246 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
0247 static inline void tlb_table_init(struct mmu_gather *tlb) { }
0248
0249 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
0250
0251 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
0252 {
0253     tlb_table_flush(tlb);
0254 #ifndef CONFIG_MMU_GATHER_NO_GATHER
0255     tlb_batch_pages_flush(tlb);
0256 #endif
0257 }
0258
0259 void tlb_flush_mmu(struct mmu_gather *tlb)
0260 {
0261     tlb_flush_mmu_tlbonly(tlb);
0262     tlb_flush_mmu_free(tlb);
0263 }
0264
0265 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
0266                  bool fullmm)
0267 {
0268     tlb->mm = mm;
0269     tlb->fullmm = fullmm;
0270
0271 #ifndef CONFIG_MMU_GATHER_NO_GATHER
0272     tlb->need_flush_all = 0;
0273     tlb->local.next = NULL;
0274     tlb->local.nr   = 0;
0275     tlb->local.max  = ARRAY_SIZE(tlb->__pages);
0276     tlb->active     = &tlb->local;
0277     tlb->batch_count = 0;
0278 #endif
0279
0280     tlb_table_init(tlb);
0281 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
0282     tlb->page_size = 0;
0283 #endif
0284
0285     __tlb_reset_range(tlb);
0286     inc_tlb_flush_pending(tlb->mm);
0287 }
0288
0289 /**
0290  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
0291  * @tlb: the mmu_gather structure to initialize
0292  * @mm: the mm_struct of the target address space
0293  *
0294  * Called to initialize an (on-stack) mmu_gather structure for page-table
0295  * tear-down from @mm.
0296  */
0297 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
0298 {
0299     __tlb_gather_mmu(tlb, mm, false);
0300 }
0301
0302 /**
0303  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
0304  * @tlb: the mmu_gather structure to initialize
0305  * @mm: the mm_struct of the target address space
0306  *
0307  * In this case, @mm is without users and we're going to destroy the
0308  * full address space (exit/execve).
0309  *
0310  * Called to initialize an (on-stack) mmu_gather structure for page-table
0311  * tear-down from @mm.
0312  */
0313 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
0314 {
0315     __tlb_gather_mmu(tlb, mm, true);
0316 }
0317
0318 /**
0319  * tlb_finish_mmu - finish an mmu_gather structure
0320  * @tlb: the mmu_gather structure to finish
0321  *
0322  * Called at the end of the shootdown operation to free up any resources that
0323  * were required.
0324  */
0325 void tlb_finish_mmu(struct mmu_gather *tlb)
0326 {
0327     /*
0328      * If there are parallel threads are doing PTE changes on same range
0329      * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
0330      * flush by batching, one thread may end up seeing inconsistent PTEs
0331      * and result in having stale TLB entries.  So flush TLB forcefully
0332      * if we detect parallel PTE batching threads.
0333      *
0334      * However, some syscalls, e.g. munmap(), may free page tables, this
0335      * needs force flush everything in the given range. Otherwise this
0336      * may result in having stale TLB entries for some architectures,
0337      * e.g. aarch64, that could specify flush what level TLB.
0338      */
0339     if (mm_tlb_flush_nested(tlb->mm)) {
0340         /*
0341          * The aarch64 yields better performance with fullmm by
0342          * avoiding multiple CPUs spamming TLBI messages at the
0343          * same time.
0344          *
0345          * On x86 non-fullmm doesn't yield significant difference
0346          * against fullmm.
0347          */
0348         tlb->fullmm = 1;
0349         __tlb_reset_range(tlb);
0350         tlb->freed_tables = 1;
0351     }
0352
0353     tlb_flush_mmu(tlb);
0354
0355 #ifndef CONFIG_MMU_GATHER_NO_GATHER
0356     tlb_batch_list_free(tlb);
0357 #endif
0358     dec_tlb_flush_pending(tlb->mm);
0359 }