misc/sgi-gru/grutlbpurge.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * SN Platform GRU Driver
0004  *
0005  *      MMUOPS callbacks  + TLB flushing
0006  *
0007  * This file handles emu notifier callbacks from the core kernel. The callbacks
0008  * are used to update the TLB in the GRU as a result of changes in the
0009  * state of a process address space. This file also handles TLB invalidates
0010  * from the GRU driver.
0011  *
0012  *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
0013  */
0014
0015 #include <linux/kernel.h>
0016 #include <linux/list.h>
0017 #include <linux/spinlock.h>
0018 #include <linux/mm.h>
0019 #include <linux/slab.h>
0020 #include <linux/device.h>
0021 #include <linux/hugetlb.h>
0022 #include <linux/delay.h>
0023 #include <linux/timex.h>
0024 #include <linux/srcu.h>
0025 #include <asm/processor.h>
0026 #include "gru.h"
0027 #include "grutables.h"
0028 #include <asm/uv/uv_hub.h>
0029
0030 #define gru_random()    get_cycles()
0031
0032 /* ---------------------------------- TLB Invalidation functions --------
0033  * get_tgh_handle
0034  *
0035  * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
0036  * local blade, use a fixed TGH that is a function of the blade-local cpu
0037  * number. Normally, this TGH is private to the cpu & no contention occurs for
0038  * the TGH. For offblade GRUs, select a random TGH in the range above the
0039  * private TGHs. A spinlock is required to access this TGH & the lock must be
0040  * released when the invalidate is completes. This sucks, but it is the best we
0041  * can do.
0042  *
0043  * Note that the spinlock is IN the TGH handle so locking does not involve
0044  * additional cache lines.
0045  *
0046  */
0047 static inline int get_off_blade_tgh(struct gru_state *gru)
0048 {
0049     int n;
0050
0051     n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
0052     n = gru_random() % n;
0053     n += gru->gs_tgh_first_remote;
0054     return n;
0055 }
0056
0057 static inline int get_on_blade_tgh(struct gru_state *gru)
0058 {
0059     return uv_blade_processor_id() >> gru->gs_tgh_local_shift;
0060 }
0061
0062 static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
0063                              *gru)
0064 {
0065     struct gru_tlb_global_handle *tgh;
0066     int n;
0067
0068     preempt_disable();
0069     if (uv_numa_blade_id() == gru->gs_blade_id)
0070         n = get_on_blade_tgh(gru);
0071     else
0072         n = get_off_blade_tgh(gru);
0073     tgh = get_tgh_by_index(gru, n);
0074     lock_tgh_handle(tgh);
0075
0076     return tgh;
0077 }
0078
0079 static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
0080 {
0081     unlock_tgh_handle(tgh);
0082     preempt_enable();
0083 }
0084
0085 /*
0086  * gru_flush_tlb_range
0087  *
0088  * General purpose TLB invalidation function. This function scans every GRU in
0089  * the ENTIRE system (partition) looking for GRUs where the specified MM has
0090  * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
0091  * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
0092  * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
0093  * cost of (possibly) a large number of future TLBmisses.
0094  *
0095  * The current algorithm is optimized based on the following (somewhat true)
0096  * assumptions:
0097  *  - GRU contexts are not loaded into a GRU unless a reference is made to
0098  *    the data segment or control block (this is true, not an assumption).
0099  *    If a DS/CB is referenced, the user will also issue instructions that
0100  *    cause TLBmisses. It is not necessary to optimize for the case where
0101  *    contexts are loaded but no instructions cause TLB misses. (I know
0102  *    this will happen but I'm not optimizing for it).
0103  *  - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
0104  *    a few usec but in unusual cases, it could be longer. Avoid if
0105  *    possible.
0106  *  - intrablade process migration between cpus is not frequent but is
0107  *    common.
0108  *  - a GRU context is not typically migrated to a different GRU on the
0109  *    blade because of intrablade migration
0110  *  - interblade migration is rare. Processes migrate their GRU context to
0111  *    the new blade.
0112  *  - if interblade migration occurs, migration back to the original blade
0113  *    is very very rare (ie., no optimization for this case)
0114  *  - most GRU instruction operate on a subset of the user REGIONS. Code
0115  *    & shared library regions are not likely targets of GRU instructions.
0116  *
0117  * To help improve the efficiency of TLB invalidation, the GMS data
0118  * structure is maintained for EACH address space (MM struct). The GMS is
0119  * also the structure that contains the pointer to the mmu callout
0120  * functions. This structure is linked to the mm_struct for the address space
0121  * using the mmu "register" function. The mmu interfaces are used to
0122  * provide the callbacks for TLB invalidation. The GMS contains:
0123  *
0124  *  - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
0125  *    loaded into the GRU.
0126  *  - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
0127  *    the above array
0128  *  - ctxbitmap[maxgrus]. Indicates the contexts that are currently active
0129  *    in the GRU for the address space. This bitmap must be passed to the
0130  *    GRU to do an invalidate.
0131  *
0132  * The current algorithm for invalidating TLBs is:
0133  *  - scan the asidmap for GRUs where the context has been loaded, ie,
0134  *    asid is non-zero.
0135  *  - for each gru found:
0136  *      - if the ctxtmap is non-zero, there are active contexts in the
0137  *        GRU. TLB invalidate instructions must be issued to the GRU.
0138  *      - if the ctxtmap is zero, no context is active. Set the ASID to
0139  *        zero to force a full TLB invalidation. This is fast but will
0140  *        cause a lot of TLB misses if the context is reloaded onto the
0141  *        GRU
0142  *
0143  */
0144
0145 void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
0146              unsigned long len)
0147 {
0148     struct gru_state *gru;
0149     struct gru_mm_tracker *asids;
0150     struct gru_tlb_global_handle *tgh;
0151     unsigned long num;
0152     int grupagesize, pagesize, pageshift, gid, asid;
0153
0154     /* ZZZ TODO - handle huge pages */
0155     pageshift = PAGE_SHIFT;
0156     pagesize = (1UL << pageshift);
0157     grupagesize = GRU_PAGESIZE(pageshift);
0158     num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
0159
0160     STAT(flush_tlb);
0161     gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
0162         start, len, gms->ms_asidmap[0]);
0163
0164     spin_lock(&gms->ms_asid_lock);
0165     for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
0166         STAT(flush_tlb_gru);
0167         gru = GID_TO_GRU(gid);
0168         asids = gms->ms_asids + gid;
0169         asid = asids->mt_asid;
0170         if (asids->mt_ctxbitmap && asid) {
0171             STAT(flush_tlb_gru_tgh);
0172             asid = GRUASID(asid, start);
0173             gru_dbg(grudev,
0174     "  FLUSH gruid %d, asid 0x%x, vaddr 0x%lx, vamask 0x%x, num %ld, cbmap 0x%x\n",
0175                   gid, asid, start, grupagesize, num, asids->mt_ctxbitmap);
0176             tgh = get_lock_tgh_handle(gru);
0177             tgh_invalidate(tgh, start, ~0, asid, grupagesize, 0,
0178                        num - 1, asids->mt_ctxbitmap);
0179             get_unlock_tgh_handle(tgh);
0180         } else {
0181             STAT(flush_tlb_gru_zero_asid);
0182             asids->mt_asid = 0;
0183             __clear_bit(gru->gs_gid, gms->ms_asidmap);
0184             gru_dbg(grudev,
0185     "  CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
0186                 gid, asid, asids->mt_ctxbitmap,
0187                 gms->ms_asidmap[0]);
0188         }
0189     }
0190     spin_unlock(&gms->ms_asid_lock);
0191 }
0192
0193 /*
0194  * Flush the entire TLB on a chiplet.
0195  */
0196 void gru_flush_all_tlb(struct gru_state *gru)
0197 {
0198     struct gru_tlb_global_handle *tgh;
0199
0200     gru_dbg(grudev, "gid %d\n", gru->gs_gid);
0201     tgh = get_lock_tgh_handle(gru);
0202     tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0xffff);
0203     get_unlock_tgh_handle(tgh);
0204 }
0205
0206 /*
0207  * MMUOPS notifier callout functions
0208  */
0209 static int gru_invalidate_range_start(struct mmu_notifier *mn,
0210             const struct mmu_notifier_range *range)
0211 {
0212     struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
0213                          ms_notifier);
0214
0215     STAT(mmu_invalidate_range);
0216     atomic_inc(&gms->ms_range_active);
0217     gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
0218         range->start, range->end, atomic_read(&gms->ms_range_active));
0219     gru_flush_tlb_range(gms, range->start, range->end - range->start);
0220
0221     return 0;
0222 }
0223
0224 static void gru_invalidate_range_end(struct mmu_notifier *mn,
0225             const struct mmu_notifier_range *range)
0226 {
0227     struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
0228                          ms_notifier);
0229
0230     /* ..._and_test() provides needed barrier */
0231     (void)atomic_dec_and_test(&gms->ms_range_active);
0232
0233     wake_up_all(&gms->ms_wait_queue);
0234     gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n",
0235         gms, range->start, range->end);
0236 }
0237
0238 static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm)
0239 {
0240     struct gru_mm_struct *gms;
0241
0242     gms = kzalloc(sizeof(*gms), GFP_KERNEL);
0243     if (!gms)
0244         return ERR_PTR(-ENOMEM);
0245     STAT(gms_alloc);
0246     spin_lock_init(&gms->ms_asid_lock);
0247     init_waitqueue_head(&gms->ms_wait_queue);
0248
0249     return &gms->ms_notifier;
0250 }
0251
0252 static void gru_free_notifier(struct mmu_notifier *mn)
0253 {
0254     kfree(container_of(mn, struct gru_mm_struct, ms_notifier));
0255     STAT(gms_free);
0256 }
0257
0258 static const struct mmu_notifier_ops gru_mmuops = {
0259     .invalidate_range_start = gru_invalidate_range_start,
0260     .invalidate_range_end   = gru_invalidate_range_end,
0261     .alloc_notifier     = gru_alloc_notifier,
0262     .free_notifier      = gru_free_notifier,
0263 };
0264
0265 struct gru_mm_struct *gru_register_mmu_notifier(void)
0266 {
0267     struct mmu_notifier *mn;
0268
0269     mn = mmu_notifier_get_locked(&gru_mmuops, current->mm);
0270     if (IS_ERR(mn))
0271         return ERR_CAST(mn);
0272
0273     return container_of(mn, struct gru_mm_struct, ms_notifier);
0274 }
0275
0276 void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
0277 {
0278     mmu_notifier_put(&gms->ms_notifier);
0279 }
0280
0281 /*
0282  * Setup TGH parameters. There are:
0283  *  - 24 TGH handles per GRU chiplet
0284  *  - a portion (MAX_LOCAL_TGH) of the handles are reserved for
0285  *    use by blade-local cpus
0286  *  - the rest are used by off-blade cpus. This usage is
0287  *    less frequent than blade-local usage.
0288  *
0289  * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
0290  * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
0291  * use.
0292  */
0293 #define MAX_LOCAL_TGH   16
0294
0295 void gru_tgh_flush_init(struct gru_state *gru)
0296 {
0297     int cpus, shift = 0, n;
0298
0299     cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id);
0300
0301     /* n = cpus rounded up to next power of 2 */
0302     if (cpus) {
0303         n = 1 << fls(cpus - 1);
0304
0305         /*
0306          * shift count for converting local cpu# to TGH index
0307          *      0 if cpus <= MAX_LOCAL_TGH,
0308          *      1 if cpus <= 2*MAX_LOCAL_TGH,
0309          *      etc
0310          */
0311         shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
0312     }
0313     gru->gs_tgh_local_shift = shift;
0314
0315     /* first starting TGH index to use for remote purges */
0316     gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
0317
0318 }