x86/mm/kmmio.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /* Support for MMIO probes.
0003  * Benefit many code from kprobes
0004  * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
0005  *     2007 Alexander Eichner
0006  *     2008 Pekka Paalanen <pq@iki.fi>
0007  */
0008
0009 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0010
0011 #include <linux/list.h>
0012 #include <linux/rculist.h>
0013 #include <linux/spinlock.h>
0014 #include <linux/hash.h>
0015 #include <linux/export.h>
0016 #include <linux/kernel.h>
0017 #include <linux/uaccess.h>
0018 #include <linux/ptrace.h>
0019 #include <linux/preempt.h>
0020 #include <linux/percpu.h>
0021 #include <linux/kdebug.h>
0022 #include <linux/mutex.h>
0023 #include <linux/io.h>
0024 #include <linux/slab.h>
0025 #include <asm/cacheflush.h>
0026 #include <asm/tlbflush.h>
0027 #include <linux/errno.h>
0028 #include <asm/debugreg.h>
0029 #include <linux/mmiotrace.h>
0030
0031 #define KMMIO_PAGE_HASH_BITS 4
0032 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
0033
0034 struct kmmio_fault_page {
0035     struct list_head list;
0036     struct kmmio_fault_page *release_next;
0037     unsigned long addr; /* the requested address */
0038     pteval_t old_presence; /* page presence prior to arming */
0039     bool armed;
0040
0041     /*
0042      * Number of times this page has been registered as a part
0043      * of a probe. If zero, page is disarmed and this may be freed.
0044      * Used only by writers (RCU) and post_kmmio_handler().
0045      * Protected by kmmio_lock, when linked into kmmio_page_table.
0046      */
0047     int count;
0048
0049     bool scheduled_for_release;
0050 };
0051
0052 struct kmmio_delayed_release {
0053     struct rcu_head rcu;
0054     struct kmmio_fault_page *release_list;
0055 };
0056
0057 struct kmmio_context {
0058     struct kmmio_fault_page *fpage;
0059     struct kmmio_probe *probe;
0060     unsigned long saved_flags;
0061     unsigned long addr;
0062     int active;
0063 };
0064
0065 static DEFINE_SPINLOCK(kmmio_lock);
0066
0067 /* Protected by kmmio_lock */
0068 unsigned int kmmio_count;
0069
0070 /* Read-protected by RCU, write-protected by kmmio_lock. */
0071 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
0072 static LIST_HEAD(kmmio_probes);
0073
0074 static struct list_head *kmmio_page_list(unsigned long addr)
0075 {
0076     unsigned int l;
0077     pte_t *pte = lookup_address(addr, &l);
0078
0079     if (!pte)
0080         return NULL;
0081     addr &= page_level_mask(l);
0082
0083     return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
0084 }
0085
0086 /* Accessed per-cpu */
0087 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
0088
0089 /*
0090  * this is basically a dynamic stabbing problem:
0091  * Could use the existing prio tree code or
0092  * Possible better implementations:
0093  * The Interval Skip List: A Data Structure for Finding All Intervals That
0094  * Overlap a Point (might be simple)
0095  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
0096  */
0097 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
0098 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
0099 {
0100     struct kmmio_probe *p;
0101     list_for_each_entry_rcu(p, &kmmio_probes, list) {
0102         if (addr >= p->addr && addr < (p->addr + p->len))
0103             return p;
0104     }
0105     return NULL;
0106 }
0107
0108 /* You must be holding RCU read lock. */
0109 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
0110 {
0111     struct list_head *head;
0112     struct kmmio_fault_page *f;
0113     unsigned int l;
0114     pte_t *pte = lookup_address(addr, &l);
0115
0116     if (!pte)
0117         return NULL;
0118     addr &= page_level_mask(l);
0119     head = kmmio_page_list(addr);
0120     list_for_each_entry_rcu(f, head, list) {
0121         if (f->addr == addr)
0122             return f;
0123     }
0124     return NULL;
0125 }
0126
0127 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
0128 {
0129     pmd_t new_pmd;
0130     pmdval_t v = pmd_val(*pmd);
0131     if (clear) {
0132         *old = v;
0133         new_pmd = pmd_mkinvalid(*pmd);
0134     } else {
0135         /* Presume this has been called with clear==true previously */
0136         new_pmd = __pmd(*old);
0137     }
0138     set_pmd(pmd, new_pmd);
0139 }
0140
0141 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
0142 {
0143     pteval_t v = pte_val(*pte);
0144     if (clear) {
0145         *old = v;
0146         /* Nothing should care about address */
0147         pte_clear(&init_mm, 0, pte);
0148     } else {
0149         /* Presume this has been called with clear==true previously */
0150         set_pte_atomic(pte, __pte(*old));
0151     }
0152 }
0153
0154 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
0155 {
0156     unsigned int level;
0157     pte_t *pte = lookup_address(f->addr, &level);
0158
0159     if (!pte) {
0160         pr_err("no pte for addr 0x%08lx\n", f->addr);
0161         return -1;
0162     }
0163
0164     switch (level) {
0165     case PG_LEVEL_2M:
0166         clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
0167         break;
0168     case PG_LEVEL_4K:
0169         clear_pte_presence(pte, clear, &f->old_presence);
0170         break;
0171     default:
0172         pr_err("unexpected page level 0x%x.\n", level);
0173         return -1;
0174     }
0175
0176     flush_tlb_one_kernel(f->addr);
0177     return 0;
0178 }
0179
0180 /*
0181  * Mark the given page as not present. Access to it will trigger a fault.
0182  *
0183  * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
0184  * protection is ignored here. RCU read lock is assumed held, so the struct
0185  * will not disappear unexpectedly. Furthermore, the caller must guarantee,
0186  * that double arming the same virtual address (page) cannot occur.
0187  *
0188  * Double disarming on the other hand is allowed, and may occur when a fault
0189  * and mmiotrace shutdown happen simultaneously.
0190  */
0191 static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
0192 {
0193     int ret;
0194     WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
0195     if (f->armed) {
0196         pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
0197             f->addr, f->count, !!f->old_presence);
0198     }
0199     ret = clear_page_presence(f, true);
0200     WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
0201           f->addr);
0202     f->armed = true;
0203     return ret;
0204 }
0205
0206 /** Restore the given page to saved presence state. */
0207 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
0208 {
0209     int ret = clear_page_presence(f, false);
0210     WARN_ONCE(ret < 0,
0211             KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
0212     f->armed = false;
0213 }
0214
0215 /*
0216  * This is being called from do_page_fault().
0217  *
0218  * We may be in an interrupt or a critical section. Also prefecthing may
0219  * trigger a page fault. We may be in the middle of process switch.
0220  * We cannot take any locks, because we could be executing especially
0221  * within a kmmio critical section.
0222  *
0223  * Local interrupts are disabled, so preemption cannot happen.
0224  * Do not enable interrupts, do not sleep, and watch out for other CPUs.
0225  */
0226 /*
0227  * Interrupts are disabled on entry as trap3 is an interrupt gate
0228  * and they remain disabled throughout this function.
0229  */
0230 int kmmio_handler(struct pt_regs *regs, unsigned long addr)
0231 {
0232     struct kmmio_context *ctx;
0233     struct kmmio_fault_page *faultpage;
0234     int ret = 0; /* default to fault not handled */
0235     unsigned long page_base = addr;
0236     unsigned int l;
0237     pte_t *pte = lookup_address(addr, &l);
0238     if (!pte)
0239         return -EINVAL;
0240     page_base &= page_level_mask(l);
0241
0242     /*
0243      * Preemption is now disabled to prevent process switch during
0244      * single stepping. We can only handle one active kmmio trace
0245      * per cpu, so ensure that we finish it before something else
0246      * gets to run. We also hold the RCU read lock over single
0247      * stepping to avoid looking up the probe and kmmio_fault_page
0248      * again.
0249      */
0250     preempt_disable();
0251     rcu_read_lock();
0252
0253     faultpage = get_kmmio_fault_page(page_base);
0254     if (!faultpage) {
0255         /*
0256          * Either this page fault is not caused by kmmio, or
0257          * another CPU just pulled the kmmio probe from under
0258          * our feet. The latter case should not be possible.
0259          */
0260         goto no_kmmio;
0261     }
0262
0263     ctx = this_cpu_ptr(&kmmio_ctx);
0264     if (ctx->active) {
0265         if (page_base == ctx->addr) {
0266             /*
0267              * A second fault on the same page means some other
0268              * condition needs handling by do_page_fault(), the
0269              * page really not being present is the most common.
0270              */
0271             pr_debug("secondary hit for 0x%08lx CPU %d.\n",
0272                  addr, smp_processor_id());
0273
0274             if (!faultpage->old_presence)
0275                 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
0276                     addr, smp_processor_id());
0277         } else {
0278             /*
0279              * Prevent overwriting already in-flight context.
0280              * This should not happen, let's hope disarming at
0281              * least prevents a panic.
0282              */
0283             pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
0284                  smp_processor_id(), addr);
0285             pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
0286             disarm_kmmio_fault_page(faultpage);
0287         }
0288         goto no_kmmio;
0289     }
0290     ctx->active++;
0291
0292     ctx->fpage = faultpage;
0293     ctx->probe = get_kmmio_probe(page_base);
0294     ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
0295     ctx->addr = page_base;
0296
0297     if (ctx->probe && ctx->probe->pre_handler)
0298         ctx->probe->pre_handler(ctx->probe, regs, addr);
0299
0300     /*
0301      * Enable single-stepping and disable interrupts for the faulting
0302      * context. Local interrupts must not get enabled during stepping.
0303      */
0304     regs->flags |= X86_EFLAGS_TF;
0305     regs->flags &= ~X86_EFLAGS_IF;
0306
0307     /* Now we set present bit in PTE and single step. */
0308     disarm_kmmio_fault_page(ctx->fpage);
0309
0310     /*
0311      * If another cpu accesses the same page while we are stepping,
0312      * the access will not be caught. It will simply succeed and the
0313      * only downside is we lose the event. If this becomes a problem,
0314      * the user should drop to single cpu before tracing.
0315      */
0316
0317     return 1; /* fault handled */
0318
0319 no_kmmio:
0320     rcu_read_unlock();
0321     preempt_enable_no_resched();
0322     return ret;
0323 }
0324
0325 /*
0326  * Interrupts are disabled on entry as trap1 is an interrupt gate
0327  * and they remain disabled throughout this function.
0328  * This must always get called as the pair to kmmio_handler().
0329  */
0330 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
0331 {
0332     int ret = 0;
0333     struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
0334
0335     if (!ctx->active) {
0336         /*
0337          * debug traps without an active context are due to either
0338          * something external causing them (f.e. using a debugger while
0339          * mmio tracing enabled), or erroneous behaviour
0340          */
0341         pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
0342         goto out;
0343     }
0344
0345     if (ctx->probe && ctx->probe->post_handler)
0346         ctx->probe->post_handler(ctx->probe, condition, regs);
0347
0348     /* Prevent racing against release_kmmio_fault_page(). */
0349     spin_lock(&kmmio_lock);
0350     if (ctx->fpage->count)
0351         arm_kmmio_fault_page(ctx->fpage);
0352     spin_unlock(&kmmio_lock);
0353
0354     regs->flags &= ~X86_EFLAGS_TF;
0355     regs->flags |= ctx->saved_flags;
0356
0357     /* These were acquired in kmmio_handler(). */
0358     ctx->active--;
0359     BUG_ON(ctx->active);
0360     rcu_read_unlock();
0361     preempt_enable_no_resched();
0362
0363     /*
0364      * if somebody else is singlestepping across a probe point, flags
0365      * will have TF set, in which case, continue the remaining processing
0366      * of do_debug, as if this is not a probe hit.
0367      */
0368     if (!(regs->flags & X86_EFLAGS_TF))
0369         ret = 1;
0370 out:
0371     return ret;
0372 }
0373
0374 /* You must be holding kmmio_lock. */
0375 static int add_kmmio_fault_page(unsigned long addr)
0376 {
0377     struct kmmio_fault_page *f;
0378
0379     f = get_kmmio_fault_page(addr);
0380     if (f) {
0381         if (!f->count)
0382             arm_kmmio_fault_page(f);
0383         f->count++;
0384         return 0;
0385     }
0386
0387     f = kzalloc(sizeof(*f), GFP_ATOMIC);
0388     if (!f)
0389         return -1;
0390
0391     f->count = 1;
0392     f->addr = addr;
0393
0394     if (arm_kmmio_fault_page(f)) {
0395         kfree(f);
0396         return -1;
0397     }
0398
0399     list_add_rcu(&f->list, kmmio_page_list(f->addr));
0400
0401     return 0;
0402 }
0403
0404 /* You must be holding kmmio_lock. */
0405 static void release_kmmio_fault_page(unsigned long addr,
0406                 struct kmmio_fault_page **release_list)
0407 {
0408     struct kmmio_fault_page *f;
0409
0410     f = get_kmmio_fault_page(addr);
0411     if (!f)
0412         return;
0413
0414     f->count--;
0415     BUG_ON(f->count < 0);
0416     if (!f->count) {
0417         disarm_kmmio_fault_page(f);
0418         if (!f->scheduled_for_release) {
0419             f->release_next = *release_list;
0420             *release_list = f;
0421             f->scheduled_for_release = true;
0422         }
0423     }
0424 }
0425
0426 /*
0427  * With page-unaligned ioremaps, one or two armed pages may contain
0428  * addresses from outside the intended mapping. Events for these addresses
0429  * are currently silently dropped. The events may result only from programming
0430  * mistakes by accessing addresses before the beginning or past the end of a
0431  * mapping.
0432  */
0433 int register_kmmio_probe(struct kmmio_probe *p)
0434 {
0435     unsigned long flags;
0436     int ret = 0;
0437     unsigned long size = 0;
0438     unsigned long addr = p->addr & PAGE_MASK;
0439     const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
0440     unsigned int l;
0441     pte_t *pte;
0442
0443     spin_lock_irqsave(&kmmio_lock, flags);
0444     if (get_kmmio_probe(addr)) {
0445         ret = -EEXIST;
0446         goto out;
0447     }
0448
0449     pte = lookup_address(addr, &l);
0450     if (!pte) {
0451         ret = -EINVAL;
0452         goto out;
0453     }
0454
0455     kmmio_count++;
0456     list_add_rcu(&p->list, &kmmio_probes);
0457     while (size < size_lim) {
0458         if (add_kmmio_fault_page(addr + size))
0459             pr_err("Unable to set page fault.\n");
0460         size += page_level_size(l);
0461     }
0462 out:
0463     spin_unlock_irqrestore(&kmmio_lock, flags);
0464     /*
0465      * XXX: What should I do here?
0466      * Here was a call to global_flush_tlb(), but it does not exist
0467      * anymore. It seems it's not needed after all.
0468      */
0469     return ret;
0470 }
0471 EXPORT_SYMBOL(register_kmmio_probe);
0472
0473 static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
0474 {
0475     struct kmmio_delayed_release *dr = container_of(
0476                         head,
0477                         struct kmmio_delayed_release,
0478                         rcu);
0479     struct kmmio_fault_page *f = dr->release_list;
0480     while (f) {
0481         struct kmmio_fault_page *next = f->release_next;
0482         BUG_ON(f->count);
0483         kfree(f);
0484         f = next;
0485     }
0486     kfree(dr);
0487 }
0488
0489 static void remove_kmmio_fault_pages(struct rcu_head *head)
0490 {
0491     struct kmmio_delayed_release *dr =
0492         container_of(head, struct kmmio_delayed_release, rcu);
0493     struct kmmio_fault_page *f = dr->release_list;
0494     struct kmmio_fault_page **prevp = &dr->release_list;
0495     unsigned long flags;
0496
0497     spin_lock_irqsave(&kmmio_lock, flags);
0498     while (f) {
0499         if (!f->count) {
0500             list_del_rcu(&f->list);
0501             prevp = &f->release_next;
0502         } else {
0503             *prevp = f->release_next;
0504             f->release_next = NULL;
0505             f->scheduled_for_release = false;
0506         }
0507         f = *prevp;
0508     }
0509     spin_unlock_irqrestore(&kmmio_lock, flags);
0510
0511     /* This is the real RCU destroy call. */
0512     call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
0513 }
0514
0515 /*
0516  * Remove a kmmio probe. You have to synchronize_rcu() before you can be
0517  * sure that the callbacks will not be called anymore. Only after that
0518  * you may actually release your struct kmmio_probe.
0519  *
0520  * Unregistering a kmmio fault page has three steps:
0521  * 1. release_kmmio_fault_page()
0522  *    Disarm the page, wait a grace period to let all faults finish.
0523  * 2. remove_kmmio_fault_pages()
0524  *    Remove the pages from kmmio_page_table.
0525  * 3. rcu_free_kmmio_fault_pages()
0526  *    Actually free the kmmio_fault_page structs as with RCU.
0527  */
0528 void unregister_kmmio_probe(struct kmmio_probe *p)
0529 {
0530     unsigned long flags;
0531     unsigned long size = 0;
0532     unsigned long addr = p->addr & PAGE_MASK;
0533     const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
0534     struct kmmio_fault_page *release_list = NULL;
0535     struct kmmio_delayed_release *drelease;
0536     unsigned int l;
0537     pte_t *pte;
0538
0539     pte = lookup_address(addr, &l);
0540     if (!pte)
0541         return;
0542
0543     spin_lock_irqsave(&kmmio_lock, flags);
0544     while (size < size_lim) {
0545         release_kmmio_fault_page(addr + size, &release_list);
0546         size += page_level_size(l);
0547     }
0548     list_del_rcu(&p->list);
0549     kmmio_count--;
0550     spin_unlock_irqrestore(&kmmio_lock, flags);
0551
0552     if (!release_list)
0553         return;
0554
0555     drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
0556     if (!drelease) {
0557         pr_crit("leaking kmmio_fault_page objects.\n");
0558         return;
0559     }
0560     drelease->release_list = release_list;
0561
0562     /*
0563      * This is not really RCU here. We have just disarmed a set of
0564      * pages so that they cannot trigger page faults anymore. However,
0565      * we cannot remove the pages from kmmio_page_table,
0566      * because a probe hit might be in flight on another CPU. The
0567      * pages are collected into a list, and they will be removed from
0568      * kmmio_page_table when it is certain that no probe hit related to
0569      * these pages can be in flight. RCU grace period sounds like a
0570      * good choice.
0571      *
0572      * If we removed the pages too early, kmmio page fault handler might
0573      * not find the respective kmmio_fault_page and determine it's not
0574      * a kmmio fault, when it actually is. This would lead to madness.
0575      */
0576     call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
0577 }
0578 EXPORT_SYMBOL(unregister_kmmio_probe);
0579
0580 static int
0581 kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
0582 {
0583     struct die_args *arg = args;
0584     unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
0585
0586     if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
0587         if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
0588             /*
0589              * Reset the BS bit in dr6 (pointed by args->err) to
0590              * denote completion of processing
0591              */
0592             *dr6_p &= ~DR_STEP;
0593             return NOTIFY_STOP;
0594         }
0595
0596     return NOTIFY_DONE;
0597 }
0598
0599 static struct notifier_block nb_die = {
0600     .notifier_call = kmmio_die_notifier
0601 };
0602
0603 int kmmio_init(void)
0604 {
0605     int i;
0606
0607     for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
0608         INIT_LIST_HEAD(&kmmio_page_table[i]);
0609
0610     return register_die_notifier(&nb_die);
0611 }
0612
0613 void kmmio_cleanup(void)
0614 {
0615     int i;
0616
0617     unregister_die_notifier(&nb_die);
0618     for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
0619         WARN_ONCE(!list_empty(&kmmio_page_table[i]),
0620             KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
0621     }
0622 }