Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  *  Kernel Probes (KProbes)
0004  *
0005  * Copyright (C) IBM Corporation, 2002, 2004
0006  *
0007  * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
0008  *      Probes initial implementation ( includes contributions from
0009  *      Rusty Russell).
0010  * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
0011  *      interface to access function arguments.
0012  * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
0013  *      <prasanna@in.ibm.com> adapted for x86_64 from i386.
0014  * 2005-Mar Roland McGrath <roland@redhat.com>
0015  *      Fixed to handle %rip-relative addressing mode correctly.
0016  * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
0017  *      <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
0018  *      <prasanna@in.ibm.com> added function-return probes.
0019  * 2005-May Rusty Lynch <rusty.lynch@intel.com>
0020  *      Added function return probes functionality
0021  * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
0022  *      kprobe-booster and kretprobe-booster for i386.
0023  * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
0024  *      and kretprobe-booster for x86-64
0025  * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
0026  *      <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
0027  *      unified x86 kprobes code.
0028  */
0029 #include <linux/kprobes.h>
0030 #include <linux/ptrace.h>
0031 #include <linux/string.h>
0032 #include <linux/slab.h>
0033 #include <linux/hardirq.h>
0034 #include <linux/preempt.h>
0035 #include <linux/sched/debug.h>
0036 #include <linux/perf_event.h>
0037 #include <linux/extable.h>
0038 #include <linux/kdebug.h>
0039 #include <linux/kallsyms.h>
0040 #include <linux/ftrace.h>
0041 #include <linux/kasan.h>
0042 #include <linux/moduleloader.h>
0043 #include <linux/objtool.h>
0044 #include <linux/vmalloc.h>
0045 #include <linux/pgtable.h>
0046 
0047 #include <asm/text-patching.h>
0048 #include <asm/cacheflush.h>
0049 #include <asm/desc.h>
0050 #include <linux/uaccess.h>
0051 #include <asm/alternative.h>
0052 #include <asm/insn.h>
0053 #include <asm/debugreg.h>
0054 #include <asm/set_memory.h>
0055 #include <asm/ibt.h>
0056 
0057 #include "common.h"
0058 
0059 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
0060 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
0061 
0062 #define stack_addr(regs) ((unsigned long *)regs->sp)
0063 
0064 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
0065     (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
0066       (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
0067       (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
0068       (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
0069      << (row % 32))
0070     /*
0071      * Undefined/reserved opcodes, conditional jump, Opcode Extension
0072      * Groups, and some special opcodes can not boost.
0073      * This is non-const and volatile to keep gcc from statically
0074      * optimizing it out, as variable_test_bit makes gcc think only
0075      * *(unsigned long*) is used.
0076      */
0077 static volatile u32 twobyte_is_boostable[256 / 32] = {
0078     /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
0079     /*      ----------------------------------------------          */
0080     W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
0081     W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
0082     W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
0083     W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
0084     W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
0085     W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
0086     W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
0087     W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
0088     W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
0089     W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
0090     W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
0091     W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
0092     W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
0093     W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
0094     W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
0095     W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
0096     /*      -----------------------------------------------         */
0097     /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
0098 };
0099 #undef W
0100 
0101 struct kretprobe_blackpoint kretprobe_blacklist[] = {
0102     {"__switch_to", }, /* This function switches only current task, but
0103                   doesn't switch kernel stack.*/
0104     {NULL, NULL}    /* Terminator */
0105 };
0106 
0107 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
0108 
0109 static nokprobe_inline void
0110 __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
0111 {
0112     struct __arch_relative_insn {
0113         u8 op;
0114         s32 raddr;
0115     } __packed *insn;
0116 
0117     insn = (struct __arch_relative_insn *)dest;
0118     insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
0119     insn->op = op;
0120 }
0121 
0122 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
0123 void synthesize_reljump(void *dest, void *from, void *to)
0124 {
0125     __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
0126 }
0127 NOKPROBE_SYMBOL(synthesize_reljump);
0128 
0129 /* Insert a call instruction at address 'from', which calls address 'to'.*/
0130 void synthesize_relcall(void *dest, void *from, void *to)
0131 {
0132     __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
0133 }
0134 NOKPROBE_SYMBOL(synthesize_relcall);
0135 
0136 /*
0137  * Returns non-zero if INSN is boostable.
0138  * RIP relative instructions are adjusted at copying time in 64 bits mode
0139  */
0140 int can_boost(struct insn *insn, void *addr)
0141 {
0142     kprobe_opcode_t opcode;
0143     insn_byte_t prefix;
0144     int i;
0145 
0146     if (search_exception_tables((unsigned long)addr))
0147         return 0;   /* Page fault may occur on this address. */
0148 
0149     /* 2nd-byte opcode */
0150     if (insn->opcode.nbytes == 2)
0151         return test_bit(insn->opcode.bytes[1],
0152                 (unsigned long *)twobyte_is_boostable);
0153 
0154     if (insn->opcode.nbytes != 1)
0155         return 0;
0156 
0157     for_each_insn_prefix(insn, i, prefix) {
0158         insn_attr_t attr;
0159 
0160         attr = inat_get_opcode_attribute(prefix);
0161         /* Can't boost Address-size override prefix and CS override prefix */
0162         if (prefix == 0x2e || inat_is_address_size_prefix(attr))
0163             return 0;
0164     }
0165 
0166     opcode = insn->opcode.bytes[0];
0167 
0168     switch (opcode) {
0169     case 0x62:      /* bound */
0170     case 0x70 ... 0x7f: /* Conditional jumps */
0171     case 0x9a:      /* Call far */
0172     case 0xc0 ... 0xc1: /* Grp2 */
0173     case 0xcc ... 0xce: /* software exceptions */
0174     case 0xd0 ... 0xd3: /* Grp2 */
0175     case 0xd6:      /* (UD) */
0176     case 0xd8 ... 0xdf: /* ESC */
0177     case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
0178     case 0xe8 ... 0xe9: /* near Call, JMP */
0179     case 0xeb:      /* Short JMP */
0180     case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
0181     case 0xf6 ... 0xf7: /* Grp3 */
0182     case 0xfe:      /* Grp4 */
0183         /* ... are not boostable */
0184         return 0;
0185     case 0xff:      /* Grp5 */
0186         /* Only indirect jmp is boostable */
0187         return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
0188     default:
0189         return 1;
0190     }
0191 }
0192 
0193 static unsigned long
0194 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
0195 {
0196     struct kprobe *kp;
0197     bool faddr;
0198 
0199     kp = get_kprobe((void *)addr);
0200     faddr = ftrace_location(addr) == addr;
0201     /*
0202      * Use the current code if it is not modified by Kprobe
0203      * and it cannot be modified by ftrace.
0204      */
0205     if (!kp && !faddr)
0206         return addr;
0207 
0208     /*
0209      * Basically, kp->ainsn.insn has an original instruction.
0210      * However, RIP-relative instruction can not do single-stepping
0211      * at different place, __copy_instruction() tweaks the displacement of
0212      * that instruction. In that case, we can't recover the instruction
0213      * from the kp->ainsn.insn.
0214      *
0215      * On the other hand, in case on normal Kprobe, kp->opcode has a copy
0216      * of the first byte of the probed instruction, which is overwritten
0217      * by int3. And the instruction at kp->addr is not modified by kprobes
0218      * except for the first byte, we can recover the original instruction
0219      * from it and kp->opcode.
0220      *
0221      * In case of Kprobes using ftrace, we do not have a copy of
0222      * the original instruction. In fact, the ftrace location might
0223      * be modified at anytime and even could be in an inconsistent state.
0224      * Fortunately, we know that the original code is the ideal 5-byte
0225      * long NOP.
0226      */
0227     if (copy_from_kernel_nofault(buf, (void *)addr,
0228         MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
0229         return 0UL;
0230 
0231     if (faddr)
0232         memcpy(buf, x86_nops[5], 5);
0233     else
0234         buf[0] = kp->opcode;
0235     return (unsigned long)buf;
0236 }
0237 
0238 /*
0239  * Recover the probed instruction at addr for further analysis.
0240  * Caller must lock kprobes by kprobe_mutex, or disable preemption
0241  * for preventing to release referencing kprobes.
0242  * Returns zero if the instruction can not get recovered (or access failed).
0243  */
0244 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
0245 {
0246     unsigned long __addr;
0247 
0248     __addr = __recover_optprobed_insn(buf, addr);
0249     if (__addr != addr)
0250         return __addr;
0251 
0252     return __recover_probed_insn(buf, addr);
0253 }
0254 
0255 /* Check if paddr is at an instruction boundary */
0256 static int can_probe(unsigned long paddr)
0257 {
0258     unsigned long addr, __addr, offset = 0;
0259     struct insn insn;
0260     kprobe_opcode_t buf[MAX_INSN_SIZE];
0261 
0262     if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
0263         return 0;
0264 
0265     /* Decode instructions */
0266     addr = paddr - offset;
0267     while (addr < paddr) {
0268         int ret;
0269 
0270         /*
0271          * Check if the instruction has been modified by another
0272          * kprobe, in which case we replace the breakpoint by the
0273          * original instruction in our buffer.
0274          * Also, jump optimization will change the breakpoint to
0275          * relative-jump. Since the relative-jump itself is
0276          * normally used, we just go through if there is no kprobe.
0277          */
0278         __addr = recover_probed_instruction(buf, addr);
0279         if (!__addr)
0280             return 0;
0281 
0282         ret = insn_decode_kernel(&insn, (void *)__addr);
0283         if (ret < 0)
0284             return 0;
0285 
0286         /*
0287          * Another debugging subsystem might insert this breakpoint.
0288          * In that case, we can't recover it.
0289          */
0290         if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
0291             return 0;
0292         addr += insn.length;
0293     }
0294 
0295     return (addr == paddr);
0296 }
0297 
0298 /* If x86 supports IBT (ENDBR) it must be skipped. */
0299 kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
0300                      bool *on_func_entry)
0301 {
0302     if (is_endbr(*(u32 *)addr)) {
0303         *on_func_entry = !offset || offset == 4;
0304         if (*on_func_entry)
0305             offset = 4;
0306 
0307     } else {
0308         *on_func_entry = !offset;
0309     }
0310 
0311     return (kprobe_opcode_t *)(addr + offset);
0312 }
0313 
0314 /*
0315  * Copy an instruction with recovering modified instruction by kprobes
0316  * and adjust the displacement if the instruction uses the %rip-relative
0317  * addressing mode. Note that since @real will be the final place of copied
0318  * instruction, displacement must be adjust by @real, not @dest.
0319  * This returns the length of copied instruction, or 0 if it has an error.
0320  */
0321 int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
0322 {
0323     kprobe_opcode_t buf[MAX_INSN_SIZE];
0324     unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
0325     int ret;
0326 
0327     if (!recovered_insn || !insn)
0328         return 0;
0329 
0330     /* This can access kernel text if given address is not recovered */
0331     if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
0332             MAX_INSN_SIZE))
0333         return 0;
0334 
0335     ret = insn_decode_kernel(insn, dest);
0336     if (ret < 0)
0337         return 0;
0338 
0339     /* We can not probe force emulate prefixed instruction */
0340     if (insn_has_emulate_prefix(insn))
0341         return 0;
0342 
0343     /* Another subsystem puts a breakpoint, failed to recover */
0344     if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
0345         return 0;
0346 
0347     /* We should not singlestep on the exception masking instructions */
0348     if (insn_masking_exception(insn))
0349         return 0;
0350 
0351 #ifdef CONFIG_X86_64
0352     /* Only x86_64 has RIP relative instructions */
0353     if (insn_rip_relative(insn)) {
0354         s64 newdisp;
0355         u8 *disp;
0356         /*
0357          * The copied instruction uses the %rip-relative addressing
0358          * mode.  Adjust the displacement for the difference between
0359          * the original location of this instruction and the location
0360          * of the copy that will actually be run.  The tricky bit here
0361          * is making sure that the sign extension happens correctly in
0362          * this calculation, since we need a signed 32-bit result to
0363          * be sign-extended to 64 bits when it's added to the %rip
0364          * value and yield the same 64-bit result that the sign-
0365          * extension of the original signed 32-bit displacement would
0366          * have given.
0367          */
0368         newdisp = (u8 *) src + (s64) insn->displacement.value
0369               - (u8 *) real;
0370         if ((s64) (s32) newdisp != newdisp) {
0371             pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
0372             return 0;
0373         }
0374         disp = (u8 *) dest + insn_offset_displacement(insn);
0375         *(s32 *) disp = (s32) newdisp;
0376     }
0377 #endif
0378     return insn->length;
0379 }
0380 
0381 /* Prepare reljump or int3 right after instruction */
0382 static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
0383                   struct insn *insn)
0384 {
0385     int len = insn->length;
0386 
0387     if (!IS_ENABLED(CONFIG_PREEMPTION) &&
0388         !p->post_handler && can_boost(insn, p->addr) &&
0389         MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
0390         /*
0391          * These instructions can be executed directly if it
0392          * jumps back to correct address.
0393          */
0394         synthesize_reljump(buf + len, p->ainsn.insn + len,
0395                    p->addr + insn->length);
0396         len += JMP32_INSN_SIZE;
0397         p->ainsn.boostable = 1;
0398     } else {
0399         /* Otherwise, put an int3 for trapping singlestep */
0400         if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
0401             return -ENOSPC;
0402 
0403         buf[len] = INT3_INSN_OPCODE;
0404         len += INT3_INSN_SIZE;
0405     }
0406 
0407     return len;
0408 }
0409 
0410 /* Make page to RO mode when allocate it */
0411 void *alloc_insn_page(void)
0412 {
0413     void *page;
0414 
0415     page = module_alloc(PAGE_SIZE);
0416     if (!page)
0417         return NULL;
0418 
0419     set_vm_flush_reset_perms(page);
0420     /*
0421      * First make the page read-only, and only then make it executable to
0422      * prevent it from being W+X in between.
0423      */
0424     set_memory_ro((unsigned long)page, 1);
0425 
0426     /*
0427      * TODO: Once additional kernel code protection mechanisms are set, ensure
0428      * that the page was not maliciously altered and it is still zeroed.
0429      */
0430     set_memory_x((unsigned long)page, 1);
0431 
0432     return page;
0433 }
0434 
0435 /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
0436 
0437 static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
0438 {
0439     switch (p->ainsn.opcode) {
0440     case 0xfa:  /* cli */
0441         regs->flags &= ~(X86_EFLAGS_IF);
0442         break;
0443     case 0xfb:  /* sti */
0444         regs->flags |= X86_EFLAGS_IF;
0445         break;
0446     case 0x9c:  /* pushf */
0447         int3_emulate_push(regs, regs->flags);
0448         break;
0449     case 0x9d:  /* popf */
0450         regs->flags = int3_emulate_pop(regs);
0451         break;
0452     }
0453     regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
0454 }
0455 NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
0456 
0457 static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
0458 {
0459     int3_emulate_ret(regs);
0460 }
0461 NOKPROBE_SYMBOL(kprobe_emulate_ret);
0462 
0463 static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
0464 {
0465     unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
0466 
0467     func += p->ainsn.rel32;
0468     int3_emulate_call(regs, func);
0469 }
0470 NOKPROBE_SYMBOL(kprobe_emulate_call);
0471 
0472 static nokprobe_inline
0473 void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
0474 {
0475     unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
0476 
0477     if (cond)
0478         ip += p->ainsn.rel32;
0479     int3_emulate_jmp(regs, ip);
0480 }
0481 
0482 static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
0483 {
0484     __kprobe_emulate_jmp(p, regs, true);
0485 }
0486 NOKPROBE_SYMBOL(kprobe_emulate_jmp);
0487 
0488 static const unsigned long jcc_mask[6] = {
0489     [0] = X86_EFLAGS_OF,
0490     [1] = X86_EFLAGS_CF,
0491     [2] = X86_EFLAGS_ZF,
0492     [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
0493     [4] = X86_EFLAGS_SF,
0494     [5] = X86_EFLAGS_PF,
0495 };
0496 
0497 static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
0498 {
0499     bool invert = p->ainsn.jcc.type & 1;
0500     bool match;
0501 
0502     if (p->ainsn.jcc.type < 0xc) {
0503         match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
0504     } else {
0505         match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
0506             ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
0507         if (p->ainsn.jcc.type >= 0xe)
0508             match = match || (regs->flags & X86_EFLAGS_ZF);
0509     }
0510     __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
0511 }
0512 NOKPROBE_SYMBOL(kprobe_emulate_jcc);
0513 
0514 static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
0515 {
0516     bool match;
0517 
0518     if (p->ainsn.loop.type != 3) {  /* LOOP* */
0519         if (p->ainsn.loop.asize == 32)
0520             match = ((*(u32 *)&regs->cx)--) != 0;
0521 #ifdef CONFIG_X86_64
0522         else if (p->ainsn.loop.asize == 64)
0523             match = ((*(u64 *)&regs->cx)--) != 0;
0524 #endif
0525         else
0526             match = ((*(u16 *)&regs->cx)--) != 0;
0527     } else {            /* JCXZ */
0528         if (p->ainsn.loop.asize == 32)
0529             match = *(u32 *)(&regs->cx) == 0;
0530 #ifdef CONFIG_X86_64
0531         else if (p->ainsn.loop.asize == 64)
0532             match = *(u64 *)(&regs->cx) == 0;
0533 #endif
0534         else
0535             match = *(u16 *)(&regs->cx) == 0;
0536     }
0537 
0538     if (p->ainsn.loop.type == 0)    /* LOOPNE */
0539         match = match && !(regs->flags & X86_EFLAGS_ZF);
0540     else if (p->ainsn.loop.type == 1)   /* LOOPE */
0541         match = match && (regs->flags & X86_EFLAGS_ZF);
0542 
0543     __kprobe_emulate_jmp(p, regs, match);
0544 }
0545 NOKPROBE_SYMBOL(kprobe_emulate_loop);
0546 
0547 static const int addrmode_regoffs[] = {
0548     offsetof(struct pt_regs, ax),
0549     offsetof(struct pt_regs, cx),
0550     offsetof(struct pt_regs, dx),
0551     offsetof(struct pt_regs, bx),
0552     offsetof(struct pt_regs, sp),
0553     offsetof(struct pt_regs, bp),
0554     offsetof(struct pt_regs, si),
0555     offsetof(struct pt_regs, di),
0556 #ifdef CONFIG_X86_64
0557     offsetof(struct pt_regs, r8),
0558     offsetof(struct pt_regs, r9),
0559     offsetof(struct pt_regs, r10),
0560     offsetof(struct pt_regs, r11),
0561     offsetof(struct pt_regs, r12),
0562     offsetof(struct pt_regs, r13),
0563     offsetof(struct pt_regs, r14),
0564     offsetof(struct pt_regs, r15),
0565 #endif
0566 };
0567 
0568 static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
0569 {
0570     unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
0571 
0572     int3_emulate_call(regs, regs_get_register(regs, offs));
0573 }
0574 NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
0575 
0576 static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
0577 {
0578     unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
0579 
0580     int3_emulate_jmp(regs, regs_get_register(regs, offs));
0581 }
0582 NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
0583 
0584 static int prepare_emulation(struct kprobe *p, struct insn *insn)
0585 {
0586     insn_byte_t opcode = insn->opcode.bytes[0];
0587 
0588     switch (opcode) {
0589     case 0xfa:      /* cli */
0590     case 0xfb:      /* sti */
0591     case 0x9c:      /* pushfl */
0592     case 0x9d:      /* popf/popfd */
0593         /*
0594          * IF modifiers must be emulated since it will enable interrupt while
0595          * int3 single stepping.
0596          */
0597         p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
0598         p->ainsn.opcode = opcode;
0599         break;
0600     case 0xc2:  /* ret/lret */
0601     case 0xc3:
0602     case 0xca:
0603     case 0xcb:
0604         p->ainsn.emulate_op = kprobe_emulate_ret;
0605         break;
0606     case 0x9a:  /* far call absolute -- segment is not supported */
0607     case 0xea:  /* far jmp absolute -- segment is not supported */
0608     case 0xcc:  /* int3 */
0609     case 0xcf:  /* iret -- in-kernel IRET is not supported */
0610         return -EOPNOTSUPP;
0611         break;
0612     case 0xe8:  /* near call relative */
0613         p->ainsn.emulate_op = kprobe_emulate_call;
0614         if (insn->immediate.nbytes == 2)
0615             p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
0616         else
0617             p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
0618         break;
0619     case 0xeb:  /* short jump relative */
0620     case 0xe9:  /* near jump relative */
0621         p->ainsn.emulate_op = kprobe_emulate_jmp;
0622         if (insn->immediate.nbytes == 1)
0623             p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
0624         else if (insn->immediate.nbytes == 2)
0625             p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
0626         else
0627             p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
0628         break;
0629     case 0x70 ... 0x7f:
0630         /* 1 byte conditional jump */
0631         p->ainsn.emulate_op = kprobe_emulate_jcc;
0632         p->ainsn.jcc.type = opcode & 0xf;
0633         p->ainsn.rel32 = *(char *)insn->immediate.bytes;
0634         break;
0635     case 0x0f:
0636         opcode = insn->opcode.bytes[1];
0637         if ((opcode & 0xf0) == 0x80) {
0638             /* 2 bytes Conditional Jump */
0639             p->ainsn.emulate_op = kprobe_emulate_jcc;
0640             p->ainsn.jcc.type = opcode & 0xf;
0641             if (insn->immediate.nbytes == 2)
0642                 p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
0643             else
0644                 p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
0645         } else if (opcode == 0x01 &&
0646                X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
0647                X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
0648             /* VM extensions - not supported */
0649             return -EOPNOTSUPP;
0650         }
0651         break;
0652     case 0xe0:  /* Loop NZ */
0653     case 0xe1:  /* Loop */
0654     case 0xe2:  /* Loop */
0655     case 0xe3:  /* J*CXZ */
0656         p->ainsn.emulate_op = kprobe_emulate_loop;
0657         p->ainsn.loop.type = opcode & 0x3;
0658         p->ainsn.loop.asize = insn->addr_bytes * 8;
0659         p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
0660         break;
0661     case 0xff:
0662         /*
0663          * Since the 0xff is an extended group opcode, the instruction
0664          * is determined by the MOD/RM byte.
0665          */
0666         opcode = insn->modrm.bytes[0];
0667         if ((opcode & 0x30) == 0x10) {
0668             if ((opcode & 0x8) == 0x8)
0669                 return -EOPNOTSUPP; /* far call */
0670             /* call absolute, indirect */
0671             p->ainsn.emulate_op = kprobe_emulate_call_indirect;
0672         } else if ((opcode & 0x30) == 0x20) {
0673             if ((opcode & 0x8) == 0x8)
0674                 return -EOPNOTSUPP; /* far jmp */
0675             /* jmp near absolute indirect */
0676             p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
0677         } else
0678             break;
0679 
0680         if (insn->addr_bytes != sizeof(unsigned long))
0681             return -EOPNOTSUPP; /* Don't support different size */
0682         if (X86_MODRM_MOD(opcode) != 3)
0683             return -EOPNOTSUPP; /* TODO: support memory addressing */
0684 
0685         p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
0686 #ifdef CONFIG_X86_64
0687         if (X86_REX_B(insn->rex_prefix.value))
0688             p->ainsn.indirect.reg += 8;
0689 #endif
0690         break;
0691     default:
0692         break;
0693     }
0694     p->ainsn.size = insn->length;
0695 
0696     return 0;
0697 }
0698 
0699 static int arch_copy_kprobe(struct kprobe *p)
0700 {
0701     struct insn insn;
0702     kprobe_opcode_t buf[MAX_INSN_SIZE];
0703     int ret, len;
0704 
0705     /* Copy an instruction with recovering if other optprobe modifies it.*/
0706     len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
0707     if (!len)
0708         return -EINVAL;
0709 
0710     /* Analyze the opcode and setup emulate functions */
0711     ret = prepare_emulation(p, &insn);
0712     if (ret < 0)
0713         return ret;
0714 
0715     /* Add int3 for single-step or booster jmp */
0716     len = prepare_singlestep(buf, p, &insn);
0717     if (len < 0)
0718         return len;
0719 
0720     /* Also, displacement change doesn't affect the first byte */
0721     p->opcode = buf[0];
0722 
0723     p->ainsn.tp_len = len;
0724     perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
0725 
0726     /* OK, write back the instruction(s) into ROX insn buffer */
0727     text_poke(p->ainsn.insn, buf, len);
0728 
0729     return 0;
0730 }
0731 
0732 int arch_prepare_kprobe(struct kprobe *p)
0733 {
0734     int ret;
0735 
0736     if (alternatives_text_reserved(p->addr, p->addr))
0737         return -EINVAL;
0738 
0739     if (!can_probe((unsigned long)p->addr))
0740         return -EILSEQ;
0741 
0742     memset(&p->ainsn, 0, sizeof(p->ainsn));
0743 
0744     /* insn: must be on special executable page on x86. */
0745     p->ainsn.insn = get_insn_slot();
0746     if (!p->ainsn.insn)
0747         return -ENOMEM;
0748 
0749     ret = arch_copy_kprobe(p);
0750     if (ret) {
0751         free_insn_slot(p->ainsn.insn, 0);
0752         p->ainsn.insn = NULL;
0753     }
0754 
0755     return ret;
0756 }
0757 
0758 void arch_arm_kprobe(struct kprobe *p)
0759 {
0760     u8 int3 = INT3_INSN_OPCODE;
0761 
0762     text_poke(p->addr, &int3, 1);
0763     text_poke_sync();
0764     perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
0765 }
0766 
0767 void arch_disarm_kprobe(struct kprobe *p)
0768 {
0769     u8 int3 = INT3_INSN_OPCODE;
0770 
0771     perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
0772     text_poke(p->addr, &p->opcode, 1);
0773     text_poke_sync();
0774 }
0775 
0776 void arch_remove_kprobe(struct kprobe *p)
0777 {
0778     if (p->ainsn.insn) {
0779         /* Record the perf event before freeing the slot */
0780         perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
0781                      p->ainsn.tp_len, NULL, 0);
0782         free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
0783         p->ainsn.insn = NULL;
0784     }
0785 }
0786 
0787 static nokprobe_inline void
0788 save_previous_kprobe(struct kprobe_ctlblk *kcb)
0789 {
0790     kcb->prev_kprobe.kp = kprobe_running();
0791     kcb->prev_kprobe.status = kcb->kprobe_status;
0792     kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
0793     kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
0794 }
0795 
0796 static nokprobe_inline void
0797 restore_previous_kprobe(struct kprobe_ctlblk *kcb)
0798 {
0799     __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
0800     kcb->kprobe_status = kcb->prev_kprobe.status;
0801     kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
0802     kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
0803 }
0804 
0805 static nokprobe_inline void
0806 set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
0807            struct kprobe_ctlblk *kcb)
0808 {
0809     __this_cpu_write(current_kprobe, p);
0810     kcb->kprobe_saved_flags = kcb->kprobe_old_flags
0811         = (regs->flags & X86_EFLAGS_IF);
0812 }
0813 
0814 static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
0815                    struct kprobe_ctlblk *kcb)
0816 {
0817     /* Restore back the original saved kprobes variables and continue. */
0818     if (kcb->kprobe_status == KPROBE_REENTER) {
0819         /* This will restore both kcb and current_kprobe */
0820         restore_previous_kprobe(kcb);
0821     } else {
0822         /*
0823          * Always update the kcb status because
0824          * reset_curent_kprobe() doesn't update kcb.
0825          */
0826         kcb->kprobe_status = KPROBE_HIT_SSDONE;
0827         if (cur->post_handler)
0828             cur->post_handler(cur, regs, 0);
0829         reset_current_kprobe();
0830     }
0831 }
0832 NOKPROBE_SYMBOL(kprobe_post_process);
0833 
0834 static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
0835                  struct kprobe_ctlblk *kcb, int reenter)
0836 {
0837     if (setup_detour_execution(p, regs, reenter))
0838         return;
0839 
0840 #if !defined(CONFIG_PREEMPTION)
0841     if (p->ainsn.boostable) {
0842         /* Boost up -- we can execute copied instructions directly */
0843         if (!reenter)
0844             reset_current_kprobe();
0845         /*
0846          * Reentering boosted probe doesn't reset current_kprobe,
0847          * nor set current_kprobe, because it doesn't use single
0848          * stepping.
0849          */
0850         regs->ip = (unsigned long)p->ainsn.insn;
0851         return;
0852     }
0853 #endif
0854     if (reenter) {
0855         save_previous_kprobe(kcb);
0856         set_current_kprobe(p, regs, kcb);
0857         kcb->kprobe_status = KPROBE_REENTER;
0858     } else
0859         kcb->kprobe_status = KPROBE_HIT_SS;
0860 
0861     if (p->ainsn.emulate_op) {
0862         p->ainsn.emulate_op(p, regs);
0863         kprobe_post_process(p, regs, kcb);
0864         return;
0865     }
0866 
0867     /* Disable interrupt, and set ip register on trampoline */
0868     regs->flags &= ~X86_EFLAGS_IF;
0869     regs->ip = (unsigned long)p->ainsn.insn;
0870 }
0871 NOKPROBE_SYMBOL(setup_singlestep);
0872 
0873 /*
0874  * Called after single-stepping.  p->addr is the address of the
0875  * instruction whose first byte has been replaced by the "int3"
0876  * instruction.  To avoid the SMP problems that can occur when we
0877  * temporarily put back the original opcode to single-step, we
0878  * single-stepped a copy of the instruction.  The address of this
0879  * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
0880  * right after the copied instruction.
0881  * Different from the trap single-step, "int3" single-step can not
0882  * handle the instruction which changes the ip register, e.g. jmp,
0883  * call, conditional jmp, and the instructions which changes the IF
0884  * flags because interrupt must be disabled around the single-stepping.
0885  * Such instructions are software emulated, but others are single-stepped
0886  * using "int3".
0887  *
0888  * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
0889  * be adjusted, so that we can resume execution on correct code.
0890  */
0891 static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
0892                   struct kprobe_ctlblk *kcb)
0893 {
0894     unsigned long copy_ip = (unsigned long)p->ainsn.insn;
0895     unsigned long orig_ip = (unsigned long)p->addr;
0896 
0897     /* Restore saved interrupt flag and ip register */
0898     regs->flags |= kcb->kprobe_saved_flags;
0899     /* Note that regs->ip is executed int3 so must be a step back */
0900     regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
0901 }
0902 NOKPROBE_SYMBOL(resume_singlestep);
0903 
0904 /*
0905  * We have reentered the kprobe_handler(), since another probe was hit while
0906  * within the handler. We save the original kprobes variables and just single
0907  * step on the instruction of the new probe without calling any user handlers.
0908  */
0909 static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
0910               struct kprobe_ctlblk *kcb)
0911 {
0912     switch (kcb->kprobe_status) {
0913     case KPROBE_HIT_SSDONE:
0914     case KPROBE_HIT_ACTIVE:
0915     case KPROBE_HIT_SS:
0916         kprobes_inc_nmissed_count(p);
0917         setup_singlestep(p, regs, kcb, 1);
0918         break;
0919     case KPROBE_REENTER:
0920         /* A probe has been hit in the codepath leading up to, or just
0921          * after, single-stepping of a probed instruction. This entire
0922          * codepath should strictly reside in .kprobes.text section.
0923          * Raise a BUG or we'll continue in an endless reentering loop
0924          * and eventually a stack overflow.
0925          */
0926         pr_err("Unrecoverable kprobe detected.\n");
0927         dump_kprobe(p);
0928         BUG();
0929     default:
0930         /* impossible cases */
0931         WARN_ON(1);
0932         return 0;
0933     }
0934 
0935     return 1;
0936 }
0937 NOKPROBE_SYMBOL(reenter_kprobe);
0938 
0939 static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
0940 {
0941     return (kcb->kprobe_status == KPROBE_HIT_SS ||
0942         kcb->kprobe_status == KPROBE_REENTER);
0943 }
0944 
0945 /*
0946  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
0947  * remain disabled throughout this function.
0948  */
0949 int kprobe_int3_handler(struct pt_regs *regs)
0950 {
0951     kprobe_opcode_t *addr;
0952     struct kprobe *p;
0953     struct kprobe_ctlblk *kcb;
0954 
0955     if (user_mode(regs))
0956         return 0;
0957 
0958     addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
0959     /*
0960      * We don't want to be preempted for the entire duration of kprobe
0961      * processing. Since int3 and debug trap disables irqs and we clear
0962      * IF while singlestepping, it must be no preemptible.
0963      */
0964 
0965     kcb = get_kprobe_ctlblk();
0966     p = get_kprobe(addr);
0967 
0968     if (p) {
0969         if (kprobe_running()) {
0970             if (reenter_kprobe(p, regs, kcb))
0971                 return 1;
0972         } else {
0973             set_current_kprobe(p, regs, kcb);
0974             kcb->kprobe_status = KPROBE_HIT_ACTIVE;
0975 
0976             /*
0977              * If we have no pre-handler or it returned 0, we
0978              * continue with normal processing.  If we have a
0979              * pre-handler and it returned non-zero, that means
0980              * user handler setup registers to exit to another
0981              * instruction, we must skip the single stepping.
0982              */
0983             if (!p->pre_handler || !p->pre_handler(p, regs))
0984                 setup_singlestep(p, regs, kcb, 0);
0985             else
0986                 reset_current_kprobe();
0987             return 1;
0988         }
0989     } else if (kprobe_is_ss(kcb)) {
0990         p = kprobe_running();
0991         if ((unsigned long)p->ainsn.insn < regs->ip &&
0992             (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
0993             /* Most provably this is the second int3 for singlestep */
0994             resume_singlestep(p, regs, kcb);
0995             kprobe_post_process(p, regs, kcb);
0996             return 1;
0997         }
0998     }
0999 
1000     if (*addr != INT3_INSN_OPCODE) {
1001         /*
1002          * The breakpoint instruction was removed right
1003          * after we hit it.  Another cpu has removed
1004          * either a probepoint or a debugger breakpoint
1005          * at this address.  In either case, no further
1006          * handling of this interrupt is appropriate.
1007          * Back up over the (now missing) int3 and run
1008          * the original instruction.
1009          */
1010         regs->ip = (unsigned long)addr;
1011         return 1;
1012     } /* else: not a kprobe fault; let the kernel handle it */
1013 
1014     return 0;
1015 }
1016 NOKPROBE_SYMBOL(kprobe_int3_handler);
1017 
1018 int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
1019 {
1020     struct kprobe *cur = kprobe_running();
1021     struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1022 
1023     if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
1024         /* This must happen on single-stepping */
1025         WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
1026             kcb->kprobe_status != KPROBE_REENTER);
1027         /*
1028          * We are here because the instruction being single
1029          * stepped caused a page fault. We reset the current
1030          * kprobe and the ip points back to the probe address
1031          * and allow the page fault handler to continue as a
1032          * normal page fault.
1033          */
1034         regs->ip = (unsigned long)cur->addr;
1035 
1036         /*
1037          * If the IF flag was set before the kprobe hit,
1038          * don't touch it:
1039          */
1040         regs->flags |= kcb->kprobe_old_flags;
1041 
1042         if (kcb->kprobe_status == KPROBE_REENTER)
1043             restore_previous_kprobe(kcb);
1044         else
1045             reset_current_kprobe();
1046     }
1047 
1048     return 0;
1049 }
1050 NOKPROBE_SYMBOL(kprobe_fault_handler);
1051 
1052 int __init arch_populate_kprobe_blacklist(void)
1053 {
1054     return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
1055                      (unsigned long)__entry_text_end);
1056 }
1057 
1058 int __init arch_init_kprobes(void)
1059 {
1060     return 0;
1061 }
1062 
1063 int arch_trampoline_kprobe(struct kprobe *p)
1064 {
1065     return 0;
1066 }