Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  *  Kernel Probes Jump Optimization (Optprobes)
0004  *
0005  * Copyright (C) IBM Corporation, 2002, 2004
0006  * Copyright (C) Hitachi Ltd., 2012
0007  */
0008 #include <linux/kprobes.h>
0009 #include <linux/perf_event.h>
0010 #include <linux/ptrace.h>
0011 #include <linux/string.h>
0012 #include <linux/slab.h>
0013 #include <linux/hardirq.h>
0014 #include <linux/preempt.h>
0015 #include <linux/extable.h>
0016 #include <linux/kdebug.h>
0017 #include <linux/kallsyms.h>
0018 #include <linux/ftrace.h>
0019 #include <linux/objtool.h>
0020 #include <linux/pgtable.h>
0021 #include <linux/static_call.h>
0022 
0023 #include <asm/text-patching.h>
0024 #include <asm/cacheflush.h>
0025 #include <asm/desc.h>
0026 #include <linux/uaccess.h>
0027 #include <asm/alternative.h>
0028 #include <asm/insn.h>
0029 #include <asm/debugreg.h>
0030 #include <asm/set_memory.h>
0031 #include <asm/sections.h>
0032 #include <asm/nospec-branch.h>
0033 
0034 #include "common.h"
0035 
0036 unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
0037 {
0038     struct optimized_kprobe *op;
0039     struct kprobe *kp;
0040     long offs;
0041     int i;
0042 
0043     for (i = 0; i < JMP32_INSN_SIZE; i++) {
0044         kp = get_kprobe((void *)addr - i);
0045         /* This function only handles jump-optimized kprobe */
0046         if (kp && kprobe_optimized(kp)) {
0047             op = container_of(kp, struct optimized_kprobe, kp);
0048             /* If op->list is not empty, op is under optimizing */
0049             if (list_empty(&op->list))
0050                 goto found;
0051         }
0052     }
0053 
0054     return addr;
0055 found:
0056     /*
0057      * If the kprobe can be optimized, original bytes which can be
0058      * overwritten by jump destination address. In this case, original
0059      * bytes must be recovered from op->optinsn.copied_insn buffer.
0060      */
0061     if (copy_from_kernel_nofault(buf, (void *)addr,
0062         MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
0063         return 0UL;
0064 
0065     if (addr == (unsigned long)kp->addr) {
0066         buf[0] = kp->opcode;
0067         memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
0068     } else {
0069         offs = addr - (unsigned long)kp->addr - 1;
0070         memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
0071     }
0072 
0073     return (unsigned long)buf;
0074 }
0075 
0076 static void synthesize_clac(kprobe_opcode_t *addr)
0077 {
0078     /*
0079      * Can't be static_cpu_has() due to how objtool treats this feature bit.
0080      * This isn't a fast path anyway.
0081      */
0082     if (!boot_cpu_has(X86_FEATURE_SMAP))
0083         return;
0084 
0085     /* Replace the NOP3 with CLAC */
0086     addr[0] = 0x0f;
0087     addr[1] = 0x01;
0088     addr[2] = 0xca;
0089 }
0090 
0091 /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
0092 static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
0093 {
0094 #ifdef CONFIG_X86_64
0095     *addr++ = 0x48;
0096     *addr++ = 0xbf;
0097 #else
0098     *addr++ = 0xb8;
0099 #endif
0100     *(unsigned long *)addr = val;
0101 }
0102 
0103 asm (
0104             ".pushsection .rodata\n"
0105             "optprobe_template_func:\n"
0106             ".global optprobe_template_entry\n"
0107             "optprobe_template_entry:\n"
0108 #ifdef CONFIG_X86_64
0109             "       pushq $" __stringify(__KERNEL_DS) "\n"
0110             /* Save the 'sp - 8', this will be fixed later. */
0111             "   pushq %rsp\n"
0112             "   pushfq\n"
0113             ".global optprobe_template_clac\n"
0114             "optprobe_template_clac:\n"
0115             ASM_NOP3
0116             SAVE_REGS_STRING
0117             "   movq %rsp, %rsi\n"
0118             ".global optprobe_template_val\n"
0119             "optprobe_template_val:\n"
0120             ASM_NOP5
0121             ASM_NOP5
0122             ".global optprobe_template_call\n"
0123             "optprobe_template_call:\n"
0124             ASM_NOP5
0125             /* Copy 'regs->flags' into 'regs->ss'. */
0126             "   movq 18*8(%rsp), %rdx\n"
0127             "   movq %rdx, 20*8(%rsp)\n"
0128             RESTORE_REGS_STRING
0129             /* Skip 'regs->flags' and 'regs->sp'. */
0130             "   addq $16, %rsp\n"
0131             /* And pop flags register from 'regs->ss'. */
0132             "   popfq\n"
0133 #else /* CONFIG_X86_32 */
0134             "   pushl %ss\n"
0135             /* Save the 'sp - 4', this will be fixed later. */
0136             "   pushl %esp\n"
0137             "   pushfl\n"
0138             ".global optprobe_template_clac\n"
0139             "optprobe_template_clac:\n"
0140             ASM_NOP3
0141             SAVE_REGS_STRING
0142             "   movl %esp, %edx\n"
0143             ".global optprobe_template_val\n"
0144             "optprobe_template_val:\n"
0145             ASM_NOP5
0146             ".global optprobe_template_call\n"
0147             "optprobe_template_call:\n"
0148             ASM_NOP5
0149             /* Copy 'regs->flags' into 'regs->ss'. */
0150             "   movl 14*4(%esp), %edx\n"
0151             "   movl %edx, 16*4(%esp)\n"
0152             RESTORE_REGS_STRING
0153             /* Skip 'regs->flags' and 'regs->sp'. */
0154             "   addl $8, %esp\n"
0155             /* And pop flags register from 'regs->ss'. */
0156             "   popfl\n"
0157 #endif
0158             ".global optprobe_template_end\n"
0159             "optprobe_template_end:\n"
0160             ".popsection\n");
0161 
0162 void optprobe_template_func(void);
0163 STACK_FRAME_NON_STANDARD(optprobe_template_func);
0164 
0165 #define TMPL_CLAC_IDX \
0166     ((long)optprobe_template_clac - (long)optprobe_template_entry)
0167 #define TMPL_MOVE_IDX \
0168     ((long)optprobe_template_val - (long)optprobe_template_entry)
0169 #define TMPL_CALL_IDX \
0170     ((long)optprobe_template_call - (long)optprobe_template_entry)
0171 #define TMPL_END_IDX \
0172     ((long)optprobe_template_end - (long)optprobe_template_entry)
0173 
0174 /* Optimized kprobe call back function: called from optinsn */
0175 static void
0176 optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
0177 {
0178     /* This is possible if op is under delayed unoptimizing */
0179     if (kprobe_disabled(&op->kp))
0180         return;
0181 
0182     preempt_disable();
0183     if (kprobe_running()) {
0184         kprobes_inc_nmissed_count(&op->kp);
0185     } else {
0186         struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
0187         /* Adjust stack pointer */
0188         regs->sp += sizeof(long);
0189         /* Save skipped registers */
0190         regs->cs = __KERNEL_CS;
0191 #ifdef CONFIG_X86_32
0192         regs->gs = 0;
0193 #endif
0194         regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
0195         regs->orig_ax = ~0UL;
0196 
0197         __this_cpu_write(current_kprobe, &op->kp);
0198         kcb->kprobe_status = KPROBE_HIT_ACTIVE;
0199         opt_pre_handler(&op->kp, regs);
0200         __this_cpu_write(current_kprobe, NULL);
0201     }
0202     preempt_enable();
0203 }
0204 NOKPROBE_SYMBOL(optimized_callback);
0205 
0206 static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
0207 {
0208     struct insn insn;
0209     int len = 0, ret;
0210 
0211     while (len < JMP32_INSN_SIZE) {
0212         ret = __copy_instruction(dest + len, src + len, real + len, &insn);
0213         if (!ret || !can_boost(&insn, src + len))
0214             return -EINVAL;
0215         len += ret;
0216     }
0217     /* Check whether the address range is reserved */
0218     if (ftrace_text_reserved(src, src + len - 1) ||
0219         alternatives_text_reserved(src, src + len - 1) ||
0220         jump_label_text_reserved(src, src + len - 1) ||
0221         static_call_text_reserved(src, src + len - 1))
0222         return -EBUSY;
0223 
0224     return len;
0225 }
0226 
0227 /* Check whether insn is indirect jump */
0228 static int __insn_is_indirect_jump(struct insn *insn)
0229 {
0230     return ((insn->opcode.bytes[0] == 0xff &&
0231         (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
0232         insn->opcode.bytes[0] == 0xea); /* Segment based jump */
0233 }
0234 
0235 /* Check whether insn jumps into specified address range */
0236 static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
0237 {
0238     unsigned long target = 0;
0239 
0240     switch (insn->opcode.bytes[0]) {
0241     case 0xe0:  /* loopne */
0242     case 0xe1:  /* loope */
0243     case 0xe2:  /* loop */
0244     case 0xe3:  /* jcxz */
0245     case 0xe9:  /* near relative jump */
0246     case 0xeb:  /* short relative jump */
0247         break;
0248     case 0x0f:
0249         if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
0250             break;
0251         return 0;
0252     default:
0253         if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
0254             break;
0255         return 0;
0256     }
0257     target = (unsigned long)insn->next_byte + insn->immediate.value;
0258 
0259     return (start <= target && target <= start + len);
0260 }
0261 
0262 static int insn_is_indirect_jump(struct insn *insn)
0263 {
0264     int ret = __insn_is_indirect_jump(insn);
0265 
0266 #ifdef CONFIG_RETPOLINE
0267     /*
0268      * Jump to x86_indirect_thunk_* is treated as an indirect jump.
0269      * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
0270      * older gcc may use indirect jump. So we add this check instead of
0271      * replace indirect-jump check.
0272      */
0273     if (!ret)
0274         ret = insn_jump_into_range(insn,
0275                 (unsigned long)__indirect_thunk_start,
0276                 (unsigned long)__indirect_thunk_end -
0277                 (unsigned long)__indirect_thunk_start);
0278 #endif
0279     return ret;
0280 }
0281 
0282 static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
0283 {
0284     unsigned char ops;
0285 
0286     for (; addr < eaddr; addr++) {
0287         if (get_kernel_nofault(ops, (void *)addr) < 0 ||
0288             ops != INT3_INSN_OPCODE)
0289             return false;
0290     }
0291 
0292     return true;
0293 }
0294 
0295 /* Decode whole function to ensure any instructions don't jump into target */
0296 static int can_optimize(unsigned long paddr)
0297 {
0298     unsigned long addr, size = 0, offset = 0;
0299     struct insn insn;
0300     kprobe_opcode_t buf[MAX_INSN_SIZE];
0301 
0302     /* Lookup symbol including addr */
0303     if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
0304         return 0;
0305 
0306     /*
0307      * Do not optimize in the entry code due to the unstable
0308      * stack handling and registers setup.
0309      */
0310     if (((paddr >= (unsigned long)__entry_text_start) &&
0311          (paddr <  (unsigned long)__entry_text_end)))
0312         return 0;
0313 
0314     /* Check there is enough space for a relative jump. */
0315     if (size - offset < JMP32_INSN_SIZE)
0316         return 0;
0317 
0318     /* Decode instructions */
0319     addr = paddr - offset;
0320     while (addr < paddr - offset + size) { /* Decode until function end */
0321         unsigned long recovered_insn;
0322         int ret;
0323 
0324         if (search_exception_tables(addr))
0325             /*
0326              * Since some fixup code will jumps into this function,
0327              * we can't optimize kprobe in this function.
0328              */
0329             return 0;
0330         recovered_insn = recover_probed_instruction(buf, addr);
0331         if (!recovered_insn)
0332             return 0;
0333 
0334         ret = insn_decode_kernel(&insn, (void *)recovered_insn);
0335         if (ret < 0)
0336             return 0;
0337 
0338         /*
0339          * In the case of detecting unknown breakpoint, this could be
0340          * a padding INT3 between functions. Let's check that all the
0341          * rest of the bytes are also INT3.
0342          */
0343         if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
0344             return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
0345 
0346         /* Recover address */
0347         insn.kaddr = (void *)addr;
0348         insn.next_byte = (void *)(addr + insn.length);
0349         /* Check any instructions don't jump into target */
0350         if (insn_is_indirect_jump(&insn) ||
0351             insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
0352                      DISP32_SIZE))
0353             return 0;
0354         addr += insn.length;
0355     }
0356 
0357     return 1;
0358 }
0359 
0360 /* Check optimized_kprobe can actually be optimized. */
0361 int arch_check_optimized_kprobe(struct optimized_kprobe *op)
0362 {
0363     int i;
0364     struct kprobe *p;
0365 
0366     for (i = 1; i < op->optinsn.size; i++) {
0367         p = get_kprobe(op->kp.addr + i);
0368         if (p && !kprobe_disabled(p))
0369             return -EEXIST;
0370     }
0371 
0372     return 0;
0373 }
0374 
0375 /* Check the addr is within the optimized instructions. */
0376 int arch_within_optimized_kprobe(struct optimized_kprobe *op,
0377                  kprobe_opcode_t *addr)
0378 {
0379     return (op->kp.addr <= addr &&
0380         op->kp.addr + op->optinsn.size > addr);
0381 }
0382 
0383 /* Free optimized instruction slot */
0384 static
0385 void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
0386 {
0387     u8 *slot = op->optinsn.insn;
0388     if (slot) {
0389         int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
0390 
0391         /* Record the perf event before freeing the slot */
0392         if (dirty)
0393             perf_event_text_poke(slot, slot, len, NULL, 0);
0394 
0395         free_optinsn_slot(slot, dirty);
0396         op->optinsn.insn = NULL;
0397         op->optinsn.size = 0;
0398     }
0399 }
0400 
0401 void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
0402 {
0403     __arch_remove_optimized_kprobe(op, 1);
0404 }
0405 
0406 /*
0407  * Copy replacing target instructions
0408  * Target instructions MUST be relocatable (checked inside)
0409  * This is called when new aggr(opt)probe is allocated or reused.
0410  */
0411 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
0412                   struct kprobe *__unused)
0413 {
0414     u8 *buf = NULL, *slot;
0415     int ret, len;
0416     long rel;
0417 
0418     if (!can_optimize((unsigned long)op->kp.addr))
0419         return -EILSEQ;
0420 
0421     buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
0422     if (!buf)
0423         return -ENOMEM;
0424 
0425     op->optinsn.insn = slot = get_optinsn_slot();
0426     if (!slot) {
0427         ret = -ENOMEM;
0428         goto out;
0429     }
0430 
0431     /*
0432      * Verify if the address gap is in 2GB range, because this uses
0433      * a relative jump.
0434      */
0435     rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
0436     if (abs(rel) > 0x7fffffff) {
0437         ret = -ERANGE;
0438         goto err;
0439     }
0440 
0441     /* Copy arch-dep-instance from template */
0442     memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
0443 
0444     /* Copy instructions into the out-of-line buffer */
0445     ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
0446                       slot + TMPL_END_IDX);
0447     if (ret < 0)
0448         goto err;
0449     op->optinsn.size = ret;
0450     len = TMPL_END_IDX + op->optinsn.size;
0451 
0452     synthesize_clac(buf + TMPL_CLAC_IDX);
0453 
0454     /* Set probe information */
0455     synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
0456 
0457     /* Set probe function call */
0458     synthesize_relcall(buf + TMPL_CALL_IDX,
0459                slot + TMPL_CALL_IDX, optimized_callback);
0460 
0461     /* Set returning jmp instruction at the tail of out-of-line buffer */
0462     synthesize_reljump(buf + len, slot + len,
0463                (u8 *)op->kp.addr + op->optinsn.size);
0464     len += JMP32_INSN_SIZE;
0465 
0466     /*
0467      * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
0468      * used in __arch_remove_optimized_kprobe().
0469      */
0470 
0471     /* We have to use text_poke() for instruction buffer because it is RO */
0472     perf_event_text_poke(slot, NULL, 0, buf, len);
0473     text_poke(slot, buf, len);
0474 
0475     ret = 0;
0476 out:
0477     kfree(buf);
0478     return ret;
0479 
0480 err:
0481     __arch_remove_optimized_kprobe(op, 0);
0482     goto out;
0483 }
0484 
0485 /*
0486  * Replace breakpoints (INT3) with relative jumps (JMP.d32).
0487  * Caller must call with locking kprobe_mutex and text_mutex.
0488  *
0489  * The caller will have installed a regular kprobe and after that issued
0490  * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
0491  * the 4 bytes after the INT3 are unused and can now be overwritten.
0492  */
0493 void arch_optimize_kprobes(struct list_head *oplist)
0494 {
0495     struct optimized_kprobe *op, *tmp;
0496     u8 insn_buff[JMP32_INSN_SIZE];
0497 
0498     list_for_each_entry_safe(op, tmp, oplist, list) {
0499         s32 rel = (s32)((long)op->optinsn.insn -
0500             ((long)op->kp.addr + JMP32_INSN_SIZE));
0501 
0502         WARN_ON(kprobe_disabled(&op->kp));
0503 
0504         /* Backup instructions which will be replaced by jump address */
0505         memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
0506                DISP32_SIZE);
0507 
0508         insn_buff[0] = JMP32_INSN_OPCODE;
0509         *(s32 *)(&insn_buff[1]) = rel;
0510 
0511         text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
0512 
0513         list_del_init(&op->list);
0514     }
0515 }
0516 
0517 /*
0518  * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
0519  *
0520  * After that, we can restore the 4 bytes after the INT3 to undo what
0521  * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
0522  * unused once the INT3 lands.
0523  */
0524 void arch_unoptimize_kprobe(struct optimized_kprobe *op)
0525 {
0526     u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
0527     u8 old[JMP32_INSN_SIZE];
0528     u8 *addr = op->kp.addr;
0529 
0530     memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
0531     memcpy(new + INT3_INSN_SIZE,
0532            op->optinsn.copied_insn,
0533            JMP32_INSN_SIZE - INT3_INSN_SIZE);
0534 
0535     text_poke(addr, new, INT3_INSN_SIZE);
0536     text_poke_sync();
0537     text_poke(addr + INT3_INSN_SIZE,
0538           new + INT3_INSN_SIZE,
0539           JMP32_INSN_SIZE - INT3_INSN_SIZE);
0540     text_poke_sync();
0541 
0542     perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
0543 }
0544 
0545 /*
0546  * Recover original instructions and breakpoints from relative jumps.
0547  * Caller must call with locking kprobe_mutex.
0548  */
0549 extern void arch_unoptimize_kprobes(struct list_head *oplist,
0550                     struct list_head *done_list)
0551 {
0552     struct optimized_kprobe *op, *tmp;
0553 
0554     list_for_each_entry_safe(op, tmp, oplist, list) {
0555         arch_unoptimize_kprobe(op);
0556         list_move(&op->list, done_list);
0557     }
0558 }
0559 
0560 int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
0561 {
0562     struct optimized_kprobe *op;
0563 
0564     if (p->flags & KPROBE_FLAG_OPTIMIZED) {
0565         /* This kprobe is really able to run optimized path. */
0566         op = container_of(p, struct optimized_kprobe, kp);
0567         /* Detour through copied instructions */
0568         regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
0569         if (!reenter)
0570             reset_current_kprobe();
0571         return 1;
0572     }
0573     return 0;
0574 }
0575 NOKPROBE_SYMBOL(setup_detour_execution);