Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 #define pr_fmt(fmt) "SMP alternatives: " fmt
0003 
0004 #include <linux/module.h>
0005 #include <linux/sched.h>
0006 #include <linux/perf_event.h>
0007 #include <linux/mutex.h>
0008 #include <linux/list.h>
0009 #include <linux/stringify.h>
0010 #include <linux/highmem.h>
0011 #include <linux/mm.h>
0012 #include <linux/vmalloc.h>
0013 #include <linux/memory.h>
0014 #include <linux/stop_machine.h>
0015 #include <linux/slab.h>
0016 #include <linux/kdebug.h>
0017 #include <linux/kprobes.h>
0018 #include <linux/mmu_context.h>
0019 #include <linux/bsearch.h>
0020 #include <linux/sync_core.h>
0021 #include <asm/text-patching.h>
0022 #include <asm/alternative.h>
0023 #include <asm/sections.h>
0024 #include <asm/mce.h>
0025 #include <asm/nmi.h>
0026 #include <asm/cacheflush.h>
0027 #include <asm/tlbflush.h>
0028 #include <asm/insn.h>
0029 #include <asm/io.h>
0030 #include <asm/fixmap.h>
0031 #include <asm/paravirt.h>
0032 #include <asm/asm-prototypes.h>
0033 
0034 int __read_mostly alternatives_patched;
0035 
0036 EXPORT_SYMBOL_GPL(alternatives_patched);
0037 
0038 #define MAX_PATCH_LEN (255-1)
0039 
0040 static int __initdata_or_module debug_alternative;
0041 
0042 static int __init debug_alt(char *str)
0043 {
0044     debug_alternative = 1;
0045     return 1;
0046 }
0047 __setup("debug-alternative", debug_alt);
0048 
0049 static int noreplace_smp;
0050 
0051 static int __init setup_noreplace_smp(char *str)
0052 {
0053     noreplace_smp = 1;
0054     return 1;
0055 }
0056 __setup("noreplace-smp", setup_noreplace_smp);
0057 
0058 #define DPRINTK(fmt, args...)                       \
0059 do {                                    \
0060     if (debug_alternative)                      \
0061         printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);        \
0062 } while (0)
0063 
0064 #define DUMP_BYTES(buf, len, fmt, args...)              \
0065 do {                                    \
0066     if (unlikely(debug_alternative)) {              \
0067         int j;                          \
0068                                     \
0069         if (!(len))                     \
0070             break;                      \
0071                                     \
0072         printk(KERN_DEBUG pr_fmt(fmt), ##args);         \
0073         for (j = 0; j < (len) - 1; j++)             \
0074             printk(KERN_CONT "%02hhx ", buf[j]);        \
0075         printk(KERN_CONT "%02hhx\n", buf[j]);           \
0076     }                               \
0077 } while (0)
0078 
0079 static const unsigned char x86nops[] =
0080 {
0081     BYTES_NOP1,
0082     BYTES_NOP2,
0083     BYTES_NOP3,
0084     BYTES_NOP4,
0085     BYTES_NOP5,
0086     BYTES_NOP6,
0087     BYTES_NOP7,
0088     BYTES_NOP8,
0089 };
0090 
0091 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
0092 {
0093     NULL,
0094     x86nops,
0095     x86nops + 1,
0096     x86nops + 1 + 2,
0097     x86nops + 1 + 2 + 3,
0098     x86nops + 1 + 2 + 3 + 4,
0099     x86nops + 1 + 2 + 3 + 4 + 5,
0100     x86nops + 1 + 2 + 3 + 4 + 5 + 6,
0101     x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
0102 };
0103 
0104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
0105 static void __init_or_module add_nops(void *insns, unsigned int len)
0106 {
0107     while (len > 0) {
0108         unsigned int noplen = len;
0109         if (noplen > ASM_NOP_MAX)
0110             noplen = ASM_NOP_MAX;
0111         memcpy(insns, x86_nops[noplen], noplen);
0112         insns += noplen;
0113         len -= noplen;
0114     }
0115 }
0116 
0117 extern s32 __retpoline_sites[], __retpoline_sites_end[];
0118 extern s32 __return_sites[], __return_sites_end[];
0119 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
0120 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
0121 extern s32 __smp_locks[], __smp_locks_end[];
0122 void text_poke_early(void *addr, const void *opcode, size_t len);
0123 
0124 /*
0125  * Are we looking at a near JMP with a 1 or 4-byte displacement.
0126  */
0127 static inline bool is_jmp(const u8 opcode)
0128 {
0129     return opcode == 0xeb || opcode == 0xe9;
0130 }
0131 
0132 static void __init_or_module
0133 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
0134 {
0135     u8 *next_rip, *tgt_rip;
0136     s32 n_dspl, o_dspl;
0137     int repl_len;
0138 
0139     if (a->replacementlen != 5)
0140         return;
0141 
0142     o_dspl = *(s32 *)(insn_buff + 1);
0143 
0144     /* next_rip of the replacement JMP */
0145     next_rip = repl_insn + a->replacementlen;
0146     /* target rip of the replacement JMP */
0147     tgt_rip  = next_rip + o_dspl;
0148     n_dspl = tgt_rip - orig_insn;
0149 
0150     DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
0151 
0152     if (tgt_rip - orig_insn >= 0) {
0153         if (n_dspl - 2 <= 127)
0154             goto two_byte_jmp;
0155         else
0156             goto five_byte_jmp;
0157     /* negative offset */
0158     } else {
0159         if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
0160             goto two_byte_jmp;
0161         else
0162             goto five_byte_jmp;
0163     }
0164 
0165 two_byte_jmp:
0166     n_dspl -= 2;
0167 
0168     insn_buff[0] = 0xeb;
0169     insn_buff[1] = (s8)n_dspl;
0170     add_nops(insn_buff + 2, 3);
0171 
0172     repl_len = 2;
0173     goto done;
0174 
0175 five_byte_jmp:
0176     n_dspl -= 5;
0177 
0178     insn_buff[0] = 0xe9;
0179     *(s32 *)&insn_buff[1] = n_dspl;
0180 
0181     repl_len = 5;
0182 
0183 done:
0184 
0185     DPRINTK("final displ: 0x%08x, JMP 0x%lx",
0186         n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
0187 }
0188 
0189 /*
0190  * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
0191  *
0192  * @instr: instruction byte stream
0193  * @instrlen: length of the above
0194  * @off: offset within @instr where the first NOP has been detected
0195  *
0196  * Return: number of NOPs found (and replaced).
0197  */
0198 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
0199 {
0200     unsigned long flags;
0201     int i = off, nnops;
0202 
0203     while (i < instrlen) {
0204         if (instr[i] != 0x90)
0205             break;
0206 
0207         i++;
0208     }
0209 
0210     nnops = i - off;
0211 
0212     if (nnops <= 1)
0213         return nnops;
0214 
0215     local_irq_save(flags);
0216     add_nops(instr + off, nnops);
0217     local_irq_restore(flags);
0218 
0219     DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
0220 
0221     return nnops;
0222 }
0223 
0224 /*
0225  * "noinline" to cause control flow change and thus invalidate I$ and
0226  * cause refetch after modification.
0227  */
0228 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
0229 {
0230     struct insn insn;
0231     int i = 0;
0232 
0233     /*
0234      * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
0235      * ones.
0236      */
0237     for (;;) {
0238         if (insn_decode_kernel(&insn, &instr[i]))
0239             return;
0240 
0241         /*
0242          * See if this and any potentially following NOPs can be
0243          * optimized.
0244          */
0245         if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
0246             i += optimize_nops_range(instr, len, i);
0247         else
0248             i += insn.length;
0249 
0250         if (i >= len)
0251             return;
0252     }
0253 }
0254 
0255 /*
0256  * Replace instructions with better alternatives for this CPU type. This runs
0257  * before SMP is initialized to avoid SMP problems with self modifying code.
0258  * This implies that asymmetric systems where APs have less capabilities than
0259  * the boot processor are not handled. Tough. Make sure you disable such
0260  * features by hand.
0261  *
0262  * Marked "noinline" to cause control flow change and thus insn cache
0263  * to refetch changed I$ lines.
0264  */
0265 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
0266                           struct alt_instr *end)
0267 {
0268     struct alt_instr *a;
0269     u8 *instr, *replacement;
0270     u8 insn_buff[MAX_PATCH_LEN];
0271 
0272     DPRINTK("alt table %px, -> %px", start, end);
0273     /*
0274      * The scan order should be from start to end. A later scanned
0275      * alternative code can overwrite previously scanned alternative code.
0276      * Some kernel functions (e.g. memcpy, memset, etc) use this order to
0277      * patch code.
0278      *
0279      * So be careful if you want to change the scan order to any other
0280      * order.
0281      */
0282     for (a = start; a < end; a++) {
0283         int insn_buff_sz = 0;
0284         /* Mask away "NOT" flag bit for feature to test. */
0285         u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
0286 
0287         instr = (u8 *)&a->instr_offset + a->instr_offset;
0288         replacement = (u8 *)&a->repl_offset + a->repl_offset;
0289         BUG_ON(a->instrlen > sizeof(insn_buff));
0290         BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
0291 
0292         /*
0293          * Patch if either:
0294          * - feature is present
0295          * - feature not present but ALTINSTR_FLAG_INV is set to mean,
0296          *   patch if feature is *NOT* present.
0297          */
0298         if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
0299             goto next;
0300 
0301         DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
0302             (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
0303             feature >> 5,
0304             feature & 0x1f,
0305             instr, instr, a->instrlen,
0306             replacement, a->replacementlen);
0307 
0308         DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
0309         DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
0310 
0311         memcpy(insn_buff, replacement, a->replacementlen);
0312         insn_buff_sz = a->replacementlen;
0313 
0314         /*
0315          * 0xe8 is a relative jump; fix the offset.
0316          *
0317          * Instruction length is checked before the opcode to avoid
0318          * accessing uninitialized bytes for zero-length replacements.
0319          */
0320         if (a->replacementlen == 5 && *insn_buff == 0xe8) {
0321             *(s32 *)(insn_buff + 1) += replacement - instr;
0322             DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
0323                 *(s32 *)(insn_buff + 1),
0324                 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
0325         }
0326 
0327         if (a->replacementlen && is_jmp(replacement[0]))
0328             recompute_jump(a, instr, replacement, insn_buff);
0329 
0330         for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
0331             insn_buff[insn_buff_sz] = 0x90;
0332 
0333         DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
0334 
0335         text_poke_early(instr, insn_buff, insn_buff_sz);
0336 
0337 next:
0338         optimize_nops(instr, a->instrlen);
0339     }
0340 }
0341 
0342 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
0343 
0344 /*
0345  * CALL/JMP *%\reg
0346  */
0347 static int emit_indirect(int op, int reg, u8 *bytes)
0348 {
0349     int i = 0;
0350     u8 modrm;
0351 
0352     switch (op) {
0353     case CALL_INSN_OPCODE:
0354         modrm = 0x10; /* Reg = 2; CALL r/m */
0355         break;
0356 
0357     case JMP32_INSN_OPCODE:
0358         modrm = 0x20; /* Reg = 4; JMP r/m */
0359         break;
0360 
0361     default:
0362         WARN_ON_ONCE(1);
0363         return -1;
0364     }
0365 
0366     if (reg >= 8) {
0367         bytes[i++] = 0x41; /* REX.B prefix */
0368         reg -= 8;
0369     }
0370 
0371     modrm |= 0xc0; /* Mod = 3 */
0372     modrm += reg;
0373 
0374     bytes[i++] = 0xff; /* opcode */
0375     bytes[i++] = modrm;
0376 
0377     return i;
0378 }
0379 
0380 /*
0381  * Rewrite the compiler generated retpoline thunk calls.
0382  *
0383  * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
0384  * indirect instructions, avoiding the extra indirection.
0385  *
0386  * For example, convert:
0387  *
0388  *   CALL __x86_indirect_thunk_\reg
0389  *
0390  * into:
0391  *
0392  *   CALL *%\reg
0393  *
0394  * It also tries to inline spectre_v2=retpoline,lfence when size permits.
0395  */
0396 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
0397 {
0398     retpoline_thunk_t *target;
0399     int reg, ret, i = 0;
0400     u8 op, cc;
0401 
0402     target = addr + insn->length + insn->immediate.value;
0403     reg = target - __x86_indirect_thunk_array;
0404 
0405     if (WARN_ON_ONCE(reg & ~0xf))
0406         return -1;
0407 
0408     /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
0409     BUG_ON(reg == 4);
0410 
0411     if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
0412         !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
0413         return -1;
0414 
0415     op = insn->opcode.bytes[0];
0416 
0417     /*
0418      * Convert:
0419      *
0420      *   Jcc.d32 __x86_indirect_thunk_\reg
0421      *
0422      * into:
0423      *
0424      *   Jncc.d8 1f
0425      *   [ LFENCE ]
0426      *   JMP *%\reg
0427      *   [ NOP ]
0428      * 1:
0429      */
0430     /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
0431     if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
0432         cc = insn->opcode.bytes[1] & 0xf;
0433         cc ^= 1; /* invert condition */
0434 
0435         bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
0436         bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
0437 
0438         /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
0439         op = JMP32_INSN_OPCODE;
0440     }
0441 
0442     /*
0443      * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
0444      */
0445     if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
0446         bytes[i++] = 0x0f;
0447         bytes[i++] = 0xae;
0448         bytes[i++] = 0xe8; /* LFENCE */
0449     }
0450 
0451     ret = emit_indirect(op, reg, bytes + i);
0452     if (ret < 0)
0453         return ret;
0454     i += ret;
0455 
0456     for (; i < insn->length;)
0457         bytes[i++] = BYTES_NOP1;
0458 
0459     return i;
0460 }
0461 
0462 /*
0463  * Generated by 'objtool --retpoline'.
0464  */
0465 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
0466 {
0467     s32 *s;
0468 
0469     for (s = start; s < end; s++) {
0470         void *addr = (void *)s + *s;
0471         struct insn insn;
0472         int len, ret;
0473         u8 bytes[16];
0474         u8 op1, op2;
0475 
0476         ret = insn_decode_kernel(&insn, addr);
0477         if (WARN_ON_ONCE(ret < 0))
0478             continue;
0479 
0480         op1 = insn.opcode.bytes[0];
0481         op2 = insn.opcode.bytes[1];
0482 
0483         switch (op1) {
0484         case CALL_INSN_OPCODE:
0485         case JMP32_INSN_OPCODE:
0486             break;
0487 
0488         case 0x0f: /* escape */
0489             if (op2 >= 0x80 && op2 <= 0x8f)
0490                 break;
0491             fallthrough;
0492         default:
0493             WARN_ON_ONCE(1);
0494             continue;
0495         }
0496 
0497         DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
0498             addr, addr, insn.length,
0499             addr + insn.length + insn.immediate.value);
0500 
0501         len = patch_retpoline(addr, &insn, bytes);
0502         if (len == insn.length) {
0503             optimize_nops(bytes, len);
0504             DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
0505             DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
0506             text_poke_early(addr, bytes, len);
0507         }
0508     }
0509 }
0510 
0511 #ifdef CONFIG_RETHUNK
0512 /*
0513  * Rewrite the compiler generated return thunk tail-calls.
0514  *
0515  * For example, convert:
0516  *
0517  *   JMP __x86_return_thunk
0518  *
0519  * into:
0520  *
0521  *   RET
0522  */
0523 static int patch_return(void *addr, struct insn *insn, u8 *bytes)
0524 {
0525     int i = 0;
0526 
0527     if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
0528         return -1;
0529 
0530     bytes[i++] = RET_INSN_OPCODE;
0531 
0532     for (; i < insn->length;)
0533         bytes[i++] = INT3_INSN_OPCODE;
0534 
0535     return i;
0536 }
0537 
0538 void __init_or_module noinline apply_returns(s32 *start, s32 *end)
0539 {
0540     s32 *s;
0541 
0542     for (s = start; s < end; s++) {
0543         void *dest = NULL, *addr = (void *)s + *s;
0544         struct insn insn;
0545         int len, ret;
0546         u8 bytes[16];
0547         u8 op;
0548 
0549         ret = insn_decode_kernel(&insn, addr);
0550         if (WARN_ON_ONCE(ret < 0))
0551             continue;
0552 
0553         op = insn.opcode.bytes[0];
0554         if (op == JMP32_INSN_OPCODE)
0555             dest = addr + insn.length + insn.immediate.value;
0556 
0557         if (__static_call_fixup(addr, op, dest) ||
0558             WARN_ONCE(dest != &__x86_return_thunk,
0559                   "missing return thunk: %pS-%pS: %*ph",
0560                   addr, dest, 5, addr))
0561             continue;
0562 
0563         DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
0564             addr, addr, insn.length,
0565             addr + insn.length + insn.immediate.value);
0566 
0567         len = patch_return(addr, &insn, bytes);
0568         if (len == insn.length) {
0569             DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
0570             DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
0571             text_poke_early(addr, bytes, len);
0572         }
0573     }
0574 }
0575 #else
0576 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
0577 #endif /* CONFIG_RETHUNK */
0578 
0579 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
0580 
0581 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
0582 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
0583 
0584 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
0585 
0586 #ifdef CONFIG_X86_KERNEL_IBT
0587 
0588 /*
0589  * Generated by: objtool --ibt
0590  */
0591 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
0592 {
0593     s32 *s;
0594 
0595     for (s = start; s < end; s++) {
0596         u32 endbr, poison = gen_endbr_poison();
0597         void *addr = (void *)s + *s;
0598 
0599         if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
0600             continue;
0601 
0602         if (WARN_ON_ONCE(!is_endbr(endbr)))
0603             continue;
0604 
0605         DPRINTK("ENDBR at: %pS (%px)", addr, addr);
0606 
0607         /*
0608          * When we have IBT, the lack of ENDBR will trigger #CP
0609          */
0610         DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
0611         DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
0612         text_poke_early(addr, &poison, 4);
0613     }
0614 }
0615 
0616 #else
0617 
0618 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
0619 
0620 #endif /* CONFIG_X86_KERNEL_IBT */
0621 
0622 #ifdef CONFIG_SMP
0623 static void alternatives_smp_lock(const s32 *start, const s32 *end,
0624                   u8 *text, u8 *text_end)
0625 {
0626     const s32 *poff;
0627 
0628     for (poff = start; poff < end; poff++) {
0629         u8 *ptr = (u8 *)poff + *poff;
0630 
0631         if (!*poff || ptr < text || ptr >= text_end)
0632             continue;
0633         /* turn DS segment override prefix into lock prefix */
0634         if (*ptr == 0x3e)
0635             text_poke(ptr, ((unsigned char []){0xf0}), 1);
0636     }
0637 }
0638 
0639 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
0640                     u8 *text, u8 *text_end)
0641 {
0642     const s32 *poff;
0643 
0644     for (poff = start; poff < end; poff++) {
0645         u8 *ptr = (u8 *)poff + *poff;
0646 
0647         if (!*poff || ptr < text || ptr >= text_end)
0648             continue;
0649         /* turn lock prefix into DS segment override prefix */
0650         if (*ptr == 0xf0)
0651             text_poke(ptr, ((unsigned char []){0x3E}), 1);
0652     }
0653 }
0654 
0655 struct smp_alt_module {
0656     /* what is this ??? */
0657     struct module   *mod;
0658     char        *name;
0659 
0660     /* ptrs to lock prefixes */
0661     const s32   *locks;
0662     const s32   *locks_end;
0663 
0664     /* .text segment, needed to avoid patching init code ;) */
0665     u8      *text;
0666     u8      *text_end;
0667 
0668     struct list_head next;
0669 };
0670 static LIST_HEAD(smp_alt_modules);
0671 static bool uniproc_patched = false;    /* protected by text_mutex */
0672 
0673 void __init_or_module alternatives_smp_module_add(struct module *mod,
0674                           char *name,
0675                           void *locks, void *locks_end,
0676                           void *text,  void *text_end)
0677 {
0678     struct smp_alt_module *smp;
0679 
0680     mutex_lock(&text_mutex);
0681     if (!uniproc_patched)
0682         goto unlock;
0683 
0684     if (num_possible_cpus() == 1)
0685         /* Don't bother remembering, we'll never have to undo it. */
0686         goto smp_unlock;
0687 
0688     smp = kzalloc(sizeof(*smp), GFP_KERNEL);
0689     if (NULL == smp)
0690         /* we'll run the (safe but slow) SMP code then ... */
0691         goto unlock;
0692 
0693     smp->mod    = mod;
0694     smp->name   = name;
0695     smp->locks  = locks;
0696     smp->locks_end  = locks_end;
0697     smp->text   = text;
0698     smp->text_end   = text_end;
0699     DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
0700         smp->locks, smp->locks_end,
0701         smp->text, smp->text_end, smp->name);
0702 
0703     list_add_tail(&smp->next, &smp_alt_modules);
0704 smp_unlock:
0705     alternatives_smp_unlock(locks, locks_end, text, text_end);
0706 unlock:
0707     mutex_unlock(&text_mutex);
0708 }
0709 
0710 void __init_or_module alternatives_smp_module_del(struct module *mod)
0711 {
0712     struct smp_alt_module *item;
0713 
0714     mutex_lock(&text_mutex);
0715     list_for_each_entry(item, &smp_alt_modules, next) {
0716         if (mod != item->mod)
0717             continue;
0718         list_del(&item->next);
0719         kfree(item);
0720         break;
0721     }
0722     mutex_unlock(&text_mutex);
0723 }
0724 
0725 void alternatives_enable_smp(void)
0726 {
0727     struct smp_alt_module *mod;
0728 
0729     /* Why bother if there are no other CPUs? */
0730     BUG_ON(num_possible_cpus() == 1);
0731 
0732     mutex_lock(&text_mutex);
0733 
0734     if (uniproc_patched) {
0735         pr_info("switching to SMP code\n");
0736         BUG_ON(num_online_cpus() != 1);
0737         clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
0738         clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
0739         list_for_each_entry(mod, &smp_alt_modules, next)
0740             alternatives_smp_lock(mod->locks, mod->locks_end,
0741                           mod->text, mod->text_end);
0742         uniproc_patched = false;
0743     }
0744     mutex_unlock(&text_mutex);
0745 }
0746 
0747 /*
0748  * Return 1 if the address range is reserved for SMP-alternatives.
0749  * Must hold text_mutex.
0750  */
0751 int alternatives_text_reserved(void *start, void *end)
0752 {
0753     struct smp_alt_module *mod;
0754     const s32 *poff;
0755     u8 *text_start = start;
0756     u8 *text_end = end;
0757 
0758     lockdep_assert_held(&text_mutex);
0759 
0760     list_for_each_entry(mod, &smp_alt_modules, next) {
0761         if (mod->text > text_end || mod->text_end < text_start)
0762             continue;
0763         for (poff = mod->locks; poff < mod->locks_end; poff++) {
0764             const u8 *ptr = (const u8 *)poff + *poff;
0765 
0766             if (text_start <= ptr && text_end > ptr)
0767                 return 1;
0768         }
0769     }
0770 
0771     return 0;
0772 }
0773 #endif /* CONFIG_SMP */
0774 
0775 #ifdef CONFIG_PARAVIRT
0776 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
0777                      struct paravirt_patch_site *end)
0778 {
0779     struct paravirt_patch_site *p;
0780     char insn_buff[MAX_PATCH_LEN];
0781 
0782     for (p = start; p < end; p++) {
0783         unsigned int used;
0784 
0785         BUG_ON(p->len > MAX_PATCH_LEN);
0786         /* prep the buffer with the original instructions */
0787         memcpy(insn_buff, p->instr, p->len);
0788         used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
0789 
0790         BUG_ON(used > p->len);
0791 
0792         /* Pad the rest with nops */
0793         add_nops(insn_buff + used, p->len - used);
0794         text_poke_early(p->instr, insn_buff, p->len);
0795     }
0796 }
0797 extern struct paravirt_patch_site __start_parainstructions[],
0798     __stop_parainstructions[];
0799 #endif  /* CONFIG_PARAVIRT */
0800 
0801 /*
0802  * Self-test for the INT3 based CALL emulation code.
0803  *
0804  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
0805  * properly and that there is a stack gap between the INT3 frame and the
0806  * previous context. Without this gap doing a virtual PUSH on the interrupted
0807  * stack would corrupt the INT3 IRET frame.
0808  *
0809  * See entry_{32,64}.S for more details.
0810  */
0811 
0812 /*
0813  * We define the int3_magic() function in assembly to control the calling
0814  * convention such that we can 'call' it from assembly.
0815  */
0816 
0817 extern void int3_magic(unsigned int *ptr); /* defined in asm */
0818 
0819 asm (
0820 "   .pushsection    .init.text, \"ax\", @progbits\n"
0821 "   .type       int3_magic, @function\n"
0822 "int3_magic:\n"
0823     ANNOTATE_NOENDBR
0824 "   movl    $1, (%" _ASM_ARG1 ")\n"
0825     ASM_RET
0826 "   .size       int3_magic, .-int3_magic\n"
0827 "   .popsection\n"
0828 );
0829 
0830 extern void int3_selftest_ip(void); /* defined in asm below */
0831 
0832 static int __init
0833 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
0834 {
0835     unsigned long selftest = (unsigned long)&int3_selftest_ip;
0836     struct die_args *args = data;
0837     struct pt_regs *regs = args->regs;
0838 
0839     OPTIMIZER_HIDE_VAR(selftest);
0840 
0841     if (!regs || user_mode(regs))
0842         return NOTIFY_DONE;
0843 
0844     if (val != DIE_INT3)
0845         return NOTIFY_DONE;
0846 
0847     if (regs->ip - INT3_INSN_SIZE != selftest)
0848         return NOTIFY_DONE;
0849 
0850     int3_emulate_call(regs, (unsigned long)&int3_magic);
0851     return NOTIFY_STOP;
0852 }
0853 
0854 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */
0855 static noinline void __init int3_selftest(void)
0856 {
0857     static __initdata struct notifier_block int3_exception_nb = {
0858         .notifier_call  = int3_exception_notify,
0859         .priority   = INT_MAX-1, /* last */
0860     };
0861     unsigned int val = 0;
0862 
0863     BUG_ON(register_die_notifier(&int3_exception_nb));
0864 
0865     /*
0866      * Basically: int3_magic(&val); but really complicated :-)
0867      *
0868      * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
0869      * notifier above will emulate CALL for us.
0870      */
0871     asm volatile ("int3_selftest_ip:\n\t"
0872               ANNOTATE_NOENDBR
0873               "    int3; nop; nop; nop; nop\n\t"
0874               : ASM_CALL_CONSTRAINT
0875               : __ASM_SEL_RAW(a, D) (&val)
0876               : "memory");
0877 
0878     BUG_ON(val != 1);
0879 
0880     unregister_die_notifier(&int3_exception_nb);
0881 }
0882 
0883 void __init alternative_instructions(void)
0884 {
0885     int3_selftest();
0886 
0887     /*
0888      * The patching is not fully atomic, so try to avoid local
0889      * interruptions that might execute the to be patched code.
0890      * Other CPUs are not running.
0891      */
0892     stop_nmi();
0893 
0894     /*
0895      * Don't stop machine check exceptions while patching.
0896      * MCEs only happen when something got corrupted and in this
0897      * case we must do something about the corruption.
0898      * Ignoring it is worse than an unlikely patching race.
0899      * Also machine checks tend to be broadcast and if one CPU
0900      * goes into machine check the others follow quickly, so we don't
0901      * expect a machine check to cause undue problems during to code
0902      * patching.
0903      */
0904 
0905     /*
0906      * Paravirt patching and alternative patching can be combined to
0907      * replace a function call with a short direct code sequence (e.g.
0908      * by setting a constant return value instead of doing that in an
0909      * external function).
0910      * In order to make this work the following sequence is required:
0911      * 1. set (artificial) features depending on used paravirt
0912      *    functions which can later influence alternative patching
0913      * 2. apply paravirt patching (generally replacing an indirect
0914      *    function call with a direct one)
0915      * 3. apply alternative patching (e.g. replacing a direct function
0916      *    call with a custom code sequence)
0917      * Doing paravirt patching after alternative patching would clobber
0918      * the optimization of the custom code with a function call again.
0919      */
0920     paravirt_set_cap();
0921 
0922     /*
0923      * First patch paravirt functions, such that we overwrite the indirect
0924      * call with the direct call.
0925      */
0926     apply_paravirt(__parainstructions, __parainstructions_end);
0927 
0928     /*
0929      * Rewrite the retpolines, must be done before alternatives since
0930      * those can rewrite the retpoline thunks.
0931      */
0932     apply_retpolines(__retpoline_sites, __retpoline_sites_end);
0933     apply_returns(__return_sites, __return_sites_end);
0934 
0935     /*
0936      * Then patch alternatives, such that those paravirt calls that are in
0937      * alternatives can be overwritten by their immediate fragments.
0938      */
0939     apply_alternatives(__alt_instructions, __alt_instructions_end);
0940 
0941     apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
0942 
0943 #ifdef CONFIG_SMP
0944     /* Patch to UP if other cpus not imminent. */
0945     if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
0946         uniproc_patched = true;
0947         alternatives_smp_module_add(NULL, "core kernel",
0948                         __smp_locks, __smp_locks_end,
0949                         _text, _etext);
0950     }
0951 
0952     if (!uniproc_patched || num_possible_cpus() == 1) {
0953         free_init_pages("SMP alternatives",
0954                 (unsigned long)__smp_locks,
0955                 (unsigned long)__smp_locks_end);
0956     }
0957 #endif
0958 
0959     restart_nmi();
0960     alternatives_patched = 1;
0961 }
0962 
0963 /**
0964  * text_poke_early - Update instructions on a live kernel at boot time
0965  * @addr: address to modify
0966  * @opcode: source of the copy
0967  * @len: length to copy
0968  *
0969  * When you use this code to patch more than one byte of an instruction
0970  * you need to make sure that other CPUs cannot execute this code in parallel.
0971  * Also no thread must be currently preempted in the middle of these
0972  * instructions. And on the local CPU you need to be protected against NMI or
0973  * MCE handlers seeing an inconsistent instruction while you patch.
0974  */
0975 void __init_or_module text_poke_early(void *addr, const void *opcode,
0976                       size_t len)
0977 {
0978     unsigned long flags;
0979 
0980     if (boot_cpu_has(X86_FEATURE_NX) &&
0981         is_module_text_address((unsigned long)addr)) {
0982         /*
0983          * Modules text is marked initially as non-executable, so the
0984          * code cannot be running and speculative code-fetches are
0985          * prevented. Just change the code.
0986          */
0987         memcpy(addr, opcode, len);
0988     } else {
0989         local_irq_save(flags);
0990         memcpy(addr, opcode, len);
0991         local_irq_restore(flags);
0992         sync_core();
0993 
0994         /*
0995          * Could also do a CLFLUSH here to speed up CPU recovery; but
0996          * that causes hangs on some VIA CPUs.
0997          */
0998     }
0999 }
1000 
1001 typedef struct {
1002     struct mm_struct *mm;
1003 } temp_mm_state_t;
1004 
1005 /*
1006  * Using a temporary mm allows to set temporary mappings that are not accessible
1007  * by other CPUs. Such mappings are needed to perform sensitive memory writes
1008  * that override the kernel memory protections (e.g., W^X), without exposing the
1009  * temporary page-table mappings that are required for these write operations to
1010  * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1011  * mapping is torn down.
1012  *
1013  * Context: The temporary mm needs to be used exclusively by a single core. To
1014  *          harden security IRQs must be disabled while the temporary mm is
1015  *          loaded, thereby preventing interrupt handler bugs from overriding
1016  *          the kernel memory protection.
1017  */
1018 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1019 {
1020     temp_mm_state_t temp_state;
1021 
1022     lockdep_assert_irqs_disabled();
1023 
1024     /*
1025      * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1026      * with a stale address space WITHOUT being in lazy mode after
1027      * restoring the previous mm.
1028      */
1029     if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1030         leave_mm(smp_processor_id());
1031 
1032     temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1033     switch_mm_irqs_off(NULL, mm, current);
1034 
1035     /*
1036      * If breakpoints are enabled, disable them while the temporary mm is
1037      * used. Userspace might set up watchpoints on addresses that are used
1038      * in the temporary mm, which would lead to wrong signals being sent or
1039      * crashes.
1040      *
1041      * Note that breakpoints are not disabled selectively, which also causes
1042      * kernel breakpoints (e.g., perf's) to be disabled. This might be
1043      * undesirable, but still seems reasonable as the code that runs in the
1044      * temporary mm should be short.
1045      */
1046     if (hw_breakpoint_active())
1047         hw_breakpoint_disable();
1048 
1049     return temp_state;
1050 }
1051 
1052 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1053 {
1054     lockdep_assert_irqs_disabled();
1055     switch_mm_irqs_off(NULL, prev_state.mm, current);
1056 
1057     /*
1058      * Restore the breakpoints if they were disabled before the temporary mm
1059      * was loaded.
1060      */
1061     if (hw_breakpoint_active())
1062         hw_breakpoint_restore();
1063 }
1064 
1065 __ro_after_init struct mm_struct *poking_mm;
1066 __ro_after_init unsigned long poking_addr;
1067 
1068 static void text_poke_memcpy(void *dst, const void *src, size_t len)
1069 {
1070     memcpy(dst, src, len);
1071 }
1072 
1073 static void text_poke_memset(void *dst, const void *src, size_t len)
1074 {
1075     int c = *(const int *)src;
1076 
1077     memset(dst, c, len);
1078 }
1079 
1080 typedef void text_poke_f(void *dst, const void *src, size_t len);
1081 
1082 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1083 {
1084     bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1085     struct page *pages[2] = {NULL};
1086     temp_mm_state_t prev;
1087     unsigned long flags;
1088     pte_t pte, *ptep;
1089     spinlock_t *ptl;
1090     pgprot_t pgprot;
1091 
1092     /*
1093      * While boot memory allocator is running we cannot use struct pages as
1094      * they are not yet initialized. There is no way to recover.
1095      */
1096     BUG_ON(!after_bootmem);
1097 
1098     if (!core_kernel_text((unsigned long)addr)) {
1099         pages[0] = vmalloc_to_page(addr);
1100         if (cross_page_boundary)
1101             pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1102     } else {
1103         pages[0] = virt_to_page(addr);
1104         WARN_ON(!PageReserved(pages[0]));
1105         if (cross_page_boundary)
1106             pages[1] = virt_to_page(addr + PAGE_SIZE);
1107     }
1108     /*
1109      * If something went wrong, crash and burn since recovery paths are not
1110      * implemented.
1111      */
1112     BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1113 
1114     /*
1115      * Map the page without the global bit, as TLB flushing is done with
1116      * flush_tlb_mm_range(), which is intended for non-global PTEs.
1117      */
1118     pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1119 
1120     /*
1121      * The lock is not really needed, but this allows to avoid open-coding.
1122      */
1123     ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1124 
1125     /*
1126      * This must not fail; preallocated in poking_init().
1127      */
1128     VM_BUG_ON(!ptep);
1129 
1130     local_irq_save(flags);
1131 
1132     pte = mk_pte(pages[0], pgprot);
1133     set_pte_at(poking_mm, poking_addr, ptep, pte);
1134 
1135     if (cross_page_boundary) {
1136         pte = mk_pte(pages[1], pgprot);
1137         set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1138     }
1139 
1140     /*
1141      * Loading the temporary mm behaves as a compiler barrier, which
1142      * guarantees that the PTE will be set at the time memcpy() is done.
1143      */
1144     prev = use_temporary_mm(poking_mm);
1145 
1146     kasan_disable_current();
1147     func((u8 *)poking_addr + offset_in_page(addr), src, len);
1148     kasan_enable_current();
1149 
1150     /*
1151      * Ensure that the PTE is only cleared after the instructions of memcpy
1152      * were issued by using a compiler barrier.
1153      */
1154     barrier();
1155 
1156     pte_clear(poking_mm, poking_addr, ptep);
1157     if (cross_page_boundary)
1158         pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1159 
1160     /*
1161      * Loading the previous page-table hierarchy requires a serializing
1162      * instruction that already allows the core to see the updated version.
1163      * Xen-PV is assumed to serialize execution in a similar manner.
1164      */
1165     unuse_temporary_mm(prev);
1166 
1167     /*
1168      * Flushing the TLB might involve IPIs, which would require enabled
1169      * IRQs, but not if the mm is not used, as it is in this point.
1170      */
1171     flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1172                (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1173                PAGE_SHIFT, false);
1174 
1175     if (func == text_poke_memcpy) {
1176         /*
1177          * If the text does not match what we just wrote then something is
1178          * fundamentally screwy; there's nothing we can really do about that.
1179          */
1180         BUG_ON(memcmp(addr, src, len));
1181     }
1182 
1183     local_irq_restore(flags);
1184     pte_unmap_unlock(ptep, ptl);
1185     return addr;
1186 }
1187 
1188 /**
1189  * text_poke - Update instructions on a live kernel
1190  * @addr: address to modify
1191  * @opcode: source of the copy
1192  * @len: length to copy
1193  *
1194  * Only atomic text poke/set should be allowed when not doing early patching.
1195  * It means the size must be writable atomically and the address must be aligned
1196  * in a way that permits an atomic write. It also makes sure we fit on a single
1197  * page.
1198  *
1199  * Note that the caller must ensure that if the modified code is part of a
1200  * module, the module would not be removed during poking. This can be achieved
1201  * by registering a module notifier, and ordering module removal and patching
1202  * trough a mutex.
1203  */
1204 void *text_poke(void *addr, const void *opcode, size_t len)
1205 {
1206     lockdep_assert_held(&text_mutex);
1207 
1208     return __text_poke(text_poke_memcpy, addr, opcode, len);
1209 }
1210 
1211 /**
1212  * text_poke_kgdb - Update instructions on a live kernel by kgdb
1213  * @addr: address to modify
1214  * @opcode: source of the copy
1215  * @len: length to copy
1216  *
1217  * Only atomic text poke/set should be allowed when not doing early patching.
1218  * It means the size must be writable atomically and the address must be aligned
1219  * in a way that permits an atomic write. It also makes sure we fit on a single
1220  * page.
1221  *
1222  * Context: should only be used by kgdb, which ensures no other core is running,
1223  *      despite the fact it does not hold the text_mutex.
1224  */
1225 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1226 {
1227     return __text_poke(text_poke_memcpy, addr, opcode, len);
1228 }
1229 
1230 /**
1231  * text_poke_copy - Copy instructions into (an unused part of) RX memory
1232  * @addr: address to modify
1233  * @opcode: source of the copy
1234  * @len: length to copy, could be more than 2x PAGE_SIZE
1235  *
1236  * Not safe against concurrent execution; useful for JITs to dump
1237  * new code blocks into unused regions of RX memory. Can be used in
1238  * conjunction with synchronize_rcu_tasks() to wait for existing
1239  * execution to quiesce after having made sure no existing functions
1240  * pointers are live.
1241  */
1242 void *text_poke_copy(void *addr, const void *opcode, size_t len)
1243 {
1244     unsigned long start = (unsigned long)addr;
1245     size_t patched = 0;
1246 
1247     if (WARN_ON_ONCE(core_kernel_text(start)))
1248         return NULL;
1249 
1250     mutex_lock(&text_mutex);
1251     while (patched < len) {
1252         unsigned long ptr = start + patched;
1253         size_t s;
1254 
1255         s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1256 
1257         __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1258         patched += s;
1259     }
1260     mutex_unlock(&text_mutex);
1261     return addr;
1262 }
1263 
1264 /**
1265  * text_poke_set - memset into (an unused part of) RX memory
1266  * @addr: address to modify
1267  * @c: the byte to fill the area with
1268  * @len: length to copy, could be more than 2x PAGE_SIZE
1269  *
1270  * This is useful to overwrite unused regions of RX memory with illegal
1271  * instructions.
1272  */
1273 void *text_poke_set(void *addr, int c, size_t len)
1274 {
1275     unsigned long start = (unsigned long)addr;
1276     size_t patched = 0;
1277 
1278     if (WARN_ON_ONCE(core_kernel_text(start)))
1279         return NULL;
1280 
1281     mutex_lock(&text_mutex);
1282     while (patched < len) {
1283         unsigned long ptr = start + patched;
1284         size_t s;
1285 
1286         s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1287 
1288         __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
1289         patched += s;
1290     }
1291     mutex_unlock(&text_mutex);
1292     return addr;
1293 }
1294 
1295 static void do_sync_core(void *info)
1296 {
1297     sync_core();
1298 }
1299 
1300 void text_poke_sync(void)
1301 {
1302     on_each_cpu(do_sync_core, NULL, 1);
1303 }
1304 
1305 struct text_poke_loc {
1306     /* addr := _stext + rel_addr */
1307     s32 rel_addr;
1308     s32 disp;
1309     u8 len;
1310     u8 opcode;
1311     const u8 text[POKE_MAX_OPCODE_SIZE];
1312     /* see text_poke_bp_batch() */
1313     u8 old;
1314 };
1315 
1316 struct bp_patching_desc {
1317     struct text_poke_loc *vec;
1318     int nr_entries;
1319     atomic_t refs;
1320 };
1321 
1322 static struct bp_patching_desc bp_desc;
1323 
1324 static __always_inline
1325 struct bp_patching_desc *try_get_desc(void)
1326 {
1327     struct bp_patching_desc *desc = &bp_desc;
1328 
1329     if (!arch_atomic_inc_not_zero(&desc->refs))
1330         return NULL;
1331 
1332     return desc;
1333 }
1334 
1335 static __always_inline void put_desc(void)
1336 {
1337     struct bp_patching_desc *desc = &bp_desc;
1338 
1339     smp_mb__before_atomic();
1340     arch_atomic_dec(&desc->refs);
1341 }
1342 
1343 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1344 {
1345     return _stext + tp->rel_addr;
1346 }
1347 
1348 static __always_inline int patch_cmp(const void *key, const void *elt)
1349 {
1350     struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1351 
1352     if (key < text_poke_addr(tp))
1353         return -1;
1354     if (key > text_poke_addr(tp))
1355         return 1;
1356     return 0;
1357 }
1358 
1359 noinstr int poke_int3_handler(struct pt_regs *regs)
1360 {
1361     struct bp_patching_desc *desc;
1362     struct text_poke_loc *tp;
1363     int ret = 0;
1364     void *ip;
1365 
1366     if (user_mode(regs))
1367         return 0;
1368 
1369     /*
1370      * Having observed our INT3 instruction, we now must observe
1371      * bp_desc with non-zero refcount:
1372      *
1373      *  bp_desc.refs = 1        INT3
1374      *  WMB             RMB
1375      *  write INT3          if (bp_desc.refs != 0)
1376      */
1377     smp_rmb();
1378 
1379     desc = try_get_desc();
1380     if (!desc)
1381         return 0;
1382 
1383     /*
1384      * Discount the INT3. See text_poke_bp_batch().
1385      */
1386     ip = (void *) regs->ip - INT3_INSN_SIZE;
1387 
1388     /*
1389      * Skip the binary search if there is a single member in the vector.
1390      */
1391     if (unlikely(desc->nr_entries > 1)) {
1392         tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1393                       sizeof(struct text_poke_loc),
1394                       patch_cmp);
1395         if (!tp)
1396             goto out_put;
1397     } else {
1398         tp = desc->vec;
1399         if (text_poke_addr(tp) != ip)
1400             goto out_put;
1401     }
1402 
1403     ip += tp->len;
1404 
1405     switch (tp->opcode) {
1406     case INT3_INSN_OPCODE:
1407         /*
1408          * Someone poked an explicit INT3, they'll want to handle it,
1409          * do not consume.
1410          */
1411         goto out_put;
1412 
1413     case RET_INSN_OPCODE:
1414         int3_emulate_ret(regs);
1415         break;
1416 
1417     case CALL_INSN_OPCODE:
1418         int3_emulate_call(regs, (long)ip + tp->disp);
1419         break;
1420 
1421     case JMP32_INSN_OPCODE:
1422     case JMP8_INSN_OPCODE:
1423         int3_emulate_jmp(regs, (long)ip + tp->disp);
1424         break;
1425 
1426     default:
1427         BUG();
1428     }
1429 
1430     ret = 1;
1431 
1432 out_put:
1433     put_desc();
1434     return ret;
1435 }
1436 
1437 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1438 static struct text_poke_loc tp_vec[TP_VEC_MAX];
1439 static int tp_vec_nr;
1440 
1441 /**
1442  * text_poke_bp_batch() -- update instructions on live kernel on SMP
1443  * @tp:         vector of instructions to patch
1444  * @nr_entries:     number of entries in the vector
1445  *
1446  * Modify multi-byte instruction by using int3 breakpoint on SMP.
1447  * We completely avoid stop_machine() here, and achieve the
1448  * synchronization using int3 breakpoint.
1449  *
1450  * The way it is done:
1451  *  - For each entry in the vector:
1452  *      - add a int3 trap to the address that will be patched
1453  *  - sync cores
1454  *  - For each entry in the vector:
1455  *      - update all but the first byte of the patched range
1456  *  - sync cores
1457  *  - For each entry in the vector:
1458  *      - replace the first byte (int3) by the first byte of
1459  *        replacing opcode
1460  *  - sync cores
1461  */
1462 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1463 {
1464     unsigned char int3 = INT3_INSN_OPCODE;
1465     unsigned int i;
1466     int do_sync;
1467 
1468     lockdep_assert_held(&text_mutex);
1469 
1470     bp_desc.vec = tp;
1471     bp_desc.nr_entries = nr_entries;
1472 
1473     /*
1474      * Corresponds to the implicit memory barrier in try_get_desc() to
1475      * ensure reading a non-zero refcount provides up to date bp_desc data.
1476      */
1477     atomic_set_release(&bp_desc.refs, 1);
1478 
1479     /*
1480      * Corresponding read barrier in int3 notifier for making sure the
1481      * nr_entries and handler are correctly ordered wrt. patching.
1482      */
1483     smp_wmb();
1484 
1485     /*
1486      * First step: add a int3 trap to the address that will be patched.
1487      */
1488     for (i = 0; i < nr_entries; i++) {
1489         tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1490         text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1491     }
1492 
1493     text_poke_sync();
1494 
1495     /*
1496      * Second step: update all but the first byte of the patched range.
1497      */
1498     for (do_sync = 0, i = 0; i < nr_entries; i++) {
1499         u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
1500         int len = tp[i].len;
1501 
1502         if (len - INT3_INSN_SIZE > 0) {
1503             memcpy(old + INT3_INSN_SIZE,
1504                    text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1505                    len - INT3_INSN_SIZE);
1506             text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1507                   (const char *)tp[i].text + INT3_INSN_SIZE,
1508                   len - INT3_INSN_SIZE);
1509             do_sync++;
1510         }
1511 
1512         /*
1513          * Emit a perf event to record the text poke, primarily to
1514          * support Intel PT decoding which must walk the executable code
1515          * to reconstruct the trace. The flow up to here is:
1516          *   - write INT3 byte
1517          *   - IPI-SYNC
1518          *   - write instruction tail
1519          * At this point the actual control flow will be through the
1520          * INT3 and handler and not hit the old or new instruction.
1521          * Intel PT outputs FUP/TIP packets for the INT3, so the flow
1522          * can still be decoded. Subsequently:
1523          *   - emit RECORD_TEXT_POKE with the new instruction
1524          *   - IPI-SYNC
1525          *   - write first byte
1526          *   - IPI-SYNC
1527          * So before the text poke event timestamp, the decoder will see
1528          * either the old instruction flow or FUP/TIP of INT3. After the
1529          * text poke event timestamp, the decoder will see either the
1530          * new instruction flow or FUP/TIP of INT3. Thus decoders can
1531          * use the timestamp as the point at which to modify the
1532          * executable code.
1533          * The old instruction is recorded so that the event can be
1534          * processed forwards or backwards.
1535          */
1536         perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
1537                      tp[i].text, len);
1538     }
1539 
1540     if (do_sync) {
1541         /*
1542          * According to Intel, this core syncing is very likely
1543          * not necessary and we'd be safe even without it. But
1544          * better safe than sorry (plus there's not only Intel).
1545          */
1546         text_poke_sync();
1547     }
1548 
1549     /*
1550      * Third step: replace the first byte (int3) by the first byte of
1551      * replacing opcode.
1552      */
1553     for (do_sync = 0, i = 0; i < nr_entries; i++) {
1554         if (tp[i].text[0] == INT3_INSN_OPCODE)
1555             continue;
1556 
1557         text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
1558         do_sync++;
1559     }
1560 
1561     if (do_sync)
1562         text_poke_sync();
1563 
1564     /*
1565      * Remove and wait for refs to be zero.
1566      */
1567     if (!atomic_dec_and_test(&bp_desc.refs))
1568         atomic_cond_read_acquire(&bp_desc.refs, !VAL);
1569 }
1570 
1571 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
1572                    const void *opcode, size_t len, const void *emulate)
1573 {
1574     struct insn insn;
1575     int ret, i;
1576 
1577     memcpy((void *)tp->text, opcode, len);
1578     if (!emulate)
1579         emulate = opcode;
1580 
1581     ret = insn_decode_kernel(&insn, emulate);
1582     BUG_ON(ret < 0);
1583 
1584     tp->rel_addr = addr - (void *)_stext;
1585     tp->len = len;
1586     tp->opcode = insn.opcode.bytes[0];
1587 
1588     switch (tp->opcode) {
1589     case RET_INSN_OPCODE:
1590     case JMP32_INSN_OPCODE:
1591     case JMP8_INSN_OPCODE:
1592         /*
1593          * Control flow instructions without implied execution of the
1594          * next instruction can be padded with INT3.
1595          */
1596         for (i = insn.length; i < len; i++)
1597             BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
1598         break;
1599 
1600     default:
1601         BUG_ON(len != insn.length);
1602     };
1603 
1604 
1605     switch (tp->opcode) {
1606     case INT3_INSN_OPCODE:
1607     case RET_INSN_OPCODE:
1608         break;
1609 
1610     case CALL_INSN_OPCODE:
1611     case JMP32_INSN_OPCODE:
1612     case JMP8_INSN_OPCODE:
1613         tp->disp = insn.immediate.value;
1614         break;
1615 
1616     default: /* assume NOP */
1617         switch (len) {
1618         case 2: /* NOP2 -- emulate as JMP8+0 */
1619             BUG_ON(memcmp(emulate, x86_nops[len], len));
1620             tp->opcode = JMP8_INSN_OPCODE;
1621             tp->disp = 0;
1622             break;
1623 
1624         case 5: /* NOP5 -- emulate as JMP32+0 */
1625             BUG_ON(memcmp(emulate, x86_nops[len], len));
1626             tp->opcode = JMP32_INSN_OPCODE;
1627             tp->disp = 0;
1628             break;
1629 
1630         default: /* unknown instruction */
1631             BUG();
1632         }
1633         break;
1634     }
1635 }
1636 
1637 /*
1638  * We hard rely on the tp_vec being ordered; ensure this is so by flushing
1639  * early if needed.
1640  */
1641 static bool tp_order_fail(void *addr)
1642 {
1643     struct text_poke_loc *tp;
1644 
1645     if (!tp_vec_nr)
1646         return false;
1647 
1648     if (!addr) /* force */
1649         return true;
1650 
1651     tp = &tp_vec[tp_vec_nr - 1];
1652     if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
1653         return true;
1654 
1655     return false;
1656 }
1657 
1658 static void text_poke_flush(void *addr)
1659 {
1660     if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
1661         text_poke_bp_batch(tp_vec, tp_vec_nr);
1662         tp_vec_nr = 0;
1663     }
1664 }
1665 
1666 void text_poke_finish(void)
1667 {
1668     text_poke_flush(NULL);
1669 }
1670 
1671 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
1672 {
1673     struct text_poke_loc *tp;
1674 
1675     if (unlikely(system_state == SYSTEM_BOOTING)) {
1676         text_poke_early(addr, opcode, len);
1677         return;
1678     }
1679 
1680     text_poke_flush(addr);
1681 
1682     tp = &tp_vec[tp_vec_nr++];
1683     text_poke_loc_init(tp, addr, opcode, len, emulate);
1684 }
1685 
1686 /**
1687  * text_poke_bp() -- update instructions on live kernel on SMP
1688  * @addr:   address to patch
1689  * @opcode: opcode of new instruction
1690  * @len:    length to copy
1691  * @emulate:    instruction to be emulated
1692  *
1693  * Update a single instruction with the vector in the stack, avoiding
1694  * dynamically allocated memory. This function should be used when it is
1695  * not possible to allocate memory.
1696  */
1697 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
1698 {
1699     struct text_poke_loc tp;
1700 
1701     if (unlikely(system_state == SYSTEM_BOOTING)) {
1702         text_poke_early(addr, opcode, len);
1703         return;
1704     }
1705 
1706     text_poke_loc_init(&tp, addr, opcode, len, emulate);
1707     text_poke_bp_batch(&tp, 1);
1708 }