Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  S390 version
0004  *    Copyright IBM Corp. 1999
0005  *    Author(s): Hartmut Penner (hp@de.ibm.com)
0006  *               Ulrich Weigand (uweigand@de.ibm.com)
0007  *
0008  *  Derived from "arch/i386/mm/fault.c"
0009  *    Copyright (C) 1995  Linus Torvalds
0010  */
0011 
0012 #include <linux/kernel_stat.h>
0013 #include <linux/perf_event.h>
0014 #include <linux/signal.h>
0015 #include <linux/sched.h>
0016 #include <linux/sched/debug.h>
0017 #include <linux/kernel.h>
0018 #include <linux/errno.h>
0019 #include <linux/string.h>
0020 #include <linux/types.h>
0021 #include <linux/ptrace.h>
0022 #include <linux/mman.h>
0023 #include <linux/mm.h>
0024 #include <linux/compat.h>
0025 #include <linux/smp.h>
0026 #include <linux/kdebug.h>
0027 #include <linux/init.h>
0028 #include <linux/console.h>
0029 #include <linux/extable.h>
0030 #include <linux/hardirq.h>
0031 #include <linux/kprobes.h>
0032 #include <linux/uaccess.h>
0033 #include <linux/hugetlb.h>
0034 #include <linux/kfence.h>
0035 #include <asm/asm-extable.h>
0036 #include <asm/asm-offsets.h>
0037 #include <asm/diag.h>
0038 #include <asm/gmap.h>
0039 #include <asm/irq.h>
0040 #include <asm/mmu_context.h>
0041 #include <asm/facility.h>
0042 #include <asm/uv.h>
0043 #include "../kernel/entry.h"
0044 
0045 #define __FAIL_ADDR_MASK -4096L
0046 #define __SUBCODE_MASK 0x0600
0047 #define __PF_RES_FIELD 0x8000000000000000ULL
0048 
0049 #define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
0050 #define VM_FAULT_BADMAP     ((__force vm_fault_t) 0x020000)
0051 #define VM_FAULT_BADACCESS  ((__force vm_fault_t) 0x040000)
0052 #define VM_FAULT_SIGNAL     ((__force vm_fault_t) 0x080000)
0053 #define VM_FAULT_PFAULT     ((__force vm_fault_t) 0x100000)
0054 
0055 enum fault_type {
0056     KERNEL_FAULT,
0057     USER_FAULT,
0058     GMAP_FAULT,
0059 };
0060 
0061 static unsigned long store_indication __read_mostly;
0062 
0063 static int __init fault_init(void)
0064 {
0065     if (test_facility(75))
0066         store_indication = 0xc00;
0067     return 0;
0068 }
0069 early_initcall(fault_init);
0070 
0071 /*
0072  * Find out which address space caused the exception.
0073  */
0074 static enum fault_type get_fault_type(struct pt_regs *regs)
0075 {
0076     unsigned long trans_exc_code;
0077 
0078     trans_exc_code = regs->int_parm_long & 3;
0079     if (likely(trans_exc_code == 0)) {
0080         /* primary space exception */
0081         if (user_mode(regs))
0082             return USER_FAULT;
0083         if (!IS_ENABLED(CONFIG_PGSTE))
0084             return KERNEL_FAULT;
0085         if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
0086             return GMAP_FAULT;
0087         return KERNEL_FAULT;
0088     }
0089     if (trans_exc_code == 2)
0090         return USER_FAULT;
0091     if (trans_exc_code == 1) {
0092         /* access register mode, not used in the kernel */
0093         return USER_FAULT;
0094     }
0095     /* home space exception -> access via kernel ASCE */
0096     return KERNEL_FAULT;
0097 }
0098 
0099 static int bad_address(void *p)
0100 {
0101     unsigned long dummy;
0102 
0103     return get_kernel_nofault(dummy, (unsigned long *)p);
0104 }
0105 
0106 static void dump_pagetable(unsigned long asce, unsigned long address)
0107 {
0108     unsigned long *table = __va(asce & _ASCE_ORIGIN);
0109 
0110     pr_alert("AS:%016lx ", asce);
0111     switch (asce & _ASCE_TYPE_MASK) {
0112     case _ASCE_TYPE_REGION1:
0113         table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
0114         if (bad_address(table))
0115             goto bad;
0116         pr_cont("R1:%016lx ", *table);
0117         if (*table & _REGION_ENTRY_INVALID)
0118             goto out;
0119         table = __va(*table & _REGION_ENTRY_ORIGIN);
0120         fallthrough;
0121     case _ASCE_TYPE_REGION2:
0122         table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
0123         if (bad_address(table))
0124             goto bad;
0125         pr_cont("R2:%016lx ", *table);
0126         if (*table & _REGION_ENTRY_INVALID)
0127             goto out;
0128         table = __va(*table & _REGION_ENTRY_ORIGIN);
0129         fallthrough;
0130     case _ASCE_TYPE_REGION3:
0131         table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
0132         if (bad_address(table))
0133             goto bad;
0134         pr_cont("R3:%016lx ", *table);
0135         if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
0136             goto out;
0137         table = __va(*table & _REGION_ENTRY_ORIGIN);
0138         fallthrough;
0139     case _ASCE_TYPE_SEGMENT:
0140         table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
0141         if (bad_address(table))
0142             goto bad;
0143         pr_cont("S:%016lx ", *table);
0144         if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
0145             goto out;
0146         table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
0147     }
0148     table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
0149     if (bad_address(table))
0150         goto bad;
0151     pr_cont("P:%016lx ", *table);
0152 out:
0153     pr_cont("\n");
0154     return;
0155 bad:
0156     pr_cont("BAD\n");
0157 }
0158 
0159 static void dump_fault_info(struct pt_regs *regs)
0160 {
0161     unsigned long asce;
0162 
0163     pr_alert("Failing address: %016lx TEID: %016lx\n",
0164          regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
0165     pr_alert("Fault in ");
0166     switch (regs->int_parm_long & 3) {
0167     case 3:
0168         pr_cont("home space ");
0169         break;
0170     case 2:
0171         pr_cont("secondary space ");
0172         break;
0173     case 1:
0174         pr_cont("access register ");
0175         break;
0176     case 0:
0177         pr_cont("primary space ");
0178         break;
0179     }
0180     pr_cont("mode while using ");
0181     switch (get_fault_type(regs)) {
0182     case USER_FAULT:
0183         asce = S390_lowcore.user_asce;
0184         pr_cont("user ");
0185         break;
0186     case GMAP_FAULT:
0187         asce = ((struct gmap *) S390_lowcore.gmap)->asce;
0188         pr_cont("gmap ");
0189         break;
0190     case KERNEL_FAULT:
0191         asce = S390_lowcore.kernel_asce;
0192         pr_cont("kernel ");
0193         break;
0194     default:
0195         unreachable();
0196     }
0197     pr_cont("ASCE.\n");
0198     dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
0199 }
0200 
0201 int show_unhandled_signals = 1;
0202 
0203 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
0204 {
0205     if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
0206         return;
0207     if (!unhandled_signal(current, signr))
0208         return;
0209     if (!printk_ratelimit())
0210         return;
0211     printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
0212            regs->int_code & 0xffff, regs->int_code >> 17);
0213     print_vma_addr(KERN_CONT "in ", regs->psw.addr);
0214     printk(KERN_CONT "\n");
0215     if (is_mm_fault)
0216         dump_fault_info(regs);
0217     show_regs(regs);
0218 }
0219 
0220 /*
0221  * Send SIGSEGV to task.  This is an external routine
0222  * to keep the stack usage of do_page_fault small.
0223  */
0224 static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
0225 {
0226     report_user_fault(regs, SIGSEGV, 1);
0227     force_sig_fault(SIGSEGV, si_code,
0228             (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
0229 }
0230 
0231 static noinline void do_no_context(struct pt_regs *regs)
0232 {
0233     if (fixup_exception(regs))
0234         return;
0235     /*
0236      * Oops. The kernel tried to access some bad page. We'll have to
0237      * terminate things with extreme prejudice.
0238      */
0239     if (get_fault_type(regs) == KERNEL_FAULT)
0240         printk(KERN_ALERT "Unable to handle kernel pointer dereference"
0241                " in virtual kernel address space\n");
0242     else
0243         printk(KERN_ALERT "Unable to handle kernel paging request"
0244                " in virtual user address space\n");
0245     dump_fault_info(regs);
0246     die(regs, "Oops");
0247 }
0248 
0249 static noinline void do_low_address(struct pt_regs *regs)
0250 {
0251     /* Low-address protection hit in kernel mode means
0252        NULL pointer write access in kernel mode.  */
0253     if (regs->psw.mask & PSW_MASK_PSTATE) {
0254         /* Low-address protection hit in user mode 'cannot happen'. */
0255         die (regs, "Low-address protection");
0256     }
0257 
0258     do_no_context(regs);
0259 }
0260 
0261 static noinline void do_sigbus(struct pt_regs *regs)
0262 {
0263     /*
0264      * Send a sigbus, regardless of whether we were in kernel
0265      * or user mode.
0266      */
0267     force_sig_fault(SIGBUS, BUS_ADRERR,
0268             (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
0269 }
0270 
0271 static noinline void do_fault_error(struct pt_regs *regs, int access,
0272                     vm_fault_t fault)
0273 {
0274     int si_code;
0275 
0276     switch (fault) {
0277     case VM_FAULT_BADACCESS:
0278     case VM_FAULT_BADMAP:
0279         /* Bad memory access. Check if it is kernel or user space. */
0280         if (user_mode(regs)) {
0281             /* User mode accesses just cause a SIGSEGV */
0282             si_code = (fault == VM_FAULT_BADMAP) ?
0283                 SEGV_MAPERR : SEGV_ACCERR;
0284             do_sigsegv(regs, si_code);
0285             break;
0286         }
0287         fallthrough;
0288     case VM_FAULT_BADCONTEXT:
0289     case VM_FAULT_PFAULT:
0290         do_no_context(regs);
0291         break;
0292     case VM_FAULT_SIGNAL:
0293         if (!user_mode(regs))
0294             do_no_context(regs);
0295         break;
0296     default: /* fault & VM_FAULT_ERROR */
0297         if (fault & VM_FAULT_OOM) {
0298             if (!user_mode(regs))
0299                 do_no_context(regs);
0300             else
0301                 pagefault_out_of_memory();
0302         } else if (fault & VM_FAULT_SIGSEGV) {
0303             /* Kernel mode? Handle exceptions or die */
0304             if (!user_mode(regs))
0305                 do_no_context(regs);
0306             else
0307                 do_sigsegv(regs, SEGV_MAPERR);
0308         } else if (fault & VM_FAULT_SIGBUS) {
0309             /* Kernel mode? Handle exceptions or die */
0310             if (!user_mode(regs))
0311                 do_no_context(regs);
0312             else
0313                 do_sigbus(regs);
0314         } else
0315             BUG();
0316         break;
0317     }
0318 }
0319 
0320 /*
0321  * This routine handles page faults.  It determines the address,
0322  * and the problem, and then passes it off to one of the appropriate
0323  * routines.
0324  *
0325  * interruption code (int_code):
0326  *   04       Protection           ->  Write-Protection  (suppression)
0327  *   10       Segment translation  ->  Not present       (nullification)
0328  *   11       Page translation     ->  Not present       (nullification)
0329  *   3b       Region third trans.  ->  Not present       (nullification)
0330  */
0331 static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
0332 {
0333     struct gmap *gmap;
0334     struct task_struct *tsk;
0335     struct mm_struct *mm;
0336     struct vm_area_struct *vma;
0337     enum fault_type type;
0338     unsigned long trans_exc_code;
0339     unsigned long address;
0340     unsigned int flags;
0341     vm_fault_t fault;
0342     bool is_write;
0343 
0344     tsk = current;
0345     /*
0346      * The instruction that caused the program check has
0347      * been nullified. Don't signal single step via SIGTRAP.
0348      */
0349     clear_thread_flag(TIF_PER_TRAP);
0350 
0351     if (kprobe_page_fault(regs, 14))
0352         return 0;
0353 
0354     mm = tsk->mm;
0355     trans_exc_code = regs->int_parm_long;
0356     address = trans_exc_code & __FAIL_ADDR_MASK;
0357     is_write = (trans_exc_code & store_indication) == 0x400;
0358 
0359     /*
0360      * Verify that the fault happened in user space, that
0361      * we are not in an interrupt and that there is a 
0362      * user context.
0363      */
0364     fault = VM_FAULT_BADCONTEXT;
0365     type = get_fault_type(regs);
0366     switch (type) {
0367     case KERNEL_FAULT:
0368         if (kfence_handle_page_fault(address, is_write, regs))
0369             return 0;
0370         goto out;
0371     case USER_FAULT:
0372     case GMAP_FAULT:
0373         if (faulthandler_disabled() || !mm)
0374             goto out;
0375         break;
0376     }
0377 
0378     perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
0379     flags = FAULT_FLAG_DEFAULT;
0380     if (user_mode(regs))
0381         flags |= FAULT_FLAG_USER;
0382     if (is_write)
0383         access = VM_WRITE;
0384     if (access == VM_WRITE)
0385         flags |= FAULT_FLAG_WRITE;
0386     mmap_read_lock(mm);
0387 
0388     gmap = NULL;
0389     if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
0390         gmap = (struct gmap *) S390_lowcore.gmap;
0391         current->thread.gmap_addr = address;
0392         current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
0393         current->thread.gmap_int_code = regs->int_code & 0xffff;
0394         address = __gmap_translate(gmap, address);
0395         if (address == -EFAULT) {
0396             fault = VM_FAULT_BADMAP;
0397             goto out_up;
0398         }
0399         if (gmap->pfault_enabled)
0400             flags |= FAULT_FLAG_RETRY_NOWAIT;
0401     }
0402 
0403 retry:
0404     fault = VM_FAULT_BADMAP;
0405     vma = find_vma(mm, address);
0406     if (!vma)
0407         goto out_up;
0408 
0409     if (unlikely(vma->vm_start > address)) {
0410         if (!(vma->vm_flags & VM_GROWSDOWN))
0411             goto out_up;
0412         if (expand_stack(vma, address))
0413             goto out_up;
0414     }
0415 
0416     /*
0417      * Ok, we have a good vm_area for this memory access, so
0418      * we can handle it..
0419      */
0420     fault = VM_FAULT_BADACCESS;
0421     if (unlikely(!(vma->vm_flags & access)))
0422         goto out_up;
0423 
0424     /*
0425      * If for any reason at all we couldn't handle the fault,
0426      * make sure we exit gracefully rather than endlessly redo
0427      * the fault.
0428      */
0429     fault = handle_mm_fault(vma, address, flags, regs);
0430     if (fault_signal_pending(fault, regs)) {
0431         fault = VM_FAULT_SIGNAL;
0432         if (flags & FAULT_FLAG_RETRY_NOWAIT)
0433             goto out_up;
0434         goto out;
0435     }
0436 
0437     /* The fault is fully completed (including releasing mmap lock) */
0438     if (fault & VM_FAULT_COMPLETED) {
0439         if (gmap) {
0440             mmap_read_lock(mm);
0441             goto out_gmap;
0442         }
0443         fault = 0;
0444         goto out;
0445     }
0446 
0447     if (unlikely(fault & VM_FAULT_ERROR))
0448         goto out_up;
0449 
0450     if (fault & VM_FAULT_RETRY) {
0451         if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
0452             (flags & FAULT_FLAG_RETRY_NOWAIT)) {
0453             /*
0454              * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
0455              * not been released
0456              */
0457             current->thread.gmap_pfault = 1;
0458             fault = VM_FAULT_PFAULT;
0459             goto out_up;
0460         }
0461         flags &= ~FAULT_FLAG_RETRY_NOWAIT;
0462         flags |= FAULT_FLAG_TRIED;
0463         mmap_read_lock(mm);
0464         goto retry;
0465     }
0466 out_gmap:
0467     if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
0468         address =  __gmap_link(gmap, current->thread.gmap_addr,
0469                        address);
0470         if (address == -EFAULT) {
0471             fault = VM_FAULT_BADMAP;
0472             goto out_up;
0473         }
0474         if (address == -ENOMEM) {
0475             fault = VM_FAULT_OOM;
0476             goto out_up;
0477         }
0478     }
0479     fault = 0;
0480 out_up:
0481     mmap_read_unlock(mm);
0482 out:
0483     return fault;
0484 }
0485 
0486 void do_protection_exception(struct pt_regs *regs)
0487 {
0488     unsigned long trans_exc_code;
0489     int access;
0490     vm_fault_t fault;
0491 
0492     trans_exc_code = regs->int_parm_long;
0493     /*
0494      * Protection exceptions are suppressing, decrement psw address.
0495      * The exception to this rule are aborted transactions, for these
0496      * the PSW already points to the correct location.
0497      */
0498     if (!(regs->int_code & 0x200))
0499         regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
0500     /*
0501      * Check for low-address protection.  This needs to be treated
0502      * as a special case because the translation exception code
0503      * field is not guaranteed to contain valid data in this case.
0504      */
0505     if (unlikely(!(trans_exc_code & 4))) {
0506         do_low_address(regs);
0507         return;
0508     }
0509     if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
0510         regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
0511                     (regs->psw.addr & PAGE_MASK);
0512         access = VM_EXEC;
0513         fault = VM_FAULT_BADACCESS;
0514     } else {
0515         access = VM_WRITE;
0516         fault = do_exception(regs, access);
0517     }
0518     if (unlikely(fault))
0519         do_fault_error(regs, access, fault);
0520 }
0521 NOKPROBE_SYMBOL(do_protection_exception);
0522 
0523 void do_dat_exception(struct pt_regs *regs)
0524 {
0525     int access;
0526     vm_fault_t fault;
0527 
0528     access = VM_ACCESS_FLAGS;
0529     fault = do_exception(regs, access);
0530     if (unlikely(fault))
0531         do_fault_error(regs, access, fault);
0532 }
0533 NOKPROBE_SYMBOL(do_dat_exception);
0534 
0535 #ifdef CONFIG_PFAULT 
0536 /*
0537  * 'pfault' pseudo page faults routines.
0538  */
0539 static int pfault_disable;
0540 
0541 static int __init nopfault(char *str)
0542 {
0543     pfault_disable = 1;
0544     return 1;
0545 }
0546 
0547 __setup("nopfault", nopfault);
0548 
0549 struct pfault_refbk {
0550     u16 refdiagc;
0551     u16 reffcode;
0552     u16 refdwlen;
0553     u16 refversn;
0554     u64 refgaddr;
0555     u64 refselmk;
0556     u64 refcmpmk;
0557     u64 reserved;
0558 } __attribute__ ((packed, aligned(8)));
0559 
0560 static struct pfault_refbk pfault_init_refbk = {
0561     .refdiagc = 0x258,
0562     .reffcode = 0,
0563     .refdwlen = 5,
0564     .refversn = 2,
0565     .refgaddr = __LC_LPP,
0566     .refselmk = 1ULL << 48,
0567     .refcmpmk = 1ULL << 48,
0568     .reserved = __PF_RES_FIELD
0569 };
0570 
0571 int pfault_init(void)
0572 {
0573         int rc;
0574 
0575     if (pfault_disable)
0576         return -1;
0577     diag_stat_inc(DIAG_STAT_X258);
0578     asm volatile(
0579         "   diag    %1,%0,0x258\n"
0580         "0: j   2f\n"
0581         "1: la  %0,8\n"
0582         "2:\n"
0583         EX_TABLE(0b,1b)
0584         : "=d" (rc)
0585         : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
0586         return rc;
0587 }
0588 
0589 static struct pfault_refbk pfault_fini_refbk = {
0590     .refdiagc = 0x258,
0591     .reffcode = 1,
0592     .refdwlen = 5,
0593     .refversn = 2,
0594 };
0595 
0596 void pfault_fini(void)
0597 {
0598 
0599     if (pfault_disable)
0600         return;
0601     diag_stat_inc(DIAG_STAT_X258);
0602     asm volatile(
0603         "   diag    %0,0,0x258\n"
0604         "0: nopr    %%r7\n"
0605         EX_TABLE(0b,0b)
0606         : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
0607 }
0608 
0609 static DEFINE_SPINLOCK(pfault_lock);
0610 static LIST_HEAD(pfault_list);
0611 
0612 #define PF_COMPLETE 0x0080
0613 
0614 /*
0615  * The mechanism of our pfault code: if Linux is running as guest, runs a user
0616  * space process and the user space process accesses a page that the host has
0617  * paged out we get a pfault interrupt.
0618  *
0619  * This allows us, within the guest, to schedule a different process. Without
0620  * this mechanism the host would have to suspend the whole virtual cpu until
0621  * the page has been paged in.
0622  *
0623  * So when we get such an interrupt then we set the state of the current task
0624  * to uninterruptible and also set the need_resched flag. Both happens within
0625  * interrupt context(!). If we later on want to return to user space we
0626  * recognize the need_resched flag and then call schedule().  It's not very
0627  * obvious how this works...
0628  *
0629  * Of course we have a lot of additional fun with the completion interrupt (->
0630  * host signals that a page of a process has been paged in and the process can
0631  * continue to run). This interrupt can arrive on any cpu and, since we have
0632  * virtual cpus, actually appear before the interrupt that signals that a page
0633  * is missing.
0634  */
0635 static void pfault_interrupt(struct ext_code ext_code,
0636                  unsigned int param32, unsigned long param64)
0637 {
0638     struct task_struct *tsk;
0639     __u16 subcode;
0640     pid_t pid;
0641 
0642     /*
0643      * Get the external interruption subcode & pfault initial/completion
0644      * signal bit. VM stores this in the 'cpu address' field associated
0645      * with the external interrupt.
0646      */
0647     subcode = ext_code.subcode;
0648     if ((subcode & 0xff00) != __SUBCODE_MASK)
0649         return;
0650     inc_irq_stat(IRQEXT_PFL);
0651     /* Get the token (= pid of the affected task). */
0652     pid = param64 & LPP_PID_MASK;
0653     rcu_read_lock();
0654     tsk = find_task_by_pid_ns(pid, &init_pid_ns);
0655     if (tsk)
0656         get_task_struct(tsk);
0657     rcu_read_unlock();
0658     if (!tsk)
0659         return;
0660     spin_lock(&pfault_lock);
0661     if (subcode & PF_COMPLETE) {
0662         /* signal bit is set -> a page has been swapped in by VM */
0663         if (tsk->thread.pfault_wait == 1) {
0664             /* Initial interrupt was faster than the completion
0665              * interrupt. pfault_wait is valid. Set pfault_wait
0666              * back to zero and wake up the process. This can
0667              * safely be done because the task is still sleeping
0668              * and can't produce new pfaults. */
0669             tsk->thread.pfault_wait = 0;
0670             list_del(&tsk->thread.list);
0671             wake_up_process(tsk);
0672             put_task_struct(tsk);
0673         } else {
0674             /* Completion interrupt was faster than initial
0675              * interrupt. Set pfault_wait to -1 so the initial
0676              * interrupt doesn't put the task to sleep.
0677              * If the task is not running, ignore the completion
0678              * interrupt since it must be a leftover of a PFAULT
0679              * CANCEL operation which didn't remove all pending
0680              * completion interrupts. */
0681             if (task_is_running(tsk))
0682                 tsk->thread.pfault_wait = -1;
0683         }
0684     } else {
0685         /* signal bit not set -> a real page is missing. */
0686         if (WARN_ON_ONCE(tsk != current))
0687             goto out;
0688         if (tsk->thread.pfault_wait == 1) {
0689             /* Already on the list with a reference: put to sleep */
0690             goto block;
0691         } else if (tsk->thread.pfault_wait == -1) {
0692             /* Completion interrupt was faster than the initial
0693              * interrupt (pfault_wait == -1). Set pfault_wait
0694              * back to zero and exit. */
0695             tsk->thread.pfault_wait = 0;
0696         } else {
0697             /* Initial interrupt arrived before completion
0698              * interrupt. Let the task sleep.
0699              * An extra task reference is needed since a different
0700              * cpu may set the task state to TASK_RUNNING again
0701              * before the scheduler is reached. */
0702             get_task_struct(tsk);
0703             tsk->thread.pfault_wait = 1;
0704             list_add(&tsk->thread.list, &pfault_list);
0705 block:
0706             /* Since this must be a userspace fault, there
0707              * is no kernel task state to trample. Rely on the
0708              * return to userspace schedule() to block. */
0709             __set_current_state(TASK_UNINTERRUPTIBLE);
0710             set_tsk_need_resched(tsk);
0711             set_preempt_need_resched();
0712         }
0713     }
0714 out:
0715     spin_unlock(&pfault_lock);
0716     put_task_struct(tsk);
0717 }
0718 
0719 static int pfault_cpu_dead(unsigned int cpu)
0720 {
0721     struct thread_struct *thread, *next;
0722     struct task_struct *tsk;
0723 
0724     spin_lock_irq(&pfault_lock);
0725     list_for_each_entry_safe(thread, next, &pfault_list, list) {
0726         thread->pfault_wait = 0;
0727         list_del(&thread->list);
0728         tsk = container_of(thread, struct task_struct, thread);
0729         wake_up_process(tsk);
0730         put_task_struct(tsk);
0731     }
0732     spin_unlock_irq(&pfault_lock);
0733     return 0;
0734 }
0735 
0736 static int __init pfault_irq_init(void)
0737 {
0738     int rc;
0739 
0740     rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
0741     if (rc)
0742         goto out_extint;
0743     rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
0744     if (rc)
0745         goto out_pfault;
0746     irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
0747     cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
0748                   NULL, pfault_cpu_dead);
0749     return 0;
0750 
0751 out_pfault:
0752     unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
0753 out_extint:
0754     pfault_disable = 1;
0755     return rc;
0756 }
0757 early_initcall(pfault_irq_init);
0758 
0759 #endif /* CONFIG_PFAULT */
0760 
0761 #if IS_ENABLED(CONFIG_PGSTE)
0762 
0763 void do_secure_storage_access(struct pt_regs *regs)
0764 {
0765     unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
0766     struct vm_area_struct *vma;
0767     struct mm_struct *mm;
0768     struct page *page;
0769     struct gmap *gmap;
0770     int rc;
0771 
0772     /*
0773      * bit 61 tells us if the address is valid, if it's not we
0774      * have a major problem and should stop the kernel or send a
0775      * SIGSEGV to the process. Unfortunately bit 61 is not
0776      * reliable without the misc UV feature so we need to check
0777      * for that as well.
0778      */
0779     if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
0780         !test_bit_inv(61, &regs->int_parm_long)) {
0781         /*
0782          * When this happens, userspace did something that it
0783          * was not supposed to do, e.g. branching into secure
0784          * memory. Trigger a segmentation fault.
0785          */
0786         if (user_mode(regs)) {
0787             send_sig(SIGSEGV, current, 0);
0788             return;
0789         }
0790 
0791         /*
0792          * The kernel should never run into this case and we
0793          * have no way out of this situation.
0794          */
0795         panic("Unexpected PGM 0x3d with TEID bit 61=0");
0796     }
0797 
0798     switch (get_fault_type(regs)) {
0799     case GMAP_FAULT:
0800         mm = current->mm;
0801         gmap = (struct gmap *)S390_lowcore.gmap;
0802         mmap_read_lock(mm);
0803         addr = __gmap_translate(gmap, addr);
0804         mmap_read_unlock(mm);
0805         if (IS_ERR_VALUE(addr)) {
0806             do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP);
0807             break;
0808         }
0809         fallthrough;
0810     case USER_FAULT:
0811         mm = current->mm;
0812         mmap_read_lock(mm);
0813         vma = find_vma(mm, addr);
0814         if (!vma) {
0815             mmap_read_unlock(mm);
0816             do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
0817             break;
0818         }
0819         page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
0820         if (IS_ERR_OR_NULL(page)) {
0821             mmap_read_unlock(mm);
0822             break;
0823         }
0824         if (arch_make_page_accessible(page))
0825             send_sig(SIGSEGV, current, 0);
0826         put_page(page);
0827         mmap_read_unlock(mm);
0828         break;
0829     case KERNEL_FAULT:
0830         page = phys_to_page(addr);
0831         if (unlikely(!try_get_page(page)))
0832             break;
0833         rc = arch_make_page_accessible(page);
0834         put_page(page);
0835         if (rc)
0836             BUG();
0837         break;
0838     default:
0839         do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
0840         WARN_ON_ONCE(1);
0841     }
0842 }
0843 NOKPROBE_SYMBOL(do_secure_storage_access);
0844 
0845 void do_non_secure_storage_access(struct pt_regs *regs)
0846 {
0847     unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
0848     struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
0849 
0850     if (get_fault_type(regs) != GMAP_FAULT) {
0851         do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
0852         WARN_ON_ONCE(1);
0853         return;
0854     }
0855 
0856     if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
0857         send_sig(SIGSEGV, current, 0);
0858 }
0859 NOKPROBE_SYMBOL(do_non_secure_storage_access);
0860 
0861 void do_secure_storage_violation(struct pt_regs *regs)
0862 {
0863     unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
0864     struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
0865 
0866     /*
0867      * If the VM has been rebooted, its address space might still contain
0868      * secure pages from the previous boot.
0869      * Clear the page so it can be reused.
0870      */
0871     if (!gmap_destroy_page(gmap, gaddr))
0872         return;
0873     /*
0874      * Either KVM messed up the secure guest mapping or the same
0875      * page is mapped into multiple secure guests.
0876      *
0877      * This exception is only triggered when a guest 2 is running
0878      * and can therefore never occur in kernel context.
0879      */
0880     printk_ratelimited(KERN_WARNING
0881                "Secure storage violation in task: %s, pid %d\n",
0882                current->comm, current->pid);
0883     send_sig(SIGSEGV, current, 0);
0884 }
0885 
0886 #endif /* CONFIG_PGSTE */