Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * pSeries_lpar.c
0004  * Copyright (C) 2001 Todd Inglett, IBM Corporation
0005  *
0006  * pSeries LPAR support.
0007  */
0008 
0009 /* Enables debugging of low-level hash table routines - careful! */
0010 #undef DEBUG
0011 #define pr_fmt(fmt) "lpar: " fmt
0012 
0013 #include <linux/kernel.h>
0014 #include <linux/dma-mapping.h>
0015 #include <linux/console.h>
0016 #include <linux/export.h>
0017 #include <linux/jump_label.h>
0018 #include <linux/delay.h>
0019 #include <linux/stop_machine.h>
0020 #include <linux/spinlock.h>
0021 #include <linux/cpuhotplug.h>
0022 #include <linux/workqueue.h>
0023 #include <linux/proc_fs.h>
0024 #include <linux/pgtable.h>
0025 #include <linux/debugfs.h>
0026 
0027 #include <asm/processor.h>
0028 #include <asm/mmu.h>
0029 #include <asm/page.h>
0030 #include <asm/setup.h>
0031 #include <asm/mmu_context.h>
0032 #include <asm/iommu.h>
0033 #include <asm/tlb.h>
0034 #include <asm/cputable.h>
0035 #include <asm/udbg.h>
0036 #include <asm/smp.h>
0037 #include <asm/trace.h>
0038 #include <asm/firmware.h>
0039 #include <asm/plpar_wrappers.h>
0040 #include <asm/kexec.h>
0041 #include <asm/fadump.h>
0042 #include <asm/dtl.h>
0043 
0044 #include "pseries.h"
0045 
0046 /* Flag bits for H_BULK_REMOVE */
0047 #define HBR_REQUEST 0x4000000000000000UL
0048 #define HBR_RESPONSE    0x8000000000000000UL
0049 #define HBR_END     0xc000000000000000UL
0050 #define HBR_AVPN    0x0200000000000000UL
0051 #define HBR_ANDCOND 0x0100000000000000UL
0052 
0053 
0054 /* in hvCall.S */
0055 EXPORT_SYMBOL(plpar_hcall);
0056 EXPORT_SYMBOL(plpar_hcall9);
0057 EXPORT_SYMBOL(plpar_hcall_norets);
0058 
0059 #ifdef CONFIG_PPC_64S_HASH_MMU
0060 /*
0061  * H_BLOCK_REMOVE supported block size for this page size in segment who's base
0062  * page size is that page size.
0063  *
0064  * The first index is the segment base page size, the second one is the actual
0065  * page size.
0066  */
0067 static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
0068 #endif
0069 
0070 /*
0071  * Due to the involved complexity, and that the current hypervisor is only
0072  * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
0073  * buffer size to 8 size block.
0074  */
0075 #define HBLKRM_SUPPORTED_BLOCK_SIZE 8
0076 
0077 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0078 static u8 dtl_mask = DTL_LOG_PREEMPT;
0079 #else
0080 static u8 dtl_mask;
0081 #endif
0082 
0083 void alloc_dtl_buffers(unsigned long *time_limit)
0084 {
0085     int cpu;
0086     struct paca_struct *pp;
0087     struct dtl_entry *dtl;
0088 
0089     for_each_possible_cpu(cpu) {
0090         pp = paca_ptrs[cpu];
0091         if (pp->dispatch_log)
0092             continue;
0093         dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
0094         if (!dtl) {
0095             pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
0096                 cpu);
0097 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0098             pr_warn("Stolen time statistics will be unreliable\n");
0099 #endif
0100             break;
0101         }
0102 
0103         pp->dtl_ridx = 0;
0104         pp->dispatch_log = dtl;
0105         pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
0106         pp->dtl_curr = dtl;
0107 
0108         if (time_limit && time_after(jiffies, *time_limit)) {
0109             cond_resched();
0110             *time_limit = jiffies + HZ;
0111         }
0112     }
0113 }
0114 
0115 void register_dtl_buffer(int cpu)
0116 {
0117     long ret;
0118     struct paca_struct *pp;
0119     struct dtl_entry *dtl;
0120     int hwcpu = get_hard_smp_processor_id(cpu);
0121 
0122     pp = paca_ptrs[cpu];
0123     dtl = pp->dispatch_log;
0124     if (dtl && dtl_mask) {
0125         pp->dtl_ridx = 0;
0126         pp->dtl_curr = dtl;
0127         lppaca_of(cpu).dtl_idx = 0;
0128 
0129         /* hypervisor reads buffer length from this field */
0130         dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
0131         ret = register_dtl(hwcpu, __pa(dtl));
0132         if (ret)
0133             pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
0134                    cpu, hwcpu, ret);
0135 
0136         lppaca_of(cpu).dtl_enable_mask = dtl_mask;
0137     }
0138 }
0139 
0140 #ifdef CONFIG_PPC_SPLPAR
0141 struct dtl_worker {
0142     struct delayed_work work;
0143     int cpu;
0144 };
0145 
0146 struct vcpu_dispatch_data {
0147     int last_disp_cpu;
0148 
0149     int total_disp;
0150 
0151     int same_cpu_disp;
0152     int same_chip_disp;
0153     int diff_chip_disp;
0154     int far_chip_disp;
0155 
0156     int numa_home_disp;
0157     int numa_remote_disp;
0158     int numa_far_disp;
0159 };
0160 
0161 /*
0162  * This represents the number of cpus in the hypervisor. Since there is no
0163  * architected way to discover the number of processors in the host, we
0164  * provision for dealing with NR_CPUS. This is currently 2048 by default, and
0165  * is sufficient for our purposes. This will need to be tweaked if
0166  * CONFIG_NR_CPUS is changed.
0167  */
0168 #define NR_CPUS_H   NR_CPUS
0169 
0170 DEFINE_RWLOCK(dtl_access_lock);
0171 static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
0172 static DEFINE_PER_CPU(u64, dtl_entry_ridx);
0173 static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
0174 static enum cpuhp_state dtl_worker_state;
0175 static DEFINE_MUTEX(dtl_enable_mutex);
0176 static int vcpudispatch_stats_on __read_mostly;
0177 static int vcpudispatch_stats_freq = 50;
0178 static __be32 *vcpu_associativity, *pcpu_associativity;
0179 
0180 
0181 static void free_dtl_buffers(unsigned long *time_limit)
0182 {
0183 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0184     int cpu;
0185     struct paca_struct *pp;
0186 
0187     for_each_possible_cpu(cpu) {
0188         pp = paca_ptrs[cpu];
0189         if (!pp->dispatch_log)
0190             continue;
0191         kmem_cache_free(dtl_cache, pp->dispatch_log);
0192         pp->dtl_ridx = 0;
0193         pp->dispatch_log = 0;
0194         pp->dispatch_log_end = 0;
0195         pp->dtl_curr = 0;
0196 
0197         if (time_limit && time_after(jiffies, *time_limit)) {
0198             cond_resched();
0199             *time_limit = jiffies + HZ;
0200         }
0201     }
0202 #endif
0203 }
0204 
0205 static int init_cpu_associativity(void)
0206 {
0207     vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
0208             VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
0209     pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
0210             VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
0211 
0212     if (!vcpu_associativity || !pcpu_associativity) {
0213         pr_err("error allocating memory for associativity information\n");
0214         return -ENOMEM;
0215     }
0216 
0217     return 0;
0218 }
0219 
0220 static void destroy_cpu_associativity(void)
0221 {
0222     kfree(vcpu_associativity);
0223     kfree(pcpu_associativity);
0224     vcpu_associativity = pcpu_associativity = 0;
0225 }
0226 
0227 static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
0228 {
0229     __be32 *assoc;
0230     int rc = 0;
0231 
0232     assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
0233     if (!assoc[0]) {
0234         rc = hcall_vphn(cpu, flag, &assoc[0]);
0235         if (rc)
0236             return NULL;
0237     }
0238 
0239     return assoc;
0240 }
0241 
0242 static __be32 *get_pcpu_associativity(int cpu)
0243 {
0244     return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
0245 }
0246 
0247 static __be32 *get_vcpu_associativity(int cpu)
0248 {
0249     return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
0250 }
0251 
0252 static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
0253 {
0254     __be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
0255 
0256     if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
0257         return -EINVAL;
0258 
0259     last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
0260     cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
0261 
0262     if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
0263         return -EIO;
0264 
0265     return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
0266 }
0267 
0268 static int cpu_home_node_dispatch_distance(int disp_cpu)
0269 {
0270     __be32 *disp_cpu_assoc, *vcpu_assoc;
0271     int vcpu_id = smp_processor_id();
0272 
0273     if (disp_cpu >= NR_CPUS_H) {
0274         pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
0275                         disp_cpu, NR_CPUS_H);
0276         return -EINVAL;
0277     }
0278 
0279     disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
0280     vcpu_assoc = get_vcpu_associativity(vcpu_id);
0281 
0282     if (!disp_cpu_assoc || !vcpu_assoc)
0283         return -EIO;
0284 
0285     return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
0286 }
0287 
0288 static void update_vcpu_disp_stat(int disp_cpu)
0289 {
0290     struct vcpu_dispatch_data *disp;
0291     int distance;
0292 
0293     disp = this_cpu_ptr(&vcpu_disp_data);
0294     if (disp->last_disp_cpu == -1) {
0295         disp->last_disp_cpu = disp_cpu;
0296         return;
0297     }
0298 
0299     disp->total_disp++;
0300 
0301     if (disp->last_disp_cpu == disp_cpu ||
0302         (cpu_first_thread_sibling(disp->last_disp_cpu) ==
0303                     cpu_first_thread_sibling(disp_cpu)))
0304         disp->same_cpu_disp++;
0305     else {
0306         distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
0307                                 disp_cpu);
0308         if (distance < 0)
0309             pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
0310                     smp_processor_id());
0311         else {
0312             switch (distance) {
0313             case 0:
0314                 disp->same_chip_disp++;
0315                 break;
0316             case 1:
0317                 disp->diff_chip_disp++;
0318                 break;
0319             case 2:
0320                 disp->far_chip_disp++;
0321                 break;
0322             default:
0323                 pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
0324                          smp_processor_id(),
0325                          disp->last_disp_cpu,
0326                          disp_cpu,
0327                          distance);
0328             }
0329         }
0330     }
0331 
0332     distance = cpu_home_node_dispatch_distance(disp_cpu);
0333     if (distance < 0)
0334         pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
0335                 smp_processor_id());
0336     else {
0337         switch (distance) {
0338         case 0:
0339             disp->numa_home_disp++;
0340             break;
0341         case 1:
0342             disp->numa_remote_disp++;
0343             break;
0344         case 2:
0345             disp->numa_far_disp++;
0346             break;
0347         default:
0348             pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
0349                          smp_processor_id(),
0350                          disp_cpu,
0351                          distance);
0352         }
0353     }
0354 
0355     disp->last_disp_cpu = disp_cpu;
0356 }
0357 
0358 static void process_dtl_buffer(struct work_struct *work)
0359 {
0360     struct dtl_entry dtle;
0361     u64 i = __this_cpu_read(dtl_entry_ridx);
0362     struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
0363     struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
0364     struct lppaca *vpa = local_paca->lppaca_ptr;
0365     struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
0366 
0367     if (!local_paca->dispatch_log)
0368         return;
0369 
0370     /* if we have been migrated away, we cancel ourself */
0371     if (d->cpu != smp_processor_id()) {
0372         pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
0373                         smp_processor_id());
0374         return;
0375     }
0376 
0377     if (i == be64_to_cpu(vpa->dtl_idx))
0378         goto out;
0379 
0380     while (i < be64_to_cpu(vpa->dtl_idx)) {
0381         dtle = *dtl;
0382         barrier();
0383         if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
0384             /* buffer has overflowed */
0385             pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
0386                 d->cpu,
0387                 be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
0388             i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
0389             dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
0390             continue;
0391         }
0392         update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
0393         ++i;
0394         ++dtl;
0395         if (dtl == dtl_end)
0396             dtl = local_paca->dispatch_log;
0397     }
0398 
0399     __this_cpu_write(dtl_entry_ridx, i);
0400 
0401 out:
0402     schedule_delayed_work_on(d->cpu, to_delayed_work(work),
0403                     HZ / vcpudispatch_stats_freq);
0404 }
0405 
0406 static int dtl_worker_online(unsigned int cpu)
0407 {
0408     struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
0409 
0410     memset(d, 0, sizeof(*d));
0411     INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
0412     d->cpu = cpu;
0413 
0414 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0415     per_cpu(dtl_entry_ridx, cpu) = 0;
0416     register_dtl_buffer(cpu);
0417 #else
0418     per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
0419 #endif
0420 
0421     schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
0422     return 0;
0423 }
0424 
0425 static int dtl_worker_offline(unsigned int cpu)
0426 {
0427     struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
0428 
0429     cancel_delayed_work_sync(&d->work);
0430 
0431 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0432     unregister_dtl(get_hard_smp_processor_id(cpu));
0433 #endif
0434 
0435     return 0;
0436 }
0437 
0438 static void set_global_dtl_mask(u8 mask)
0439 {
0440     int cpu;
0441 
0442     dtl_mask = mask;
0443     for_each_present_cpu(cpu)
0444         lppaca_of(cpu).dtl_enable_mask = dtl_mask;
0445 }
0446 
0447 static void reset_global_dtl_mask(void)
0448 {
0449     int cpu;
0450 
0451 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
0452     dtl_mask = DTL_LOG_PREEMPT;
0453 #else
0454     dtl_mask = 0;
0455 #endif
0456     for_each_present_cpu(cpu)
0457         lppaca_of(cpu).dtl_enable_mask = dtl_mask;
0458 }
0459 
0460 static int dtl_worker_enable(unsigned long *time_limit)
0461 {
0462     int rc = 0, state;
0463 
0464     if (!write_trylock(&dtl_access_lock)) {
0465         rc = -EBUSY;
0466         goto out;
0467     }
0468 
0469     set_global_dtl_mask(DTL_LOG_ALL);
0470 
0471     /* Setup dtl buffers and register those */
0472     alloc_dtl_buffers(time_limit);
0473 
0474     state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
0475                     dtl_worker_online, dtl_worker_offline);
0476     if (state < 0) {
0477         pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
0478         free_dtl_buffers(time_limit);
0479         reset_global_dtl_mask();
0480         write_unlock(&dtl_access_lock);
0481         rc = -EINVAL;
0482         goto out;
0483     }
0484     dtl_worker_state = state;
0485 
0486 out:
0487     return rc;
0488 }
0489 
0490 static void dtl_worker_disable(unsigned long *time_limit)
0491 {
0492     cpuhp_remove_state(dtl_worker_state);
0493     free_dtl_buffers(time_limit);
0494     reset_global_dtl_mask();
0495     write_unlock(&dtl_access_lock);
0496 }
0497 
0498 static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
0499         size_t count, loff_t *ppos)
0500 {
0501     unsigned long time_limit = jiffies + HZ;
0502     struct vcpu_dispatch_data *disp;
0503     int rc, cmd, cpu;
0504     char buf[16];
0505 
0506     if (count > 15)
0507         return -EINVAL;
0508 
0509     if (copy_from_user(buf, p, count))
0510         return -EFAULT;
0511 
0512     buf[count] = 0;
0513     rc = kstrtoint(buf, 0, &cmd);
0514     if (rc || cmd < 0 || cmd > 1) {
0515         pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
0516         return rc ? rc : -EINVAL;
0517     }
0518 
0519     mutex_lock(&dtl_enable_mutex);
0520 
0521     if ((cmd == 0 && !vcpudispatch_stats_on) ||
0522             (cmd == 1 && vcpudispatch_stats_on))
0523         goto out;
0524 
0525     if (cmd) {
0526         rc = init_cpu_associativity();
0527         if (rc)
0528             goto out;
0529 
0530         for_each_possible_cpu(cpu) {
0531             disp = per_cpu_ptr(&vcpu_disp_data, cpu);
0532             memset(disp, 0, sizeof(*disp));
0533             disp->last_disp_cpu = -1;
0534         }
0535 
0536         rc = dtl_worker_enable(&time_limit);
0537         if (rc) {
0538             destroy_cpu_associativity();
0539             goto out;
0540         }
0541     } else {
0542         dtl_worker_disable(&time_limit);
0543         destroy_cpu_associativity();
0544     }
0545 
0546     vcpudispatch_stats_on = cmd;
0547 
0548 out:
0549     mutex_unlock(&dtl_enable_mutex);
0550     if (rc)
0551         return rc;
0552     return count;
0553 }
0554 
0555 static int vcpudispatch_stats_display(struct seq_file *p, void *v)
0556 {
0557     int cpu;
0558     struct vcpu_dispatch_data *disp;
0559 
0560     if (!vcpudispatch_stats_on) {
0561         seq_puts(p, "off\n");
0562         return 0;
0563     }
0564 
0565     for_each_online_cpu(cpu) {
0566         disp = per_cpu_ptr(&vcpu_disp_data, cpu);
0567         seq_printf(p, "cpu%d", cpu);
0568         seq_put_decimal_ull(p, " ", disp->total_disp);
0569         seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
0570         seq_put_decimal_ull(p, " ", disp->same_chip_disp);
0571         seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
0572         seq_put_decimal_ull(p, " ", disp->far_chip_disp);
0573         seq_put_decimal_ull(p, " ", disp->numa_home_disp);
0574         seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
0575         seq_put_decimal_ull(p, " ", disp->numa_far_disp);
0576         seq_puts(p, "\n");
0577     }
0578 
0579     return 0;
0580 }
0581 
0582 static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
0583 {
0584     return single_open(file, vcpudispatch_stats_display, NULL);
0585 }
0586 
0587 static const struct proc_ops vcpudispatch_stats_proc_ops = {
0588     .proc_open  = vcpudispatch_stats_open,
0589     .proc_read  = seq_read,
0590     .proc_write = vcpudispatch_stats_write,
0591     .proc_lseek = seq_lseek,
0592     .proc_release   = single_release,
0593 };
0594 
0595 static ssize_t vcpudispatch_stats_freq_write(struct file *file,
0596         const char __user *p, size_t count, loff_t *ppos)
0597 {
0598     int rc, freq;
0599     char buf[16];
0600 
0601     if (count > 15)
0602         return -EINVAL;
0603 
0604     if (copy_from_user(buf, p, count))
0605         return -EFAULT;
0606 
0607     buf[count] = 0;
0608     rc = kstrtoint(buf, 0, &freq);
0609     if (rc || freq < 1 || freq > HZ) {
0610         pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
0611                 HZ);
0612         return rc ? rc : -EINVAL;
0613     }
0614 
0615     vcpudispatch_stats_freq = freq;
0616 
0617     return count;
0618 }
0619 
0620 static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
0621 {
0622     seq_printf(p, "%d\n", vcpudispatch_stats_freq);
0623     return 0;
0624 }
0625 
0626 static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
0627 {
0628     return single_open(file, vcpudispatch_stats_freq_display, NULL);
0629 }
0630 
0631 static const struct proc_ops vcpudispatch_stats_freq_proc_ops = {
0632     .proc_open  = vcpudispatch_stats_freq_open,
0633     .proc_read  = seq_read,
0634     .proc_write = vcpudispatch_stats_freq_write,
0635     .proc_lseek = seq_lseek,
0636     .proc_release   = single_release,
0637 };
0638 
0639 static int __init vcpudispatch_stats_procfs_init(void)
0640 {
0641     /*
0642      * Avoid smp_processor_id while preemptible. All CPUs should have
0643      * the same value for lppaca_shared_proc.
0644      */
0645     preempt_disable();
0646     if (!lppaca_shared_proc(get_lppaca())) {
0647         preempt_enable();
0648         return 0;
0649     }
0650     preempt_enable();
0651 
0652     if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
0653                     &vcpudispatch_stats_proc_ops))
0654         pr_err("vcpudispatch_stats: error creating procfs file\n");
0655     else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
0656                     &vcpudispatch_stats_freq_proc_ops))
0657         pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
0658 
0659     return 0;
0660 }
0661 
0662 machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
0663 #endif /* CONFIG_PPC_SPLPAR */
0664 
0665 void vpa_init(int cpu)
0666 {
0667     int hwcpu = get_hard_smp_processor_id(cpu);
0668     unsigned long addr;
0669     long ret;
0670 
0671     /*
0672      * The spec says it "may be problematic" if CPU x registers the VPA of
0673      * CPU y. We should never do that, but wail if we ever do.
0674      */
0675     WARN_ON(cpu != smp_processor_id());
0676 
0677     if (cpu_has_feature(CPU_FTR_ALTIVEC))
0678         lppaca_of(cpu).vmxregs_in_use = 1;
0679 
0680     if (cpu_has_feature(CPU_FTR_ARCH_207S))
0681         lppaca_of(cpu).ebb_regs_in_use = 1;
0682 
0683     addr = __pa(&lppaca_of(cpu));
0684     ret = register_vpa(hwcpu, addr);
0685 
0686     if (ret) {
0687         pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
0688                "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
0689         return;
0690     }
0691 
0692 #ifdef CONFIG_PPC_64S_HASH_MMU
0693     /*
0694      * PAPR says this feature is SLB-Buffer but firmware never
0695      * reports that.  All SPLPAR support SLB shadow buffer.
0696      */
0697     if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
0698         addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
0699         ret = register_slb_shadow(hwcpu, addr);
0700         if (ret)
0701             pr_err("WARNING: SLB shadow buffer registration for "
0702                    "cpu %d (hw %d) of area %lx failed with %ld\n",
0703                    cpu, hwcpu, addr, ret);
0704     }
0705 #endif /* CONFIG_PPC_64S_HASH_MMU */
0706 
0707     /*
0708      * Register dispatch trace log, if one has been allocated.
0709      */
0710     register_dtl_buffer(cpu);
0711 }
0712 
0713 #ifdef CONFIG_PPC_BOOK3S_64
0714 
0715 static int __init pseries_lpar_register_process_table(unsigned long base,
0716             unsigned long page_size, unsigned long table_size)
0717 {
0718     long rc;
0719     unsigned long flags = 0;
0720 
0721     if (table_size)
0722         flags |= PROC_TABLE_NEW;
0723     if (radix_enabled()) {
0724         flags |= PROC_TABLE_RADIX;
0725         if (mmu_has_feature(MMU_FTR_GTSE))
0726             flags |= PROC_TABLE_GTSE;
0727     } else
0728         flags |= PROC_TABLE_HPT_SLB;
0729     for (;;) {
0730         rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
0731                     page_size, table_size);
0732         if (!H_IS_LONG_BUSY(rc))
0733             break;
0734         mdelay(get_longbusy_msecs(rc));
0735     }
0736     if (rc != H_SUCCESS) {
0737         pr_err("Failed to register process table (rc=%ld)\n", rc);
0738         BUG();
0739     }
0740     return rc;
0741 }
0742 
0743 #ifdef CONFIG_PPC_64S_HASH_MMU
0744 
0745 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
0746                      unsigned long vpn, unsigned long pa,
0747                      unsigned long rflags, unsigned long vflags,
0748                      int psize, int apsize, int ssize)
0749 {
0750     unsigned long lpar_rc;
0751     unsigned long flags;
0752     unsigned long slot;
0753     unsigned long hpte_v, hpte_r;
0754 
0755     if (!(vflags & HPTE_V_BOLTED))
0756         pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
0757              "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
0758              hpte_group, vpn,  pa, rflags, vflags, psize);
0759 
0760     hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
0761     hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
0762 
0763     if (!(vflags & HPTE_V_BOLTED))
0764         pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
0765 
0766     /* Now fill in the actual HPTE */
0767     /* Set CEC cookie to 0         */
0768     /* Zero page = 0               */
0769     /* I-cache Invalidate = 0      */
0770     /* I-cache synchronize = 0     */
0771     /* Exact = 0                   */
0772     flags = 0;
0773 
0774     if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
0775         flags |= H_COALESCE_CAND;
0776 
0777     lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
0778     if (unlikely(lpar_rc == H_PTEG_FULL)) {
0779         pr_devel("Hash table group is full\n");
0780         return -1;
0781     }
0782 
0783     /*
0784      * Since we try and ioremap PHBs we don't own, the pte insert
0785      * will fail. However we must catch the failure in hash_page
0786      * or we will loop forever, so return -2 in this case.
0787      */
0788     if (unlikely(lpar_rc != H_SUCCESS)) {
0789         pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
0790         return -2;
0791     }
0792     if (!(vflags & HPTE_V_BOLTED))
0793         pr_devel(" -> slot: %lu\n", slot & 7);
0794 
0795     /* Because of iSeries, we have to pass down the secondary
0796      * bucket bit here as well
0797      */
0798     return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
0799 }
0800 
0801 static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
0802 
0803 static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
0804 {
0805     unsigned long slot_offset;
0806     unsigned long lpar_rc;
0807     int i;
0808     unsigned long dummy1, dummy2;
0809 
0810     /* pick a random slot to start at */
0811     slot_offset = mftb() & 0x7;
0812 
0813     for (i = 0; i < HPTES_PER_GROUP; i++) {
0814 
0815         /* don't remove a bolted entry */
0816         lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
0817                        HPTE_V_BOLTED, &dummy1, &dummy2);
0818         if (lpar_rc == H_SUCCESS)
0819             return i;
0820 
0821         /*
0822          * The test for adjunct partition is performed before the
0823          * ANDCOND test.  H_RESOURCE may be returned, so we need to
0824          * check for that as well.
0825          */
0826         BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
0827 
0828         slot_offset++;
0829         slot_offset &= 0x7;
0830     }
0831 
0832     return -1;
0833 }
0834 
0835 /* Called during kexec sequence with MMU off */
0836 static notrace void manual_hpte_clear_all(void)
0837 {
0838     unsigned long size_bytes = 1UL << ppc64_pft_size;
0839     unsigned long hpte_count = size_bytes >> 4;
0840     struct {
0841         unsigned long pteh;
0842         unsigned long ptel;
0843     } ptes[4];
0844     long lpar_rc;
0845     unsigned long i, j;
0846 
0847     /* Read in batches of 4,
0848      * invalidate only valid entries not in the VRMA
0849      * hpte_count will be a multiple of 4
0850          */
0851     for (i = 0; i < hpte_count; i += 4) {
0852         lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
0853         if (lpar_rc != H_SUCCESS) {
0854             pr_info("Failed to read hash page table at %ld err %ld\n",
0855                 i, lpar_rc);
0856             continue;
0857         }
0858         for (j = 0; j < 4; j++){
0859             if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
0860                 HPTE_V_VRMA_MASK)
0861                 continue;
0862             if (ptes[j].pteh & HPTE_V_VALID)
0863                 plpar_pte_remove_raw(0, i + j, 0,
0864                     &(ptes[j].pteh), &(ptes[j].ptel));
0865         }
0866     }
0867 }
0868 
0869 /* Called during kexec sequence with MMU off */
0870 static notrace int hcall_hpte_clear_all(void)
0871 {
0872     int rc;
0873 
0874     do {
0875         rc = plpar_hcall_norets(H_CLEAR_HPT);
0876     } while (rc == H_CONTINUE);
0877 
0878     return rc;
0879 }
0880 
0881 /* Called during kexec sequence with MMU off */
0882 static notrace void pseries_hpte_clear_all(void)
0883 {
0884     int rc;
0885 
0886     rc = hcall_hpte_clear_all();
0887     if (rc != H_SUCCESS)
0888         manual_hpte_clear_all();
0889 
0890 #ifdef __LITTLE_ENDIAN__
0891     /*
0892      * Reset exceptions to big endian.
0893      *
0894      * FIXME this is a hack for kexec, we need to reset the exception
0895      * endian before starting the new kernel and this is a convenient place
0896      * to do it.
0897      *
0898      * This is also called on boot when a fadump happens. In that case we
0899      * must not change the exception endian mode.
0900      */
0901     if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
0902         pseries_big_endian_exceptions();
0903 #endif
0904 }
0905 
0906 /*
0907  * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
0908  * the low 3 bits of flags happen to line up.  So no transform is needed.
0909  * We can probably optimize here and assume the high bits of newpp are
0910  * already zero.  For now I am paranoid.
0911  */
0912 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
0913                        unsigned long newpp,
0914                        unsigned long vpn,
0915                        int psize, int apsize,
0916                        int ssize, unsigned long inv_flags)
0917 {
0918     unsigned long lpar_rc;
0919     unsigned long flags;
0920     unsigned long want_v;
0921 
0922     want_v = hpte_encode_avpn(vpn, psize, ssize);
0923 
0924     flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN;
0925     flags |= (newpp & HPTE_R_KEY_HI) >> 48;
0926     if (mmu_has_feature(MMU_FTR_KERNEL_RO))
0927         /* Move pp0 into bit 8 (IBM 55) */
0928         flags |= (newpp & HPTE_R_PP0) >> 55;
0929 
0930     pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
0931          want_v, slot, flags, psize);
0932 
0933     lpar_rc = plpar_pte_protect(flags, slot, want_v);
0934 
0935     if (lpar_rc == H_NOT_FOUND) {
0936         pr_devel("not found !\n");
0937         return -1;
0938     }
0939 
0940     pr_devel("ok\n");
0941 
0942     BUG_ON(lpar_rc != H_SUCCESS);
0943 
0944     return 0;
0945 }
0946 
0947 static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
0948 {
0949     long lpar_rc;
0950     unsigned long i, j;
0951     struct {
0952         unsigned long pteh;
0953         unsigned long ptel;
0954     } ptes[4];
0955 
0956     for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
0957 
0958         lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
0959         if (lpar_rc != H_SUCCESS) {
0960             pr_info("Failed to read hash page table at %ld err %ld\n",
0961                 hpte_group, lpar_rc);
0962             continue;
0963         }
0964 
0965         for (j = 0; j < 4; j++) {
0966             if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
0967                 (ptes[j].pteh & HPTE_V_VALID))
0968                 return i + j;
0969         }
0970     }
0971 
0972     return -1;
0973 }
0974 
0975 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
0976 {
0977     long slot;
0978     unsigned long hash;
0979     unsigned long want_v;
0980     unsigned long hpte_group;
0981 
0982     hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
0983     want_v = hpte_encode_avpn(vpn, psize, ssize);
0984 
0985     /*
0986      * We try to keep bolted entries always in primary hash
0987      * But in some case we can find them in secondary too.
0988      */
0989     hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
0990     slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
0991     if (slot < 0) {
0992         /* Try in secondary */
0993         hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
0994         slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
0995         if (slot < 0)
0996             return -1;
0997     }
0998     return hpte_group + slot;
0999 }
1000 
1001 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
1002                          unsigned long ea,
1003                          int psize, int ssize)
1004 {
1005     unsigned long vpn;
1006     unsigned long lpar_rc, slot, vsid, flags;
1007 
1008     vsid = get_kernel_vsid(ea, ssize);
1009     vpn = hpt_vpn(ea, vsid, ssize);
1010 
1011     slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1012     BUG_ON(slot == -1);
1013 
1014     flags = newpp & (HPTE_R_PP | HPTE_R_N);
1015     if (mmu_has_feature(MMU_FTR_KERNEL_RO))
1016         /* Move pp0 into bit 8 (IBM 55) */
1017         flags |= (newpp & HPTE_R_PP0) >> 55;
1018 
1019     flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO);
1020 
1021     lpar_rc = plpar_pte_protect(flags, slot, 0);
1022 
1023     BUG_ON(lpar_rc != H_SUCCESS);
1024 }
1025 
1026 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
1027                      int psize, int apsize,
1028                      int ssize, int local)
1029 {
1030     unsigned long want_v;
1031     unsigned long lpar_rc;
1032     unsigned long dummy1, dummy2;
1033 
1034     pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
1035          slot, vpn, psize, local);
1036 
1037     want_v = hpte_encode_avpn(vpn, psize, ssize);
1038     lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
1039     if (lpar_rc == H_NOT_FOUND)
1040         return;
1041 
1042     BUG_ON(lpar_rc != H_SUCCESS);
1043 }
1044 
1045 
1046 /*
1047  * As defined in the PAPR's section 14.5.4.1.8
1048  * The control mask doesn't include the returned reference and change bit from
1049  * the processed PTE.
1050  */
1051 #define HBLKR_AVPN      0x0100000000000000UL
1052 #define HBLKR_CTRL_MASK     0xf800000000000000UL
1053 #define HBLKR_CTRL_SUCCESS  0x8000000000000000UL
1054 #define HBLKR_CTRL_ERRNOTFOUND  0x8800000000000000UL
1055 #define HBLKR_CTRL_ERRBUSY  0xa000000000000000UL
1056 
1057 /*
1058  * Returned true if we are supporting this block size for the specified segment
1059  * base page size and actual page size.
1060  *
1061  * Currently, we only support 8 size block.
1062  */
1063 static inline bool is_supported_hlbkrm(int bpsize, int psize)
1064 {
1065     return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
1066 }
1067 
1068 /**
1069  * H_BLOCK_REMOVE caller.
1070  * @idx should point to the latest @param entry set with a PTEX.
1071  * If PTE cannot be processed because another CPUs has already locked that
1072  * group, those entries are put back in @param starting at index 1.
1073  * If entries has to be retried and @retry_busy is set to true, these entries
1074  * are retried until success. If @retry_busy is set to false, the returned
1075  * is the number of entries yet to process.
1076  */
1077 static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
1078                        bool retry_busy)
1079 {
1080     unsigned long i, rc, new_idx;
1081     unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1082 
1083     if (idx < 2) {
1084         pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
1085         return 0;
1086     }
1087 again:
1088     new_idx = 0;
1089     if (idx > PLPAR_HCALL9_BUFSIZE) {
1090         pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
1091         idx = PLPAR_HCALL9_BUFSIZE;
1092     } else if (idx < PLPAR_HCALL9_BUFSIZE)
1093         param[idx] = HBR_END;
1094 
1095     rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
1096               param[0], /* AVA */
1097               param[1],  param[2],  param[3],  param[4], /* TS0-7 */
1098               param[5],  param[6],  param[7],  param[8]);
1099     if (rc == H_SUCCESS)
1100         return 0;
1101 
1102     BUG_ON(rc != H_PARTIAL);
1103 
1104     /* Check that the unprocessed entries were 'not found' or 'busy' */
1105     for (i = 0; i < idx-1; i++) {
1106         unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
1107 
1108         if (ctrl == HBLKR_CTRL_ERRBUSY) {
1109             param[++new_idx] = param[i+1];
1110             continue;
1111         }
1112 
1113         BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
1114                && ctrl != HBLKR_CTRL_ERRNOTFOUND);
1115     }
1116 
1117     /*
1118      * If there were entries found busy, retry these entries if requested,
1119      * of if all the entries have to be retried.
1120      */
1121     if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
1122         idx = new_idx + 1;
1123         goto again;
1124     }
1125 
1126     return new_idx;
1127 }
1128 
1129 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1130 /*
1131  * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
1132  * to make sure that we avoid bouncing the hypervisor tlbie lock.
1133  */
1134 #define PPC64_HUGE_HPTE_BATCH 12
1135 
1136 static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn,
1137                       int count, int psize, int ssize)
1138 {
1139     unsigned long param[PLPAR_HCALL9_BUFSIZE];
1140     unsigned long shift, current_vpgb, vpgb;
1141     int i, pix = 0;
1142 
1143     shift = mmu_psize_defs[psize].shift;
1144 
1145     for (i = 0; i < count; i++) {
1146         /*
1147          * Shifting 3 bits more on the right to get a
1148          * 8 pages aligned virtual addresse.
1149          */
1150         vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
1151         if (!pix || vpgb != current_vpgb) {
1152             /*
1153              * Need to start a new 8 pages block, flush
1154              * the current one if needed.
1155              */
1156             if (pix)
1157                 (void)call_block_remove(pix, param, true);
1158             current_vpgb = vpgb;
1159             param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
1160             pix = 1;
1161         }
1162 
1163         param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i];
1164         if (pix == PLPAR_HCALL9_BUFSIZE) {
1165             pix = call_block_remove(pix, param, false);
1166             /*
1167              * pix = 0 means that all the entries were
1168              * removed, we can start a new block.
1169              * Otherwise, this means that there are entries
1170              * to retry, and pix points to latest one, so
1171              * we should increment it and try to continue
1172              * the same block.
1173              */
1174             if (pix)
1175                 pix++;
1176         }
1177     }
1178     if (pix)
1179         (void)call_block_remove(pix, param, true);
1180 }
1181 
1182 static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn,
1183                      int count, int psize, int ssize)
1184 {
1185     unsigned long param[PLPAR_HCALL9_BUFSIZE];
1186     int i = 0, pix = 0, rc;
1187 
1188     for (i = 0; i < count; i++) {
1189 
1190         if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1191             pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
1192                              ssize, 0);
1193         } else {
1194             param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
1195             param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
1196             pix += 2;
1197             if (pix == 8) {
1198                 rc = plpar_hcall9(H_BULK_REMOVE, param,
1199                           param[0], param[1], param[2],
1200                           param[3], param[4], param[5],
1201                           param[6], param[7]);
1202                 BUG_ON(rc != H_SUCCESS);
1203                 pix = 0;
1204             }
1205         }
1206     }
1207     if (pix) {
1208         param[pix] = HBR_END;
1209         rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1210                   param[2], param[3], param[4], param[5],
1211                   param[6], param[7]);
1212         BUG_ON(rc != H_SUCCESS);
1213     }
1214 }
1215 
1216 static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
1217                               unsigned long *vpn,
1218                               int count, int psize,
1219                               int ssize)
1220 {
1221     unsigned long flags = 0;
1222     int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1223 
1224     if (lock_tlbie)
1225         spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1226 
1227     /* Assuming THP size is 16M */
1228     if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
1229         hugepage_block_invalidate(slot, vpn, count, psize, ssize);
1230     else
1231         hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
1232 
1233     if (lock_tlbie)
1234         spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1235 }
1236 
1237 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1238                          unsigned long addr,
1239                          unsigned char *hpte_slot_array,
1240                          int psize, int ssize, int local)
1241 {
1242     int i, index = 0;
1243     unsigned long s_addr = addr;
1244     unsigned int max_hpte_count, valid;
1245     unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
1246     unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
1247     unsigned long shift, hidx, vpn = 0, hash, slot;
1248 
1249     shift = mmu_psize_defs[psize].shift;
1250     max_hpte_count = 1U << (PMD_SHIFT - shift);
1251 
1252     for (i = 0; i < max_hpte_count; i++) {
1253         valid = hpte_valid(hpte_slot_array, i);
1254         if (!valid)
1255             continue;
1256         hidx =  hpte_hash_index(hpte_slot_array, i);
1257 
1258         /* get the vpn */
1259         addr = s_addr + (i * (1ul << shift));
1260         vpn = hpt_vpn(addr, vsid, ssize);
1261         hash = hpt_hash(vpn, shift, ssize);
1262         if (hidx & _PTEIDX_SECONDARY)
1263             hash = ~hash;
1264 
1265         slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1266         slot += hidx & _PTEIDX_GROUP_IX;
1267 
1268         slot_array[index] = slot;
1269         vpn_array[index] = vpn;
1270         if (index == PPC64_HUGE_HPTE_BATCH - 1) {
1271             /*
1272              * Now do a bluk invalidate
1273              */
1274             __pSeries_lpar_hugepage_invalidate(slot_array,
1275                                vpn_array,
1276                                PPC64_HUGE_HPTE_BATCH,
1277                                psize, ssize);
1278             index = 0;
1279         } else
1280             index++;
1281     }
1282     if (index)
1283         __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
1284                            index, psize, ssize);
1285 }
1286 #else
1287 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1288                          unsigned long addr,
1289                          unsigned char *hpte_slot_array,
1290                          int psize, int ssize, int local)
1291 {
1292     WARN(1, "%s called without THP support\n", __func__);
1293 }
1294 #endif
1295 
1296 static int pSeries_lpar_hpte_removebolted(unsigned long ea,
1297                       int psize, int ssize)
1298 {
1299     unsigned long vpn;
1300     unsigned long slot, vsid;
1301 
1302     vsid = get_kernel_vsid(ea, ssize);
1303     vpn = hpt_vpn(ea, vsid, ssize);
1304 
1305     slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1306     if (slot == -1)
1307         return -ENOENT;
1308 
1309     /*
1310      * lpar doesn't use the passed actual page size
1311      */
1312     pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
1313     return 0;
1314 }
1315 
1316 
1317 static inline unsigned long compute_slot(real_pte_t pte,
1318                      unsigned long vpn,
1319                      unsigned long index,
1320                      unsigned long shift,
1321                      int ssize)
1322 {
1323     unsigned long slot, hash, hidx;
1324 
1325     hash = hpt_hash(vpn, shift, ssize);
1326     hidx = __rpte_to_hidx(pte, index);
1327     if (hidx & _PTEIDX_SECONDARY)
1328         hash = ~hash;
1329     slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1330     slot += hidx & _PTEIDX_GROUP_IX;
1331     return slot;
1332 }
1333 
1334 /**
1335  * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
1336  * "all within the same naturally aligned 8 page virtual address block".
1337  */
1338 static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
1339                 unsigned long *param)
1340 {
1341     unsigned long vpn;
1342     unsigned long i, pix = 0;
1343     unsigned long index, shift, slot, current_vpgb, vpgb;
1344     real_pte_t pte;
1345     int psize, ssize;
1346 
1347     psize = batch->psize;
1348     ssize = batch->ssize;
1349 
1350     for (i = 0; i < number; i++) {
1351         vpn = batch->vpn[i];
1352         pte = batch->pte[i];
1353         pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1354             /*
1355              * Shifting 3 bits more on the right to get a
1356              * 8 pages aligned virtual addresse.
1357              */
1358             vpgb = (vpn >> (shift - VPN_SHIFT + 3));
1359             if (!pix || vpgb != current_vpgb) {
1360                 /*
1361                  * Need to start a new 8 pages block, flush
1362                  * the current one if needed.
1363                  */
1364                 if (pix)
1365                     (void)call_block_remove(pix, param,
1366                                 true);
1367                 current_vpgb = vpgb;
1368                 param[0] = hpte_encode_avpn(vpn, psize,
1369                                 ssize);
1370                 pix = 1;
1371             }
1372 
1373             slot = compute_slot(pte, vpn, index, shift, ssize);
1374             param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot;
1375 
1376             if (pix == PLPAR_HCALL9_BUFSIZE) {
1377                 pix = call_block_remove(pix, param, false);
1378                 /*
1379                  * pix = 0 means that all the entries were
1380                  * removed, we can start a new block.
1381                  * Otherwise, this means that there are entries
1382                  * to retry, and pix points to latest one, so
1383                  * we should increment it and try to continue
1384                  * the same block.
1385                  */
1386                 if (pix)
1387                     pix++;
1388             }
1389         } pte_iterate_hashed_end();
1390     }
1391 
1392     if (pix)
1393         (void)call_block_remove(pix, param, true);
1394 }
1395 
1396 /*
1397  * TLB Block Invalidate Characteristics
1398  *
1399  * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
1400  * is able to process for each couple segment base page size, actual page size.
1401  *
1402  * The ibm,get-system-parameter properties is returning a buffer with the
1403  * following layout:
1404  *
1405  * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
1406  * -----------------
1407  * TLB Block Invalidate Specifiers:
1408  * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
1409  * [ 1 byte Number of page sizes (N) that are supported for the specified
1410  *          TLB invalidate block size ]
1411  * [ 1 byte Encoded segment base page size and actual page size
1412  *          MSB=0 means 4k segment base page size and actual page size
1413  *          MSB=1 the penc value in mmu_psize_def ]
1414  * ...
1415  * -----------------
1416  * Next TLB Block Invalidate Specifiers...
1417  * -----------------
1418  * [ 0 ]
1419  */
1420 static inline void set_hblkrm_bloc_size(int bpsize, int psize,
1421                     unsigned int block_size)
1422 {
1423     if (block_size > hblkrm_size[bpsize][psize])
1424         hblkrm_size[bpsize][psize] = block_size;
1425 }
1426 
1427 /*
1428  * Decode the Encoded segment base page size and actual page size.
1429  * PAPR specifies:
1430  *   - bit 7 is the L bit
1431  *   - bits 0-5 are the penc value
1432  * If the L bit is 0, this means 4K segment base page size and actual page size
1433  * otherwise the penc value should be read.
1434  */
1435 #define HBLKRM_L_MASK       0x80
1436 #define HBLKRM_PENC_MASK    0x3f
1437 static inline void __init check_lp_set_hblkrm(unsigned int lp,
1438                           unsigned int block_size)
1439 {
1440     unsigned int bpsize, psize;
1441 
1442     /* First, check the L bit, if not set, this means 4K */
1443     if ((lp & HBLKRM_L_MASK) == 0) {
1444         set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
1445         return;
1446     }
1447 
1448     lp &= HBLKRM_PENC_MASK;
1449     for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
1450         struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
1451 
1452         for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
1453             if (def->penc[psize] == lp) {
1454                 set_hblkrm_bloc_size(bpsize, psize, block_size);
1455                 return;
1456             }
1457         }
1458     }
1459 }
1460 
1461 #define SPLPAR_TLB_BIC_TOKEN        50
1462 
1463 /*
1464  * The size of the TLB Block Invalidate Characteristics is variable. But at the
1465  * maximum it will be the number of possible page sizes *2 + 10 bytes.
1466  * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
1467  * (128 bytes) for the buffer to get plenty of space.
1468  */
1469 #define SPLPAR_TLB_BIC_MAXLENGTH    128
1470 
1471 void __init pseries_lpar_read_hblkrm_characteristics(void)
1472 {
1473     unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
1474     int call_status, len, idx, bpsize;
1475 
1476     if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
1477         return;
1478 
1479     spin_lock(&rtas_data_buf_lock);
1480     memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
1481     call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
1482                 NULL,
1483                 SPLPAR_TLB_BIC_TOKEN,
1484                 __pa(rtas_data_buf),
1485                 RTAS_DATA_BUF_SIZE);
1486     memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
1487     local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
1488     spin_unlock(&rtas_data_buf_lock);
1489 
1490     if (call_status != 0) {
1491         pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
1492             __FILE__, __func__, call_status);
1493         return;
1494     }
1495 
1496     /*
1497      * The first two (2) bytes of the data in the buffer are the length of
1498      * the returned data, not counting these first two (2) bytes.
1499      */
1500     len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
1501     if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
1502         pr_warn("%s too large returned buffer %d", __func__, len);
1503         return;
1504     }
1505 
1506     idx = 2;
1507     while (idx < len) {
1508         u8 block_shift = local_buffer[idx++];
1509         u32 block_size;
1510         unsigned int npsize;
1511 
1512         if (!block_shift)
1513             break;
1514 
1515         block_size = 1 << block_shift;
1516 
1517         for (npsize = local_buffer[idx++];
1518              npsize > 0 && idx < len; npsize--)
1519             check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
1520                         block_size);
1521     }
1522 
1523     for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
1524         for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
1525             if (hblkrm_size[bpsize][idx])
1526                 pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
1527                     bpsize, idx, hblkrm_size[bpsize][idx]);
1528 }
1529 
1530 /*
1531  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
1532  * lock.
1533  */
1534 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
1535 {
1536     unsigned long vpn;
1537     unsigned long i, pix, rc;
1538     unsigned long flags = 0;
1539     struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
1540     int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1541     unsigned long param[PLPAR_HCALL9_BUFSIZE];
1542     unsigned long index, shift, slot;
1543     real_pte_t pte;
1544     int psize, ssize;
1545 
1546     if (lock_tlbie)
1547         spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1548 
1549     if (is_supported_hlbkrm(batch->psize, batch->psize)) {
1550         do_block_remove(number, batch, param);
1551         goto out;
1552     }
1553 
1554     psize = batch->psize;
1555     ssize = batch->ssize;
1556     pix = 0;
1557     for (i = 0; i < number; i++) {
1558         vpn = batch->vpn[i];
1559         pte = batch->pte[i];
1560         pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1561             slot = compute_slot(pte, vpn, index, shift, ssize);
1562             if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1563                 /*
1564                  * lpar doesn't use the passed actual page size
1565                  */
1566                 pSeries_lpar_hpte_invalidate(slot, vpn, psize,
1567                                  0, ssize, local);
1568             } else {
1569                 param[pix] = HBR_REQUEST | HBR_AVPN | slot;
1570                 param[pix+1] = hpte_encode_avpn(vpn, psize,
1571                                 ssize);
1572                 pix += 2;
1573                 if (pix == 8) {
1574                     rc = plpar_hcall9(H_BULK_REMOVE, param,
1575                         param[0], param[1], param[2],
1576                         param[3], param[4], param[5],
1577                         param[6], param[7]);
1578                     BUG_ON(rc != H_SUCCESS);
1579                     pix = 0;
1580                 }
1581             }
1582         } pte_iterate_hashed_end();
1583     }
1584     if (pix) {
1585         param[pix] = HBR_END;
1586         rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1587                   param[2], param[3], param[4], param[5],
1588                   param[6], param[7]);
1589         BUG_ON(rc != H_SUCCESS);
1590     }
1591 
1592 out:
1593     if (lock_tlbie)
1594         spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1595 }
1596 
1597 static int __init disable_bulk_remove(char *str)
1598 {
1599     if (strcmp(str, "off") == 0 &&
1600         firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1601         pr_info("Disabling BULK_REMOVE firmware feature");
1602         powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
1603     }
1604     return 1;
1605 }
1606 
1607 __setup("bulk_remove=", disable_bulk_remove);
1608 
1609 #define HPT_RESIZE_TIMEOUT  10000 /* ms */
1610 
1611 struct hpt_resize_state {
1612     unsigned long shift;
1613     int commit_rc;
1614 };
1615 
1616 static int pseries_lpar_resize_hpt_commit(void *data)
1617 {
1618     struct hpt_resize_state *state = data;
1619 
1620     state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
1621     if (state->commit_rc != H_SUCCESS)
1622         return -EIO;
1623 
1624     /* Hypervisor has transitioned the HTAB, update our globals */
1625     ppc64_pft_size = state->shift;
1626     htab_size_bytes = 1UL << ppc64_pft_size;
1627     htab_hash_mask = (htab_size_bytes >> 7) - 1;
1628 
1629     return 0;
1630 }
1631 
1632 /*
1633  * Must be called in process context. The caller must hold the
1634  * cpus_lock.
1635  */
1636 static int pseries_lpar_resize_hpt(unsigned long shift)
1637 {
1638     struct hpt_resize_state state = {
1639         .shift = shift,
1640         .commit_rc = H_FUNCTION,
1641     };
1642     unsigned int delay, total_delay = 0;
1643     int rc;
1644     ktime_t t0, t1, t2;
1645 
1646     might_sleep();
1647 
1648     if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1649         return -ENODEV;
1650 
1651     pr_info("Attempting to resize HPT to shift %lu\n", shift);
1652 
1653     t0 = ktime_get();
1654 
1655     rc = plpar_resize_hpt_prepare(0, shift);
1656     while (H_IS_LONG_BUSY(rc)) {
1657         delay = get_longbusy_msecs(rc);
1658         total_delay += delay;
1659         if (total_delay > HPT_RESIZE_TIMEOUT) {
1660             /* prepare with shift==0 cancels an in-progress resize */
1661             rc = plpar_resize_hpt_prepare(0, 0);
1662             if (rc != H_SUCCESS)
1663                 pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
1664                        rc);
1665             return -ETIMEDOUT;
1666         }
1667         msleep(delay);
1668         rc = plpar_resize_hpt_prepare(0, shift);
1669     }
1670 
1671     switch (rc) {
1672     case H_SUCCESS:
1673         /* Continue on */
1674         break;
1675 
1676     case H_PARAMETER:
1677         pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
1678         return -EINVAL;
1679     case H_RESOURCE:
1680         pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
1681         return -EPERM;
1682     default:
1683         pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
1684         return -EIO;
1685     }
1686 
1687     t1 = ktime_get();
1688 
1689     rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
1690                      &state, NULL);
1691 
1692     t2 = ktime_get();
1693 
1694     if (rc != 0) {
1695         switch (state.commit_rc) {
1696         case H_PTEG_FULL:
1697             return -ENOSPC;
1698 
1699         default:
1700             pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
1701                 state.commit_rc);
1702             return -EIO;
1703         };
1704     }
1705 
1706     pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
1707         shift, (long long) ktime_ms_delta(t1, t0),
1708         (long long) ktime_ms_delta(t2, t1));
1709 
1710     return 0;
1711 }
1712 
1713 void __init hpte_init_pseries(void)
1714 {
1715     mmu_hash_ops.hpte_invalidate     = pSeries_lpar_hpte_invalidate;
1716     mmu_hash_ops.hpte_updatepp   = pSeries_lpar_hpte_updatepp;
1717     mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
1718     mmu_hash_ops.hpte_insert     = pSeries_lpar_hpte_insert;
1719     mmu_hash_ops.hpte_remove     = pSeries_lpar_hpte_remove;
1720     mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
1721     mmu_hash_ops.flush_hash_range    = pSeries_lpar_flush_hash_range;
1722     mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
1723     mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
1724 
1725     if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1726         mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
1727 
1728     /*
1729      * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
1730      * to inform the hypervisor that we wish to use the HPT.
1731      */
1732     if (cpu_has_feature(CPU_FTR_ARCH_300))
1733         pseries_lpar_register_process_table(0, 0, 0);
1734 }
1735 #endif /* CONFIG_PPC_64S_HASH_MMU */
1736 
1737 #ifdef CONFIG_PPC_RADIX_MMU
1738 void __init radix_init_pseries(void)
1739 {
1740     pr_info("Using radix MMU under hypervisor\n");
1741 
1742     pseries_lpar_register_process_table(__pa(process_tb),
1743                         0, PRTB_SIZE_SHIFT - 12);
1744 }
1745 #endif
1746 
1747 #ifdef CONFIG_PPC_SMLPAR
1748 #define CMO_FREE_HINT_DEFAULT 1
1749 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
1750 
1751 static int __init cmo_free_hint(char *str)
1752 {
1753     char *parm;
1754     parm = strstrip(str);
1755 
1756     if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
1757         pr_info("%s: CMO free page hinting is not active.\n", __func__);
1758         cmo_free_hint_flag = 0;
1759         return 1;
1760     }
1761 
1762     cmo_free_hint_flag = 1;
1763     pr_info("%s: CMO free page hinting is active.\n", __func__);
1764 
1765     if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
1766         return 1;
1767 
1768     return 0;
1769 }
1770 
1771 __setup("cmo_free_hint=", cmo_free_hint);
1772 
1773 static void pSeries_set_page_state(struct page *page, int order,
1774                    unsigned long state)
1775 {
1776     int i, j;
1777     unsigned long cmo_page_sz, addr;
1778 
1779     cmo_page_sz = cmo_get_page_size();
1780     addr = __pa((unsigned long)page_address(page));
1781 
1782     for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
1783         for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
1784             plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
1785     }
1786 }
1787 
1788 void arch_free_page(struct page *page, int order)
1789 {
1790     if (radix_enabled())
1791         return;
1792     if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
1793         return;
1794 
1795     pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
1796 }
1797 EXPORT_SYMBOL(arch_free_page);
1798 
1799 #endif /* CONFIG_PPC_SMLPAR */
1800 #endif /* CONFIG_PPC_BOOK3S_64 */
1801 
1802 #ifdef CONFIG_TRACEPOINTS
1803 #ifdef CONFIG_JUMP_LABEL
1804 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
1805 
1806 int hcall_tracepoint_regfunc(void)
1807 {
1808     static_key_slow_inc(&hcall_tracepoint_key);
1809     return 0;
1810 }
1811 
1812 void hcall_tracepoint_unregfunc(void)
1813 {
1814     static_key_slow_dec(&hcall_tracepoint_key);
1815 }
1816 #else
1817 /*
1818  * We optimise our hcall path by placing hcall_tracepoint_refcount
1819  * directly in the TOC so we can check if the hcall tracepoints are
1820  * enabled via a single load.
1821  */
1822 
1823 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
1824 extern long hcall_tracepoint_refcount;
1825 
1826 int hcall_tracepoint_regfunc(void)
1827 {
1828     hcall_tracepoint_refcount++;
1829     return 0;
1830 }
1831 
1832 void hcall_tracepoint_unregfunc(void)
1833 {
1834     hcall_tracepoint_refcount--;
1835 }
1836 #endif
1837 
1838 /*
1839  * Keep track of hcall tracing depth and prevent recursion. Warn if any is
1840  * detected because it may indicate a problem. This will not catch all
1841  * problems with tracing code making hcalls, because the tracing might have
1842  * been invoked from a non-hcall, so the first hcall could recurse into it
1843  * without warning here, but this better than nothing.
1844  *
1845  * Hcalls with specific problems being traced should use the _notrace
1846  * plpar_hcall variants.
1847  */
1848 static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
1849 
1850 
1851 notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
1852 {
1853     unsigned long flags;
1854     unsigned int *depth;
1855 
1856     local_irq_save(flags);
1857 
1858     depth = this_cpu_ptr(&hcall_trace_depth);
1859 
1860     if (WARN_ON_ONCE(*depth))
1861         goto out;
1862 
1863     (*depth)++;
1864     preempt_disable();
1865     trace_hcall_entry(opcode, args);
1866     (*depth)--;
1867 
1868 out:
1869     local_irq_restore(flags);
1870 }
1871 
1872 notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
1873 {
1874     unsigned long flags;
1875     unsigned int *depth;
1876 
1877     local_irq_save(flags);
1878 
1879     depth = this_cpu_ptr(&hcall_trace_depth);
1880 
1881     if (*depth) /* Don't warn again on the way out */
1882         goto out;
1883 
1884     (*depth)++;
1885     trace_hcall_exit(opcode, retval, retbuf);
1886     preempt_enable();
1887     (*depth)--;
1888 
1889 out:
1890     local_irq_restore(flags);
1891 }
1892 #endif
1893 
1894 /**
1895  * h_get_mpp
1896  * H_GET_MPP hcall returns info in 7 parms
1897  */
1898 int h_get_mpp(struct hvcall_mpp_data *mpp_data)
1899 {
1900     int rc;
1901     unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1902 
1903     rc = plpar_hcall9(H_GET_MPP, retbuf);
1904 
1905     mpp_data->entitled_mem = retbuf[0];
1906     mpp_data->mapped_mem = retbuf[1];
1907 
1908     mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
1909     mpp_data->pool_num = retbuf[2] & 0xffff;
1910 
1911     mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
1912     mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
1913     mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
1914 
1915     mpp_data->pool_size = retbuf[4];
1916     mpp_data->loan_request = retbuf[5];
1917     mpp_data->backing_mem = retbuf[6];
1918 
1919     return rc;
1920 }
1921 EXPORT_SYMBOL(h_get_mpp);
1922 
1923 int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
1924 {
1925     int rc;
1926     unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
1927 
1928     rc = plpar_hcall9(H_GET_MPP_X, retbuf);
1929 
1930     mpp_x_data->coalesced_bytes = retbuf[0];
1931     mpp_x_data->pool_coalesced_bytes = retbuf[1];
1932     mpp_x_data->pool_purr_cycles = retbuf[2];
1933     mpp_x_data->pool_spurr_cycles = retbuf[3];
1934 
1935     return rc;
1936 }
1937 
1938 #ifdef CONFIG_PPC_64S_HASH_MMU
1939 static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize)
1940 {
1941     unsigned long protovsid;
1942     unsigned long va_bits = VA_BITS;
1943     unsigned long modinv, vsid_modulus;
1944     unsigned long max_mod_inv, tmp_modinv;
1945 
1946     if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
1947         va_bits = 65;
1948 
1949     if (ssize == MMU_SEGSIZE_256M) {
1950         modinv = VSID_MULINV_256M;
1951         vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
1952     } else {
1953         modinv = VSID_MULINV_1T;
1954         vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
1955     }
1956 
1957     /*
1958      * vsid outside our range.
1959      */
1960     if (vsid >= vsid_modulus)
1961         return 0;
1962 
1963     /*
1964      * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
1965      * and vsid = (protovsid * x) % vsid_modulus, then we say:
1966      *   protovsid = (vsid * modinv) % vsid_modulus
1967      */
1968 
1969     /* Check if (vsid * modinv) overflow (63 bits) */
1970     max_mod_inv = 0x7fffffffffffffffull / vsid;
1971     if (modinv < max_mod_inv)
1972         return (vsid * modinv) % vsid_modulus;
1973 
1974     tmp_modinv = modinv/max_mod_inv;
1975     modinv %= max_mod_inv;
1976 
1977     protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
1978     protovsid = (protovsid + vsid * modinv) % vsid_modulus;
1979 
1980     return protovsid;
1981 }
1982 
1983 static int __init reserve_vrma_context_id(void)
1984 {
1985     unsigned long protovsid;
1986 
1987     /*
1988      * Reserve context ids which map to reserved virtual addresses. For now
1989      * we only reserve the context id which maps to the VRMA VSID. We ignore
1990      * the addresses in "ibm,adjunct-virtual-addresses" because we don't
1991      * enable adjunct support via the "ibm,client-architecture-support"
1992      * interface.
1993      */
1994     protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
1995     hash__reserve_context_id(protovsid >> ESID_BITS_1T);
1996     return 0;
1997 }
1998 machine_device_initcall(pseries, reserve_vrma_context_id);
1999 #endif
2000 
2001 #ifdef CONFIG_DEBUG_FS
2002 /* debugfs file interface for vpa data */
2003 static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
2004                   loff_t *pos)
2005 {
2006     int cpu = (long)filp->private_data;
2007     struct lppaca *lppaca = &lppaca_of(cpu);
2008 
2009     return simple_read_from_buffer(buf, len, pos, lppaca,
2010                 sizeof(struct lppaca));
2011 }
2012 
2013 static const struct file_operations vpa_fops = {
2014     .open       = simple_open,
2015     .read       = vpa_file_read,
2016     .llseek     = default_llseek,
2017 };
2018 
2019 static int __init vpa_debugfs_init(void)
2020 {
2021     char name[16];
2022     long i;
2023     struct dentry *vpa_dir;
2024 
2025     if (!firmware_has_feature(FW_FEATURE_SPLPAR))
2026         return 0;
2027 
2028     vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir);
2029 
2030     /* set up the per-cpu vpa file*/
2031     for_each_possible_cpu(i) {
2032         sprintf(name, "cpu-%ld", i);
2033         debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops);
2034     }
2035 
2036     return 0;
2037 }
2038 machine_arch_initcall(pseries, vpa_debugfs_init);
2039 #endif /* CONFIG_DEBUG_FS */