Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Performance event support for s390x - CPU-measurement Counter Facility
0004  *
0005  *  Copyright IBM Corp. 2012, 2021
0006  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
0007  *         Thomas Richter <tmricht@linux.ibm.com>
0008  */
0009 #define KMSG_COMPONENT  "cpum_cf"
0010 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
0011 
0012 #include <linux/kernel.h>
0013 #include <linux/kernel_stat.h>
0014 #include <linux/percpu.h>
0015 #include <linux/notifier.h>
0016 #include <linux/init.h>
0017 #include <linux/export.h>
0018 #include <linux/miscdevice.h>
0019 
0020 #include <asm/cpu_mcf.h>
0021 #include <asm/hwctrset.h>
0022 #include <asm/debug.h>
0023 
0024 static unsigned int cfdiag_cpu_speed;   /* CPU speed for CF_DIAG trailer */
0025 static debug_info_t *cf_dbg;
0026 
0027 #define CF_DIAG_CTRSET_DEF      0xfeef  /* Counter set header mark */
0028                         /* interval in seconds */
0029 
0030 /* Counter sets are stored as data stream in a page sized memory buffer and
0031  * exported to user space via raw data attached to the event sample data.
0032  * Each counter set starts with an eight byte header consisting of:
0033  * - a two byte eye catcher (0xfeef)
0034  * - a one byte counter set number
0035  * - a two byte counter set size (indicates the number of counters in this set)
0036  * - a three byte reserved value (must be zero) to make the header the same
0037  *   size as a counter value.
0038  * All counter values are eight byte in size.
0039  *
0040  * All counter sets are followed by a 64 byte trailer.
0041  * The trailer consists of a:
0042  * - flag field indicating valid fields when corresponding bit set
0043  * - the counter facility first and second version number
0044  * - the CPU speed if nonzero
0045  * - the time stamp the counter sets have been collected
0046  * - the time of day (TOD) base value
0047  * - the machine type.
0048  *
0049  * The counter sets are saved when the process is prepared to be executed on a
0050  * CPU and saved again when the process is going to be removed from a CPU.
0051  * The difference of both counter sets are calculated and stored in the event
0052  * sample data area.
0053  */
0054 struct cf_ctrset_entry {    /* CPU-M CF counter set entry (8 byte) */
0055     unsigned int def:16;    /* 0-15  Data Entry Format */
0056     unsigned int set:16;    /* 16-31 Counter set identifier */
0057     unsigned int ctr:16;    /* 32-47 Number of stored counters */
0058     unsigned int res1:16;   /* 48-63 Reserved */
0059 };
0060 
0061 struct cf_trailer_entry {   /* CPU-M CF_DIAG trailer (64 byte) */
0062     /* 0 - 7 */
0063     union {
0064         struct {
0065             unsigned int clock_base:1;  /* TOD clock base set */
0066             unsigned int speed:1;       /* CPU speed set */
0067             /* Measurement alerts */
0068             unsigned int mtda:1;    /* Loss of MT ctr. data alert */
0069             unsigned int caca:1;    /* Counter auth. change alert */
0070             unsigned int lcda:1;    /* Loss of counter data alert */
0071         };
0072         unsigned long flags;    /* 0-63    All indicators */
0073     };
0074     /* 8 - 15 */
0075     unsigned int cfvn:16;           /* 64-79   Ctr First Version */
0076     unsigned int csvn:16;           /* 80-95   Ctr Second Version */
0077     unsigned int cpu_speed:32;      /* 96-127  CPU speed */
0078     /* 16 - 23 */
0079     unsigned long timestamp;        /* 128-191 Timestamp (TOD) */
0080     /* 24 - 55 */
0081     union {
0082         struct {
0083             unsigned long progusage1;
0084             unsigned long progusage2;
0085             unsigned long progusage3;
0086             unsigned long tod_base;
0087         };
0088         unsigned long progusage[4];
0089     };
0090     /* 56 - 63 */
0091     unsigned int mach_type:16;      /* Machine type */
0092     unsigned int res1:16;           /* Reserved */
0093     unsigned int res2:32;           /* Reserved */
0094 };
0095 
0096 /* Create the trailer data at the end of a page. */
0097 static void cfdiag_trailer(struct cf_trailer_entry *te)
0098 {
0099     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0100     struct cpuid cpuid;
0101 
0102     te->cfvn = cpuhw->info.cfvn;        /* Counter version numbers */
0103     te->csvn = cpuhw->info.csvn;
0104 
0105     get_cpu_id(&cpuid);         /* Machine type */
0106     te->mach_type = cpuid.machine;
0107     te->cpu_speed = cfdiag_cpu_speed;
0108     if (te->cpu_speed)
0109         te->speed = 1;
0110     te->clock_base = 1;         /* Save clock base */
0111     te->tod_base = tod_clock_base.tod;
0112     te->timestamp = get_tod_clock_fast();
0113 }
0114 
0115 /* Read a counter set. The counter set number determines the counter set and
0116  * the CPUM-CF first and second version number determine the number of
0117  * available counters in each counter set.
0118  * Each counter set starts with header containing the counter set number and
0119  * the number of eight byte counters.
0120  *
0121  * The functions returns the number of bytes occupied by this counter set
0122  * including the header.
0123  * If there is no counter in the counter set, this counter set is useless and
0124  * zero is returned on this case.
0125  *
0126  * Note that the counter sets may not be enabled or active and the stcctm
0127  * instruction might return error 3. Depending on error_ok value this is ok,
0128  * for example when called from cpumf_pmu_start() call back function.
0129  */
0130 static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
0131                    size_t room, bool error_ok)
0132 {
0133     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0134     size_t ctrset_size, need = 0;
0135     int rc = 3;             /* Assume write failure */
0136 
0137     ctrdata->def = CF_DIAG_CTRSET_DEF;
0138     ctrdata->set = ctrset;
0139     ctrdata->res1 = 0;
0140     ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
0141 
0142     if (ctrset_size) {          /* Save data */
0143         need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
0144         if (need <= room) {
0145             rc = ctr_stcctm(ctrset, ctrset_size,
0146                     (u64 *)(ctrdata + 1));
0147         }
0148         if (rc != 3 || error_ok)
0149             ctrdata->ctr = ctrset_size;
0150         else
0151             need = 0;
0152     }
0153 
0154     debug_sprintf_event(cf_dbg, 3,
0155                 "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
0156                 " need %zd rc %d\n", __func__, ctrset, ctrset_size,
0157                 cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
0158     return need;
0159 }
0160 
0161 static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
0162     [CPUMF_CTR_SET_BASIC]   = 0x02,
0163     [CPUMF_CTR_SET_USER]    = 0x04,
0164     [CPUMF_CTR_SET_CRYPTO]  = 0x08,
0165     [CPUMF_CTR_SET_EXT] = 0x01,
0166     [CPUMF_CTR_SET_MT_DIAG] = 0x20,
0167 };
0168 
0169 /* Read out all counter sets and save them in the provided data buffer.
0170  * The last 64 byte host an artificial trailer entry.
0171  */
0172 static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
0173                 bool error_ok)
0174 {
0175     struct cf_trailer_entry *trailer;
0176     size_t offset = 0, done;
0177     int i;
0178 
0179     memset(data, 0, sz);
0180     sz -= sizeof(*trailer);     /* Always room for trailer */
0181     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
0182         struct cf_ctrset_entry *ctrdata = data + offset;
0183 
0184         if (!(auth & cpumf_ctr_ctl[i]))
0185             continue;   /* Counter set not authorized */
0186 
0187         done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
0188         offset += done;
0189     }
0190     trailer = data + offset;
0191     cfdiag_trailer(trailer);
0192     return offset + sizeof(*trailer);
0193 }
0194 
0195 /* Calculate the difference for each counter in a counter set. */
0196 static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
0197 {
0198     for (; --counters >= 0; ++pstart, ++pstop)
0199         if (*pstop >= *pstart)
0200             *pstop -= *pstart;
0201         else
0202             *pstop = *pstart - *pstop + 1;
0203 }
0204 
0205 /* Scan the counter sets and calculate the difference of each counter
0206  * in each set. The result is the increment of each counter during the
0207  * period the counter set has been activated.
0208  *
0209  * Return true on success.
0210  */
0211 static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
0212 {
0213     struct cf_trailer_entry *trailer_start, *trailer_stop;
0214     struct cf_ctrset_entry *ctrstart, *ctrstop;
0215     size_t offset = 0;
0216 
0217     auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
0218     do {
0219         ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
0220         ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
0221 
0222         if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
0223             pr_err_once("cpum_cf_diag counter set compare error "
0224                     "in set %i\n", ctrstart->set);
0225             return 0;
0226         }
0227         auth &= ~cpumf_ctr_ctl[ctrstart->set];
0228         if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
0229             cfdiag_diffctrset((u64 *)(ctrstart + 1),
0230                       (u64 *)(ctrstop + 1), ctrstart->ctr);
0231             offset += ctrstart->ctr * sizeof(u64) +
0232                             sizeof(*ctrstart);
0233         }
0234     } while (ctrstart->def && auth);
0235 
0236     /* Save time_stamp from start of event in stop's trailer */
0237     trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
0238     trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
0239     trailer_stop->progusage[0] = trailer_start->timestamp;
0240 
0241     return 1;
0242 }
0243 
0244 static enum cpumf_ctr_set get_counter_set(u64 event)
0245 {
0246     int set = CPUMF_CTR_SET_MAX;
0247 
0248     if (event < 32)
0249         set = CPUMF_CTR_SET_BASIC;
0250     else if (event < 64)
0251         set = CPUMF_CTR_SET_USER;
0252     else if (event < 128)
0253         set = CPUMF_CTR_SET_CRYPTO;
0254     else if (event < 288)
0255         set = CPUMF_CTR_SET_EXT;
0256     else if (event >= 448 && event < 496)
0257         set = CPUMF_CTR_SET_MT_DIAG;
0258 
0259     return set;
0260 }
0261 
0262 static int validate_ctr_version(const struct hw_perf_event *hwc,
0263                 enum cpumf_ctr_set set)
0264 {
0265     struct cpu_cf_events *cpuhw;
0266     int err = 0;
0267     u16 mtdiag_ctl;
0268 
0269     cpuhw = &get_cpu_var(cpu_cf_events);
0270 
0271     /* check required version for counter sets */
0272     switch (set) {
0273     case CPUMF_CTR_SET_BASIC:
0274     case CPUMF_CTR_SET_USER:
0275         if (cpuhw->info.cfvn < 1)
0276             err = -EOPNOTSUPP;
0277         break;
0278     case CPUMF_CTR_SET_CRYPTO:
0279         if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 &&
0280              hwc->config > 79) ||
0281             (cpuhw->info.csvn >= 6 && hwc->config > 83))
0282             err = -EOPNOTSUPP;
0283         break;
0284     case CPUMF_CTR_SET_EXT:
0285         if (cpuhw->info.csvn < 1)
0286             err = -EOPNOTSUPP;
0287         if ((cpuhw->info.csvn == 1 && hwc->config > 159) ||
0288             (cpuhw->info.csvn == 2 && hwc->config > 175) ||
0289             (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5
0290              && hwc->config > 255) ||
0291             (cpuhw->info.csvn >= 6 && hwc->config > 287))
0292             err = -EOPNOTSUPP;
0293         break;
0294     case CPUMF_CTR_SET_MT_DIAG:
0295         if (cpuhw->info.csvn <= 3)
0296             err = -EOPNOTSUPP;
0297         /*
0298          * MT-diagnostic counters are read-only.  The counter set
0299          * is automatically enabled and activated on all CPUs with
0300          * multithreading (SMT).  Deactivation of multithreading
0301          * also disables the counter set.  State changes are ignored
0302          * by lcctl().  Because Linux controls SMT enablement through
0303          * a kernel parameter only, the counter set is either disabled
0304          * or enabled and active.
0305          *
0306          * Thus, the counters can only be used if SMT is on and the
0307          * counter set is enabled and active.
0308          */
0309         mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG];
0310         if (!((cpuhw->info.auth_ctl & mtdiag_ctl) &&
0311               (cpuhw->info.enable_ctl & mtdiag_ctl) &&
0312               (cpuhw->info.act_ctl & mtdiag_ctl)))
0313             err = -EOPNOTSUPP;
0314         break;
0315     case CPUMF_CTR_SET_MAX:
0316         err = -EOPNOTSUPP;
0317     }
0318 
0319     put_cpu_var(cpu_cf_events);
0320     return err;
0321 }
0322 
0323 static int validate_ctr_auth(const struct hw_perf_event *hwc)
0324 {
0325     struct cpu_cf_events *cpuhw;
0326     int err = 0;
0327 
0328     cpuhw = &get_cpu_var(cpu_cf_events);
0329 
0330     /* Check authorization for cpu counter sets.
0331      * If the particular CPU counter set is not authorized,
0332      * return with -ENOENT in order to fall back to other
0333      * PMUs that might suffice the event request.
0334      */
0335     if (!(hwc->config_base & cpuhw->info.auth_ctl))
0336         err = -ENOENT;
0337 
0338     put_cpu_var(cpu_cf_events);
0339     return err;
0340 }
0341 
0342 /*
0343  * Change the CPUMF state to active.
0344  * Enable and activate the CPU-counter sets according
0345  * to the per-cpu control state.
0346  */
0347 static void cpumf_pmu_enable(struct pmu *pmu)
0348 {
0349     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0350     int err;
0351 
0352     if (cpuhw->flags & PMU_F_ENABLED)
0353         return;
0354 
0355     err = lcctl(cpuhw->state | cpuhw->dev_state);
0356     if (err) {
0357         pr_err("Enabling the performance measuring unit "
0358                "failed with rc=%x\n", err);
0359         return;
0360     }
0361 
0362     cpuhw->flags |= PMU_F_ENABLED;
0363 }
0364 
0365 /*
0366  * Change the CPUMF state to inactive.
0367  * Disable and enable (inactive) the CPU-counter sets according
0368  * to the per-cpu control state.
0369  */
0370 static void cpumf_pmu_disable(struct pmu *pmu)
0371 {
0372     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0373     int err;
0374     u64 inactive;
0375 
0376     if (!(cpuhw->flags & PMU_F_ENABLED))
0377         return;
0378 
0379     inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
0380     inactive |= cpuhw->dev_state;
0381     err = lcctl(inactive);
0382     if (err) {
0383         pr_err("Disabling the performance measuring unit "
0384                "failed with rc=%x\n", err);
0385         return;
0386     }
0387 
0388     cpuhw->flags &= ~PMU_F_ENABLED;
0389 }
0390 
0391 
0392 /* Number of perf events counting hardware events */
0393 static atomic_t num_events = ATOMIC_INIT(0);
0394 /* Used to avoid races in calling reserve/release_cpumf_hardware */
0395 static DEFINE_MUTEX(pmc_reserve_mutex);
0396 
0397 /* Release the PMU if event is the last perf event */
0398 static void hw_perf_event_destroy(struct perf_event *event)
0399 {
0400     if (!atomic_add_unless(&num_events, -1, 1)) {
0401         mutex_lock(&pmc_reserve_mutex);
0402         if (atomic_dec_return(&num_events) == 0)
0403             __kernel_cpumcf_end();
0404         mutex_unlock(&pmc_reserve_mutex);
0405     }
0406 }
0407 
0408 /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */
0409 static const int cpumf_generic_events_basic[] = {
0410     [PERF_COUNT_HW_CPU_CYCLES]      = 0,
0411     [PERF_COUNT_HW_INSTRUCTIONS]        = 1,
0412     [PERF_COUNT_HW_CACHE_REFERENCES]    = -1,
0413     [PERF_COUNT_HW_CACHE_MISSES]        = -1,
0414     [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1,
0415     [PERF_COUNT_HW_BRANCH_MISSES]       = -1,
0416     [PERF_COUNT_HW_BUS_CYCLES]      = -1,
0417 };
0418 /* CPUMF <-> perf event mappings for userspace (problem-state set) */
0419 static const int cpumf_generic_events_user[] = {
0420     [PERF_COUNT_HW_CPU_CYCLES]      = 32,
0421     [PERF_COUNT_HW_INSTRUCTIONS]        = 33,
0422     [PERF_COUNT_HW_CACHE_REFERENCES]    = -1,
0423     [PERF_COUNT_HW_CACHE_MISSES]        = -1,
0424     [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1,
0425     [PERF_COUNT_HW_BRANCH_MISSES]       = -1,
0426     [PERF_COUNT_HW_BUS_CYCLES]      = -1,
0427 };
0428 
0429 static void cpumf_hw_inuse(void)
0430 {
0431     mutex_lock(&pmc_reserve_mutex);
0432     if (atomic_inc_return(&num_events) == 1)
0433         __kernel_cpumcf_begin();
0434     mutex_unlock(&pmc_reserve_mutex);
0435 }
0436 
0437 static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
0438 {
0439     struct perf_event_attr *attr = &event->attr;
0440     struct hw_perf_event *hwc = &event->hw;
0441     enum cpumf_ctr_set set;
0442     int err = 0;
0443     u64 ev;
0444 
0445     switch (type) {
0446     case PERF_TYPE_RAW:
0447         /* Raw events are used to access counters directly,
0448          * hence do not permit excludes */
0449         if (attr->exclude_kernel || attr->exclude_user ||
0450             attr->exclude_hv)
0451             return -EOPNOTSUPP;
0452         ev = attr->config;
0453         break;
0454 
0455     case PERF_TYPE_HARDWARE:
0456         if (is_sampling_event(event))   /* No sampling support */
0457             return -ENOENT;
0458         ev = attr->config;
0459         /* Count user space (problem-state) only */
0460         if (!attr->exclude_user && attr->exclude_kernel) {
0461             if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
0462                 return -EOPNOTSUPP;
0463             ev = cpumf_generic_events_user[ev];
0464 
0465         /* No support for kernel space counters only */
0466         } else if (!attr->exclude_kernel && attr->exclude_user) {
0467             return -EOPNOTSUPP;
0468         } else {    /* Count user and kernel space */
0469             if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
0470                 return -EOPNOTSUPP;
0471             ev = cpumf_generic_events_basic[ev];
0472         }
0473         break;
0474 
0475     default:
0476         return -ENOENT;
0477     }
0478 
0479     if (ev == -1)
0480         return -ENOENT;
0481 
0482     if (ev > PERF_CPUM_CF_MAX_CTR)
0483         return -ENOENT;
0484 
0485     /* Obtain the counter set to which the specified counter belongs */
0486     set = get_counter_set(ev);
0487     switch (set) {
0488     case CPUMF_CTR_SET_BASIC:
0489     case CPUMF_CTR_SET_USER:
0490     case CPUMF_CTR_SET_CRYPTO:
0491     case CPUMF_CTR_SET_EXT:
0492     case CPUMF_CTR_SET_MT_DIAG:
0493         /*
0494          * Use the hardware perf event structure to store the
0495          * counter number in the 'config' member and the counter
0496          * set number in the 'config_base' as bit mask.
0497          * It is later used to enable/disable the counter(s).
0498          */
0499         hwc->config = ev;
0500         hwc->config_base = cpumf_ctr_ctl[set];
0501         break;
0502     case CPUMF_CTR_SET_MAX:
0503         /* The counter could not be associated to a counter set */
0504         return -EINVAL;
0505     }
0506 
0507     /* Initialize for using the CPU-measurement counter facility */
0508     cpumf_hw_inuse();
0509     event->destroy = hw_perf_event_destroy;
0510 
0511     /* Finally, validate version and authorization of the counter set */
0512     err = validate_ctr_auth(hwc);
0513     if (!err)
0514         err = validate_ctr_version(hwc, set);
0515 
0516     return err;
0517 }
0518 
0519 /* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different
0520  * attribute::type values:
0521  * - PERF_TYPE_HARDWARE:
0522  * - pmu->type:
0523  * Handle both type of invocations identical. They address the same hardware.
0524  * The result is different when event modifiers exclude_kernel and/or
0525  * exclude_user are also set.
0526  */
0527 static int cpumf_pmu_event_type(struct perf_event *event)
0528 {
0529     u64 ev = event->attr.config;
0530 
0531     if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev ||
0532         cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev ||
0533         cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
0534         cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev)
0535         return PERF_TYPE_HARDWARE;
0536     return PERF_TYPE_RAW;
0537 }
0538 
0539 static int cpumf_pmu_event_init(struct perf_event *event)
0540 {
0541     unsigned int type = event->attr.type;
0542     int err;
0543 
0544     if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW)
0545         err = __hw_perf_event_init(event, type);
0546     else if (event->pmu->type == type)
0547         /* Registered as unknown PMU */
0548         err = __hw_perf_event_init(event, cpumf_pmu_event_type(event));
0549     else
0550         return -ENOENT;
0551 
0552     if (unlikely(err) && event->destroy)
0553         event->destroy(event);
0554 
0555     return err;
0556 }
0557 
0558 static int hw_perf_event_reset(struct perf_event *event)
0559 {
0560     u64 prev, new;
0561     int err;
0562 
0563     do {
0564         prev = local64_read(&event->hw.prev_count);
0565         err = ecctr(event->hw.config, &new);
0566         if (err) {
0567             if (err != 3)
0568                 break;
0569             /* The counter is not (yet) available. This
0570              * might happen if the counter set to which
0571              * this counter belongs is in the disabled
0572              * state.
0573              */
0574             new = 0;
0575         }
0576     } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
0577 
0578     return err;
0579 }
0580 
0581 static void hw_perf_event_update(struct perf_event *event)
0582 {
0583     u64 prev, new, delta;
0584     int err;
0585 
0586     do {
0587         prev = local64_read(&event->hw.prev_count);
0588         err = ecctr(event->hw.config, &new);
0589         if (err)
0590             return;
0591     } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
0592 
0593     delta = (prev <= new) ? new - prev
0594                   : (-1ULL - prev) + new + 1;    /* overflow */
0595     local64_add(delta, &event->count);
0596 }
0597 
0598 static void cpumf_pmu_read(struct perf_event *event)
0599 {
0600     if (event->hw.state & PERF_HES_STOPPED)
0601         return;
0602 
0603     hw_perf_event_update(event);
0604 }
0605 
0606 static void cpumf_pmu_start(struct perf_event *event, int flags)
0607 {
0608     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0609     struct hw_perf_event *hwc = &event->hw;
0610     int i;
0611 
0612     if (!(hwc->state & PERF_HES_STOPPED))
0613         return;
0614 
0615     hwc->state = 0;
0616 
0617     /* (Re-)enable and activate the counter set */
0618     ctr_set_enable(&cpuhw->state, hwc->config_base);
0619     ctr_set_start(&cpuhw->state, hwc->config_base);
0620 
0621     /* The counter set to which this counter belongs can be already active.
0622      * Because all counters in a set are active, the event->hw.prev_count
0623      * needs to be synchronized.  At this point, the counter set can be in
0624      * the inactive or disabled state.
0625      */
0626     if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
0627         cpuhw->usedss = cfdiag_getctr(cpuhw->start,
0628                           sizeof(cpuhw->start),
0629                           hwc->config_base, true);
0630     } else {
0631         hw_perf_event_reset(event);
0632     }
0633 
0634     /* Increment refcount for counter sets */
0635     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
0636         if ((hwc->config_base & cpumf_ctr_ctl[i]))
0637             atomic_inc(&cpuhw->ctr_set[i]);
0638 }
0639 
0640 /* Create perf event sample with the counter sets as raw data.  The sample
0641  * is then pushed to the event subsystem and the function checks for
0642  * possible event overflows. If an event overflow occurs, the PMU is
0643  * stopped.
0644  *
0645  * Return non-zero if an event overflow occurred.
0646  */
0647 static int cfdiag_push_sample(struct perf_event *event,
0648                   struct cpu_cf_events *cpuhw)
0649 {
0650     struct perf_sample_data data;
0651     struct perf_raw_record raw;
0652     struct pt_regs regs;
0653     int overflow;
0654 
0655     /* Setup perf sample */
0656     perf_sample_data_init(&data, 0, event->hw.last_period);
0657     memset(&regs, 0, sizeof(regs));
0658     memset(&raw, 0, sizeof(raw));
0659 
0660     if (event->attr.sample_type & PERF_SAMPLE_CPU)
0661         data.cpu_entry.cpu = event->cpu;
0662     if (event->attr.sample_type & PERF_SAMPLE_RAW) {
0663         raw.frag.size = cpuhw->usedss;
0664         raw.frag.data = cpuhw->stop;
0665         raw.size = raw.frag.size;
0666         data.raw = &raw;
0667     }
0668 
0669     overflow = perf_event_overflow(event, &data, &regs);
0670     debug_sprintf_event(cf_dbg, 3,
0671                 "%s event %#llx sample_type %#llx raw %d ov %d\n",
0672                 __func__, event->hw.config,
0673                 event->attr.sample_type, raw.size, overflow);
0674     if (overflow)
0675         event->pmu->stop(event, 0);
0676 
0677     perf_event_update_userpage(event);
0678     return overflow;
0679 }
0680 
0681 static void cpumf_pmu_stop(struct perf_event *event, int flags)
0682 {
0683     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0684     struct hw_perf_event *hwc = &event->hw;
0685     int i;
0686 
0687     if (!(hwc->state & PERF_HES_STOPPED)) {
0688         /* Decrement reference count for this counter set and if this
0689          * is the last used counter in the set, clear activation
0690          * control and set the counter set state to inactive.
0691          */
0692         for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
0693             if (!(hwc->config_base & cpumf_ctr_ctl[i]))
0694                 continue;
0695             if (!atomic_dec_return(&cpuhw->ctr_set[i]))
0696                 ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
0697         }
0698         hwc->state |= PERF_HES_STOPPED;
0699     }
0700 
0701     if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
0702         if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
0703             local64_inc(&event->count);
0704             cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
0705                               sizeof(cpuhw->stop),
0706                               event->hw.config_base,
0707                               false);
0708             if (cfdiag_diffctr(cpuhw, event->hw.config_base))
0709                 cfdiag_push_sample(event, cpuhw);
0710         } else if (cpuhw->flags & PMU_F_RESERVED) {
0711             /* Only update when PMU not hotplugged off */
0712             hw_perf_event_update(event);
0713         }
0714         hwc->state |= PERF_HES_UPTODATE;
0715     }
0716 }
0717 
0718 static int cpumf_pmu_add(struct perf_event *event, int flags)
0719 {
0720     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0721 
0722     ctr_set_enable(&cpuhw->state, event->hw.config_base);
0723     event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
0724 
0725     if (flags & PERF_EF_START)
0726         cpumf_pmu_start(event, PERF_EF_RELOAD);
0727 
0728     return 0;
0729 }
0730 
0731 static void cpumf_pmu_del(struct perf_event *event, int flags)
0732 {
0733     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0734     int i;
0735 
0736     cpumf_pmu_stop(event, PERF_EF_UPDATE);
0737 
0738     /* Check if any counter in the counter set is still used.  If not used,
0739      * change the counter set to the disabled state.  This also clears the
0740      * content of all counters in the set.
0741      *
0742      * When a new perf event has been added but not yet started, this can
0743      * clear enable control and resets all counters in a set.  Therefore,
0744      * cpumf_pmu_start() always has to reenable a counter set.
0745      */
0746     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
0747         if (!atomic_read(&cpuhw->ctr_set[i]))
0748             ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
0749 }
0750 
0751 /* Performance monitoring unit for s390x */
0752 static struct pmu cpumf_pmu = {
0753     .task_ctx_nr  = perf_sw_context,
0754     .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
0755     .pmu_enable   = cpumf_pmu_enable,
0756     .pmu_disable  = cpumf_pmu_disable,
0757     .event_init   = cpumf_pmu_event_init,
0758     .add          = cpumf_pmu_add,
0759     .del          = cpumf_pmu_del,
0760     .start        = cpumf_pmu_start,
0761     .stop         = cpumf_pmu_stop,
0762     .read         = cpumf_pmu_read,
0763 };
0764 
0765 static int cfset_init(void);
0766 static int __init cpumf_pmu_init(void)
0767 {
0768     int rc;
0769 
0770     if (!kernel_cpumcf_avail())
0771         return -ENODEV;
0772 
0773     /* Setup s390dbf facility */
0774     cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
0775     if (!cf_dbg) {
0776         pr_err("Registration of s390dbf(cpum_cf) failed\n");
0777         return -ENOMEM;
0778     }
0779     debug_register_view(cf_dbg, &debug_sprintf_view);
0780 
0781     cpumf_pmu.attr_groups = cpumf_cf_event_group();
0782     rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
0783     if (rc) {
0784         debug_unregister_view(cf_dbg, &debug_sprintf_view);
0785         debug_unregister(cf_dbg);
0786         pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
0787     } else if (stccm_avail()) { /* Setup counter set device */
0788         cfset_init();
0789     }
0790     return rc;
0791 }
0792 
0793 /* Support for the CPU Measurement Facility counter set extraction using
0794  * device /dev/hwctr. This allows user space programs to extract complete
0795  * counter set via normal file operations.
0796  */
0797 
0798 static atomic_t cfset_opencnt = ATOMIC_INIT(0);     /* Access count */
0799 static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
0800 struct cfset_call_on_cpu_parm {     /* Parm struct for smp_call_on_cpu */
0801     unsigned int sets;      /* Counter set bit mask */
0802     atomic_t cpus_ack;      /* # CPUs successfully executed func */
0803 };
0804 
0805 static struct cfset_session {       /* CPUs and counter set bit mask */
0806     struct list_head head;      /* Head of list of active processes */
0807 } cfset_session = {
0808     .head = LIST_HEAD_INIT(cfset_session.head)
0809 };
0810 
0811 struct cfset_request {          /* CPUs and counter set bit mask */
0812     unsigned long ctrset;       /* Bit mask of counter set to read */
0813     cpumask_t mask;         /* CPU mask to read from */
0814     struct list_head node;      /* Chain to cfset_session.head */
0815 };
0816 
0817 static void cfset_session_init(void)
0818 {
0819     INIT_LIST_HEAD(&cfset_session.head);
0820 }
0821 
0822 /* Remove current request from global bookkeeping. Maintain a counter set bit
0823  * mask on a per CPU basis.
0824  * Done in process context under mutex protection.
0825  */
0826 static void cfset_session_del(struct cfset_request *p)
0827 {
0828     list_del(&p->node);
0829 }
0830 
0831 /* Add current request to global bookkeeping. Maintain a counter set bit mask
0832  * on a per CPU basis.
0833  * Done in process context under mutex protection.
0834  */
0835 static void cfset_session_add(struct cfset_request *p)
0836 {
0837     list_add(&p->node, &cfset_session.head);
0838 }
0839 
0840 /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
0841  * path is currently used.
0842  * The cpu_cf_events::dev_state is used to denote counter sets in use by this
0843  * interface. It is always or'ed in. If this interface is not active, its
0844  * value is zero and no additional counter sets will be included.
0845  *
0846  * The cpu_cf_events::state is used by the perf_event_open SVC and remains
0847  * unchanged.
0848  *
0849  * perf_pmu_enable() and perf_pmu_enable() and its call backs
0850  * cpumf_pmu_enable() and  cpumf_pmu_disable() are called by the
0851  * performance measurement subsystem to enable per process
0852  * CPU Measurement counter facility.
0853  * The XXX_enable() and XXX_disable functions are used to turn off
0854  * x86 performance monitoring interrupt (PMI) during scheduling.
0855  * s390 uses these calls to temporarily stop and resume the active CPU
0856  * counters sets during scheduling.
0857  *
0858  * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
0859  * device access.  The perf_event_open() SVC interface makes a lot of effort
0860  * to only run the counters while the calling process is actively scheduled
0861  * to run.
0862  * When /dev/hwctr interface is also used at the same time, the counter sets
0863  * will keep running, even when the process is scheduled off a CPU.
0864  * However this is not a problem and does not lead to wrong counter values
0865  * for the perf_event_open() SVC. The current counter value will be recorded
0866  * during schedule-in. At schedule-out time the current counter value is
0867  * extracted again and the delta is calculated and added to the event.
0868  */
0869 /* Stop all counter sets via ioctl interface */
0870 static void cfset_ioctl_off(void *parm)
0871 {
0872     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0873     struct cfset_call_on_cpu_parm *p = parm;
0874     int rc;
0875 
0876     /* Check if any counter set used by /dev/hwc */
0877     for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
0878         if ((p->sets & cpumf_ctr_ctl[rc])) {
0879             if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
0880                 ctr_set_disable(&cpuhw->dev_state,
0881                         cpumf_ctr_ctl[rc]);
0882                 ctr_set_stop(&cpuhw->dev_state,
0883                          cpumf_ctr_ctl[rc]);
0884             }
0885         }
0886     /* Keep perf_event_open counter sets */
0887     rc = lcctl(cpuhw->dev_state | cpuhw->state);
0888     if (rc)
0889         pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
0890                cpuhw->state, S390_HWCTR_DEVICE, rc);
0891     if (!cpuhw->dev_state)
0892         cpuhw->flags &= ~PMU_F_IN_USE;
0893     debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
0894                 __func__, rc, cpuhw->state, cpuhw->dev_state);
0895 }
0896 
0897 /* Start counter sets on particular CPU */
0898 static void cfset_ioctl_on(void *parm)
0899 {
0900     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0901     struct cfset_call_on_cpu_parm *p = parm;
0902     int rc;
0903 
0904     cpuhw->flags |= PMU_F_IN_USE;
0905     ctr_set_enable(&cpuhw->dev_state, p->sets);
0906     ctr_set_start(&cpuhw->dev_state, p->sets);
0907     for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
0908         if ((p->sets & cpumf_ctr_ctl[rc]))
0909             atomic_inc(&cpuhw->ctr_set[rc]);
0910     rc = lcctl(cpuhw->dev_state | cpuhw->state);    /* Start counter sets */
0911     if (!rc)
0912         atomic_inc(&p->cpus_ack);
0913     else
0914         pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
0915                cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
0916     debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
0917                 __func__, rc, cpuhw->state, cpuhw->dev_state);
0918 }
0919 
0920 static void cfset_release_cpu(void *p)
0921 {
0922     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
0923     int rc;
0924 
0925     debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
0926                 __func__, cpuhw->state, cpuhw->dev_state);
0927     cpuhw->dev_state = 0;
0928     rc = lcctl(cpuhw->state);   /* Keep perf_event_open counter sets */
0929     if (rc)
0930         pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
0931                cpuhw->state, S390_HWCTR_DEVICE, rc);
0932 }
0933 
0934 /* This modifies the process CPU mask to adopt it to the currently online
0935  * CPUs. Offline CPUs can not be addresses. This call terminates the access
0936  * and is usually followed by close() or a new iotcl(..., START, ...) which
0937  * creates a new request structure.
0938  */
0939 static void cfset_all_stop(struct cfset_request *req)
0940 {
0941     struct cfset_call_on_cpu_parm p = {
0942         .sets = req->ctrset,
0943     };
0944 
0945     cpumask_and(&req->mask, &req->mask, cpu_online_mask);
0946     on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
0947 }
0948 
0949 /* Release function is also called when application gets terminated without
0950  * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
0951  */
0952 static int cfset_release(struct inode *inode, struct file *file)
0953 {
0954     mutex_lock(&cfset_ctrset_mutex);
0955     /* Open followed by close/exit has no private_data */
0956     if (file->private_data) {
0957         cfset_all_stop(file->private_data);
0958         cfset_session_del(file->private_data);
0959         kfree(file->private_data);
0960         file->private_data = NULL;
0961     }
0962     if (!atomic_dec_return(&cfset_opencnt))
0963         on_each_cpu(cfset_release_cpu, NULL, 1);
0964     mutex_unlock(&cfset_ctrset_mutex);
0965 
0966     hw_perf_event_destroy(NULL);
0967     return 0;
0968 }
0969 
0970 static int cfset_open(struct inode *inode, struct file *file)
0971 {
0972     if (!capable(CAP_SYS_ADMIN))
0973         return -EPERM;
0974     mutex_lock(&cfset_ctrset_mutex);
0975     if (atomic_inc_return(&cfset_opencnt) == 1)
0976         cfset_session_init();
0977     mutex_unlock(&cfset_ctrset_mutex);
0978 
0979     cpumf_hw_inuse();
0980     file->private_data = NULL;
0981     /* nonseekable_open() never fails */
0982     return nonseekable_open(inode, file);
0983 }
0984 
0985 static int cfset_all_start(struct cfset_request *req)
0986 {
0987     struct cfset_call_on_cpu_parm p = {
0988         .sets = req->ctrset,
0989         .cpus_ack = ATOMIC_INIT(0),
0990     };
0991     cpumask_var_t mask;
0992     int rc = 0;
0993 
0994     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
0995         return -ENOMEM;
0996     cpumask_and(mask, &req->mask, cpu_online_mask);
0997     on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
0998     if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
0999         on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
1000         rc = -EIO;
1001         debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
1002     }
1003     free_cpumask_var(mask);
1004     return rc;
1005 }
1006 
1007 
1008 /* Return the maximum required space for all possible CPUs in case one
1009  * CPU will be onlined during the START, READ, STOP cycles.
1010  * To find out the size of the counter sets, any one CPU will do. They
1011  * all have the same counter sets.
1012  */
1013 static size_t cfset_needspace(unsigned int sets)
1014 {
1015     struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
1016     size_t bytes = 0;
1017     int i;
1018 
1019     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
1020         if (!(sets & cpumf_ctr_ctl[i]))
1021             continue;
1022         bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
1023              sizeof(((struct s390_ctrset_setdata *)0)->set) +
1024              sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
1025     }
1026     bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
1027         (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
1028              sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
1029     put_cpu_ptr(&cpu_cf_events);
1030     return bytes;
1031 }
1032 
1033 static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
1034 {
1035     struct s390_ctrset_read __user *ctrset_read;
1036     unsigned int cpu, cpus, rc;
1037     void __user *uptr;
1038 
1039     ctrset_read = (struct s390_ctrset_read __user *)arg;
1040     uptr = ctrset_read->data;
1041     for_each_cpu(cpu, mask) {
1042         struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
1043         struct s390_ctrset_cpudata __user *ctrset_cpudata;
1044 
1045         ctrset_cpudata = uptr;
1046         rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
1047         rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
1048         rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
1049                    cpuhw->used);
1050         if (rc)
1051             return -EFAULT;
1052         uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
1053         cond_resched();
1054     }
1055     cpus = cpumask_weight(mask);
1056     if (put_user(cpus, &ctrset_read->no_cpus))
1057         return -EFAULT;
1058     debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
1059                 uptr - (void __user *)ctrset_read->data);
1060     return 0;
1061 }
1062 
1063 static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
1064                 int ctrset_size, size_t room)
1065 {
1066     size_t need = 0;
1067     int rc = -1;
1068 
1069     need = sizeof(*p) + sizeof(u64) * ctrset_size;
1070     if (need <= room) {
1071         p->set = cpumf_ctr_ctl[ctrset];
1072         p->no_cnts = ctrset_size;
1073         rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
1074         if (rc == 3)        /* Nothing stored */
1075             need = 0;
1076     }
1077     return need;
1078 }
1079 
1080 /* Read all counter sets. */
1081 static void cfset_cpu_read(void *parm)
1082 {
1083     struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
1084     struct cfset_call_on_cpu_parm *p = parm;
1085     int set, set_size;
1086     size_t space;
1087 
1088     /* No data saved yet */
1089     cpuhw->used = 0;
1090     cpuhw->sets = 0;
1091     memset(cpuhw->data, 0, sizeof(cpuhw->data));
1092 
1093     /* Scan the counter sets */
1094     for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
1095         struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
1096                          cpuhw->used;
1097 
1098         if (!(p->sets & cpumf_ctr_ctl[set]))
1099             continue;   /* Counter set not in list */
1100         set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
1101         space = sizeof(cpuhw->data) - cpuhw->used;
1102         space = cfset_cpuset_read(sp, set, set_size, space);
1103         if (space) {
1104             cpuhw->used += space;
1105             cpuhw->sets += 1;
1106         }
1107     }
1108     debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
1109                 cpuhw->sets, cpuhw->used);
1110 }
1111 
1112 static int cfset_all_read(unsigned long arg, struct cfset_request *req)
1113 {
1114     struct cfset_call_on_cpu_parm p;
1115     cpumask_var_t mask;
1116     int rc;
1117 
1118     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
1119         return -ENOMEM;
1120 
1121     p.sets = req->ctrset;
1122     cpumask_and(mask, &req->mask, cpu_online_mask);
1123     on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
1124     rc = cfset_all_copy(arg, mask);
1125     free_cpumask_var(mask);
1126     return rc;
1127 }
1128 
1129 static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
1130 {
1131     struct s390_ctrset_read read;
1132     int ret = -ENODATA;
1133 
1134     if (req && req->ctrset) {
1135         if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
1136             return -EFAULT;
1137         ret = cfset_all_read(arg, req);
1138     }
1139     return ret;
1140 }
1141 
1142 static long cfset_ioctl_stop(struct file *file)
1143 {
1144     struct cfset_request *req = file->private_data;
1145     int ret = -ENXIO;
1146 
1147     if (req) {
1148         cfset_all_stop(req);
1149         cfset_session_del(req);
1150         kfree(req);
1151         file->private_data = NULL;
1152         ret = 0;
1153     }
1154     return ret;
1155 }
1156 
1157 static long cfset_ioctl_start(unsigned long arg, struct file *file)
1158 {
1159     struct s390_ctrset_start __user *ustart;
1160     struct s390_ctrset_start start;
1161     struct cfset_request *preq;
1162     void __user *umask;
1163     unsigned int len;
1164     int ret = 0;
1165     size_t need;
1166 
1167     if (file->private_data)
1168         return -EBUSY;
1169     ustart = (struct s390_ctrset_start __user *)arg;
1170     if (copy_from_user(&start, ustart, sizeof(start)))
1171         return -EFAULT;
1172     if (start.version != S390_HWCTR_START_VERSION)
1173         return -EINVAL;
1174     if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
1175                    cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
1176                    cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
1177                    cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
1178                    cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
1179         return -EINVAL;     /* Invalid counter set */
1180     if (!start.counter_sets)
1181         return -EINVAL;     /* No counter set at all? */
1182 
1183     preq = kzalloc(sizeof(*preq), GFP_KERNEL);
1184     if (!preq)
1185         return -ENOMEM;
1186     cpumask_clear(&preq->mask);
1187     len = min_t(u64, start.cpumask_len, cpumask_size());
1188     umask = (void __user *)start.cpumask;
1189     if (copy_from_user(&preq->mask, umask, len)) {
1190         kfree(preq);
1191         return -EFAULT;
1192     }
1193     if (cpumask_empty(&preq->mask)) {
1194         kfree(preq);
1195         return -EINVAL;
1196     }
1197     need = cfset_needspace(start.counter_sets);
1198     if (put_user(need, &ustart->data_bytes)) {
1199         kfree(preq);
1200         return -EFAULT;
1201     }
1202     preq->ctrset = start.counter_sets;
1203     ret = cfset_all_start(preq);
1204     if (!ret) {
1205         cfset_session_add(preq);
1206         file->private_data = preq;
1207         debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n",
1208                     __func__, preq->ctrset, need, ret);
1209     } else {
1210         kfree(preq);
1211     }
1212     return ret;
1213 }
1214 
1215 /* Entry point to the /dev/hwctr device interface.
1216  * The ioctl system call supports three subcommands:
1217  * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
1218  *    counter set keeps running until explicitly stopped. Returns the number
1219  *    of bytes needed to store the counter values. If another S390_HWCTR_START
1220  *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
1221  *    command on the same file descriptor, -EBUSY is returned.
1222  * S390_HWCTR_READ: Read the counter set values from specified CPU list given
1223  *    with the S390_HWCTR_START command.
1224  * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
1225  *    previous S390_HWCTR_START subcommand.
1226  */
1227 static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1228 {
1229     int ret;
1230 
1231     cpus_read_lock();
1232     mutex_lock(&cfset_ctrset_mutex);
1233     switch (cmd) {
1234     case S390_HWCTR_START:
1235         ret = cfset_ioctl_start(arg, file);
1236         break;
1237     case S390_HWCTR_STOP:
1238         ret = cfset_ioctl_stop(file);
1239         break;
1240     case S390_HWCTR_READ:
1241         ret = cfset_ioctl_read(arg, file->private_data);
1242         break;
1243     default:
1244         ret = -ENOTTY;
1245         break;
1246     }
1247     mutex_unlock(&cfset_ctrset_mutex);
1248     cpus_read_unlock();
1249     return ret;
1250 }
1251 
1252 static const struct file_operations cfset_fops = {
1253     .owner = THIS_MODULE,
1254     .open = cfset_open,
1255     .release = cfset_release,
1256     .unlocked_ioctl = cfset_ioctl,
1257     .compat_ioctl = cfset_ioctl,
1258     .llseek = no_llseek
1259 };
1260 
1261 static struct miscdevice cfset_dev = {
1262     .name   = S390_HWCTR_DEVICE,
1263     .minor  = MISC_DYNAMIC_MINOR,
1264     .fops   = &cfset_fops,
1265 };
1266 
1267 /* Hotplug add of a CPU. Scan through all active processes and add
1268  * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
1269  */
1270 int cfset_online_cpu(unsigned int cpu)
1271 {
1272     struct cfset_call_on_cpu_parm p;
1273     struct cfset_request *rp;
1274 
1275     mutex_lock(&cfset_ctrset_mutex);
1276     if (!list_empty(&cfset_session.head)) {
1277         list_for_each_entry(rp, &cfset_session.head, node) {
1278             p.sets = rp->ctrset;
1279             cfset_ioctl_on(&p);
1280             cpumask_set_cpu(cpu, &rp->mask);
1281         }
1282     }
1283     mutex_unlock(&cfset_ctrset_mutex);
1284     return 0;
1285 }
1286 
1287 /* Hotplug remove of a CPU. Scan through all active processes and clear
1288  * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
1289  */
1290 int cfset_offline_cpu(unsigned int cpu)
1291 {
1292     struct cfset_call_on_cpu_parm p;
1293     struct cfset_request *rp;
1294 
1295     mutex_lock(&cfset_ctrset_mutex);
1296     if (!list_empty(&cfset_session.head)) {
1297         list_for_each_entry(rp, &cfset_session.head, node) {
1298             p.sets = rp->ctrset;
1299             cfset_ioctl_off(&p);
1300             cpumask_clear_cpu(cpu, &rp->mask);
1301         }
1302     }
1303     mutex_unlock(&cfset_ctrset_mutex);
1304     return 0;
1305 }
1306 
1307 static void cfdiag_read(struct perf_event *event)
1308 {
1309     debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
1310                 event->attr.config, local64_read(&event->count));
1311 }
1312 
1313 static int get_authctrsets(void)
1314 {
1315     struct cpu_cf_events *cpuhw;
1316     unsigned long auth = 0;
1317     enum cpumf_ctr_set i;
1318 
1319     cpuhw = &get_cpu_var(cpu_cf_events);
1320     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
1321         if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
1322             auth |= cpumf_ctr_ctl[i];
1323     }
1324     put_cpu_var(cpu_cf_events);
1325     return auth;
1326 }
1327 
1328 /* Setup the event. Test for authorized counter sets and only include counter
1329  * sets which are authorized at the time of the setup. Including unauthorized
1330  * counter sets result in specification exception (and panic).
1331  */
1332 static int cfdiag_event_init2(struct perf_event *event)
1333 {
1334     struct perf_event_attr *attr = &event->attr;
1335     int err = 0;
1336 
1337     /* Set sample_period to indicate sampling */
1338     event->hw.config = attr->config;
1339     event->hw.sample_period = attr->sample_period;
1340     local64_set(&event->hw.period_left, event->hw.sample_period);
1341     local64_set(&event->count, 0);
1342     event->hw.last_period = event->hw.sample_period;
1343 
1344     /* Add all authorized counter sets to config_base. The
1345      * the hardware init function is either called per-cpu or just once
1346      * for all CPUS (event->cpu == -1).  This depends on the whether
1347      * counting is started for all CPUs or on a per workload base where
1348      * the perf event moves from one CPU to another CPU.
1349      * Checking the authorization on any CPU is fine as the hardware
1350      * applies the same authorization settings to all CPUs.
1351      */
1352     event->hw.config_base = get_authctrsets();
1353 
1354     /* No authorized counter sets, nothing to count/sample */
1355     if (!event->hw.config_base)
1356         err = -EINVAL;
1357 
1358     debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
1359                 __func__, err, event->hw.config_base);
1360     return err;
1361 }
1362 
1363 static int cfdiag_event_init(struct perf_event *event)
1364 {
1365     struct perf_event_attr *attr = &event->attr;
1366     int err = -ENOENT;
1367 
1368     if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
1369         event->attr.type != event->pmu->type)
1370         goto out;
1371 
1372     /* Raw events are used to access counters directly,
1373      * hence do not permit excludes.
1374      * This event is useless without PERF_SAMPLE_RAW to return counter set
1375      * values as raw data.
1376      */
1377     if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
1378         !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
1379         err = -EOPNOTSUPP;
1380         goto out;
1381     }
1382 
1383     /* Initialize for using the CPU-measurement counter facility */
1384     cpumf_hw_inuse();
1385     event->destroy = hw_perf_event_destroy;
1386 
1387     err = cfdiag_event_init2(event);
1388     if (unlikely(err))
1389         event->destroy(event);
1390 out:
1391     return err;
1392 }
1393 
1394 /* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
1395  * to collect the complete counter sets for a scheduled process. Target
1396  * are complete counter sets attached as raw data to the artificial event.
1397  * This results in complete counter sets available when a process is
1398  * scheduled. Contains the delta of every counter while the process was
1399  * running.
1400  */
1401 CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
1402 
1403 static struct attribute *cfdiag_events_attr[] = {
1404     CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
1405     NULL,
1406 };
1407 
1408 PMU_FORMAT_ATTR(event, "config:0-63");
1409 
1410 static struct attribute *cfdiag_format_attr[] = {
1411     &format_attr_event.attr,
1412     NULL,
1413 };
1414 
1415 static struct attribute_group cfdiag_events_group = {
1416     .name = "events",
1417     .attrs = cfdiag_events_attr,
1418 };
1419 static struct attribute_group cfdiag_format_group = {
1420     .name = "format",
1421     .attrs = cfdiag_format_attr,
1422 };
1423 static const struct attribute_group *cfdiag_attr_groups[] = {
1424     &cfdiag_events_group,
1425     &cfdiag_format_group,
1426     NULL,
1427 };
1428 
1429 /* Performance monitoring unit for event CF_DIAG. Since this event
1430  * is also started and stopped via the perf_event_open() system call, use
1431  * the same event enable/disable call back functions. They do not
1432  * have a pointer to the perf_event strcture as first parameter.
1433  *
1434  * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
1435  * Reuse them and distinguish the event (always first parameter) via
1436  * 'config' member.
1437  */
1438 static struct pmu cf_diag = {
1439     .task_ctx_nr  = perf_sw_context,
1440     .event_init   = cfdiag_event_init,
1441     .pmu_enable   = cpumf_pmu_enable,
1442     .pmu_disable  = cpumf_pmu_disable,
1443     .add          = cpumf_pmu_add,
1444     .del          = cpumf_pmu_del,
1445     .start        = cpumf_pmu_start,
1446     .stop         = cpumf_pmu_stop,
1447     .read         = cfdiag_read,
1448 
1449     .attr_groups  = cfdiag_attr_groups
1450 };
1451 
1452 /* Calculate memory needed to store all counter sets together with header and
1453  * trailer data. This is independent of the counter set authorization which
1454  * can vary depending on the configuration.
1455  */
1456 static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
1457 {
1458     size_t max_size = sizeof(struct cf_trailer_entry);
1459     enum cpumf_ctr_set i;
1460 
1461     for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
1462         size_t size = cpum_cf_ctrset_size(i, info);
1463 
1464         if (size)
1465             max_size += size * sizeof(u64) +
1466                     sizeof(struct cf_ctrset_entry);
1467     }
1468     return max_size;
1469 }
1470 
1471 /* Get the CPU speed, try sampling facility first and CPU attributes second. */
1472 static void cfdiag_get_cpu_speed(void)
1473 {
1474     unsigned long mhz;
1475 
1476     if (cpum_sf_avail()) {          /* Sampling facility first */
1477         struct hws_qsi_info_block si;
1478 
1479         memset(&si, 0, sizeof(si));
1480         if (!qsi(&si)) {
1481             cfdiag_cpu_speed = si.cpu_speed;
1482             return;
1483         }
1484     }
1485 
1486     /* Fallback: CPU speed extract static part. Used in case
1487      * CPU Measurement Sampling Facility is turned off.
1488      */
1489     mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
1490     if (mhz != -1UL)
1491         cfdiag_cpu_speed = mhz & 0xffffffff;
1492 }
1493 
1494 static int cfset_init(void)
1495 {
1496     struct cpumf_ctr_info info;
1497     size_t need;
1498     int rc;
1499 
1500     if (qctri(&info))
1501         return -ENODEV;
1502 
1503     cfdiag_get_cpu_speed();
1504     /* Make sure the counter set data fits into predefined buffer. */
1505     need = cfdiag_maxsize(&info);
1506     if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
1507         pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
1508                need);
1509         return -ENOMEM;
1510     }
1511 
1512     rc = misc_register(&cfset_dev);
1513     if (rc) {
1514         pr_err("Registration of /dev/%s failed rc=%i\n",
1515                cfset_dev.name, rc);
1516         goto out;
1517     }
1518 
1519     rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
1520     if (rc) {
1521         misc_deregister(&cfset_dev);
1522         pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
1523                rc);
1524     }
1525 out:
1526     return rc;
1527 }
1528 
1529 device_initcall(cpumf_pmu_init);