Back to home page

LXR

 
 

    


0001 /*
0002  * Cell Broadband Engine OProfile Support
0003  *
0004  * (C) Copyright IBM Corporation 2006
0005  *
0006  * Author: David Erb (djerb@us.ibm.com)
0007  * Modifications:
0008  *     Carl Love <carll@us.ibm.com>
0009  *     Maynard Johnson <maynardj@us.ibm.com>
0010  *
0011  * This program is free software; you can redistribute it and/or
0012  * modify it under the terms of the GNU General Public License
0013  * as published by the Free Software Foundation; either version
0014  * 2 of the License, or (at your option) any later version.
0015  */
0016 
0017 #include <linux/cpufreq.h>
0018 #include <linux/delay.h>
0019 #include <linux/jiffies.h>
0020 #include <linux/kthread.h>
0021 #include <linux/oprofile.h>
0022 #include <linux/percpu.h>
0023 #include <linux/smp.h>
0024 #include <linux/spinlock.h>
0025 #include <linux/timer.h>
0026 #include <asm/cell-pmu.h>
0027 #include <asm/cputable.h>
0028 #include <asm/firmware.h>
0029 #include <asm/io.h>
0030 #include <asm/oprofile_impl.h>
0031 #include <asm/processor.h>
0032 #include <asm/prom.h>
0033 #include <asm/ptrace.h>
0034 #include <asm/reg.h>
0035 #include <asm/rtas.h>
0036 #include <asm/cell-regs.h>
0037 
0038 #include "../platforms/cell/interrupt.h"
0039 #include "cell/pr_util.h"
0040 
0041 #define PPU_PROFILING            0
0042 #define SPU_PROFILING_CYCLES     1
0043 #define SPU_PROFILING_EVENTS     2
0044 
0045 #define SPU_EVENT_NUM_START      4100
0046 #define SPU_EVENT_NUM_STOP       4399
0047 #define SPU_PROFILE_EVENT_ADDR          4363  /* spu, address trace, decimal */
0048 #define SPU_PROFILE_EVENT_ADDR_MASK_A   0x146 /* sub unit set to zero */
0049 #define SPU_PROFILE_EVENT_ADDR_MASK_B   0x186 /* sub unit set to zero */
0050 
0051 #define NUM_SPUS_PER_NODE    8
0052 #define SPU_CYCLES_EVENT_NUM 2  /*  event number for SPU_CYCLES */
0053 
0054 #define PPU_CYCLES_EVENT_NUM 1  /*  event number for CYCLES */
0055 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
0056                  * PPU_CYCLES event
0057                  */
0058 #define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
0059 
0060 #define NUM_THREADS 2         /* number of physical threads in
0061                    * physical processor
0062                    */
0063 #define NUM_DEBUG_BUS_WORDS 4
0064 #define NUM_INPUT_BUS_WORDS 2
0065 
0066 #define MAX_SPU_COUNT 0xFFFFFF  /* maximum 24 bit LFSR value */
0067 
0068 /* Minimum HW interval timer setting to send value to trace buffer is 10 cycle.
0069  * To configure counter to send value every N cycles set counter to
0070  * 2^32 - 1 - N.
0071  */
0072 #define NUM_INTERVAL_CYC  0xFFFFFFFF - 10
0073 
0074 /*
0075  * spu_cycle_reset is the number of cycles between samples.
0076  * This variable is used for SPU profiling and should ONLY be set
0077  * at the beginning of cell_reg_setup; otherwise, it's read-only.
0078  */
0079 static unsigned int spu_cycle_reset;
0080 static unsigned int profiling_mode;
0081 static int spu_evnt_phys_spu_indx;
0082 
0083 struct pmc_cntrl_data {
0084     unsigned long vcntr;
0085     unsigned long evnts;
0086     unsigned long masks;
0087     unsigned long enabled;
0088 };
0089 
0090 /*
0091  * ibm,cbe-perftools rtas parameters
0092  */
0093 struct pm_signal {
0094     u16 cpu;        /* Processor to modify */
0095     u16 sub_unit;       /* hw subunit this applies to (if applicable)*/
0096     short int signal_group; /* Signal Group to Enable/Disable */
0097     u8 bus_word;        /* Enable/Disable on this Trace/Trigger/Event
0098                  * Bus Word(s) (bitmask)
0099                  */
0100     u8 bit;         /* Trigger/Event bit (if applicable) */
0101 };
0102 
0103 /*
0104  * rtas call arguments
0105  */
0106 enum {
0107     SUBFUNC_RESET = 1,
0108     SUBFUNC_ACTIVATE = 2,
0109     SUBFUNC_DEACTIVATE = 3,
0110 
0111     PASSTHRU_IGNORE = 0,
0112     PASSTHRU_ENABLE = 1,
0113     PASSTHRU_DISABLE = 2,
0114 };
0115 
0116 struct pm_cntrl {
0117     u16 enable;
0118     u16 stop_at_max;
0119     u16 trace_mode;
0120     u16 freeze;
0121     u16 count_mode;
0122     u16 spu_addr_trace;
0123     u8  trace_buf_ovflw;
0124 };
0125 
0126 static struct {
0127     u32 group_control;
0128     u32 debug_bus_control;
0129     struct pm_cntrl pm_cntrl;
0130     u32 pm07_cntrl[NR_PHYS_CTRS];
0131 } pm_regs;
0132 
0133 #define GET_SUB_UNIT(x) ((x & 0x0000f000) >> 12)
0134 #define GET_BUS_WORD(x) ((x & 0x000000f0) >> 4)
0135 #define GET_BUS_TYPE(x) ((x & 0x00000300) >> 8)
0136 #define GET_POLARITY(x) ((x & 0x00000002) >> 1)
0137 #define GET_COUNT_CYCLES(x) (x & 0x00000001)
0138 #define GET_INPUT_CONTROL(x) ((x & 0x00000004) >> 2)
0139 
0140 static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
0141 static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
0142 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
0143 
0144 /*
0145  * The CELL profiling code makes rtas calls to setup the debug bus to
0146  * route the performance signals.  Additionally, SPU profiling requires
0147  * a second rtas call to setup the hardware to capture the SPU PCs.
0148  * The EIO error value is returned if the token lookups or the rtas
0149  * call fail.  The EIO error number is the best choice of the existing
0150  * error numbers.  The probability of rtas related error is very low.  But
0151  * by returning EIO and printing additional information to dmsg the user
0152  * will know that OProfile did not start and dmesg will tell them why.
0153  * OProfile does not support returning errors on Stop.  Not a huge issue
0154  * since failure to reset the debug bus or stop the SPU PC collection is
0155  * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
0156  * either.
0157  */
0158 
0159 /*
0160  * Interpetation of hdw_thread:
0161  * 0 - even virtual cpus 0, 2, 4,...
0162  * 1 - odd virtual cpus 1, 3, 5, ...
0163  *
0164  * FIXME: this is strictly wrong, we need to clean this up in a number
0165  * of places. It works for now. -arnd
0166  */
0167 static u32 hdw_thread;
0168 
0169 static u32 virt_cntr_inter_mask;
0170 static struct timer_list timer_virt_cntr;
0171 static struct timer_list timer_spu_event_swap;
0172 
0173 /*
0174  * pm_signal needs to be global since it is initialized in
0175  * cell_reg_setup at the time when the necessary information
0176  * is available.
0177  */
0178 static struct pm_signal pm_signal[NR_PHYS_CTRS];
0179 static int pm_rtas_token;    /* token for debug bus setup call */
0180 static int spu_rtas_token;   /* token for SPU cycle profiling */
0181 
0182 static u32 reset_value[NR_PHYS_CTRS];
0183 static int num_counters;
0184 static int oprofile_running;
0185 static DEFINE_SPINLOCK(cntr_lock);
0186 
0187 static u32 ctr_enabled;
0188 
0189 static unsigned char input_bus[NUM_INPUT_BUS_WORDS];
0190 
0191 /*
0192  * Firmware interface functions
0193  */
0194 static int
0195 rtas_ibm_cbe_perftools(int subfunc, int passthru,
0196                void *address, unsigned long length)
0197 {
0198     u64 paddr = __pa(address);
0199 
0200     return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
0201              passthru, paddr >> 32, paddr & 0xffffffff, length);
0202 }
0203 
0204 static void pm_rtas_reset_signals(u32 node)
0205 {
0206     int ret;
0207     struct pm_signal pm_signal_local;
0208 
0209     /*
0210      * The debug bus is being set to the passthru disable state.
0211      * However, the FW still expects at least one legal signal routing
0212      * entry or it will return an error on the arguments.   If we don't
0213      * supply a valid entry, we must ignore all return values.  Ignoring
0214      * all return values means we might miss an error we should be
0215      * concerned about.
0216      */
0217 
0218     /*  fw expects physical cpu #. */
0219     pm_signal_local.cpu = node;
0220     pm_signal_local.signal_group = 21;
0221     pm_signal_local.bus_word = 1;
0222     pm_signal_local.sub_unit = 0;
0223     pm_signal_local.bit = 0;
0224 
0225     ret = rtas_ibm_cbe_perftools(SUBFUNC_RESET, PASSTHRU_DISABLE,
0226                      &pm_signal_local,
0227                      sizeof(struct pm_signal));
0228 
0229     if (unlikely(ret))
0230         /*
0231          * Not a fatal error. For Oprofile stop, the oprofile
0232          * functions do not support returning an error for
0233          * failure to stop OProfile.
0234          */
0235         printk(KERN_WARNING "%s: rtas returned: %d\n",
0236                __func__, ret);
0237 }
0238 
0239 static int pm_rtas_activate_signals(u32 node, u32 count)
0240 {
0241     int ret;
0242     int i, j;
0243     struct pm_signal pm_signal_local[NR_PHYS_CTRS];
0244 
0245     /*
0246      * There is no debug setup required for the cycles event.
0247      * Note that only events in the same group can be used.
0248      * Otherwise, there will be conflicts in correctly routing
0249      * the signals on the debug bus.  It is the responsibility
0250      * of the OProfile user tool to check the events are in
0251      * the same group.
0252      */
0253     i = 0;
0254     for (j = 0; j < count; j++) {
0255         if (pm_signal[j].signal_group != PPU_CYCLES_GRP_NUM) {
0256 
0257             /* fw expects physical cpu # */
0258             pm_signal_local[i].cpu = node;
0259             pm_signal_local[i].signal_group
0260                 = pm_signal[j].signal_group;
0261             pm_signal_local[i].bus_word = pm_signal[j].bus_word;
0262             pm_signal_local[i].sub_unit = pm_signal[j].sub_unit;
0263             pm_signal_local[i].bit = pm_signal[j].bit;
0264             i++;
0265         }
0266     }
0267 
0268     if (i != 0) {
0269         ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
0270                          pm_signal_local,
0271                          i * sizeof(struct pm_signal));
0272 
0273         if (unlikely(ret)) {
0274             printk(KERN_WARNING "%s: rtas returned: %d\n",
0275                    __func__, ret);
0276             return -EIO;
0277         }
0278     }
0279 
0280     return 0;
0281 }
0282 
0283 /*
0284  * PM Signal functions
0285  */
0286 static void set_pm_event(u32 ctr, int event, u32 unit_mask)
0287 {
0288     struct pm_signal *p;
0289     u32 signal_bit;
0290     u32 bus_word, bus_type, count_cycles, polarity, input_control;
0291     int j, i;
0292 
0293     if (event == PPU_CYCLES_EVENT_NUM) {
0294         /* Special Event: Count all cpu cycles */
0295         pm_regs.pm07_cntrl[ctr] = CBE_COUNT_ALL_CYCLES;
0296         p = &(pm_signal[ctr]);
0297         p->signal_group = PPU_CYCLES_GRP_NUM;
0298         p->bus_word = 1;
0299         p->sub_unit = 0;
0300         p->bit = 0;
0301         goto out;
0302     } else {
0303         pm_regs.pm07_cntrl[ctr] = 0;
0304     }
0305 
0306     bus_word = GET_BUS_WORD(unit_mask);
0307     bus_type = GET_BUS_TYPE(unit_mask);
0308     count_cycles = GET_COUNT_CYCLES(unit_mask);
0309     polarity = GET_POLARITY(unit_mask);
0310     input_control = GET_INPUT_CONTROL(unit_mask);
0311     signal_bit = (event % 100);
0312 
0313     p = &(pm_signal[ctr]);
0314 
0315     p->signal_group = event / 100;
0316     p->bus_word = bus_word;
0317     p->sub_unit = GET_SUB_UNIT(unit_mask);
0318 
0319     pm_regs.pm07_cntrl[ctr] = 0;
0320     pm_regs.pm07_cntrl[ctr] |= PM07_CTR_COUNT_CYCLES(count_cycles);
0321     pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
0322     pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
0323 
0324     /*
0325      * Some of the islands signal selection is based on 64 bit words.
0326      * The debug bus words are 32 bits, the input words to the performance
0327      * counters are defined as 32 bits.  Need to convert the 64 bit island
0328      * specification to the appropriate 32 input bit and bus word for the
0329      * performance counter event selection.  See the CELL Performance
0330      * monitoring signals manual and the Perf cntr hardware descriptions
0331      * for the details.
0332      */
0333     if (input_control == 0) {
0334         if (signal_bit > 31) {
0335             signal_bit -= 32;
0336             if (bus_word == 0x3)
0337                 bus_word = 0x2;
0338             else if (bus_word == 0xc)
0339                 bus_word = 0x8;
0340         }
0341 
0342         if ((bus_type == 0) && p->signal_group >= 60)
0343             bus_type = 2;
0344         if ((bus_type == 1) && p->signal_group >= 50)
0345             bus_type = 0;
0346 
0347         pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_MUX(signal_bit);
0348     } else {
0349         pm_regs.pm07_cntrl[ctr] = 0;
0350         p->bit = signal_bit;
0351     }
0352 
0353     for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) {
0354         if (bus_word & (1 << i)) {
0355             pm_regs.debug_bus_control |=
0356                 (bus_type << (30 - (2 * i)));
0357 
0358             for (j = 0; j < NUM_INPUT_BUS_WORDS; j++) {
0359                 if (input_bus[j] == 0xff) {
0360                     input_bus[j] = i;
0361                     pm_regs.group_control |=
0362                         (i << (30 - (2 * j)));
0363 
0364                     break;
0365                 }
0366             }
0367         }
0368     }
0369 out:
0370     ;
0371 }
0372 
0373 static void write_pm_cntrl(int cpu)
0374 {
0375     /*
0376      * Oprofile will use 32 bit counters, set bits 7:10 to 0
0377      * pmregs.pm_cntrl is a global
0378      */
0379 
0380     u32 val = 0;
0381     if (pm_regs.pm_cntrl.enable == 1)
0382         val |= CBE_PM_ENABLE_PERF_MON;
0383 
0384     if (pm_regs.pm_cntrl.stop_at_max == 1)
0385         val |= CBE_PM_STOP_AT_MAX;
0386 
0387     if (pm_regs.pm_cntrl.trace_mode != 0)
0388         val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode);
0389 
0390     if (pm_regs.pm_cntrl.trace_buf_ovflw == 1)
0391         val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw);
0392     if (pm_regs.pm_cntrl.freeze == 1)
0393         val |= CBE_PM_FREEZE_ALL_CTRS;
0394 
0395     val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace);
0396 
0397     /*
0398      * Routine set_count_mode must be called previously to set
0399      * the count mode based on the user selection of user and kernel.
0400      */
0401     val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
0402     cbe_write_pm(cpu, pm_control, val);
0403 }
0404 
0405 static inline void
0406 set_count_mode(u32 kernel, u32 user)
0407 {
0408     /*
0409      * The user must specify user and kernel if they want them. If
0410      *  neither is specified, OProfile will count in hypervisor mode.
0411      *  pm_regs.pm_cntrl is a global
0412      */
0413     if (kernel) {
0414         if (user)
0415             pm_regs.pm_cntrl.count_mode = CBE_COUNT_ALL_MODES;
0416         else
0417             pm_regs.pm_cntrl.count_mode =
0418                 CBE_COUNT_SUPERVISOR_MODE;
0419     } else {
0420         if (user)
0421             pm_regs.pm_cntrl.count_mode = CBE_COUNT_PROBLEM_MODE;
0422         else
0423             pm_regs.pm_cntrl.count_mode =
0424                 CBE_COUNT_HYPERVISOR_MODE;
0425     }
0426 }
0427 
0428 static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl)
0429 {
0430 
0431     pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE;
0432     cbe_write_pm07_control(cpu, ctr, pm07_cntrl[ctr]);
0433 }
0434 
0435 /*
0436  * Oprofile is expected to collect data on all CPUs simultaneously.
0437  * However, there is one set of performance counters per node.  There are
0438  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
0439  * multiplex in time the performance counter collection on the two virtual
0440  * CPUs.  The multiplexing of the performance counters is done by this
0441  * virtual counter routine.
0442  *
0443  * The pmc_values used below is defined as 'per-cpu' but its use is
0444  * more akin to 'per-node'.  We need to store two sets of counter
0445  * values per node -- one for the previous run and one for the next.
0446  * The per-cpu[NR_PHYS_CTRS] gives us the storage we need.  Each odd/even
0447  * pair of per-cpu arrays is used for storing the previous and next
0448  * pmc values for a given node.
0449  * NOTE: We use the per-cpu variable to improve cache performance.
0450  *
0451  * This routine will alternate loading the virtual counters for
0452  * virtual CPUs
0453  */
0454 static void cell_virtual_cntr(unsigned long data)
0455 {
0456     int i, prev_hdw_thread, next_hdw_thread;
0457     u32 cpu;
0458     unsigned long flags;
0459 
0460     /*
0461      * Make sure that the interrupt_hander and the virt counter are
0462      * not both playing with the counters on the same node.
0463      */
0464 
0465     spin_lock_irqsave(&cntr_lock, flags);
0466 
0467     prev_hdw_thread = hdw_thread;
0468 
0469     /* switch the cpu handling the interrupts */
0470     hdw_thread = 1 ^ hdw_thread;
0471     next_hdw_thread = hdw_thread;
0472 
0473     pm_regs.group_control = 0;
0474     pm_regs.debug_bus_control = 0;
0475 
0476     for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
0477         input_bus[i] = 0xff;
0478 
0479     /*
0480      * There are some per thread events.  Must do the
0481      * set event, for the thread that is being started
0482      */
0483     for (i = 0; i < num_counters; i++)
0484         set_pm_event(i,
0485             pmc_cntrl[next_hdw_thread][i].evnts,
0486             pmc_cntrl[next_hdw_thread][i].masks);
0487 
0488     /*
0489      * The following is done only once per each node, but
0490      * we need cpu #, not node #, to pass to the cbe_xxx functions.
0491      */
0492     for_each_online_cpu(cpu) {
0493         if (cbe_get_hw_thread_id(cpu))
0494             continue;
0495 
0496         /*
0497          * stop counters, save counter values, restore counts
0498          * for previous thread
0499          */
0500         cbe_disable_pm(cpu);
0501         cbe_disable_pm_interrupts(cpu);
0502         for (i = 0; i < num_counters; i++) {
0503             per_cpu(pmc_values, cpu + prev_hdw_thread)[i]
0504                 = cbe_read_ctr(cpu, i);
0505 
0506             if (per_cpu(pmc_values, cpu + next_hdw_thread)[i]
0507                 == 0xFFFFFFFF)
0508                 /* If the cntr value is 0xffffffff, we must
0509                  * reset that to 0xfffffff0 when the current
0510                  * thread is restarted.  This will generate a
0511                  * new interrupt and make sure that we never
0512                  * restore the counters to the max value.  If
0513                  * the counters were restored to the max value,
0514                  * they do not increment and no interrupts are
0515                  * generated.  Hence no more samples will be
0516                  * collected on that cpu.
0517                  */
0518                 cbe_write_ctr(cpu, i, 0xFFFFFFF0);
0519             else
0520                 cbe_write_ctr(cpu, i,
0521                           per_cpu(pmc_values,
0522                               cpu +
0523                               next_hdw_thread)[i]);
0524         }
0525 
0526         /*
0527          * Switch to the other thread. Change the interrupt
0528          * and control regs to be scheduled on the CPU
0529          * corresponding to the thread to execute.
0530          */
0531         for (i = 0; i < num_counters; i++) {
0532             if (pmc_cntrl[next_hdw_thread][i].enabled) {
0533                 /*
0534                  * There are some per thread events.
0535                  * Must do the set event, enable_cntr
0536                  * for each cpu.
0537                  */
0538                 enable_ctr(cpu, i,
0539                        pm_regs.pm07_cntrl);
0540             } else {
0541                 cbe_write_pm07_control(cpu, i, 0);
0542             }
0543         }
0544 
0545         /* Enable interrupts on the CPU thread that is starting */
0546         cbe_enable_pm_interrupts(cpu, next_hdw_thread,
0547                      virt_cntr_inter_mask);
0548         cbe_enable_pm(cpu);
0549     }
0550 
0551     spin_unlock_irqrestore(&cntr_lock, flags);
0552 
0553     mod_timer(&timer_virt_cntr, jiffies + HZ / 10);
0554 }
0555 
0556 static void start_virt_cntrs(void)
0557 {
0558     init_timer(&timer_virt_cntr);
0559     timer_virt_cntr.function = cell_virtual_cntr;
0560     timer_virt_cntr.data = 0UL;
0561     timer_virt_cntr.expires = jiffies + HZ / 10;
0562     add_timer(&timer_virt_cntr);
0563 }
0564 
0565 static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
0566             struct op_system_config *sys, int num_ctrs)
0567 {
0568     spu_cycle_reset = ctr[0].count;
0569 
0570     /*
0571      * Each node will need to make the rtas call to start
0572      * and stop SPU profiling.  Get the token once and store it.
0573      */
0574     spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
0575 
0576     if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
0577         printk(KERN_ERR
0578                "%s: rtas token ibm,cbe-spu-perftools unknown\n",
0579                __func__);
0580         return -EIO;
0581     }
0582     return 0;
0583 }
0584 
0585 /* Unfortunately, the hardware will only support event profiling
0586  * on one SPU per node at a time.  Therefore, we must time slice
0587  * the profiling across all SPUs in the node.  Note, we do this
0588  * in parallel for each node.  The following routine is called
0589  * periodically based on kernel timer to switch which SPU is
0590  * being monitored in a round robbin fashion.
0591  */
0592 static void spu_evnt_swap(unsigned long data)
0593 {
0594     int node;
0595     int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
0596     unsigned long flags;
0597     int cpu;
0598     int ret;
0599     u32 interrupt_mask;
0600 
0601 
0602     /* enable interrupts on cntr 0 */
0603     interrupt_mask = CBE_PM_CTR_OVERFLOW_INTR(0);
0604 
0605     hdw_thread = 0;
0606 
0607     /* Make sure spu event interrupt handler and spu event swap
0608      * don't access the counters simultaneously.
0609      */
0610     spin_lock_irqsave(&cntr_lock, flags);
0611 
0612     cur_spu_evnt_phys_spu_indx = spu_evnt_phys_spu_indx;
0613 
0614     if (++(spu_evnt_phys_spu_indx) == NUM_SPUS_PER_NODE)
0615         spu_evnt_phys_spu_indx = 0;
0616 
0617     pm_signal[0].sub_unit = spu_evnt_phys_spu_indx;
0618     pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
0619     pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
0620 
0621     /* switch the SPU being profiled on each node */
0622     for_each_online_cpu(cpu) {
0623         if (cbe_get_hw_thread_id(cpu))
0624             continue;
0625 
0626         node = cbe_cpu_to_node(cpu);
0627         cur_phys_spu = (node * NUM_SPUS_PER_NODE)
0628             + cur_spu_evnt_phys_spu_indx;
0629         nxt_phys_spu = (node * NUM_SPUS_PER_NODE)
0630             + spu_evnt_phys_spu_indx;
0631 
0632         /*
0633          * stop counters, save counter values, restore counts
0634          * for previous physical SPU
0635          */
0636         cbe_disable_pm(cpu);
0637         cbe_disable_pm_interrupts(cpu);
0638 
0639         spu_pm_cnt[cur_phys_spu]
0640             = cbe_read_ctr(cpu, 0);
0641 
0642         /* restore previous count for the next spu to sample */
0643         /* NOTE, hardware issue, counter will not start if the
0644          * counter value is at max (0xFFFFFFFF).
0645          */
0646         if (spu_pm_cnt[nxt_phys_spu] >= 0xFFFFFFFF)
0647             cbe_write_ctr(cpu, 0, 0xFFFFFFF0);
0648          else
0649              cbe_write_ctr(cpu, 0, spu_pm_cnt[nxt_phys_spu]);
0650 
0651         pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
0652 
0653         /* setup the debug bus measure the one event and
0654          * the two events to route the next SPU's PC on
0655          * the debug bus
0656          */
0657         ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), 3);
0658         if (ret)
0659             printk(KERN_ERR "%s: pm_rtas_activate_signals failed, "
0660                    "SPU event swap\n", __func__);
0661 
0662         /* clear the trace buffer, don't want to take PC for
0663          * previous SPU*/
0664         cbe_write_pm(cpu, trace_address, 0);
0665 
0666         enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
0667 
0668         /* Enable interrupts on the CPU thread that is starting */
0669         cbe_enable_pm_interrupts(cpu, hdw_thread,
0670                      interrupt_mask);
0671         cbe_enable_pm(cpu);
0672     }
0673 
0674     spin_unlock_irqrestore(&cntr_lock, flags);
0675 
0676     /* swap approximately every 0.1 seconds */
0677     mod_timer(&timer_spu_event_swap, jiffies + HZ / 25);
0678 }
0679 
0680 static void start_spu_event_swap(void)
0681 {
0682     init_timer(&timer_spu_event_swap);
0683     timer_spu_event_swap.function = spu_evnt_swap;
0684     timer_spu_event_swap.data = 0UL;
0685     timer_spu_event_swap.expires = jiffies + HZ / 25;
0686     add_timer(&timer_spu_event_swap);
0687 }
0688 
0689 static int cell_reg_setup_spu_events(struct op_counter_config *ctr,
0690             struct op_system_config *sys, int num_ctrs)
0691 {
0692     int i;
0693 
0694     /* routine is called once for all nodes */
0695 
0696     spu_evnt_phys_spu_indx = 0;
0697     /*
0698      * For all events except PPU CYCLEs, each node will need to make
0699      * the rtas cbe-perftools call to setup and reset the debug bus.
0700      * Make the token lookup call once and store it in the global
0701      * variable pm_rtas_token.
0702      */
0703     pm_rtas_token = rtas_token("ibm,cbe-perftools");
0704 
0705     if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
0706         printk(KERN_ERR
0707                "%s: rtas token ibm,cbe-perftools unknown\n",
0708                __func__);
0709         return -EIO;
0710     }
0711 
0712     /* setup the pm_control register settings,
0713      * settings will be written per node by the
0714      * cell_cpu_setup() function.
0715      */
0716     pm_regs.pm_cntrl.trace_buf_ovflw = 1;
0717 
0718     /* Use the occurrence trace mode to have SPU PC saved
0719      * to the trace buffer.  Occurrence data in trace buffer
0720      * is not used.  Bit 2 must be set to store SPU addresses.
0721      */
0722     pm_regs.pm_cntrl.trace_mode = 2;
0723 
0724     pm_regs.pm_cntrl.spu_addr_trace = 0x1;  /* using debug bus
0725                            event 2 & 3 */
0726 
0727     /* setup the debug bus event array with the SPU PC routing events.
0728     *  Note, pm_signal[0] will be filled in by set_pm_event() call below.
0729     */
0730     pm_signal[1].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
0731     pm_signal[1].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_A);
0732     pm_signal[1].bit = SPU_PROFILE_EVENT_ADDR % 100;
0733     pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
0734 
0735     pm_signal[2].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
0736     pm_signal[2].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_B);
0737     pm_signal[2].bit = SPU_PROFILE_EVENT_ADDR % 100;
0738     pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
0739 
0740     /* Set the user selected spu event to profile on,
0741      * note, only one SPU profiling event is supported
0742      */
0743     num_counters = 1;  /* Only support one SPU event at a time */
0744     set_pm_event(0, ctr[0].event, ctr[0].unit_mask);
0745 
0746     reset_value[0] = 0xFFFFFFFF - ctr[0].count;
0747 
0748     /* global, used by cell_cpu_setup */
0749     ctr_enabled |= 1;
0750 
0751     /* Initialize the count for each SPU to the reset value */
0752     for (i=0; i < MAX_NUMNODES * NUM_SPUS_PER_NODE; i++)
0753         spu_pm_cnt[i] = reset_value[0];
0754 
0755     return 0;
0756 }
0757 
0758 static int cell_reg_setup_ppu(struct op_counter_config *ctr,
0759             struct op_system_config *sys, int num_ctrs)
0760 {
0761     /* routine is called once for all nodes */
0762     int i, j, cpu;
0763 
0764     num_counters = num_ctrs;
0765 
0766     if (unlikely(num_ctrs > NR_PHYS_CTRS)) {
0767         printk(KERN_ERR
0768                "%s: Oprofile, number of specified events " \
0769                "exceeds number of physical counters\n",
0770                __func__);
0771         return -EIO;
0772     }
0773 
0774     set_count_mode(sys->enable_kernel, sys->enable_user);
0775 
0776     /* Setup the thread 0 events */
0777     for (i = 0; i < num_ctrs; ++i) {
0778 
0779         pmc_cntrl[0][i].evnts = ctr[i].event;
0780         pmc_cntrl[0][i].masks = ctr[i].unit_mask;
0781         pmc_cntrl[0][i].enabled = ctr[i].enabled;
0782         pmc_cntrl[0][i].vcntr = i;
0783 
0784         for_each_possible_cpu(j)
0785             per_cpu(pmc_values, j)[i] = 0;
0786     }
0787 
0788     /*
0789      * Setup the thread 1 events, map the thread 0 event to the
0790      * equivalent thread 1 event.
0791      */
0792     for (i = 0; i < num_ctrs; ++i) {
0793         if ((ctr[i].event >= 2100) && (ctr[i].event <= 2111))
0794             pmc_cntrl[1][i].evnts = ctr[i].event + 19;
0795         else if (ctr[i].event == 2203)
0796             pmc_cntrl[1][i].evnts = ctr[i].event;
0797         else if ((ctr[i].event >= 2200) && (ctr[i].event <= 2215))
0798             pmc_cntrl[1][i].evnts = ctr[i].event + 16;
0799         else
0800             pmc_cntrl[1][i].evnts = ctr[i].event;
0801 
0802         pmc_cntrl[1][i].masks = ctr[i].unit_mask;
0803         pmc_cntrl[1][i].enabled = ctr[i].enabled;
0804         pmc_cntrl[1][i].vcntr = i;
0805     }
0806 
0807     for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
0808         input_bus[i] = 0xff;
0809 
0810     /*
0811      * Our counters count up, and "count" refers to
0812      * how much before the next interrupt, and we interrupt
0813      * on overflow.  So we calculate the starting value
0814      * which will give us "count" until overflow.
0815      * Then we set the events on the enabled counters.
0816      */
0817     for (i = 0; i < num_counters; ++i) {
0818         /* start with virtual counter set 0 */
0819         if (pmc_cntrl[0][i].enabled) {
0820             /* Using 32bit counters, reset max - count */
0821             reset_value[i] = 0xFFFFFFFF - ctr[i].count;
0822             set_pm_event(i,
0823                      pmc_cntrl[0][i].evnts,
0824                      pmc_cntrl[0][i].masks);
0825 
0826             /* global, used by cell_cpu_setup */
0827             ctr_enabled |= (1 << i);
0828         }
0829     }
0830 
0831     /* initialize the previous counts for the virtual cntrs */
0832     for_each_online_cpu(cpu)
0833         for (i = 0; i < num_counters; ++i) {
0834             per_cpu(pmc_values, cpu)[i] = reset_value[i];
0835         }
0836 
0837     return 0;
0838 }
0839 
0840 
0841 /* This function is called once for all cpus combined */
0842 static int cell_reg_setup(struct op_counter_config *ctr,
0843             struct op_system_config *sys, int num_ctrs)
0844 {
0845     int ret=0;
0846     spu_cycle_reset = 0;
0847 
0848     /* initialize the spu_arr_trace value, will be reset if
0849      * doing spu event profiling.
0850      */
0851     pm_regs.group_control = 0;
0852     pm_regs.debug_bus_control = 0;
0853     pm_regs.pm_cntrl.stop_at_max = 1;
0854     pm_regs.pm_cntrl.trace_mode = 0;
0855     pm_regs.pm_cntrl.freeze = 1;
0856     pm_regs.pm_cntrl.trace_buf_ovflw = 0;
0857     pm_regs.pm_cntrl.spu_addr_trace = 0;
0858 
0859     /*
0860      * For all events except PPU CYCLEs, each node will need to make
0861      * the rtas cbe-perftools call to setup and reset the debug bus.
0862      * Make the token lookup call once and store it in the global
0863      * variable pm_rtas_token.
0864      */
0865     pm_rtas_token = rtas_token("ibm,cbe-perftools");
0866 
0867     if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
0868         printk(KERN_ERR
0869                "%s: rtas token ibm,cbe-perftools unknown\n",
0870                __func__);
0871         return -EIO;
0872     }
0873 
0874     if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
0875         profiling_mode = SPU_PROFILING_CYCLES;
0876         ret = cell_reg_setup_spu_cycles(ctr, sys, num_ctrs);
0877     } else if ((ctr[0].event >= SPU_EVENT_NUM_START) &&
0878            (ctr[0].event <= SPU_EVENT_NUM_STOP)) {
0879         profiling_mode = SPU_PROFILING_EVENTS;
0880         spu_cycle_reset = ctr[0].count;
0881 
0882         /* for SPU event profiling, need to setup the
0883          * pm_signal array with the events to route the
0884          * SPU PC before making the FW call.  Note, only
0885          * one SPU event for profiling can be specified
0886          * at a time.
0887          */
0888         cell_reg_setup_spu_events(ctr, sys, num_ctrs);
0889     } else {
0890         profiling_mode = PPU_PROFILING;
0891         ret = cell_reg_setup_ppu(ctr, sys, num_ctrs);
0892     }
0893 
0894     return ret;
0895 }
0896 
0897 
0898 
0899 /* This function is called once for each cpu */
0900 static int cell_cpu_setup(struct op_counter_config *cntr)
0901 {
0902     u32 cpu = smp_processor_id();
0903     u32 num_enabled = 0;
0904     int i;
0905     int ret;
0906 
0907     /* Cycle based SPU profiling does not use the performance
0908      * counters.  The trace array is configured to collect
0909      * the data.
0910      */
0911     if (profiling_mode == SPU_PROFILING_CYCLES)
0912         return 0;
0913 
0914     /* There is one performance monitor per processor chip (i.e. node),
0915      * so we only need to perform this function once per node.
0916      */
0917     if (cbe_get_hw_thread_id(cpu))
0918         return 0;
0919 
0920     /* Stop all counters */
0921     cbe_disable_pm(cpu);
0922     cbe_disable_pm_interrupts(cpu);
0923 
0924     cbe_write_pm(cpu, pm_start_stop, 0);
0925     cbe_write_pm(cpu, group_control, pm_regs.group_control);
0926     cbe_write_pm(cpu, debug_bus_control, pm_regs.debug_bus_control);
0927     write_pm_cntrl(cpu);
0928 
0929     for (i = 0; i < num_counters; ++i) {
0930         if (ctr_enabled & (1 << i)) {
0931             pm_signal[num_enabled].cpu = cbe_cpu_to_node(cpu);
0932             num_enabled++;
0933         }
0934     }
0935 
0936     /*
0937      * The pm_rtas_activate_signals will return -EIO if the FW
0938      * call failed.
0939      */
0940     if (profiling_mode == SPU_PROFILING_EVENTS) {
0941         /* For SPU event profiling also need to setup the
0942          * pm interval timer
0943          */
0944         ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
0945                            num_enabled+2);
0946         /* store PC from debug bus to Trace buffer as often
0947          * as possible (every 10 cycles)
0948          */
0949         cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
0950         return ret;
0951     } else
0952         return pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
0953                         num_enabled);
0954 }
0955 
0956 #define ENTRIES  303
0957 #define MAXLFSR  0xFFFFFF
0958 
0959 /* precomputed table of 24 bit LFSR values */
0960 static int initial_lfsr[] = {
0961  8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
0962  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
0963  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
0964  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
0965  9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
0966  2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
0967  3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
0968  14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
0969  11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
0970  6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
0971  15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
0972  7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
0973  16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
0974  15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
0975  15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
0976  10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
0977  3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
0978  3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
0979  8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
0980  8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
0981  4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
0982  16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
0983  2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
0984  14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
0985  1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
0986  6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
0987  10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
0988  10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
0989  14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
0990  7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
0991  9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
0992  14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
0993  13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
0994  5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
0995  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
0996  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
0997  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
0998  6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
0999 };
1000 
1001 /*
1002  * The hardware uses an LFSR counting sequence to determine when to capture
1003  * the SPU PCs.  An LFSR sequence is like a puesdo random number sequence
1004  * where each number occurs once in the sequence but the sequence is not in
1005  * numerical order. The SPU PC capture is done when the LFSR sequence reaches
1006  * the last value in the sequence.  Hence the user specified value N
1007  * corresponds to the LFSR number that is N from the end of the sequence.
1008  *
1009  * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
1010  * LFSR sequence is broken into four ranges.  The spacing of the precomputed
1011  * values is adjusted in each range so the error between the user specified
1012  * number (N) of events between samples and the actual number of events based
1013  * on the precomputed value will be les then about 6.2%.  Note, if the user
1014  * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
1015  * This is to prevent the loss of samples because the trace buffer is full.
1016  *
1017  *     User specified N          Step between      Index in
1018  *                   precomputed values  precomputed
1019  *                                  table
1020  * 0            to  2^16-1          ----              0
1021  * 2^16     to  2^16+2^19-1     2^12            1 to 128
1022  * 2^16+2^19        to  2^16+2^19+2^22-1    2^15          129 to 256
1023  * 2^16+2^19+2^22  to   2^24-1          2^18          257 to 302
1024  *
1025  *
1026  * For example, the LFSR values in the second range are computed for 2^16,
1027  * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
1028  * 1, 2,..., 127, 128.
1029  *
1030  * The 24 bit LFSR value for the nth number in the sequence can be
1031  * calculated using the following code:
1032  *
1033  * #define size 24
1034  * int calculate_lfsr(int n)
1035  * {
1036  *  int i;
1037  *  unsigned int newlfsr0;
1038  *  unsigned int lfsr = 0xFFFFFF;
1039  *  unsigned int howmany = n;
1040  *
1041  *  for (i = 2; i < howmany + 2; i++) {
1042  *      newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
1043  *      ((lfsr >> (size - 1 - 1)) & 1) ^
1044  *      (((lfsr >> (size - 1 - 6)) & 1) ^
1045  *      ((lfsr >> (size - 1 - 23)) & 1)));
1046  *
1047  *      lfsr >>= 1;
1048  *      lfsr = lfsr | (newlfsr0 << (size - 1));
1049  *  }
1050  *  return lfsr;
1051  * }
1052  */
1053 
1054 #define V2_16  (0x1 << 16)
1055 #define V2_19  (0x1 << 19)
1056 #define V2_22  (0x1 << 22)
1057 
1058 static int calculate_lfsr(int n)
1059 {
1060     /*
1061      * The ranges and steps are in powers of 2 so the calculations
1062      * can be done using shifts rather then divide.
1063      */
1064     int index;
1065 
1066     if ((n >> 16) == 0)
1067         index = 0;
1068     else if (((n - V2_16) >> 19) == 0)
1069         index = ((n - V2_16) >> 12) + 1;
1070     else if (((n - V2_16 - V2_19) >> 22) == 0)
1071         index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
1072     else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
1073         index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
1074     else
1075         index = ENTRIES-1;
1076 
1077     /* make sure index is valid */
1078     if ((index >= ENTRIES) || (index < 0))
1079         index = ENTRIES-1;
1080 
1081     return initial_lfsr[index];
1082 }
1083 
1084 static int pm_rtas_activate_spu_profiling(u32 node)
1085 {
1086     int ret, i;
1087     struct pm_signal pm_signal_local[NUM_SPUS_PER_NODE];
1088 
1089     /*
1090      * Set up the rtas call to configure the debug bus to
1091      * route the SPU PCs.  Setup the pm_signal for each SPU
1092      */
1093     for (i = 0; i < ARRAY_SIZE(pm_signal_local); i++) {
1094         pm_signal_local[i].cpu = node;
1095         pm_signal_local[i].signal_group = 41;
1096         /* spu i on word (i/2) */
1097         pm_signal_local[i].bus_word = 1 << i / 2;
1098         /* spu i */
1099         pm_signal_local[i].sub_unit = i;
1100         pm_signal_local[i].bit = 63;
1101     }
1102 
1103     ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
1104                      PASSTHRU_ENABLE, pm_signal_local,
1105                      (ARRAY_SIZE(pm_signal_local)
1106                       * sizeof(struct pm_signal)));
1107 
1108     if (unlikely(ret)) {
1109         printk(KERN_WARNING "%s: rtas returned: %d\n",
1110                __func__, ret);
1111         return -EIO;
1112     }
1113 
1114     return 0;
1115 }
1116 
1117 #ifdef CONFIG_CPU_FREQ
1118 static int
1119 oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
1120 {
1121     int ret = 0;
1122     struct cpufreq_freqs *frq = data;
1123     if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
1124         (val == CPUFREQ_POSTCHANGE && frq->old > frq->new))
1125         set_spu_profiling_frequency(frq->new, spu_cycle_reset);
1126     return ret;
1127 }
1128 
1129 static struct notifier_block cpu_freq_notifier_block = {
1130     .notifier_call  = oprof_cpufreq_notify
1131 };
1132 #endif
1133 
1134 /*
1135  * Note the generic OProfile stop calls do not support returning
1136  * an error on stop.  Hence, will not return an error if the FW
1137  * calls fail on stop.  Failure to reset the debug bus is not an issue.
1138  * Failure to disable the SPU profiling is not an issue.  The FW calls
1139  * to enable the performance counters and debug bus will work even if
1140  * the hardware was not cleanly reset.
1141  */
1142 static void cell_global_stop_spu_cycles(void)
1143 {
1144     int subfunc, rtn_value;
1145     unsigned int lfsr_value;
1146     int cpu;
1147 
1148     oprofile_running = 0;
1149     smp_wmb();
1150 
1151 #ifdef CONFIG_CPU_FREQ
1152     cpufreq_unregister_notifier(&cpu_freq_notifier_block,
1153                     CPUFREQ_TRANSITION_NOTIFIER);
1154 #endif
1155 
1156     for_each_online_cpu(cpu) {
1157         if (cbe_get_hw_thread_id(cpu))
1158             continue;
1159 
1160         subfunc = 3;    /*
1161                  * 2 - activate SPU tracing,
1162                  * 3 - deactivate
1163                  */
1164         lfsr_value = 0x8f100000;
1165 
1166         rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
1167                       subfunc, cbe_cpu_to_node(cpu),
1168                       lfsr_value);
1169 
1170         if (unlikely(rtn_value != 0)) {
1171             printk(KERN_ERR
1172                    "%s: rtas call ibm,cbe-spu-perftools " \
1173                    "failed, return = %d\n",
1174                    __func__, rtn_value);
1175         }
1176 
1177         /* Deactivate the signals */
1178         pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1179     }
1180 
1181     stop_spu_profiling_cycles();
1182 }
1183 
1184 static void cell_global_stop_spu_events(void)
1185 {
1186     int cpu;
1187     oprofile_running = 0;
1188 
1189     stop_spu_profiling_events();
1190     smp_wmb();
1191 
1192     for_each_online_cpu(cpu) {
1193         if (cbe_get_hw_thread_id(cpu))
1194             continue;
1195 
1196         cbe_sync_irq(cbe_cpu_to_node(cpu));
1197         /* Stop the counters */
1198         cbe_disable_pm(cpu);
1199         cbe_write_pm07_control(cpu, 0, 0);
1200 
1201         /* Deactivate the signals */
1202         pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1203 
1204         /* Deactivate interrupts */
1205         cbe_disable_pm_interrupts(cpu);
1206     }
1207     del_timer_sync(&timer_spu_event_swap);
1208 }
1209 
1210 static void cell_global_stop_ppu(void)
1211 {
1212     int cpu;
1213 
1214     /*
1215      * This routine will be called once for the system.
1216      * There is one performance monitor per node, so we
1217      * only need to perform this function once per node.
1218      */
1219     del_timer_sync(&timer_virt_cntr);
1220     oprofile_running = 0;
1221     smp_wmb();
1222 
1223     for_each_online_cpu(cpu) {
1224         if (cbe_get_hw_thread_id(cpu))
1225             continue;
1226 
1227         cbe_sync_irq(cbe_cpu_to_node(cpu));
1228         /* Stop the counters */
1229         cbe_disable_pm(cpu);
1230 
1231         /* Deactivate the signals */
1232         pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
1233 
1234         /* Deactivate interrupts */
1235         cbe_disable_pm_interrupts(cpu);
1236     }
1237 }
1238 
1239 static void cell_global_stop(void)
1240 {
1241     if (profiling_mode == PPU_PROFILING)
1242         cell_global_stop_ppu();
1243     else if (profiling_mode == SPU_PROFILING_EVENTS)
1244         cell_global_stop_spu_events();
1245     else
1246         cell_global_stop_spu_cycles();
1247 }
1248 
1249 static int cell_global_start_spu_cycles(struct op_counter_config *ctr)
1250 {
1251     int subfunc;
1252     unsigned int lfsr_value;
1253     int cpu;
1254     int ret;
1255     int rtas_error;
1256     unsigned int cpu_khzfreq = 0;
1257 
1258     /* The SPU profiling uses time-based profiling based on
1259      * cpu frequency, so if configured with the CPU_FREQ
1260      * option, we should detect frequency changes and react
1261      * accordingly.
1262      */
1263 #ifdef CONFIG_CPU_FREQ
1264     ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
1265                     CPUFREQ_TRANSITION_NOTIFIER);
1266     if (ret < 0)
1267         /* this is not a fatal error */
1268         printk(KERN_ERR "CPU freq change registration failed: %d\n",
1269                ret);
1270 
1271     else
1272         cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
1273 #endif
1274 
1275     set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
1276 
1277     for_each_online_cpu(cpu) {
1278         if (cbe_get_hw_thread_id(cpu))
1279             continue;
1280 
1281         /*
1282          * Setup SPU cycle-based profiling.
1283          * Set perf_mon_control bit 0 to a zero before
1284          * enabling spu collection hardware.
1285          */
1286         cbe_write_pm(cpu, pm_control, 0);
1287 
1288         if (spu_cycle_reset > MAX_SPU_COUNT)
1289             /* use largest possible value */
1290             lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
1291         else
1292             lfsr_value = calculate_lfsr(spu_cycle_reset);
1293 
1294         /* must use a non zero value. Zero disables data collection. */
1295         if (lfsr_value == 0)
1296             lfsr_value = calculate_lfsr(1);
1297 
1298         lfsr_value = lfsr_value << 8; /* shift lfsr to correct
1299                         * register location
1300                         */
1301 
1302         /* debug bus setup */
1303         ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
1304 
1305         if (unlikely(ret)) {
1306             rtas_error = ret;
1307             goto out;
1308         }
1309 
1310 
1311         subfunc = 2;    /* 2 - activate SPU tracing, 3 - deactivate */
1312 
1313         /* start profiling */
1314         ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
1315                 cbe_cpu_to_node(cpu), lfsr_value);
1316 
1317         if (unlikely(ret != 0)) {
1318             printk(KERN_ERR
1319                    "%s: rtas call ibm,cbe-spu-perftools failed, " \
1320                    "return = %d\n", __func__, ret);
1321             rtas_error = -EIO;
1322             goto out;
1323         }
1324     }
1325 
1326     rtas_error = start_spu_profiling_cycles(spu_cycle_reset);
1327     if (rtas_error)
1328         goto out_stop;
1329 
1330     oprofile_running = 1;
1331     return 0;
1332 
1333 out_stop:
1334     cell_global_stop_spu_cycles();  /* clean up the PMU/debug bus */
1335 out:
1336     return rtas_error;
1337 }
1338 
1339 static int cell_global_start_spu_events(struct op_counter_config *ctr)
1340 {
1341     int cpu;
1342     u32 interrupt_mask = 0;
1343     int rtn = 0;
1344 
1345     hdw_thread = 0;
1346 
1347     /* spu event profiling, uses the performance counters to generate
1348      * an interrupt.  The hardware is setup to store the SPU program
1349      * counter into the trace array.  The occurrence mode is used to
1350      * enable storing data to the trace buffer.  The bits are set
1351      * to send/store the SPU address in the trace buffer.  The debug
1352      * bus must be setup to route the SPU program counter onto the
1353      * debug bus.  The occurrence data in the trace buffer is not used.
1354      */
1355 
1356     /* This routine gets called once for the system.
1357      * There is one performance monitor per node, so we
1358      * only need to perform this function once per node.
1359      */
1360 
1361     for_each_online_cpu(cpu) {
1362         if (cbe_get_hw_thread_id(cpu))
1363             continue;
1364 
1365         /*
1366          * Setup SPU event-based profiling.
1367          * Set perf_mon_control bit 0 to a zero before
1368          * enabling spu collection hardware.
1369          *
1370          * Only support one SPU event on one SPU per node.
1371          */
1372         if (ctr_enabled & 1) {
1373             cbe_write_ctr(cpu, 0, reset_value[0]);
1374             enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
1375             interrupt_mask |=
1376                 CBE_PM_CTR_OVERFLOW_INTR(0);
1377         } else {
1378             /* Disable counter */
1379             cbe_write_pm07_control(cpu, 0, 0);
1380         }
1381 
1382         cbe_get_and_clear_pm_interrupts(cpu);
1383         cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1384         cbe_enable_pm(cpu);
1385 
1386         /* clear the trace buffer */
1387         cbe_write_pm(cpu, trace_address, 0);
1388     }
1389 
1390     /* Start the timer to time slice collecting the event profile
1391      * on each of the SPUs.  Note, can collect profile on one SPU
1392      * per node at a time.
1393      */
1394     start_spu_event_swap();
1395     start_spu_profiling_events();
1396     oprofile_running = 1;
1397     smp_wmb();
1398 
1399     return rtn;
1400 }
1401 
1402 static int cell_global_start_ppu(struct op_counter_config *ctr)
1403 {
1404     u32 cpu, i;
1405     u32 interrupt_mask = 0;
1406 
1407     /* This routine gets called once for the system.
1408      * There is one performance monitor per node, so we
1409      * only need to perform this function once per node.
1410      */
1411     for_each_online_cpu(cpu) {
1412         if (cbe_get_hw_thread_id(cpu))
1413             continue;
1414 
1415         interrupt_mask = 0;
1416 
1417         for (i = 0; i < num_counters; ++i) {
1418             if (ctr_enabled & (1 << i)) {
1419                 cbe_write_ctr(cpu, i, reset_value[i]);
1420                 enable_ctr(cpu, i, pm_regs.pm07_cntrl);
1421                 interrupt_mask |= CBE_PM_CTR_OVERFLOW_INTR(i);
1422             } else {
1423                 /* Disable counter */
1424                 cbe_write_pm07_control(cpu, i, 0);
1425             }
1426         }
1427 
1428         cbe_get_and_clear_pm_interrupts(cpu);
1429         cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
1430         cbe_enable_pm(cpu);
1431     }
1432 
1433     virt_cntr_inter_mask = interrupt_mask;
1434     oprofile_running = 1;
1435     smp_wmb();
1436 
1437     /*
1438      * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
1439      * executed which manipulates the PMU.  We start the "virtual counter"
1440      * here so that we do not need to synchronize access to the PMU in
1441      * the above for-loop.
1442      */
1443     start_virt_cntrs();
1444 
1445     return 0;
1446 }
1447 
1448 static int cell_global_start(struct op_counter_config *ctr)
1449 {
1450     if (profiling_mode == SPU_PROFILING_CYCLES)
1451         return cell_global_start_spu_cycles(ctr);
1452     else if (profiling_mode == SPU_PROFILING_EVENTS)
1453         return cell_global_start_spu_events(ctr);
1454     else
1455         return cell_global_start_ppu(ctr);
1456 }
1457 
1458 
1459 /* The SPU interrupt handler
1460  *
1461  * SPU event profiling works as follows:
1462  * The pm_signal[0] holds the one SPU event to be measured.  It is routed on
1463  * the debug bus using word 0 or 1.  The value of pm_signal[1] and
1464  * pm_signal[2] contain the necessary events to route the SPU program
1465  * counter for the selected SPU onto the debug bus using words 2 and 3.
1466  * The pm_interval register is setup to write the SPU PC value into the
1467  * trace buffer at the maximum rate possible.  The trace buffer is configured
1468  * to store the PCs, wrapping when it is full.  The performance counter is
1469  * initialized to the max hardware count minus the number of events, N, between
1470  * samples.  Once the N events have occurred, a HW counter overflow occurs
1471  * causing the generation of a HW counter interrupt which also stops the
1472  * writing of the SPU PC values to the trace buffer.  Hence the last PC
1473  * written to the trace buffer is the SPU PC that we want.  Unfortunately,
1474  * we have to read from the beginning of the trace buffer to get to the
1475  * last value written.  We just hope the PPU has nothing better to do then
1476  * service this interrupt. The PC for the specific SPU being profiled is
1477  * extracted from the trace buffer processed and stored.  The trace buffer
1478  * is cleared, interrupts are cleared, the counter is reset to max - N.
1479  * A kernel timer is used to periodically call the routine spu_evnt_swap()
1480  * to switch to the next physical SPU in the node to profile in round robbin
1481  * order.  This way data is collected for all SPUs on the node. It does mean
1482  * that we need to use a relatively small value of N to ensure enough samples
1483  * on each SPU are collected each SPU is being profiled 1/8 of the time.
1484  * It may also be necessary to use a longer sample collection period.
1485  */
1486 static void cell_handle_interrupt_spu(struct pt_regs *regs,
1487                       struct op_counter_config *ctr)
1488 {
1489     u32 cpu, cpu_tmp;
1490     u64 trace_entry;
1491     u32 interrupt_mask;
1492     u64 trace_buffer[2];
1493     u64 last_trace_buffer;
1494     u32 sample;
1495     u32 trace_addr;
1496     unsigned long sample_array_lock_flags;
1497     int spu_num;
1498     unsigned long flags;
1499 
1500     /* Make sure spu event interrupt handler and spu event swap
1501      * don't access the counters simultaneously.
1502      */
1503     cpu = smp_processor_id();
1504     spin_lock_irqsave(&cntr_lock, flags);
1505 
1506     cpu_tmp = cpu;
1507     cbe_disable_pm(cpu);
1508 
1509     interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1510 
1511     sample = 0xABCDEF;
1512     trace_entry = 0xfedcba;
1513     last_trace_buffer = 0xdeadbeaf;
1514 
1515     if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1516         /* disable writes to trace buff */
1517         cbe_write_pm(cpu, pm_interval, 0);
1518 
1519         /* only have one perf cntr being used, cntr 0 */
1520         if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(0))
1521             && ctr[0].enabled)
1522             /* The SPU PC values will be read
1523              * from the trace buffer, reset counter
1524              */
1525 
1526             cbe_write_ctr(cpu, 0, reset_value[0]);
1527 
1528         trace_addr = cbe_read_pm(cpu, trace_address);
1529 
1530         while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
1531             /* There is data in the trace buffer to process
1532              * Read the buffer until you get to the last
1533              * entry.  This is the value we want.
1534              */
1535 
1536             cbe_read_trace_buffer(cpu, trace_buffer);
1537             trace_addr = cbe_read_pm(cpu, trace_address);
1538         }
1539 
1540         /* SPU Address 16 bit count format for 128 bit
1541          * HW trace buffer is used for the SPU PC storage
1542          *    HDR bits          0:15
1543          *    SPU Addr 0 bits   16:31
1544          *    SPU Addr 1 bits   32:47
1545          *    unused bits       48:127
1546          *
1547          * HDR: bit4 = 1 SPU Address 0 valid
1548          * HDR: bit5 = 1 SPU Address 1 valid
1549          *  - unfortunately, the valid bits don't seem to work
1550          *
1551          * Note trace_buffer[0] holds bits 0:63 of the HW
1552          * trace buffer, trace_buffer[1] holds bits 64:127
1553          */
1554 
1555         trace_entry = trace_buffer[0]
1556             & 0x00000000FFFF0000;
1557 
1558         /* only top 16 of the 18 bit SPU PC address
1559          * is stored in trace buffer, hence shift right
1560          * by 16 -2 bits */
1561         sample = trace_entry >> 14;
1562         last_trace_buffer = trace_buffer[0];
1563 
1564         spu_num = spu_evnt_phys_spu_indx
1565             + (cbe_cpu_to_node(cpu) * NUM_SPUS_PER_NODE);
1566 
1567         /* make sure only one process at a time is calling
1568          * spu_sync_buffer()
1569          */
1570         spin_lock_irqsave(&oprof_spu_smpl_arry_lck,
1571                   sample_array_lock_flags);
1572         spu_sync_buffer(spu_num, &sample, 1);
1573         spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
1574                        sample_array_lock_flags);
1575 
1576         smp_wmb();    /* insure spu event buffer updates are written
1577                    * don't want events intermingled... */
1578 
1579         /* The counters were frozen by the interrupt.
1580          * Reenable the interrupt and restart the counters.
1581          */
1582         cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1583         cbe_enable_pm_interrupts(cpu, hdw_thread,
1584                      virt_cntr_inter_mask);
1585 
1586         /* clear the trace buffer, re-enable writes to trace buff */
1587         cbe_write_pm(cpu, trace_address, 0);
1588         cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
1589 
1590         /* The writes to the various performance counters only writes
1591          * to a latch.  The new values (interrupt setting bits, reset
1592          * counter value etc.) are not copied to the actual registers
1593          * until the performance monitor is enabled.  In order to get
1594          * this to work as desired, the performance monitor needs to
1595          * be disabled while writing to the latches.  This is a
1596          * HW design issue.
1597          */
1598         write_pm_cntrl(cpu);
1599         cbe_enable_pm(cpu);
1600     }
1601     spin_unlock_irqrestore(&cntr_lock, flags);
1602 }
1603 
1604 static void cell_handle_interrupt_ppu(struct pt_regs *regs,
1605                       struct op_counter_config *ctr)
1606 {
1607     u32 cpu;
1608     u64 pc;
1609     int is_kernel;
1610     unsigned long flags = 0;
1611     u32 interrupt_mask;
1612     int i;
1613 
1614     cpu = smp_processor_id();
1615 
1616     /*
1617      * Need to make sure the interrupt handler and the virt counter
1618      * routine are not running at the same time. See the
1619      * cell_virtual_cntr() routine for additional comments.
1620      */
1621     spin_lock_irqsave(&cntr_lock, flags);
1622 
1623     /*
1624      * Need to disable and reenable the performance counters
1625      * to get the desired behavior from the hardware.  This
1626      * is hardware specific.
1627      */
1628 
1629     cbe_disable_pm(cpu);
1630 
1631     interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
1632 
1633     /*
1634      * If the interrupt mask has been cleared, then the virt cntr
1635      * has cleared the interrupt.  When the thread that generated
1636      * the interrupt is restored, the data count will be restored to
1637      * 0xffffff0 to cause the interrupt to be regenerated.
1638      */
1639 
1640     if ((oprofile_running == 1) && (interrupt_mask != 0)) {
1641         pc = regs->nip;
1642         is_kernel = is_kernel_addr(pc);
1643 
1644         for (i = 0; i < num_counters; ++i) {
1645             if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(i))
1646                 && ctr[i].enabled) {
1647                 oprofile_add_ext_sample(pc, regs, i, is_kernel);
1648                 cbe_write_ctr(cpu, i, reset_value[i]);
1649             }
1650         }
1651 
1652         /*
1653          * The counters were frozen by the interrupt.
1654          * Reenable the interrupt and restart the counters.
1655          * If there was a race between the interrupt handler and
1656          * the virtual counter routine.  The virtual counter
1657          * routine may have cleared the interrupts.  Hence must
1658          * use the virt_cntr_inter_mask to re-enable the interrupts.
1659          */
1660         cbe_enable_pm_interrupts(cpu, hdw_thread,
1661                      virt_cntr_inter_mask);
1662 
1663         /*
1664          * The writes to the various performance counters only writes
1665          * to a latch.  The new values (interrupt setting bits, reset
1666          * counter value etc.) are not copied to the actual registers
1667          * until the performance monitor is enabled.  In order to get
1668          * this to work as desired, the performance monitor needs to
1669          * be disabled while writing to the latches.  This is a
1670          * HW design issue.
1671          */
1672         cbe_enable_pm(cpu);
1673     }
1674     spin_unlock_irqrestore(&cntr_lock, flags);
1675 }
1676 
1677 static void cell_handle_interrupt(struct pt_regs *regs,
1678                   struct op_counter_config *ctr)
1679 {
1680     if (profiling_mode == PPU_PROFILING)
1681         cell_handle_interrupt_ppu(regs, ctr);
1682     else
1683         cell_handle_interrupt_spu(regs, ctr);
1684 }
1685 
1686 /*
1687  * This function is called from the generic OProfile
1688  * driver.  When profiling PPUs, we need to do the
1689  * generic sync start; otherwise, do spu_sync_start.
1690  */
1691 static int cell_sync_start(void)
1692 {
1693     if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1694         (profiling_mode == SPU_PROFILING_EVENTS))
1695         return spu_sync_start();
1696     else
1697         return DO_GENERIC_SYNC;
1698 }
1699 
1700 static int cell_sync_stop(void)
1701 {
1702     if ((profiling_mode == SPU_PROFILING_CYCLES) ||
1703         (profiling_mode == SPU_PROFILING_EVENTS))
1704         return spu_sync_stop();
1705     else
1706         return 1;
1707 }
1708 
1709 struct op_powerpc_model op_model_cell = {
1710     .reg_setup = cell_reg_setup,
1711     .cpu_setup = cell_cpu_setup,
1712     .global_start = cell_global_start,
1713     .global_stop = cell_global_stop,
1714     .sync_start = cell_sync_start,
1715     .sync_stop = cell_sync_stop,
1716     .handle_interrupt = cell_handle_interrupt,
1717 };