Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Resource Director Technology(RDT)
0004  * - Monitoring code
0005  *
0006  * Copyright (C) 2017 Intel Corporation
0007  *
0008  * Author:
0009  *    Vikas Shivappa <vikas.shivappa@intel.com>
0010  *
0011  * This replaces the cqm.c based on perf but we reuse a lot of
0012  * code and datastructures originally from Peter Zijlstra and Matt Fleming.
0013  *
0014  * More information about RDT be found in the Intel (R) x86 Architecture
0015  * Software Developer Manual June 2016, volume 3, section 17.17.
0016  */
0017 
0018 #include <linux/module.h>
0019 #include <linux/slab.h>
0020 #include <asm/cpu_device_id.h>
0021 #include "internal.h"
0022 
0023 struct rmid_entry {
0024     u32             rmid;
0025     int             busy;
0026     struct list_head        list;
0027 };
0028 
0029 /**
0030  * @rmid_free_lru    A least recently used list of free RMIDs
0031  *     These RMIDs are guaranteed to have an occupancy less than the
0032  *     threshold occupancy
0033  */
0034 static LIST_HEAD(rmid_free_lru);
0035 
0036 /**
0037  * @rmid_limbo_count     count of currently unused but (potentially)
0038  *     dirty RMIDs.
0039  *     This counts RMIDs that no one is currently using but that
0040  *     may have a occupancy value > intel_cqm_threshold. User can change
0041  *     the threshold occupancy value.
0042  */
0043 static unsigned int rmid_limbo_count;
0044 
0045 /**
0046  * @rmid_entry - The entry in the limbo and free lists.
0047  */
0048 static struct rmid_entry    *rmid_ptrs;
0049 
0050 /*
0051  * Global boolean for rdt_monitor which is true if any
0052  * resource monitoring is enabled.
0053  */
0054 bool rdt_mon_capable;
0055 
0056 /*
0057  * Global to indicate which monitoring events are enabled.
0058  */
0059 unsigned int rdt_mon_features;
0060 
0061 /*
0062  * This is the threshold cache occupancy at which we will consider an
0063  * RMID available for re-allocation.
0064  */
0065 unsigned int resctrl_cqm_threshold;
0066 
0067 #define CF(cf)  ((unsigned long)(1048576 * (cf) + 0.5))
0068 
0069 /*
0070  * The correction factor table is documented in Documentation/x86/resctrl.rst.
0071  * If rmid > rmid threshold, MBM total and local values should be multiplied
0072  * by the correction factor.
0073  *
0074  * The original table is modified for better code:
0075  *
0076  * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
0077  *    for the case.
0078  * 2. MBM total and local correction table indexed by core counter which is
0079  *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
0080  * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
0081  *    to calculate corrected value by shifting:
0082  *    corrected_value = (original_value * correction_factor) >> 20
0083  */
0084 static const struct mbm_correction_factor_table {
0085     u32 rmidthreshold;
0086     u64 cf;
0087 } mbm_cf_table[] __initconst = {
0088     {7, CF(1.000000)},
0089     {15,    CF(1.000000)},
0090     {15,    CF(0.969650)},
0091     {31,    CF(1.000000)},
0092     {31,    CF(1.066667)},
0093     {31,    CF(0.969650)},
0094     {47,    CF(1.142857)},
0095     {63,    CF(1.000000)},
0096     {63,    CF(1.185115)},
0097     {63,    CF(1.066553)},
0098     {79,    CF(1.454545)},
0099     {95,    CF(1.000000)},
0100     {95,    CF(1.230769)},
0101     {95,    CF(1.142857)},
0102     {95,    CF(1.066667)},
0103     {127,   CF(1.000000)},
0104     {127,   CF(1.254863)},
0105     {127,   CF(1.185255)},
0106     {151,   CF(1.000000)},
0107     {127,   CF(1.066667)},
0108     {167,   CF(1.000000)},
0109     {159,   CF(1.454334)},
0110     {183,   CF(1.000000)},
0111     {127,   CF(0.969744)},
0112     {191,   CF(1.280246)},
0113     {191,   CF(1.230921)},
0114     {215,   CF(1.000000)},
0115     {191,   CF(1.143118)},
0116 };
0117 
0118 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
0119 static u64 mbm_cf __read_mostly;
0120 
0121 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
0122 {
0123     /* Correct MBM value. */
0124     if (rmid > mbm_cf_rmidthreshold)
0125         val = (val * mbm_cf) >> 20;
0126 
0127     return val;
0128 }
0129 
0130 static inline struct rmid_entry *__rmid_entry(u32 rmid)
0131 {
0132     struct rmid_entry *entry;
0133 
0134     entry = &rmid_ptrs[rmid];
0135     WARN_ON(entry->rmid != rmid);
0136 
0137     return entry;
0138 }
0139 
0140 static u64 __rmid_read(u32 rmid, u32 eventid)
0141 {
0142     u64 val;
0143 
0144     /*
0145      * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
0146      * with a valid event code for supported resource type and the bits
0147      * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
0148      * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
0149      * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
0150      * are error bits.
0151      */
0152     wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
0153     rdmsrl(MSR_IA32_QM_CTR, val);
0154 
0155     return val;
0156 }
0157 
0158 static bool rmid_dirty(struct rmid_entry *entry)
0159 {
0160     u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
0161 
0162     return val >= resctrl_cqm_threshold;
0163 }
0164 
0165 /*
0166  * Check the RMIDs that are marked as busy for this domain. If the
0167  * reported LLC occupancy is below the threshold clear the busy bit and
0168  * decrement the count. If the busy count gets to zero on an RMID, we
0169  * free the RMID
0170  */
0171 void __check_limbo(struct rdt_domain *d, bool force_free)
0172 {
0173     struct rmid_entry *entry;
0174     struct rdt_resource *r;
0175     u32 crmid = 1, nrmid;
0176 
0177     r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
0178 
0179     /*
0180      * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
0181      * are marked as busy for occupancy < threshold. If the occupancy
0182      * is less than the threshold decrement the busy counter of the
0183      * RMID and move it to the free list when the counter reaches 0.
0184      */
0185     for (;;) {
0186         nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
0187         if (nrmid >= r->num_rmid)
0188             break;
0189 
0190         entry = __rmid_entry(nrmid);
0191         if (force_free || !rmid_dirty(entry)) {
0192             clear_bit(entry->rmid, d->rmid_busy_llc);
0193             if (!--entry->busy) {
0194                 rmid_limbo_count--;
0195                 list_add_tail(&entry->list, &rmid_free_lru);
0196             }
0197         }
0198         crmid = nrmid + 1;
0199     }
0200 }
0201 
0202 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
0203 {
0204     return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
0205 }
0206 
0207 /*
0208  * As of now the RMIDs allocation is global.
0209  * However we keep track of which packages the RMIDs
0210  * are used to optimize the limbo list management.
0211  */
0212 int alloc_rmid(void)
0213 {
0214     struct rmid_entry *entry;
0215 
0216     lockdep_assert_held(&rdtgroup_mutex);
0217 
0218     if (list_empty(&rmid_free_lru))
0219         return rmid_limbo_count ? -EBUSY : -ENOSPC;
0220 
0221     entry = list_first_entry(&rmid_free_lru,
0222                  struct rmid_entry, list);
0223     list_del(&entry->list);
0224 
0225     return entry->rmid;
0226 }
0227 
0228 static void add_rmid_to_limbo(struct rmid_entry *entry)
0229 {
0230     struct rdt_resource *r;
0231     struct rdt_domain *d;
0232     int cpu;
0233     u64 val;
0234 
0235     r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
0236 
0237     entry->busy = 0;
0238     cpu = get_cpu();
0239     list_for_each_entry(d, &r->domains, list) {
0240         if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
0241             val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
0242             if (val <= resctrl_cqm_threshold)
0243                 continue;
0244         }
0245 
0246         /*
0247          * For the first limbo RMID in the domain,
0248          * setup up the limbo worker.
0249          */
0250         if (!has_busy_rmid(r, d))
0251             cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
0252         set_bit(entry->rmid, d->rmid_busy_llc);
0253         entry->busy++;
0254     }
0255     put_cpu();
0256 
0257     if (entry->busy)
0258         rmid_limbo_count++;
0259     else
0260         list_add_tail(&entry->list, &rmid_free_lru);
0261 }
0262 
0263 void free_rmid(u32 rmid)
0264 {
0265     struct rmid_entry *entry;
0266 
0267     if (!rmid)
0268         return;
0269 
0270     lockdep_assert_held(&rdtgroup_mutex);
0271 
0272     entry = __rmid_entry(rmid);
0273 
0274     if (is_llc_occupancy_enabled())
0275         add_rmid_to_limbo(entry);
0276     else
0277         list_add_tail(&entry->list, &rmid_free_lru);
0278 }
0279 
0280 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
0281 {
0282     u64 shift = 64 - width, chunks;
0283 
0284     chunks = (cur_msr << shift) - (prev_msr << shift);
0285     return chunks >> shift;
0286 }
0287 
0288 static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
0289 {
0290     struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
0291     struct mbm_state *m;
0292     u64 chunks, tval;
0293 
0294     tval = __rmid_read(rmid, rr->evtid);
0295     if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
0296         return tval;
0297     }
0298     switch (rr->evtid) {
0299     case QOS_L3_OCCUP_EVENT_ID:
0300         rr->val += tval;
0301         return 0;
0302     case QOS_L3_MBM_TOTAL_EVENT_ID:
0303         m = &rr->d->mbm_total[rmid];
0304         break;
0305     case QOS_L3_MBM_LOCAL_EVENT_ID:
0306         m = &rr->d->mbm_local[rmid];
0307         break;
0308     default:
0309         /*
0310          * Code would never reach here because an invalid
0311          * event id would fail the __rmid_read.
0312          */
0313         return RMID_VAL_ERROR;
0314     }
0315 
0316     if (rr->first) {
0317         memset(m, 0, sizeof(struct mbm_state));
0318         m->prev_bw_msr = m->prev_msr = tval;
0319         return 0;
0320     }
0321 
0322     chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
0323     m->chunks += chunks;
0324     m->prev_msr = tval;
0325 
0326     rr->val += get_corrected_mbm_count(rmid, m->chunks);
0327 
0328     return 0;
0329 }
0330 
0331 /*
0332  * Supporting function to calculate the memory bandwidth
0333  * and delta bandwidth in MBps.
0334  */
0335 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
0336 {
0337     struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
0338     struct mbm_state *m = &rr->d->mbm_local[rmid];
0339     u64 tval, cur_bw, chunks;
0340 
0341     tval = __rmid_read(rmid, rr->evtid);
0342     if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
0343         return;
0344 
0345     chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
0346     cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
0347 
0348     if (m->delta_comp)
0349         m->delta_bw = abs(cur_bw - m->prev_bw);
0350     m->delta_comp = false;
0351     m->prev_bw = cur_bw;
0352     m->prev_bw_msr = tval;
0353 }
0354 
0355 /*
0356  * This is called via IPI to read the CQM/MBM counters
0357  * on a domain.
0358  */
0359 void mon_event_count(void *info)
0360 {
0361     struct rdtgroup *rdtgrp, *entry;
0362     struct rmid_read *rr = info;
0363     struct list_head *head;
0364     u64 ret_val;
0365 
0366     rdtgrp = rr->rgrp;
0367 
0368     ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
0369 
0370     /*
0371      * For Ctrl groups read data from child monitor groups and
0372      * add them together. Count events which are read successfully.
0373      * Discard the rmid_read's reporting errors.
0374      */
0375     head = &rdtgrp->mon.crdtgrp_list;
0376 
0377     if (rdtgrp->type == RDTCTRL_GROUP) {
0378         list_for_each_entry(entry, head, mon.crdtgrp_list) {
0379             if (__mon_event_count(entry->mon.rmid, rr) == 0)
0380                 ret_val = 0;
0381         }
0382     }
0383 
0384     /* Report error if none of rmid_reads are successful */
0385     if (ret_val)
0386         rr->val = ret_val;
0387 }
0388 
0389 /*
0390  * Feedback loop for MBA software controller (mba_sc)
0391  *
0392  * mba_sc is a feedback loop where we periodically read MBM counters and
0393  * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
0394  * that:
0395  *
0396  *   current bandwidth(cur_bw) < user specified bandwidth(user_bw)
0397  *
0398  * This uses the MBM counters to measure the bandwidth and MBA throttle
0399  * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
0400  * fact that resctrl rdtgroups have both monitoring and control.
0401  *
0402  * The frequency of the checks is 1s and we just tag along the MBM overflow
0403  * timer. Having 1s interval makes the calculation of bandwidth simpler.
0404  *
0405  * Although MBA's goal is to restrict the bandwidth to a maximum, there may
0406  * be a need to increase the bandwidth to avoid unnecessarily restricting
0407  * the L2 <-> L3 traffic.
0408  *
0409  * Since MBA controls the L2 external bandwidth where as MBM measures the
0410  * L3 external bandwidth the following sequence could lead to such a
0411  * situation.
0412  *
0413  * Consider an rdtgroup which had high L3 <-> memory traffic in initial
0414  * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
0415  * after some time rdtgroup has mostly L2 <-> L3 traffic.
0416  *
0417  * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
0418  * throttle MSRs already have low percentage values.  To avoid
0419  * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
0420  */
0421 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
0422 {
0423     u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
0424     struct mbm_state *pmbm_data, *cmbm_data;
0425     struct rdt_hw_resource *hw_r_mba;
0426     struct rdt_hw_domain *hw_dom_mba;
0427     u32 cur_bw, delta_bw, user_bw;
0428     struct rdt_resource *r_mba;
0429     struct rdt_domain *dom_mba;
0430     struct list_head *head;
0431     struct rdtgroup *entry;
0432 
0433     if (!is_mbm_local_enabled())
0434         return;
0435 
0436     hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
0437     r_mba = &hw_r_mba->r_resctrl;
0438     closid = rgrp->closid;
0439     rmid = rgrp->mon.rmid;
0440     pmbm_data = &dom_mbm->mbm_local[rmid];
0441 
0442     dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
0443     if (!dom_mba) {
0444         pr_warn_once("Failure to get domain for MBA update\n");
0445         return;
0446     }
0447     hw_dom_mba = resctrl_to_arch_dom(dom_mba);
0448 
0449     cur_bw = pmbm_data->prev_bw;
0450     user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
0451     delta_bw = pmbm_data->delta_bw;
0452     /*
0453      * resctrl_arch_get_config() chooses the mbps/ctrl value to return
0454      * based on is_mba_sc(). For now, reach into the hw_dom.
0455      */
0456     cur_msr_val = hw_dom_mba->ctrl_val[closid];
0457 
0458     /*
0459      * For Ctrl groups read data from child monitor groups.
0460      */
0461     head = &rgrp->mon.crdtgrp_list;
0462     list_for_each_entry(entry, head, mon.crdtgrp_list) {
0463         cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
0464         cur_bw += cmbm_data->prev_bw;
0465         delta_bw += cmbm_data->delta_bw;
0466     }
0467 
0468     /*
0469      * Scale up/down the bandwidth linearly for the ctrl group.  The
0470      * bandwidth step is the bandwidth granularity specified by the
0471      * hardware.
0472      *
0473      * The delta_bw is used when increasing the bandwidth so that we
0474      * dont alternately increase and decrease the control values
0475      * continuously.
0476      *
0477      * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
0478      * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
0479      * switching between 90 and 110 continuously if we only check
0480      * cur_bw < user_bw.
0481      */
0482     if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
0483         new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
0484     } else if (cur_msr_val < MAX_MBA_BW &&
0485            (user_bw > (cur_bw + delta_bw))) {
0486         new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
0487     } else {
0488         return;
0489     }
0490 
0491     cur_msr = hw_r_mba->msr_base + closid;
0492     wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
0493     hw_dom_mba->ctrl_val[closid] = new_msr_val;
0494 
0495     /*
0496      * Delta values are updated dynamically package wise for each
0497      * rdtgrp every time the throttle MSR changes value.
0498      *
0499      * This is because (1)the increase in bandwidth is not perfectly
0500      * linear and only "approximately" linear even when the hardware
0501      * says it is linear.(2)Also since MBA is a core specific
0502      * mechanism, the delta values vary based on number of cores used
0503      * by the rdtgrp.
0504      */
0505     pmbm_data->delta_comp = true;
0506     list_for_each_entry(entry, head, mon.crdtgrp_list) {
0507         cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
0508         cmbm_data->delta_comp = true;
0509     }
0510 }
0511 
0512 static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
0513 {
0514     struct rmid_read rr;
0515 
0516     rr.first = false;
0517     rr.r = r;
0518     rr.d = d;
0519 
0520     /*
0521      * This is protected from concurrent reads from user
0522      * as both the user and we hold the global mutex.
0523      */
0524     if (is_mbm_total_enabled()) {
0525         rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
0526         __mon_event_count(rmid, &rr);
0527     }
0528     if (is_mbm_local_enabled()) {
0529         rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
0530         __mon_event_count(rmid, &rr);
0531 
0532         /*
0533          * Call the MBA software controller only for the
0534          * control groups and when user has enabled
0535          * the software controller explicitly.
0536          */
0537         if (is_mba_sc(NULL))
0538             mbm_bw_count(rmid, &rr);
0539     }
0540 }
0541 
0542 /*
0543  * Handler to scan the limbo list and move the RMIDs
0544  * to free list whose occupancy < threshold_occupancy.
0545  */
0546 void cqm_handle_limbo(struct work_struct *work)
0547 {
0548     unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
0549     int cpu = smp_processor_id();
0550     struct rdt_resource *r;
0551     struct rdt_domain *d;
0552 
0553     mutex_lock(&rdtgroup_mutex);
0554 
0555     r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
0556     d = container_of(work, struct rdt_domain, cqm_limbo.work);
0557 
0558     __check_limbo(d, false);
0559 
0560     if (has_busy_rmid(r, d))
0561         schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
0562 
0563     mutex_unlock(&rdtgroup_mutex);
0564 }
0565 
0566 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
0567 {
0568     unsigned long delay = msecs_to_jiffies(delay_ms);
0569     int cpu;
0570 
0571     cpu = cpumask_any(&dom->cpu_mask);
0572     dom->cqm_work_cpu = cpu;
0573 
0574     schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
0575 }
0576 
0577 void mbm_handle_overflow(struct work_struct *work)
0578 {
0579     unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
0580     struct rdtgroup *prgrp, *crgrp;
0581     int cpu = smp_processor_id();
0582     struct list_head *head;
0583     struct rdt_resource *r;
0584     struct rdt_domain *d;
0585 
0586     mutex_lock(&rdtgroup_mutex);
0587 
0588     if (!static_branch_likely(&rdt_mon_enable_key))
0589         goto out_unlock;
0590 
0591     r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
0592     d = container_of(work, struct rdt_domain, mbm_over.work);
0593 
0594     list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
0595         mbm_update(r, d, prgrp->mon.rmid);
0596 
0597         head = &prgrp->mon.crdtgrp_list;
0598         list_for_each_entry(crgrp, head, mon.crdtgrp_list)
0599             mbm_update(r, d, crgrp->mon.rmid);
0600 
0601         if (is_mba_sc(NULL))
0602             update_mba_bw(prgrp, d);
0603     }
0604 
0605     schedule_delayed_work_on(cpu, &d->mbm_over, delay);
0606 
0607 out_unlock:
0608     mutex_unlock(&rdtgroup_mutex);
0609 }
0610 
0611 void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
0612 {
0613     unsigned long delay = msecs_to_jiffies(delay_ms);
0614     int cpu;
0615 
0616     if (!static_branch_likely(&rdt_mon_enable_key))
0617         return;
0618     cpu = cpumask_any(&dom->cpu_mask);
0619     dom->mbm_work_cpu = cpu;
0620     schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
0621 }
0622 
0623 static int dom_data_init(struct rdt_resource *r)
0624 {
0625     struct rmid_entry *entry = NULL;
0626     int i, nr_rmids;
0627 
0628     nr_rmids = r->num_rmid;
0629     rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
0630     if (!rmid_ptrs)
0631         return -ENOMEM;
0632 
0633     for (i = 0; i < nr_rmids; i++) {
0634         entry = &rmid_ptrs[i];
0635         INIT_LIST_HEAD(&entry->list);
0636 
0637         entry->rmid = i;
0638         list_add_tail(&entry->list, &rmid_free_lru);
0639     }
0640 
0641     /*
0642      * RMID 0 is special and is always allocated. It's used for all
0643      * tasks that are not monitored.
0644      */
0645     entry = __rmid_entry(0);
0646     list_del(&entry->list);
0647 
0648     return 0;
0649 }
0650 
0651 static struct mon_evt llc_occupancy_event = {
0652     .name       = "llc_occupancy",
0653     .evtid      = QOS_L3_OCCUP_EVENT_ID,
0654 };
0655 
0656 static struct mon_evt mbm_total_event = {
0657     .name       = "mbm_total_bytes",
0658     .evtid      = QOS_L3_MBM_TOTAL_EVENT_ID,
0659 };
0660 
0661 static struct mon_evt mbm_local_event = {
0662     .name       = "mbm_local_bytes",
0663     .evtid      = QOS_L3_MBM_LOCAL_EVENT_ID,
0664 };
0665 
0666 /*
0667  * Initialize the event list for the resource.
0668  *
0669  * Note that MBM events are also part of RDT_RESOURCE_L3 resource
0670  * because as per the SDM the total and local memory bandwidth
0671  * are enumerated as part of L3 monitoring.
0672  */
0673 static void l3_mon_evt_init(struct rdt_resource *r)
0674 {
0675     INIT_LIST_HEAD(&r->evt_list);
0676 
0677     if (is_llc_occupancy_enabled())
0678         list_add_tail(&llc_occupancy_event.list, &r->evt_list);
0679     if (is_mbm_total_enabled())
0680         list_add_tail(&mbm_total_event.list, &r->evt_list);
0681     if (is_mbm_local_enabled())
0682         list_add_tail(&mbm_local_event.list, &r->evt_list);
0683 }
0684 
0685 int rdt_get_mon_l3_config(struct rdt_resource *r)
0686 {
0687     unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
0688     struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
0689     unsigned int cl_size = boot_cpu_data.x86_cache_size;
0690     int ret;
0691 
0692     hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
0693     r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
0694     hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
0695 
0696     if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
0697         hw_res->mbm_width += mbm_offset;
0698     else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
0699         pr_warn("Ignoring impossible MBM counter offset\n");
0700 
0701     /*
0702      * A reasonable upper limit on the max threshold is the number
0703      * of lines tagged per RMID if all RMIDs have the same number of
0704      * lines tagged in the LLC.
0705      *
0706      * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
0707      */
0708     resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
0709 
0710     /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
0711     resctrl_cqm_threshold /= hw_res->mon_scale;
0712 
0713     ret = dom_data_init(r);
0714     if (ret)
0715         return ret;
0716 
0717     l3_mon_evt_init(r);
0718 
0719     r->mon_capable = true;
0720     r->mon_enabled = true;
0721 
0722     return 0;
0723 }
0724 
0725 void __init intel_rdt_mbm_apply_quirk(void)
0726 {
0727     int cf_index;
0728 
0729     cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
0730     if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
0731         pr_info("No MBM correction factor available\n");
0732         return;
0733     }
0734 
0735     mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
0736     mbm_cf = mbm_cf_table[cf_index].cf;
0737 }