Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright © 2015 Intel Corporation.
0004  *
0005  * Authors: David Woodhouse <dwmw2@infradead.org>
0006  */
0007 
0008 #include <linux/mmu_notifier.h>
0009 #include <linux/sched.h>
0010 #include <linux/sched/mm.h>
0011 #include <linux/slab.h>
0012 #include <linux/intel-svm.h>
0013 #include <linux/rculist.h>
0014 #include <linux/pci.h>
0015 #include <linux/pci-ats.h>
0016 #include <linux/dmar.h>
0017 #include <linux/interrupt.h>
0018 #include <linux/mm_types.h>
0019 #include <linux/xarray.h>
0020 #include <linux/ioasid.h>
0021 #include <asm/page.h>
0022 #include <asm/fpu/api.h>
0023 
0024 #include "iommu.h"
0025 #include "pasid.h"
0026 #include "perf.h"
0027 #include "../iommu-sva-lib.h"
0028 #include "trace.h"
0029 
0030 static irqreturn_t prq_event_thread(int irq, void *d);
0031 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
0032 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
0033 
0034 static DEFINE_XARRAY_ALLOC(pasid_private_array);
0035 static int pasid_private_add(ioasid_t pasid, void *priv)
0036 {
0037     return xa_alloc(&pasid_private_array, &pasid, priv,
0038             XA_LIMIT(pasid, pasid), GFP_ATOMIC);
0039 }
0040 
0041 static void pasid_private_remove(ioasid_t pasid)
0042 {
0043     xa_erase(&pasid_private_array, pasid);
0044 }
0045 
0046 static void *pasid_private_find(ioasid_t pasid)
0047 {
0048     return xa_load(&pasid_private_array, pasid);
0049 }
0050 
0051 static struct intel_svm_dev *
0052 svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid)
0053 {
0054     struct intel_svm_dev *sdev = NULL, *t;
0055 
0056     rcu_read_lock();
0057     list_for_each_entry_rcu(t, &svm->devs, list) {
0058         if (t->sid == sid) {
0059             sdev = t;
0060             break;
0061         }
0062     }
0063     rcu_read_unlock();
0064 
0065     return sdev;
0066 }
0067 
0068 static struct intel_svm_dev *
0069 svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
0070 {
0071     struct intel_svm_dev *sdev = NULL, *t;
0072 
0073     rcu_read_lock();
0074     list_for_each_entry_rcu(t, &svm->devs, list) {
0075         if (t->dev == dev) {
0076             sdev = t;
0077             break;
0078         }
0079     }
0080     rcu_read_unlock();
0081 
0082     return sdev;
0083 }
0084 
0085 int intel_svm_enable_prq(struct intel_iommu *iommu)
0086 {
0087     struct iopf_queue *iopfq;
0088     struct page *pages;
0089     int irq, ret;
0090 
0091     pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
0092     if (!pages) {
0093         pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
0094             iommu->name);
0095         return -ENOMEM;
0096     }
0097     iommu->prq = page_address(pages);
0098 
0099     irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
0100     if (irq <= 0) {
0101         pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
0102                iommu->name);
0103         ret = -EINVAL;
0104         goto free_prq;
0105     }
0106     iommu->pr_irq = irq;
0107 
0108     snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
0109          "dmar%d-iopfq", iommu->seq_id);
0110     iopfq = iopf_queue_alloc(iommu->iopfq_name);
0111     if (!iopfq) {
0112         pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
0113         ret = -ENOMEM;
0114         goto free_hwirq;
0115     }
0116     iommu->iopf_queue = iopfq;
0117 
0118     snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
0119 
0120     ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
0121                    iommu->prq_name, iommu);
0122     if (ret) {
0123         pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
0124                iommu->name);
0125         goto free_iopfq;
0126     }
0127     dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
0128     dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
0129     dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
0130 
0131     init_completion(&iommu->prq_complete);
0132 
0133     return 0;
0134 
0135 free_iopfq:
0136     iopf_queue_free(iommu->iopf_queue);
0137     iommu->iopf_queue = NULL;
0138 free_hwirq:
0139     dmar_free_hwirq(irq);
0140     iommu->pr_irq = 0;
0141 free_prq:
0142     free_pages((unsigned long)iommu->prq, PRQ_ORDER);
0143     iommu->prq = NULL;
0144 
0145     return ret;
0146 }
0147 
0148 int intel_svm_finish_prq(struct intel_iommu *iommu)
0149 {
0150     dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
0151     dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
0152     dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
0153 
0154     if (iommu->pr_irq) {
0155         free_irq(iommu->pr_irq, iommu);
0156         dmar_free_hwirq(iommu->pr_irq);
0157         iommu->pr_irq = 0;
0158     }
0159 
0160     if (iommu->iopf_queue) {
0161         iopf_queue_free(iommu->iopf_queue);
0162         iommu->iopf_queue = NULL;
0163     }
0164 
0165     free_pages((unsigned long)iommu->prq, PRQ_ORDER);
0166     iommu->prq = NULL;
0167 
0168     return 0;
0169 }
0170 
0171 void intel_svm_check(struct intel_iommu *iommu)
0172 {
0173     if (!pasid_supported(iommu))
0174         return;
0175 
0176     if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
0177         !cap_fl1gp_support(iommu->cap)) {
0178         pr_err("%s SVM disabled, incompatible 1GB page capability\n",
0179                iommu->name);
0180         return;
0181     }
0182 
0183     if (cpu_feature_enabled(X86_FEATURE_LA57) &&
0184         !cap_5lp_support(iommu->cap)) {
0185         pr_err("%s SVM disabled, incompatible paging mode\n",
0186                iommu->name);
0187         return;
0188     }
0189 
0190     iommu->flags |= VTD_FLAG_SVM_CAPABLE;
0191 }
0192 
0193 static void __flush_svm_range_dev(struct intel_svm *svm,
0194                   struct intel_svm_dev *sdev,
0195                   unsigned long address,
0196                   unsigned long pages, int ih)
0197 {
0198     struct device_domain_info *info = dev_iommu_priv_get(sdev->dev);
0199 
0200     if (WARN_ON(!pages))
0201         return;
0202 
0203     qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
0204     if (info->ats_enabled)
0205         qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
0206                      svm->pasid, sdev->qdep, address,
0207                      order_base_2(pages));
0208 }
0209 
0210 static void intel_flush_svm_range_dev(struct intel_svm *svm,
0211                       struct intel_svm_dev *sdev,
0212                       unsigned long address,
0213                       unsigned long pages, int ih)
0214 {
0215     unsigned long shift = ilog2(__roundup_pow_of_two(pages));
0216     unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift));
0217     unsigned long start = ALIGN_DOWN(address, align);
0218     unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align);
0219 
0220     while (start < end) {
0221         __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih);
0222         start += align;
0223     }
0224 }
0225 
0226 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
0227                 unsigned long pages, int ih)
0228 {
0229     struct intel_svm_dev *sdev;
0230 
0231     rcu_read_lock();
0232     list_for_each_entry_rcu(sdev, &svm->devs, list)
0233         intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
0234     rcu_read_unlock();
0235 }
0236 
0237 /* Pages have been freed at this point */
0238 static void intel_invalidate_range(struct mmu_notifier *mn,
0239                    struct mm_struct *mm,
0240                    unsigned long start, unsigned long end)
0241 {
0242     struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
0243 
0244     intel_flush_svm_range(svm, start,
0245                   (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
0246 }
0247 
0248 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
0249 {
0250     struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
0251     struct intel_svm_dev *sdev;
0252 
0253     /* This might end up being called from exit_mmap(), *before* the page
0254      * tables are cleared. And __mmu_notifier_release() will delete us from
0255      * the list of notifiers so that our invalidate_range() callback doesn't
0256      * get called when the page tables are cleared. So we need to protect
0257      * against hardware accessing those page tables.
0258      *
0259      * We do it by clearing the entry in the PASID table and then flushing
0260      * the IOTLB and the PASID table caches. This might upset hardware;
0261      * perhaps we'll want to point the PASID to a dummy PGD (like the zero
0262      * page) so that we end up taking a fault that the hardware really
0263      * *has* to handle gracefully without affecting other processes.
0264      */
0265     rcu_read_lock();
0266     list_for_each_entry_rcu(sdev, &svm->devs, list)
0267         intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
0268                         svm->pasid, true);
0269     rcu_read_unlock();
0270 
0271 }
0272 
0273 static const struct mmu_notifier_ops intel_mmuops = {
0274     .release = intel_mm_release,
0275     .invalidate_range = intel_invalidate_range,
0276 };
0277 
0278 static DEFINE_MUTEX(pasid_mutex);
0279 
0280 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
0281                  struct intel_svm **rsvm,
0282                  struct intel_svm_dev **rsdev)
0283 {
0284     struct intel_svm_dev *sdev = NULL;
0285     struct intel_svm *svm;
0286 
0287     /* The caller should hold the pasid_mutex lock */
0288     if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
0289         return -EINVAL;
0290 
0291     if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
0292         return -EINVAL;
0293 
0294     svm = pasid_private_find(pasid);
0295     if (IS_ERR(svm))
0296         return PTR_ERR(svm);
0297 
0298     if (!svm)
0299         goto out;
0300 
0301     /*
0302      * If we found svm for the PASID, there must be at least one device
0303      * bond.
0304      */
0305     if (WARN_ON(list_empty(&svm->devs)))
0306         return -EINVAL;
0307     sdev = svm_lookup_device_by_dev(svm, dev);
0308 
0309 out:
0310     *rsvm = svm;
0311     *rsdev = sdev;
0312 
0313     return 0;
0314 }
0315 
0316 static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
0317                  unsigned int flags)
0318 {
0319     ioasid_t max_pasid = dev_is_pci(dev) ?
0320             pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
0321 
0322     return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
0323 }
0324 
0325 static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
0326                        struct device *dev,
0327                        struct mm_struct *mm,
0328                        unsigned int flags)
0329 {
0330     struct device_domain_info *info = dev_iommu_priv_get(dev);
0331     struct intel_svm_dev *sdev;
0332     struct intel_svm *svm;
0333     unsigned long sflags;
0334     int ret = 0;
0335 
0336     svm = pasid_private_find(mm->pasid);
0337     if (!svm) {
0338         svm = kzalloc(sizeof(*svm), GFP_KERNEL);
0339         if (!svm)
0340             return ERR_PTR(-ENOMEM);
0341 
0342         svm->pasid = mm->pasid;
0343         svm->mm = mm;
0344         svm->flags = flags;
0345         INIT_LIST_HEAD_RCU(&svm->devs);
0346 
0347         if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
0348             svm->notifier.ops = &intel_mmuops;
0349             ret = mmu_notifier_register(&svm->notifier, mm);
0350             if (ret) {
0351                 kfree(svm);
0352                 return ERR_PTR(ret);
0353             }
0354         }
0355 
0356         ret = pasid_private_add(svm->pasid, svm);
0357         if (ret) {
0358             if (svm->notifier.ops)
0359                 mmu_notifier_unregister(&svm->notifier, mm);
0360             kfree(svm);
0361             return ERR_PTR(ret);
0362         }
0363     }
0364 
0365     /* Find the matching device in svm list */
0366     sdev = svm_lookup_device_by_dev(svm, dev);
0367     if (sdev) {
0368         sdev->users++;
0369         goto success;
0370     }
0371 
0372     sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
0373     if (!sdev) {
0374         ret = -ENOMEM;
0375         goto free_svm;
0376     }
0377 
0378     sdev->dev = dev;
0379     sdev->iommu = iommu;
0380     sdev->did = FLPT_DEFAULT_DID;
0381     sdev->sid = PCI_DEVID(info->bus, info->devfn);
0382     sdev->users = 1;
0383     sdev->pasid = svm->pasid;
0384     sdev->sva.dev = dev;
0385     init_rcu_head(&sdev->rcu);
0386     if (info->ats_enabled) {
0387         sdev->dev_iotlb = 1;
0388         sdev->qdep = info->ats_qdep;
0389         if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
0390             sdev->qdep = 0;
0391     }
0392 
0393     /* Setup the pasid table: */
0394     sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
0395             PASID_FLAG_SUPERVISOR_MODE : 0;
0396     sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
0397     ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
0398                         FLPT_DEFAULT_DID, sflags);
0399     if (ret)
0400         goto free_sdev;
0401 
0402     list_add_rcu(&sdev->list, &svm->devs);
0403 success:
0404     return &sdev->sva;
0405 
0406 free_sdev:
0407     kfree(sdev);
0408 free_svm:
0409     if (list_empty(&svm->devs)) {
0410         if (svm->notifier.ops)
0411             mmu_notifier_unregister(&svm->notifier, mm);
0412         pasid_private_remove(mm->pasid);
0413         kfree(svm);
0414     }
0415 
0416     return ERR_PTR(ret);
0417 }
0418 
0419 /* Caller must hold pasid_mutex */
0420 static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
0421 {
0422     struct intel_svm_dev *sdev;
0423     struct intel_iommu *iommu;
0424     struct intel_svm *svm;
0425     struct mm_struct *mm;
0426     int ret = -EINVAL;
0427 
0428     iommu = device_to_iommu(dev, NULL, NULL);
0429     if (!iommu)
0430         goto out;
0431 
0432     ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
0433     if (ret)
0434         goto out;
0435     mm = svm->mm;
0436 
0437     if (sdev) {
0438         sdev->users--;
0439         if (!sdev->users) {
0440             list_del_rcu(&sdev->list);
0441             /* Flush the PASID cache and IOTLB for this device.
0442              * Note that we do depend on the hardware *not* using
0443              * the PASID any more. Just as we depend on other
0444              * devices never using PASIDs that they have no right
0445              * to use. We have a *shared* PASID table, because it's
0446              * large and has to be physically contiguous. So it's
0447              * hard to be as defensive as we might like. */
0448             intel_pasid_tear_down_entry(iommu, dev,
0449                             svm->pasid, false);
0450             intel_svm_drain_prq(dev, svm->pasid);
0451             kfree_rcu(sdev, rcu);
0452 
0453             if (list_empty(&svm->devs)) {
0454                 if (svm->notifier.ops)
0455                     mmu_notifier_unregister(&svm->notifier, mm);
0456                 pasid_private_remove(svm->pasid);
0457                 /* We mandate that no page faults may be outstanding
0458                  * for the PASID when intel_svm_unbind_mm() is called.
0459                  * If that is not obeyed, subtle errors will happen.
0460                  * Let's make them less subtle... */
0461                 memset(svm, 0x6b, sizeof(*svm));
0462                 kfree(svm);
0463             }
0464         }
0465     }
0466 out:
0467     return ret;
0468 }
0469 
0470 /* Page request queue descriptor */
0471 struct page_req_dsc {
0472     union {
0473         struct {
0474             u64 type:8;
0475             u64 pasid_present:1;
0476             u64 priv_data_present:1;
0477             u64 rsvd:6;
0478             u64 rid:16;
0479             u64 pasid:20;
0480             u64 exe_req:1;
0481             u64 pm_req:1;
0482             u64 rsvd2:10;
0483         };
0484         u64 qw_0;
0485     };
0486     union {
0487         struct {
0488             u64 rd_req:1;
0489             u64 wr_req:1;
0490             u64 lpig:1;
0491             u64 prg_index:9;
0492             u64 addr:52;
0493         };
0494         u64 qw_1;
0495     };
0496     u64 priv_data[2];
0497 };
0498 
0499 static bool is_canonical_address(u64 addr)
0500 {
0501     int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
0502     long saddr = (long) addr;
0503 
0504     return (((saddr << shift) >> shift) == saddr);
0505 }
0506 
0507 /**
0508  * intel_svm_drain_prq - Drain page requests and responses for a pasid
0509  * @dev: target device
0510  * @pasid: pasid for draining
0511  *
0512  * Drain all pending page requests and responses related to @pasid in both
0513  * software and hardware. This is supposed to be called after the device
0514  * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
0515  * and DevTLB have been invalidated.
0516  *
0517  * It waits until all pending page requests for @pasid in the page fault
0518  * queue are completed by the prq handling thread. Then follow the steps
0519  * described in VT-d spec CH7.10 to drain all page requests and page
0520  * responses pending in the hardware.
0521  */
0522 static void intel_svm_drain_prq(struct device *dev, u32 pasid)
0523 {
0524     struct device_domain_info *info;
0525     struct dmar_domain *domain;
0526     struct intel_iommu *iommu;
0527     struct qi_desc desc[3];
0528     struct pci_dev *pdev;
0529     int head, tail;
0530     u16 sid, did;
0531     int qdep;
0532 
0533     info = dev_iommu_priv_get(dev);
0534     if (WARN_ON(!info || !dev_is_pci(dev)))
0535         return;
0536 
0537     if (!info->pri_enabled)
0538         return;
0539 
0540     iommu = info->iommu;
0541     domain = info->domain;
0542     pdev = to_pci_dev(dev);
0543     sid = PCI_DEVID(info->bus, info->devfn);
0544     did = domain_id_iommu(domain, iommu);
0545     qdep = pci_ats_queue_depth(pdev);
0546 
0547     /*
0548      * Check and wait until all pending page requests in the queue are
0549      * handled by the prq handling thread.
0550      */
0551 prq_retry:
0552     reinit_completion(&iommu->prq_complete);
0553     tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
0554     head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
0555     while (head != tail) {
0556         struct page_req_dsc *req;
0557 
0558         req = &iommu->prq[head / sizeof(*req)];
0559         if (!req->pasid_present || req->pasid != pasid) {
0560             head = (head + sizeof(*req)) & PRQ_RING_MASK;
0561             continue;
0562         }
0563 
0564         wait_for_completion(&iommu->prq_complete);
0565         goto prq_retry;
0566     }
0567 
0568     /*
0569      * A work in IO page fault workqueue may try to lock pasid_mutex now.
0570      * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
0571      * all works in the workqueue to finish may cause deadlock.
0572      *
0573      * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
0574      * Unlock it to allow the works to be handled while waiting for
0575      * them to finish.
0576      */
0577     lockdep_assert_held(&pasid_mutex);
0578     mutex_unlock(&pasid_mutex);
0579     iopf_queue_flush_dev(dev);
0580     mutex_lock(&pasid_mutex);
0581 
0582     /*
0583      * Perform steps described in VT-d spec CH7.10 to drain page
0584      * requests and responses in hardware.
0585      */
0586     memset(desc, 0, sizeof(desc));
0587     desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
0588             QI_IWD_FENCE |
0589             QI_IWD_TYPE;
0590     desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
0591             QI_EIOTLB_DID(did) |
0592             QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
0593             QI_EIOTLB_TYPE;
0594     desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
0595             QI_DEV_EIOTLB_SID(sid) |
0596             QI_DEV_EIOTLB_QDEP(qdep) |
0597             QI_DEIOTLB_TYPE |
0598             QI_DEV_IOTLB_PFSID(info->pfsid);
0599 qi_retry:
0600     reinit_completion(&iommu->prq_complete);
0601     qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
0602     if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
0603         wait_for_completion(&iommu->prq_complete);
0604         goto qi_retry;
0605     }
0606 }
0607 
0608 static int prq_to_iommu_prot(struct page_req_dsc *req)
0609 {
0610     int prot = 0;
0611 
0612     if (req->rd_req)
0613         prot |= IOMMU_FAULT_PERM_READ;
0614     if (req->wr_req)
0615         prot |= IOMMU_FAULT_PERM_WRITE;
0616     if (req->exe_req)
0617         prot |= IOMMU_FAULT_PERM_EXEC;
0618     if (req->pm_req)
0619         prot |= IOMMU_FAULT_PERM_PRIV;
0620 
0621     return prot;
0622 }
0623 
0624 static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
0625                 struct page_req_dsc *desc)
0626 {
0627     struct iommu_fault_event event;
0628 
0629     if (!dev || !dev_is_pci(dev))
0630         return -ENODEV;
0631 
0632     /* Fill in event data for device specific processing */
0633     memset(&event, 0, sizeof(struct iommu_fault_event));
0634     event.fault.type = IOMMU_FAULT_PAGE_REQ;
0635     event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
0636     event.fault.prm.pasid = desc->pasid;
0637     event.fault.prm.grpid = desc->prg_index;
0638     event.fault.prm.perm = prq_to_iommu_prot(desc);
0639 
0640     if (desc->lpig)
0641         event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
0642     if (desc->pasid_present) {
0643         event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
0644         event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
0645     }
0646     if (desc->priv_data_present) {
0647         /*
0648          * Set last page in group bit if private data is present,
0649          * page response is required as it does for LPIG.
0650          * iommu_report_device_fault() doesn't understand this vendor
0651          * specific requirement thus we set last_page as a workaround.
0652          */
0653         event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
0654         event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
0655         event.fault.prm.private_data[0] = desc->priv_data[0];
0656         event.fault.prm.private_data[1] = desc->priv_data[1];
0657     } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
0658         /*
0659          * If the private data fields are not used by hardware, use it
0660          * to monitor the prq handle latency.
0661          */
0662         event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
0663     }
0664 
0665     return iommu_report_device_fault(dev, &event);
0666 }
0667 
0668 static void handle_bad_prq_event(struct intel_iommu *iommu,
0669                  struct page_req_dsc *req, int result)
0670 {
0671     struct qi_desc desc;
0672 
0673     pr_err("%s: Invalid page request: %08llx %08llx\n",
0674            iommu->name, ((unsigned long long *)req)[0],
0675            ((unsigned long long *)req)[1]);
0676 
0677     /*
0678      * Per VT-d spec. v3.0 ch7.7, system software must
0679      * respond with page group response if private data
0680      * is present (PDP) or last page in group (LPIG) bit
0681      * is set. This is an additional VT-d feature beyond
0682      * PCI ATS spec.
0683      */
0684     if (!req->lpig && !req->priv_data_present)
0685         return;
0686 
0687     desc.qw0 = QI_PGRP_PASID(req->pasid) |
0688             QI_PGRP_DID(req->rid) |
0689             QI_PGRP_PASID_P(req->pasid_present) |
0690             QI_PGRP_PDP(req->priv_data_present) |
0691             QI_PGRP_RESP_CODE(result) |
0692             QI_PGRP_RESP_TYPE;
0693     desc.qw1 = QI_PGRP_IDX(req->prg_index) |
0694             QI_PGRP_LPIG(req->lpig);
0695 
0696     if (req->priv_data_present) {
0697         desc.qw2 = req->priv_data[0];
0698         desc.qw3 = req->priv_data[1];
0699     } else {
0700         desc.qw2 = 0;
0701         desc.qw3 = 0;
0702     }
0703 
0704     qi_submit_sync(iommu, &desc, 1, 0);
0705 }
0706 
0707 static irqreturn_t prq_event_thread(int irq, void *d)
0708 {
0709     struct intel_svm_dev *sdev = NULL;
0710     struct intel_iommu *iommu = d;
0711     struct intel_svm *svm = NULL;
0712     struct page_req_dsc *req;
0713     int head, tail, handled;
0714     u64 address;
0715 
0716     /*
0717      * Clear PPR bit before reading head/tail registers, to ensure that
0718      * we get a new interrupt if needed.
0719      */
0720     writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
0721 
0722     tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
0723     head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
0724     handled = (head != tail);
0725     while (head != tail) {
0726         req = &iommu->prq[head / sizeof(*req)];
0727         address = (u64)req->addr << VTD_PAGE_SHIFT;
0728 
0729         if (unlikely(!req->pasid_present)) {
0730             pr_err("IOMMU: %s: Page request without PASID\n",
0731                    iommu->name);
0732 bad_req:
0733             svm = NULL;
0734             sdev = NULL;
0735             handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
0736             goto prq_advance;
0737         }
0738 
0739         if (unlikely(!is_canonical_address(address))) {
0740             pr_err("IOMMU: %s: Address is not canonical\n",
0741                    iommu->name);
0742             goto bad_req;
0743         }
0744 
0745         if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
0746             pr_err("IOMMU: %s: Page request in Privilege Mode\n",
0747                    iommu->name);
0748             goto bad_req;
0749         }
0750 
0751         if (unlikely(req->exe_req && req->rd_req)) {
0752             pr_err("IOMMU: %s: Execution request not supported\n",
0753                    iommu->name);
0754             goto bad_req;
0755         }
0756 
0757         /* Drop Stop Marker message. No need for a response. */
0758         if (unlikely(req->lpig && !req->rd_req && !req->wr_req))
0759             goto prq_advance;
0760 
0761         if (!svm || svm->pasid != req->pasid) {
0762             /*
0763              * It can't go away, because the driver is not permitted
0764              * to unbind the mm while any page faults are outstanding.
0765              */
0766             svm = pasid_private_find(req->pasid);
0767             if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE))
0768                 goto bad_req;
0769         }
0770 
0771         if (!sdev || sdev->sid != req->rid) {
0772             sdev = svm_lookup_device_by_sid(svm, req->rid);
0773             if (!sdev)
0774                 goto bad_req;
0775         }
0776 
0777         sdev->prq_seq_number++;
0778 
0779         /*
0780          * If prq is to be handled outside iommu driver via receiver of
0781          * the fault notifiers, we skip the page response here.
0782          */
0783         if (intel_svm_prq_report(iommu, sdev->dev, req))
0784             handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
0785 
0786         trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
0787                  req->priv_data[0], req->priv_data[1],
0788                  sdev->prq_seq_number);
0789 prq_advance:
0790         head = (head + sizeof(*req)) & PRQ_RING_MASK;
0791     }
0792 
0793     dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
0794 
0795     /*
0796      * Clear the page request overflow bit and wake up all threads that
0797      * are waiting for the completion of this handling.
0798      */
0799     if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
0800         pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
0801                     iommu->name);
0802         head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
0803         tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
0804         if (head == tail) {
0805             iopf_queue_discard_partial(iommu->iopf_queue);
0806             writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
0807             pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
0808                         iommu->name);
0809         }
0810     }
0811 
0812     if (!completion_done(&iommu->prq_complete))
0813         complete(&iommu->prq_complete);
0814 
0815     return IRQ_RETVAL(handled);
0816 }
0817 
0818 struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
0819 {
0820     struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
0821     unsigned int flags = 0;
0822     struct iommu_sva *sva;
0823     int ret;
0824 
0825     if (drvdata)
0826         flags = *(unsigned int *)drvdata;
0827 
0828     if (flags & SVM_FLAG_SUPERVISOR_MODE) {
0829         if (!ecap_srs(iommu->ecap)) {
0830             dev_err(dev, "%s: Supervisor PASID not supported\n",
0831                 iommu->name);
0832             return ERR_PTR(-EOPNOTSUPP);
0833         }
0834 
0835         if (mm) {
0836             dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
0837                 iommu->name);
0838             return ERR_PTR(-EINVAL);
0839         }
0840 
0841         mm = &init_mm;
0842     }
0843 
0844     mutex_lock(&pasid_mutex);
0845     ret = intel_svm_alloc_pasid(dev, mm, flags);
0846     if (ret) {
0847         mutex_unlock(&pasid_mutex);
0848         return ERR_PTR(ret);
0849     }
0850 
0851     sva = intel_svm_bind_mm(iommu, dev, mm, flags);
0852     mutex_unlock(&pasid_mutex);
0853 
0854     return sva;
0855 }
0856 
0857 void intel_svm_unbind(struct iommu_sva *sva)
0858 {
0859     struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
0860 
0861     mutex_lock(&pasid_mutex);
0862     intel_svm_unbind_mm(sdev->dev, sdev->pasid);
0863     mutex_unlock(&pasid_mutex);
0864 }
0865 
0866 u32 intel_svm_get_pasid(struct iommu_sva *sva)
0867 {
0868     struct intel_svm_dev *sdev;
0869     u32 pasid;
0870 
0871     mutex_lock(&pasid_mutex);
0872     sdev = to_intel_svm_dev(sva);
0873     pasid = sdev->pasid;
0874     mutex_unlock(&pasid_mutex);
0875 
0876     return pasid;
0877 }
0878 
0879 int intel_svm_page_response(struct device *dev,
0880                 struct iommu_fault_event *evt,
0881                 struct iommu_page_response *msg)
0882 {
0883     struct iommu_fault_page_request *prm;
0884     struct intel_svm_dev *sdev = NULL;
0885     struct intel_svm *svm = NULL;
0886     struct intel_iommu *iommu;
0887     bool private_present;
0888     bool pasid_present;
0889     bool last_page;
0890     u8 bus, devfn;
0891     int ret = 0;
0892     u16 sid;
0893 
0894     if (!dev || !dev_is_pci(dev))
0895         return -ENODEV;
0896 
0897     iommu = device_to_iommu(dev, &bus, &devfn);
0898     if (!iommu)
0899         return -ENODEV;
0900 
0901     if (!msg || !evt)
0902         return -EINVAL;
0903 
0904     mutex_lock(&pasid_mutex);
0905 
0906     prm = &evt->fault.prm;
0907     sid = PCI_DEVID(bus, devfn);
0908     pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
0909     private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
0910     last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
0911 
0912     if (!pasid_present) {
0913         ret = -EINVAL;
0914         goto out;
0915     }
0916 
0917     if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
0918         ret = -EINVAL;
0919         goto out;
0920     }
0921 
0922     ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
0923     if (ret || !sdev) {
0924         ret = -ENODEV;
0925         goto out;
0926     }
0927 
0928     /*
0929      * Per VT-d spec. v3.0 ch7.7, system software must respond
0930      * with page group response if private data is present (PDP)
0931      * or last page in group (LPIG) bit is set. This is an
0932      * additional VT-d requirement beyond PCI ATS spec.
0933      */
0934     if (last_page || private_present) {
0935         struct qi_desc desc;
0936 
0937         desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
0938                 QI_PGRP_PASID_P(pasid_present) |
0939                 QI_PGRP_PDP(private_present) |
0940                 QI_PGRP_RESP_CODE(msg->code) |
0941                 QI_PGRP_RESP_TYPE;
0942         desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
0943         desc.qw2 = 0;
0944         desc.qw3 = 0;
0945 
0946         if (private_present) {
0947             desc.qw2 = prm->private_data[0];
0948             desc.qw3 = prm->private_data[1];
0949         } else if (prm->private_data[0]) {
0950             dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
0951                 ktime_to_ns(ktime_get()) - prm->private_data[0]);
0952         }
0953 
0954         qi_submit_sync(iommu, &desc, 1, 0);
0955     }
0956 out:
0957     mutex_unlock(&pasid_mutex);
0958     return ret;
0959 }