Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright © 2006-2014 Intel Corporation.
0004  *
0005  * Authors: David Woodhouse <dwmw2@infradead.org>,
0006  *          Ashok Raj <ashok.raj@intel.com>,
0007  *          Shaohua Li <shaohua.li@intel.com>,
0008  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
0009  *          Fenghua Yu <fenghua.yu@intel.com>
0010  *          Joerg Roedel <jroedel@suse.de>
0011  */
0012 
0013 #define pr_fmt(fmt)     "DMAR: " fmt
0014 #define dev_fmt(fmt)    pr_fmt(fmt)
0015 
0016 #include <linux/crash_dump.h>
0017 #include <linux/dma-direct.h>
0018 #include <linux/dma-iommu.h>
0019 #include <linux/dmi.h>
0020 #include <linux/intel-svm.h>
0021 #include <linux/memory.h>
0022 #include <linux/pci.h>
0023 #include <linux/pci-ats.h>
0024 #include <linux/spinlock.h>
0025 #include <linux/syscore_ops.h>
0026 #include <linux/tboot.h>
0027 
0028 #include "iommu.h"
0029 #include "../irq_remapping.h"
0030 #include "../iommu-sva-lib.h"
0031 #include "pasid.h"
0032 #include "cap_audit.h"
0033 
0034 #define ROOT_SIZE       VTD_PAGE_SIZE
0035 #define CONTEXT_SIZE        VTD_PAGE_SIZE
0036 
0037 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
0038 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
0039 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
0040 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
0041 
0042 #define IOAPIC_RANGE_START  (0xfee00000)
0043 #define IOAPIC_RANGE_END    (0xfeefffff)
0044 #define IOVA_START_ADDR     (0x1000)
0045 
0046 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
0047 
0048 #define MAX_AGAW_WIDTH 64
0049 #define MAX_AGAW_PFN_WIDTH  (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
0050 
0051 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
0052 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
0053 
0054 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
0055    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
0056 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
0057                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
0058 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
0059 
0060 /* IO virtual address start page frame number */
0061 #define IOVA_START_PFN      (1)
0062 
0063 #define IOVA_PFN(addr)      ((addr) >> PAGE_SHIFT)
0064 
0065 /* page table handling */
0066 #define LEVEL_STRIDE        (9)
0067 #define LEVEL_MASK      (((u64)1 << LEVEL_STRIDE) - 1)
0068 
0069 static inline int agaw_to_level(int agaw)
0070 {
0071     return agaw + 2;
0072 }
0073 
0074 static inline int agaw_to_width(int agaw)
0075 {
0076     return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
0077 }
0078 
0079 static inline int width_to_agaw(int width)
0080 {
0081     return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
0082 }
0083 
0084 static inline unsigned int level_to_offset_bits(int level)
0085 {
0086     return (level - 1) * LEVEL_STRIDE;
0087 }
0088 
0089 static inline int pfn_level_offset(u64 pfn, int level)
0090 {
0091     return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
0092 }
0093 
0094 static inline u64 level_mask(int level)
0095 {
0096     return -1ULL << level_to_offset_bits(level);
0097 }
0098 
0099 static inline u64 level_size(int level)
0100 {
0101     return 1ULL << level_to_offset_bits(level);
0102 }
0103 
0104 static inline u64 align_to_level(u64 pfn, int level)
0105 {
0106     return (pfn + level_size(level) - 1) & level_mask(level);
0107 }
0108 
0109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
0110 {
0111     return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
0112 }
0113 
0114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
0115    are never going to work. */
0116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
0117 {
0118     return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
0119 }
0120 static inline unsigned long page_to_dma_pfn(struct page *pg)
0121 {
0122     return mm_to_dma_pfn(page_to_pfn(pg));
0123 }
0124 static inline unsigned long virt_to_dma_pfn(void *p)
0125 {
0126     return page_to_dma_pfn(virt_to_page(p));
0127 }
0128 
0129 static void __init check_tylersburg_isoch(void);
0130 static int rwbf_quirk;
0131 
0132 /*
0133  * set to 1 to panic kernel if can't successfully enable VT-d
0134  * (used when kernel is launched w/ TXT)
0135  */
0136 static int force_on = 0;
0137 static int intel_iommu_tboot_noforce;
0138 static int no_platform_optin;
0139 
0140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
0141 
0142 /*
0143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
0144  * if marked present.
0145  */
0146 static phys_addr_t root_entry_lctp(struct root_entry *re)
0147 {
0148     if (!(re->lo & 1))
0149         return 0;
0150 
0151     return re->lo & VTD_PAGE_MASK;
0152 }
0153 
0154 /*
0155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
0156  * if marked present.
0157  */
0158 static phys_addr_t root_entry_uctp(struct root_entry *re)
0159 {
0160     if (!(re->hi & 1))
0161         return 0;
0162 
0163     return re->hi & VTD_PAGE_MASK;
0164 }
0165 
0166 static inline void context_set_present(struct context_entry *context)
0167 {
0168     context->lo |= 1;
0169 }
0170 
0171 static inline void context_set_fault_enable(struct context_entry *context)
0172 {
0173     context->lo &= (((u64)-1) << 2) | 1;
0174 }
0175 
0176 static inline void context_set_translation_type(struct context_entry *context,
0177                         unsigned long value)
0178 {
0179     context->lo &= (((u64)-1) << 4) | 3;
0180     context->lo |= (value & 3) << 2;
0181 }
0182 
0183 static inline void context_set_address_root(struct context_entry *context,
0184                         unsigned long value)
0185 {
0186     context->lo &= ~VTD_PAGE_MASK;
0187     context->lo |= value & VTD_PAGE_MASK;
0188 }
0189 
0190 static inline void context_set_address_width(struct context_entry *context,
0191                          unsigned long value)
0192 {
0193     context->hi |= value & 7;
0194 }
0195 
0196 static inline void context_set_domain_id(struct context_entry *context,
0197                      unsigned long value)
0198 {
0199     context->hi |= (value & ((1 << 16) - 1)) << 8;
0200 }
0201 
0202 static inline int context_domain_id(struct context_entry *c)
0203 {
0204     return((c->hi >> 8) & 0xffff);
0205 }
0206 
0207 static inline void context_clear_entry(struct context_entry *context)
0208 {
0209     context->lo = 0;
0210     context->hi = 0;
0211 }
0212 
0213 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
0214 {
0215     if (!iommu->copied_tables)
0216         return false;
0217 
0218     return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
0219 }
0220 
0221 static inline void
0222 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
0223 {
0224     set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
0225 }
0226 
0227 static inline void
0228 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
0229 {
0230     clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
0231 }
0232 
0233 /*
0234  * This domain is a statically identity mapping domain.
0235  *  1. This domain creats a static 1:1 mapping to all usable memory.
0236  *  2. It maps to each iommu if successful.
0237  *  3. Each iommu mapps to this domain if successful.
0238  */
0239 static struct dmar_domain *si_domain;
0240 static int hw_pass_through = 1;
0241 
0242 struct dmar_rmrr_unit {
0243     struct list_head list;      /* list of rmrr units   */
0244     struct acpi_dmar_header *hdr;   /* ACPI header      */
0245     u64 base_address;       /* reserved base address*/
0246     u64 end_address;        /* reserved end address */
0247     struct dmar_dev_scope *devices; /* target devices */
0248     int devices_cnt;        /* target device count */
0249 };
0250 
0251 struct dmar_atsr_unit {
0252     struct list_head list;      /* list of ATSR units */
0253     struct acpi_dmar_header *hdr;   /* ACPI header */
0254     struct dmar_dev_scope *devices; /* target devices */
0255     int devices_cnt;        /* target device count */
0256     u8 include_all:1;       /* include all ports */
0257 };
0258 
0259 struct dmar_satc_unit {
0260     struct list_head list;      /* list of SATC units */
0261     struct acpi_dmar_header *hdr;   /* ACPI header */
0262     struct dmar_dev_scope *devices; /* target devices */
0263     struct intel_iommu *iommu;  /* the corresponding iommu */
0264     int devices_cnt;        /* target device count */
0265     u8 atc_required:1;      /* ATS is required */
0266 };
0267 
0268 static LIST_HEAD(dmar_atsr_units);
0269 static LIST_HEAD(dmar_rmrr_units);
0270 static LIST_HEAD(dmar_satc_units);
0271 
0272 #define for_each_rmrr_units(rmrr) \
0273     list_for_each_entry(rmrr, &dmar_rmrr_units, list)
0274 
0275 static void dmar_remove_one_dev_info(struct device *dev);
0276 
0277 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
0278 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
0279 
0280 int intel_iommu_enabled = 0;
0281 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
0282 
0283 static int dmar_map_gfx = 1;
0284 static int intel_iommu_superpage = 1;
0285 static int iommu_identity_mapping;
0286 static int iommu_skip_te_disable;
0287 
0288 #define IDENTMAP_GFX        2
0289 #define IDENTMAP_AZALIA     4
0290 
0291 const struct iommu_ops intel_iommu_ops;
0292 
0293 static bool translation_pre_enabled(struct intel_iommu *iommu)
0294 {
0295     return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
0296 }
0297 
0298 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
0299 {
0300     iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
0301 }
0302 
0303 static void init_translation_status(struct intel_iommu *iommu)
0304 {
0305     u32 gsts;
0306 
0307     gsts = readl(iommu->reg + DMAR_GSTS_REG);
0308     if (gsts & DMA_GSTS_TES)
0309         iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
0310 }
0311 
0312 static int __init intel_iommu_setup(char *str)
0313 {
0314     if (!str)
0315         return -EINVAL;
0316 
0317     while (*str) {
0318         if (!strncmp(str, "on", 2)) {
0319             dmar_disabled = 0;
0320             pr_info("IOMMU enabled\n");
0321         } else if (!strncmp(str, "off", 3)) {
0322             dmar_disabled = 1;
0323             no_platform_optin = 1;
0324             pr_info("IOMMU disabled\n");
0325         } else if (!strncmp(str, "igfx_off", 8)) {
0326             dmar_map_gfx = 0;
0327             pr_info("Disable GFX device mapping\n");
0328         } else if (!strncmp(str, "forcedac", 8)) {
0329             pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
0330             iommu_dma_forcedac = true;
0331         } else if (!strncmp(str, "strict", 6)) {
0332             pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
0333             iommu_set_dma_strict();
0334         } else if (!strncmp(str, "sp_off", 6)) {
0335             pr_info("Disable supported super page\n");
0336             intel_iommu_superpage = 0;
0337         } else if (!strncmp(str, "sm_on", 5)) {
0338             pr_info("Enable scalable mode if hardware supports\n");
0339             intel_iommu_sm = 1;
0340         } else if (!strncmp(str, "sm_off", 6)) {
0341             pr_info("Scalable mode is disallowed\n");
0342             intel_iommu_sm = 0;
0343         } else if (!strncmp(str, "tboot_noforce", 13)) {
0344             pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
0345             intel_iommu_tboot_noforce = 1;
0346         } else {
0347             pr_notice("Unknown option - '%s'\n", str);
0348         }
0349 
0350         str += strcspn(str, ",");
0351         while (*str == ',')
0352             str++;
0353     }
0354 
0355     return 1;
0356 }
0357 __setup("intel_iommu=", intel_iommu_setup);
0358 
0359 void *alloc_pgtable_page(int node)
0360 {
0361     struct page *page;
0362     void *vaddr = NULL;
0363 
0364     page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
0365     if (page)
0366         vaddr = page_address(page);
0367     return vaddr;
0368 }
0369 
0370 void free_pgtable_page(void *vaddr)
0371 {
0372     free_page((unsigned long)vaddr);
0373 }
0374 
0375 static inline int domain_type_is_si(struct dmar_domain *domain)
0376 {
0377     return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
0378 }
0379 
0380 static inline bool domain_use_first_level(struct dmar_domain *domain)
0381 {
0382     return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
0383 }
0384 
0385 static inline int domain_pfn_supported(struct dmar_domain *domain,
0386                        unsigned long pfn)
0387 {
0388     int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
0389 
0390     return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
0391 }
0392 
0393 /*
0394  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
0395  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
0396  * the returned SAGAW.
0397  */
0398 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
0399 {
0400     unsigned long fl_sagaw, sl_sagaw;
0401 
0402     fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
0403     sl_sagaw = cap_sagaw(iommu->cap);
0404 
0405     /* Second level only. */
0406     if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
0407         return sl_sagaw;
0408 
0409     /* First level only. */
0410     if (!ecap_slts(iommu->ecap))
0411         return fl_sagaw;
0412 
0413     return fl_sagaw & sl_sagaw;
0414 }
0415 
0416 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
0417 {
0418     unsigned long sagaw;
0419     int agaw;
0420 
0421     sagaw = __iommu_calculate_sagaw(iommu);
0422     for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
0423         if (test_bit(agaw, &sagaw))
0424             break;
0425     }
0426 
0427     return agaw;
0428 }
0429 
0430 /*
0431  * Calculate max SAGAW for each iommu.
0432  */
0433 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
0434 {
0435     return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
0436 }
0437 
0438 /*
0439  * calculate agaw for each iommu.
0440  * "SAGAW" may be different across iommus, use a default agaw, and
0441  * get a supported less agaw for iommus that don't support the default agaw.
0442  */
0443 int iommu_calculate_agaw(struct intel_iommu *iommu)
0444 {
0445     return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
0446 }
0447 
0448 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
0449 {
0450     return sm_supported(iommu) ?
0451             ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
0452 }
0453 
0454 static void domain_update_iommu_coherency(struct dmar_domain *domain)
0455 {
0456     struct iommu_domain_info *info;
0457     struct dmar_drhd_unit *drhd;
0458     struct intel_iommu *iommu;
0459     bool found = false;
0460     unsigned long i;
0461 
0462     domain->iommu_coherency = true;
0463     xa_for_each(&domain->iommu_array, i, info) {
0464         found = true;
0465         if (!iommu_paging_structure_coherency(info->iommu)) {
0466             domain->iommu_coherency = false;
0467             break;
0468         }
0469     }
0470     if (found)
0471         return;
0472 
0473     /* No hardware attached; use lowest common denominator */
0474     rcu_read_lock();
0475     for_each_active_iommu(iommu, drhd) {
0476         if (!iommu_paging_structure_coherency(iommu)) {
0477             domain->iommu_coherency = false;
0478             break;
0479         }
0480     }
0481     rcu_read_unlock();
0482 }
0483 
0484 static int domain_update_iommu_superpage(struct dmar_domain *domain,
0485                      struct intel_iommu *skip)
0486 {
0487     struct dmar_drhd_unit *drhd;
0488     struct intel_iommu *iommu;
0489     int mask = 0x3;
0490 
0491     if (!intel_iommu_superpage)
0492         return 0;
0493 
0494     /* set iommu_superpage to the smallest common denominator */
0495     rcu_read_lock();
0496     for_each_active_iommu(iommu, drhd) {
0497         if (iommu != skip) {
0498             if (domain && domain_use_first_level(domain)) {
0499                 if (!cap_fl1gp_support(iommu->cap))
0500                     mask = 0x1;
0501             } else {
0502                 mask &= cap_super_page_val(iommu->cap);
0503             }
0504 
0505             if (!mask)
0506                 break;
0507         }
0508     }
0509     rcu_read_unlock();
0510 
0511     return fls(mask);
0512 }
0513 
0514 static int domain_update_device_node(struct dmar_domain *domain)
0515 {
0516     struct device_domain_info *info;
0517     int nid = NUMA_NO_NODE;
0518     unsigned long flags;
0519 
0520     spin_lock_irqsave(&domain->lock, flags);
0521     list_for_each_entry(info, &domain->devices, link) {
0522         /*
0523          * There could possibly be multiple device numa nodes as devices
0524          * within the same domain may sit behind different IOMMUs. There
0525          * isn't perfect answer in such situation, so we select first
0526          * come first served policy.
0527          */
0528         nid = dev_to_node(info->dev);
0529         if (nid != NUMA_NO_NODE)
0530             break;
0531     }
0532     spin_unlock_irqrestore(&domain->lock, flags);
0533 
0534     return nid;
0535 }
0536 
0537 static void domain_update_iotlb(struct dmar_domain *domain);
0538 
0539 /* Return the super pagesize bitmap if supported. */
0540 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
0541 {
0542     unsigned long bitmap = 0;
0543 
0544     /*
0545      * 1-level super page supports page size of 2MiB, 2-level super page
0546      * supports page size of both 2MiB and 1GiB.
0547      */
0548     if (domain->iommu_superpage == 1)
0549         bitmap |= SZ_2M;
0550     else if (domain->iommu_superpage == 2)
0551         bitmap |= SZ_2M | SZ_1G;
0552 
0553     return bitmap;
0554 }
0555 
0556 /* Some capabilities may be different across iommus */
0557 static void domain_update_iommu_cap(struct dmar_domain *domain)
0558 {
0559     domain_update_iommu_coherency(domain);
0560     domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
0561 
0562     /*
0563      * If RHSA is missing, we should default to the device numa domain
0564      * as fall back.
0565      */
0566     if (domain->nid == NUMA_NO_NODE)
0567         domain->nid = domain_update_device_node(domain);
0568 
0569     /*
0570      * First-level translation restricts the input-address to a
0571      * canonical address (i.e., address bits 63:N have the same
0572      * value as address bit [N-1], where N is 48-bits with 4-level
0573      * paging and 57-bits with 5-level paging). Hence, skip bit
0574      * [N-1].
0575      */
0576     if (domain_use_first_level(domain))
0577         domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
0578     else
0579         domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
0580 
0581     domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
0582     domain_update_iotlb(domain);
0583 }
0584 
0585 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
0586                      u8 devfn, int alloc)
0587 {
0588     struct root_entry *root = &iommu->root_entry[bus];
0589     struct context_entry *context;
0590     u64 *entry;
0591 
0592     /*
0593      * Except that the caller requested to allocate a new entry,
0594      * returning a copied context entry makes no sense.
0595      */
0596     if (!alloc && context_copied(iommu, bus, devfn))
0597         return NULL;
0598 
0599     entry = &root->lo;
0600     if (sm_supported(iommu)) {
0601         if (devfn >= 0x80) {
0602             devfn -= 0x80;
0603             entry = &root->hi;
0604         }
0605         devfn *= 2;
0606     }
0607     if (*entry & 1)
0608         context = phys_to_virt(*entry & VTD_PAGE_MASK);
0609     else {
0610         unsigned long phy_addr;
0611         if (!alloc)
0612             return NULL;
0613 
0614         context = alloc_pgtable_page(iommu->node);
0615         if (!context)
0616             return NULL;
0617 
0618         __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
0619         phy_addr = virt_to_phys((void *)context);
0620         *entry = phy_addr | 1;
0621         __iommu_flush_cache(iommu, entry, sizeof(*entry));
0622     }
0623     return &context[devfn];
0624 }
0625 
0626 /**
0627  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
0628  *               sub-hierarchy of a candidate PCI-PCI bridge
0629  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
0630  * @bridge: the candidate PCI-PCI bridge
0631  *
0632  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
0633  */
0634 static bool
0635 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
0636 {
0637     struct pci_dev *pdev, *pbridge;
0638 
0639     if (!dev_is_pci(dev) || !dev_is_pci(bridge))
0640         return false;
0641 
0642     pdev = to_pci_dev(dev);
0643     pbridge = to_pci_dev(bridge);
0644 
0645     if (pbridge->subordinate &&
0646         pbridge->subordinate->number <= pdev->bus->number &&
0647         pbridge->subordinate->busn_res.end >= pdev->bus->number)
0648         return true;
0649 
0650     return false;
0651 }
0652 
0653 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
0654 {
0655     struct dmar_drhd_unit *drhd;
0656     u32 vtbar;
0657     int rc;
0658 
0659     /* We know that this device on this chipset has its own IOMMU.
0660      * If we find it under a different IOMMU, then the BIOS is lying
0661      * to us. Hope that the IOMMU for this device is actually
0662      * disabled, and it needs no translation...
0663      */
0664     rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
0665     if (rc) {
0666         /* "can't" happen */
0667         dev_info(&pdev->dev, "failed to run vt-d quirk\n");
0668         return false;
0669     }
0670     vtbar &= 0xffff0000;
0671 
0672     /* we know that the this iommu should be at offset 0xa000 from vtbar */
0673     drhd = dmar_find_matched_drhd_unit(pdev);
0674     if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
0675         pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
0676         add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
0677         return true;
0678     }
0679 
0680     return false;
0681 }
0682 
0683 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
0684 {
0685     if (!iommu || iommu->drhd->ignored)
0686         return true;
0687 
0688     if (dev_is_pci(dev)) {
0689         struct pci_dev *pdev = to_pci_dev(dev);
0690 
0691         if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
0692             pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
0693             quirk_ioat_snb_local_iommu(pdev))
0694             return true;
0695     }
0696 
0697     return false;
0698 }
0699 
0700 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
0701 {
0702     struct dmar_drhd_unit *drhd = NULL;
0703     struct pci_dev *pdev = NULL;
0704     struct intel_iommu *iommu;
0705     struct device *tmp;
0706     u16 segment = 0;
0707     int i;
0708 
0709     if (!dev)
0710         return NULL;
0711 
0712     if (dev_is_pci(dev)) {
0713         struct pci_dev *pf_pdev;
0714 
0715         pdev = pci_real_dma_dev(to_pci_dev(dev));
0716 
0717         /* VFs aren't listed in scope tables; we need to look up
0718          * the PF instead to find the IOMMU. */
0719         pf_pdev = pci_physfn(pdev);
0720         dev = &pf_pdev->dev;
0721         segment = pci_domain_nr(pdev->bus);
0722     } else if (has_acpi_companion(dev))
0723         dev = &ACPI_COMPANION(dev)->dev;
0724 
0725     rcu_read_lock();
0726     for_each_iommu(iommu, drhd) {
0727         if (pdev && segment != drhd->segment)
0728             continue;
0729 
0730         for_each_active_dev_scope(drhd->devices,
0731                       drhd->devices_cnt, i, tmp) {
0732             if (tmp == dev) {
0733                 /* For a VF use its original BDF# not that of the PF
0734                  * which we used for the IOMMU lookup. Strictly speaking
0735                  * we could do this for all PCI devices; we only need to
0736                  * get the BDF# from the scope table for ACPI matches. */
0737                 if (pdev && pdev->is_virtfn)
0738                     goto got_pdev;
0739 
0740                 if (bus && devfn) {
0741                     *bus = drhd->devices[i].bus;
0742                     *devfn = drhd->devices[i].devfn;
0743                 }
0744                 goto out;
0745             }
0746 
0747             if (is_downstream_to_pci_bridge(dev, tmp))
0748                 goto got_pdev;
0749         }
0750 
0751         if (pdev && drhd->include_all) {
0752 got_pdev:
0753             if (bus && devfn) {
0754                 *bus = pdev->bus->number;
0755                 *devfn = pdev->devfn;
0756             }
0757             goto out;
0758         }
0759     }
0760     iommu = NULL;
0761 out:
0762     if (iommu_is_dummy(iommu, dev))
0763         iommu = NULL;
0764 
0765     rcu_read_unlock();
0766 
0767     return iommu;
0768 }
0769 
0770 static void domain_flush_cache(struct dmar_domain *domain,
0771                    void *addr, int size)
0772 {
0773     if (!domain->iommu_coherency)
0774         clflush_cache_range(addr, size);
0775 }
0776 
0777 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
0778 {
0779     struct context_entry *context;
0780     int ret = 0;
0781 
0782     spin_lock(&iommu->lock);
0783     context = iommu_context_addr(iommu, bus, devfn, 0);
0784     if (context)
0785         ret = context_present(context);
0786     spin_unlock(&iommu->lock);
0787     return ret;
0788 }
0789 
0790 static void free_context_table(struct intel_iommu *iommu)
0791 {
0792     struct context_entry *context;
0793     int i;
0794 
0795     if (!iommu->root_entry)
0796         return;
0797 
0798     for (i = 0; i < ROOT_ENTRY_NR; i++) {
0799         context = iommu_context_addr(iommu, i, 0, 0);
0800         if (context)
0801             free_pgtable_page(context);
0802 
0803         if (!sm_supported(iommu))
0804             continue;
0805 
0806         context = iommu_context_addr(iommu, i, 0x80, 0);
0807         if (context)
0808             free_pgtable_page(context);
0809     }
0810 
0811     free_pgtable_page(iommu->root_entry);
0812     iommu->root_entry = NULL;
0813 }
0814 
0815 #ifdef CONFIG_DMAR_DEBUG
0816 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
0817              u8 bus, u8 devfn, struct dma_pte *parent, int level)
0818 {
0819     struct dma_pte *pte;
0820     int offset;
0821 
0822     while (1) {
0823         offset = pfn_level_offset(pfn, level);
0824         pte = &parent[offset];
0825         if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
0826             pr_info("PTE not present at level %d\n", level);
0827             break;
0828         }
0829 
0830         pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
0831 
0832         if (level == 1)
0833             break;
0834 
0835         parent = phys_to_virt(dma_pte_addr(pte));
0836         level--;
0837     }
0838 }
0839 
0840 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
0841               unsigned long long addr, u32 pasid)
0842 {
0843     struct pasid_dir_entry *dir, *pde;
0844     struct pasid_entry *entries, *pte;
0845     struct context_entry *ctx_entry;
0846     struct root_entry *rt_entry;
0847     int i, dir_index, index, level;
0848     u8 devfn = source_id & 0xff;
0849     u8 bus = source_id >> 8;
0850     struct dma_pte *pgtable;
0851 
0852     pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
0853 
0854     /* root entry dump */
0855     rt_entry = &iommu->root_entry[bus];
0856     if (!rt_entry) {
0857         pr_info("root table entry is not present\n");
0858         return;
0859     }
0860 
0861     if (sm_supported(iommu))
0862         pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
0863             rt_entry->hi, rt_entry->lo);
0864     else
0865         pr_info("root entry: 0x%016llx", rt_entry->lo);
0866 
0867     /* context entry dump */
0868     ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
0869     if (!ctx_entry) {
0870         pr_info("context table entry is not present\n");
0871         return;
0872     }
0873 
0874     pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
0875         ctx_entry->hi, ctx_entry->lo);
0876 
0877     /* legacy mode does not require PASID entries */
0878     if (!sm_supported(iommu)) {
0879         level = agaw_to_level(ctx_entry->hi & 7);
0880         pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
0881         goto pgtable_walk;
0882     }
0883 
0884     /* get the pointer to pasid directory entry */
0885     dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
0886     if (!dir) {
0887         pr_info("pasid directory entry is not present\n");
0888         return;
0889     }
0890     /* For request-without-pasid, get the pasid from context entry */
0891     if (intel_iommu_sm && pasid == INVALID_IOASID)
0892         pasid = PASID_RID2PASID;
0893 
0894     dir_index = pasid >> PASID_PDE_SHIFT;
0895     pde = &dir[dir_index];
0896     pr_info("pasid dir entry: 0x%016llx\n", pde->val);
0897 
0898     /* get the pointer to the pasid table entry */
0899     entries = get_pasid_table_from_pde(pde);
0900     if (!entries) {
0901         pr_info("pasid table entry is not present\n");
0902         return;
0903     }
0904     index = pasid & PASID_PTE_MASK;
0905     pte = &entries[index];
0906     for (i = 0; i < ARRAY_SIZE(pte->val); i++)
0907         pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
0908 
0909     if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
0910         level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
0911         pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
0912     } else {
0913         level = agaw_to_level((pte->val[0] >> 2) & 0x7);
0914         pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
0915     }
0916 
0917 pgtable_walk:
0918     pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
0919 }
0920 #endif
0921 
0922 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
0923                       unsigned long pfn, int *target_level)
0924 {
0925     struct dma_pte *parent, *pte;
0926     int level = agaw_to_level(domain->agaw);
0927     int offset;
0928 
0929     BUG_ON(!domain->pgd);
0930 
0931     if (!domain_pfn_supported(domain, pfn))
0932         /* Address beyond IOMMU's addressing capabilities. */
0933         return NULL;
0934 
0935     parent = domain->pgd;
0936 
0937     while (1) {
0938         void *tmp_page;
0939 
0940         offset = pfn_level_offset(pfn, level);
0941         pte = &parent[offset];
0942         if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
0943             break;
0944         if (level == *target_level)
0945             break;
0946 
0947         if (!dma_pte_present(pte)) {
0948             uint64_t pteval;
0949 
0950             tmp_page = alloc_pgtable_page(domain->nid);
0951 
0952             if (!tmp_page)
0953                 return NULL;
0954 
0955             domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
0956             pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
0957             if (domain_use_first_level(domain)) {
0958                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
0959                 if (iommu_is_dma_domain(&domain->domain))
0960                     pteval |= DMA_FL_PTE_ACCESS;
0961             }
0962             if (cmpxchg64(&pte->val, 0ULL, pteval))
0963                 /* Someone else set it while we were thinking; use theirs. */
0964                 free_pgtable_page(tmp_page);
0965             else
0966                 domain_flush_cache(domain, pte, sizeof(*pte));
0967         }
0968         if (level == 1)
0969             break;
0970 
0971         parent = phys_to_virt(dma_pte_addr(pte));
0972         level--;
0973     }
0974 
0975     if (!*target_level)
0976         *target_level = level;
0977 
0978     return pte;
0979 }
0980 
0981 /* return address's pte at specific level */
0982 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
0983                      unsigned long pfn,
0984                      int level, int *large_page)
0985 {
0986     struct dma_pte *parent, *pte;
0987     int total = agaw_to_level(domain->agaw);
0988     int offset;
0989 
0990     parent = domain->pgd;
0991     while (level <= total) {
0992         offset = pfn_level_offset(pfn, total);
0993         pte = &parent[offset];
0994         if (level == total)
0995             return pte;
0996 
0997         if (!dma_pte_present(pte)) {
0998             *large_page = total;
0999             break;
1000         }
1001 
1002         if (dma_pte_superpage(pte)) {
1003             *large_page = total;
1004             return pte;
1005         }
1006 
1007         parent = phys_to_virt(dma_pte_addr(pte));
1008         total--;
1009     }
1010     return NULL;
1011 }
1012 
1013 /* clear last level pte, a tlb flush should be followed */
1014 static void dma_pte_clear_range(struct dmar_domain *domain,
1015                 unsigned long start_pfn,
1016                 unsigned long last_pfn)
1017 {
1018     unsigned int large_page;
1019     struct dma_pte *first_pte, *pte;
1020 
1021     BUG_ON(!domain_pfn_supported(domain, start_pfn));
1022     BUG_ON(!domain_pfn_supported(domain, last_pfn));
1023     BUG_ON(start_pfn > last_pfn);
1024 
1025     /* we don't need lock here; nobody else touches the iova range */
1026     do {
1027         large_page = 1;
1028         first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1029         if (!pte) {
1030             start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1031             continue;
1032         }
1033         do {
1034             dma_clear_pte(pte);
1035             start_pfn += lvl_to_nr_pages(large_page);
1036             pte++;
1037         } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1038 
1039         domain_flush_cache(domain, first_pte,
1040                    (void *)pte - (void *)first_pte);
1041 
1042     } while (start_pfn && start_pfn <= last_pfn);
1043 }
1044 
1045 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1046                    int retain_level, struct dma_pte *pte,
1047                    unsigned long pfn, unsigned long start_pfn,
1048                    unsigned long last_pfn)
1049 {
1050     pfn = max(start_pfn, pfn);
1051     pte = &pte[pfn_level_offset(pfn, level)];
1052 
1053     do {
1054         unsigned long level_pfn;
1055         struct dma_pte *level_pte;
1056 
1057         if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1058             goto next;
1059 
1060         level_pfn = pfn & level_mask(level);
1061         level_pte = phys_to_virt(dma_pte_addr(pte));
1062 
1063         if (level > 2) {
1064             dma_pte_free_level(domain, level - 1, retain_level,
1065                        level_pte, level_pfn, start_pfn,
1066                        last_pfn);
1067         }
1068 
1069         /*
1070          * Free the page table if we're below the level we want to
1071          * retain and the range covers the entire table.
1072          */
1073         if (level < retain_level && !(start_pfn > level_pfn ||
1074               last_pfn < level_pfn + level_size(level) - 1)) {
1075             dma_clear_pte(pte);
1076             domain_flush_cache(domain, pte, sizeof(*pte));
1077             free_pgtable_page(level_pte);
1078         }
1079 next:
1080         pfn += level_size(level);
1081     } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1082 }
1083 
1084 /*
1085  * clear last level (leaf) ptes and free page table pages below the
1086  * level we wish to keep intact.
1087  */
1088 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1089                    unsigned long start_pfn,
1090                    unsigned long last_pfn,
1091                    int retain_level)
1092 {
1093     dma_pte_clear_range(domain, start_pfn, last_pfn);
1094 
1095     /* We don't need lock here; nobody else touches the iova range */
1096     dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1097                domain->pgd, 0, start_pfn, last_pfn);
1098 
1099     /* free pgd */
1100     if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1101         free_pgtable_page(domain->pgd);
1102         domain->pgd = NULL;
1103     }
1104 }
1105 
1106 /* When a page at a given level is being unlinked from its parent, we don't
1107    need to *modify* it at all. All we need to do is make a list of all the
1108    pages which can be freed just as soon as we've flushed the IOTLB and we
1109    know the hardware page-walk will no longer touch them.
1110    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1111    be freed. */
1112 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1113                     int level, struct dma_pte *pte,
1114                     struct list_head *freelist)
1115 {
1116     struct page *pg;
1117 
1118     pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1119     list_add_tail(&pg->lru, freelist);
1120 
1121     if (level == 1)
1122         return;
1123 
1124     pte = page_address(pg);
1125     do {
1126         if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1127             dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1128         pte++;
1129     } while (!first_pte_in_page(pte));
1130 }
1131 
1132 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1133                 struct dma_pte *pte, unsigned long pfn,
1134                 unsigned long start_pfn, unsigned long last_pfn,
1135                 struct list_head *freelist)
1136 {
1137     struct dma_pte *first_pte = NULL, *last_pte = NULL;
1138 
1139     pfn = max(start_pfn, pfn);
1140     pte = &pte[pfn_level_offset(pfn, level)];
1141 
1142     do {
1143         unsigned long level_pfn = pfn & level_mask(level);
1144 
1145         if (!dma_pte_present(pte))
1146             goto next;
1147 
1148         /* If range covers entire pagetable, free it */
1149         if (start_pfn <= level_pfn &&
1150             last_pfn >= level_pfn + level_size(level) - 1) {
1151             /* These suborbinate page tables are going away entirely. Don't
1152                bother to clear them; we're just going to *free* them. */
1153             if (level > 1 && !dma_pte_superpage(pte))
1154                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1155 
1156             dma_clear_pte(pte);
1157             if (!first_pte)
1158                 first_pte = pte;
1159             last_pte = pte;
1160         } else if (level > 1) {
1161             /* Recurse down into a level that isn't *entirely* obsolete */
1162             dma_pte_clear_level(domain, level - 1,
1163                         phys_to_virt(dma_pte_addr(pte)),
1164                         level_pfn, start_pfn, last_pfn,
1165                         freelist);
1166         }
1167 next:
1168         pfn = level_pfn + level_size(level);
1169     } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1170 
1171     if (first_pte)
1172         domain_flush_cache(domain, first_pte,
1173                    (void *)++last_pte - (void *)first_pte);
1174 }
1175 
1176 /* We can't just free the pages because the IOMMU may still be walking
1177    the page tables, and may have cached the intermediate levels. The
1178    pages can only be freed after the IOTLB flush has been done. */
1179 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1180              unsigned long last_pfn, struct list_head *freelist)
1181 {
1182     BUG_ON(!domain_pfn_supported(domain, start_pfn));
1183     BUG_ON(!domain_pfn_supported(domain, last_pfn));
1184     BUG_ON(start_pfn > last_pfn);
1185 
1186     /* we don't need lock here; nobody else touches the iova range */
1187     dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1188                 domain->pgd, 0, start_pfn, last_pfn, freelist);
1189 
1190     /* free pgd */
1191     if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1192         struct page *pgd_page = virt_to_page(domain->pgd);
1193         list_add_tail(&pgd_page->lru, freelist);
1194         domain->pgd = NULL;
1195     }
1196 }
1197 
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201     struct root_entry *root;
1202 
1203     root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1204     if (!root) {
1205         pr_err("Allocating root entry for %s failed\n",
1206             iommu->name);
1207         return -ENOMEM;
1208     }
1209 
1210     __iommu_flush_cache(iommu, root, ROOT_SIZE);
1211     iommu->root_entry = root;
1212 
1213     return 0;
1214 }
1215 
1216 static void iommu_set_root_entry(struct intel_iommu *iommu)
1217 {
1218     u64 addr;
1219     u32 sts;
1220     unsigned long flag;
1221 
1222     addr = virt_to_phys(iommu->root_entry);
1223     if (sm_supported(iommu))
1224         addr |= DMA_RTADDR_SMT;
1225 
1226     raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227     dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1228 
1229     writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1230 
1231     /* Make sure hardware complete it */
1232     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233               readl, (sts & DMA_GSTS_RTPS), sts);
1234 
1235     raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236 
1237     iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1238     if (sm_supported(iommu))
1239         qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1240     iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1241 }
1242 
1243 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1244 {
1245     u32 val;
1246     unsigned long flag;
1247 
1248     if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1249         return;
1250 
1251     raw_spin_lock_irqsave(&iommu->register_lock, flag);
1252     writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1253 
1254     /* Make sure hardware complete it */
1255     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1256               readl, (!(val & DMA_GSTS_WBFS)), val);
1257 
1258     raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1259 }
1260 
1261 /* return value determine if we need a write buffer flush */
1262 static void __iommu_flush_context(struct intel_iommu *iommu,
1263                   u16 did, u16 source_id, u8 function_mask,
1264                   u64 type)
1265 {
1266     u64 val = 0;
1267     unsigned long flag;
1268 
1269     switch (type) {
1270     case DMA_CCMD_GLOBAL_INVL:
1271         val = DMA_CCMD_GLOBAL_INVL;
1272         break;
1273     case DMA_CCMD_DOMAIN_INVL:
1274         val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1275         break;
1276     case DMA_CCMD_DEVICE_INVL:
1277         val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1278             | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1279         break;
1280     default:
1281         BUG();
1282     }
1283     val |= DMA_CCMD_ICC;
1284 
1285     raw_spin_lock_irqsave(&iommu->register_lock, flag);
1286     dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287 
1288     /* Make sure hardware complete it */
1289     IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1290         dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291 
1292     raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1293 }
1294 
1295 /* return value determine if we need a write buffer flush */
1296 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1297                 u64 addr, unsigned int size_order, u64 type)
1298 {
1299     int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1300     u64 val = 0, val_iva = 0;
1301     unsigned long flag;
1302 
1303     switch (type) {
1304     case DMA_TLB_GLOBAL_FLUSH:
1305         /* global flush doesn't need set IVA_REG */
1306         val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307         break;
1308     case DMA_TLB_DSI_FLUSH:
1309         val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310         break;
1311     case DMA_TLB_PSI_FLUSH:
1312         val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1313         /* IH bit is passed in as part of address */
1314         val_iva = size_order | addr;
1315         break;
1316     default:
1317         BUG();
1318     }
1319     /* Note: set drain read/write */
1320 #if 0
1321     /*
1322      * This is probably to be super secure.. Looks like we can
1323      * ignore it without any impact.
1324      */
1325     if (cap_read_drain(iommu->cap))
1326         val |= DMA_TLB_READ_DRAIN;
1327 #endif
1328     if (cap_write_drain(iommu->cap))
1329         val |= DMA_TLB_WRITE_DRAIN;
1330 
1331     raw_spin_lock_irqsave(&iommu->register_lock, flag);
1332     /* Note: Only uses first TLB reg currently */
1333     if (val_iva)
1334         dmar_writeq(iommu->reg + tlb_offset, val_iva);
1335     dmar_writeq(iommu->reg + tlb_offset + 8, val);
1336 
1337     /* Make sure hardware complete it */
1338     IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1339         dmar_readq, (!(val & DMA_TLB_IVT)), val);
1340 
1341     raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342 
1343     /* check IOTLB invalidation granularity */
1344     if (DMA_TLB_IAIG(val) == 0)
1345         pr_err("Flush IOTLB failed\n");
1346     if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1347         pr_debug("TLB flush request %Lx, actual %Lx\n",
1348             (unsigned long long)DMA_TLB_IIRG(type),
1349             (unsigned long long)DMA_TLB_IAIG(val));
1350 }
1351 
1352 static struct device_domain_info *
1353 iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu,
1354             u8 bus, u8 devfn)
1355 {
1356     struct device_domain_info *info;
1357     unsigned long flags;
1358 
1359     if (!iommu->qi)
1360         return NULL;
1361 
1362     spin_lock_irqsave(&domain->lock, flags);
1363     list_for_each_entry(info, &domain->devices, link) {
1364         if (info->iommu == iommu && info->bus == bus &&
1365             info->devfn == devfn) {
1366             spin_unlock_irqrestore(&domain->lock, flags);
1367             return info->ats_supported ? info : NULL;
1368         }
1369     }
1370     spin_unlock_irqrestore(&domain->lock, flags);
1371 
1372     return NULL;
1373 }
1374 
1375 static void domain_update_iotlb(struct dmar_domain *domain)
1376 {
1377     struct device_domain_info *info;
1378     bool has_iotlb_device = false;
1379     unsigned long flags;
1380 
1381     spin_lock_irqsave(&domain->lock, flags);
1382     list_for_each_entry(info, &domain->devices, link) {
1383         if (info->ats_enabled) {
1384             has_iotlb_device = true;
1385             break;
1386         }
1387     }
1388     domain->has_iotlb_device = has_iotlb_device;
1389     spin_unlock_irqrestore(&domain->lock, flags);
1390 }
1391 
1392 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1393 {
1394     struct pci_dev *pdev;
1395 
1396     if (!info || !dev_is_pci(info->dev))
1397         return;
1398 
1399     pdev = to_pci_dev(info->dev);
1400     /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1401      * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1402      * queue depth at PF level. If DIT is not set, PFSID will be treated as
1403      * reserved, which should be set to 0.
1404      */
1405     if (!ecap_dit(info->iommu->ecap))
1406         info->pfsid = 0;
1407     else {
1408         struct pci_dev *pf_pdev;
1409 
1410         /* pdev will be returned if device is not a vf */
1411         pf_pdev = pci_physfn(pdev);
1412         info->pfsid = pci_dev_id(pf_pdev);
1413     }
1414 
1415 #ifdef CONFIG_INTEL_IOMMU_SVM
1416     /* The PCIe spec, in its wisdom, declares that the behaviour of
1417        the device if you enable PASID support after ATS support is
1418        undefined. So always enable PASID support on devices which
1419        have it, even if we can't yet know if we're ever going to
1420        use it. */
1421     if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1422         info->pasid_enabled = 1;
1423 
1424     if (info->pri_supported &&
1425         (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1426         !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1427         info->pri_enabled = 1;
1428 #endif
1429     if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1430         !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1431         info->ats_enabled = 1;
1432         domain_update_iotlb(info->domain);
1433         info->ats_qdep = pci_ats_queue_depth(pdev);
1434     }
1435 }
1436 
1437 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1438 {
1439     struct pci_dev *pdev;
1440 
1441     if (!dev_is_pci(info->dev))
1442         return;
1443 
1444     pdev = to_pci_dev(info->dev);
1445 
1446     if (info->ats_enabled) {
1447         pci_disable_ats(pdev);
1448         info->ats_enabled = 0;
1449         domain_update_iotlb(info->domain);
1450     }
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452     if (info->pri_enabled) {
1453         pci_disable_pri(pdev);
1454         info->pri_enabled = 0;
1455     }
1456     if (info->pasid_enabled) {
1457         pci_disable_pasid(pdev);
1458         info->pasid_enabled = 0;
1459     }
1460 #endif
1461 }
1462 
1463 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1464                     u64 addr, unsigned int mask)
1465 {
1466     u16 sid, qdep;
1467 
1468     if (!info || !info->ats_enabled)
1469         return;
1470 
1471     sid = info->bus << 8 | info->devfn;
1472     qdep = info->ats_qdep;
1473     qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1474                qdep, addr, mask);
1475 }
1476 
1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1478                   u64 addr, unsigned mask)
1479 {
1480     struct device_domain_info *info;
1481     unsigned long flags;
1482 
1483     if (!domain->has_iotlb_device)
1484         return;
1485 
1486     spin_lock_irqsave(&domain->lock, flags);
1487     list_for_each_entry(info, &domain->devices, link)
1488         __iommu_flush_dev_iotlb(info, addr, mask);
1489     spin_unlock_irqrestore(&domain->lock, flags);
1490 }
1491 
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493                   struct dmar_domain *domain,
1494                   unsigned long pfn, unsigned int pages,
1495                   int ih, int map)
1496 {
1497     unsigned int aligned_pages = __roundup_pow_of_two(pages);
1498     unsigned int mask = ilog2(aligned_pages);
1499     uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1500     u16 did = domain_id_iommu(domain, iommu);
1501 
1502     BUG_ON(pages == 0);
1503 
1504     if (ih)
1505         ih = 1 << 6;
1506 
1507     if (domain_use_first_level(domain)) {
1508         qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1509     } else {
1510         unsigned long bitmask = aligned_pages - 1;
1511 
1512         /*
1513          * PSI masks the low order bits of the base address. If the
1514          * address isn't aligned to the mask, then compute a mask value
1515          * needed to ensure the target range is flushed.
1516          */
1517         if (unlikely(bitmask & pfn)) {
1518             unsigned long end_pfn = pfn + pages - 1, shared_bits;
1519 
1520             /*
1521              * Since end_pfn <= pfn + bitmask, the only way bits
1522              * higher than bitmask can differ in pfn and end_pfn is
1523              * by carrying. This means after masking out bitmask,
1524              * high bits starting with the first set bit in
1525              * shared_bits are all equal in both pfn and end_pfn.
1526              */
1527             shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1528             mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1529         }
1530 
1531         /*
1532          * Fallback to domain selective flush if no PSI support or
1533          * the size is too big.
1534          */
1535         if (!cap_pgsel_inv(iommu->cap) ||
1536             mask > cap_max_amask_val(iommu->cap))
1537             iommu->flush.flush_iotlb(iommu, did, 0, 0,
1538                             DMA_TLB_DSI_FLUSH);
1539         else
1540             iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1541                             DMA_TLB_PSI_FLUSH);
1542     }
1543 
1544     /*
1545      * In caching mode, changes of pages from non-present to present require
1546      * flush. However, device IOTLB doesn't need to be flushed in this case.
1547      */
1548     if (!cap_caching_mode(iommu->cap) || !map)
1549         iommu_flush_dev_iotlb(domain, addr, mask);
1550 }
1551 
1552 /* Notification for newly created mappings */
1553 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1554                     struct dmar_domain *domain,
1555                     unsigned long pfn, unsigned int pages)
1556 {
1557     /*
1558      * It's a non-present to present mapping. Only flush if caching mode
1559      * and second level.
1560      */
1561     if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1562         iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1563     else
1564         iommu_flush_write_buffer(iommu);
1565 }
1566 
1567 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1568 {
1569     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1570     struct iommu_domain_info *info;
1571     unsigned long idx;
1572 
1573     xa_for_each(&dmar_domain->iommu_array, idx, info) {
1574         struct intel_iommu *iommu = info->iommu;
1575         u16 did = domain_id_iommu(dmar_domain, iommu);
1576 
1577         if (domain_use_first_level(dmar_domain))
1578             qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1579         else
1580             iommu->flush.flush_iotlb(iommu, did, 0, 0,
1581                          DMA_TLB_DSI_FLUSH);
1582 
1583         if (!cap_caching_mode(iommu->cap))
1584             iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1585     }
1586 }
1587 
1588 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1589 {
1590     u32 pmen;
1591     unsigned long flags;
1592 
1593     if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1594         return;
1595 
1596     raw_spin_lock_irqsave(&iommu->register_lock, flags);
1597     pmen = readl(iommu->reg + DMAR_PMEN_REG);
1598     pmen &= ~DMA_PMEN_EPM;
1599     writel(pmen, iommu->reg + DMAR_PMEN_REG);
1600 
1601     /* wait for the protected region status bit to clear */
1602     IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1603         readl, !(pmen & DMA_PMEN_PRS), pmen);
1604 
1605     raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1606 }
1607 
1608 static void iommu_enable_translation(struct intel_iommu *iommu)
1609 {
1610     u32 sts;
1611     unsigned long flags;
1612 
1613     raw_spin_lock_irqsave(&iommu->register_lock, flags);
1614     iommu->gcmd |= DMA_GCMD_TE;
1615     writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1616 
1617     /* Make sure hardware complete it */
1618     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1619               readl, (sts & DMA_GSTS_TES), sts);
1620 
1621     raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1622 }
1623 
1624 static void iommu_disable_translation(struct intel_iommu *iommu)
1625 {
1626     u32 sts;
1627     unsigned long flag;
1628 
1629     if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1630         (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1631         return;
1632 
1633     raw_spin_lock_irqsave(&iommu->register_lock, flag);
1634     iommu->gcmd &= ~DMA_GCMD_TE;
1635     writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636 
1637     /* Make sure hardware complete it */
1638     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639               readl, (!(sts & DMA_GSTS_TES)), sts);
1640 
1641     raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1642 }
1643 
1644 static int iommu_init_domains(struct intel_iommu *iommu)
1645 {
1646     u32 ndomains;
1647 
1648     ndomains = cap_ndoms(iommu->cap);
1649     pr_debug("%s: Number of Domains supported <%d>\n",
1650          iommu->name, ndomains);
1651 
1652     spin_lock_init(&iommu->lock);
1653 
1654     iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1655     if (!iommu->domain_ids)
1656         return -ENOMEM;
1657 
1658     /*
1659      * If Caching mode is set, then invalid translations are tagged
1660      * with domain-id 0, hence we need to pre-allocate it. We also
1661      * use domain-id 0 as a marker for non-allocated domain-id, so
1662      * make sure it is not used for a real domain.
1663      */
1664     set_bit(0, iommu->domain_ids);
1665 
1666     /*
1667      * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1668      * entry for first-level or pass-through translation modes should
1669      * be programmed with a domain id different from those used for
1670      * second-level or nested translation. We reserve a domain id for
1671      * this purpose.
1672      */
1673     if (sm_supported(iommu))
1674         set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1675 
1676     return 0;
1677 }
1678 
1679 static void disable_dmar_iommu(struct intel_iommu *iommu)
1680 {
1681     if (!iommu->domain_ids)
1682         return;
1683 
1684     /*
1685      * All iommu domains must have been detached from the devices,
1686      * hence there should be no domain IDs in use.
1687      */
1688     if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1689             > NUM_RESERVED_DID))
1690         return;
1691 
1692     if (iommu->gcmd & DMA_GCMD_TE)
1693         iommu_disable_translation(iommu);
1694 }
1695 
1696 static void free_dmar_iommu(struct intel_iommu *iommu)
1697 {
1698     if (iommu->domain_ids) {
1699         bitmap_free(iommu->domain_ids);
1700         iommu->domain_ids = NULL;
1701     }
1702 
1703     if (iommu->copied_tables) {
1704         bitmap_free(iommu->copied_tables);
1705         iommu->copied_tables = NULL;
1706     }
1707 
1708     /* free context mapping */
1709     free_context_table(iommu);
1710 
1711 #ifdef CONFIG_INTEL_IOMMU_SVM
1712     if (pasid_supported(iommu)) {
1713         if (ecap_prs(iommu->ecap))
1714             intel_svm_finish_prq(iommu);
1715     }
1716     if (vccap_pasid(iommu->vccap))
1717         ioasid_unregister_allocator(&iommu->pasid_allocator);
1718 
1719 #endif
1720 }
1721 
1722 /*
1723  * Check and return whether first level is used by default for
1724  * DMA translation.
1725  */
1726 static bool first_level_by_default(unsigned int type)
1727 {
1728     /* Only SL is available in legacy mode */
1729     if (!scalable_mode_support())
1730         return false;
1731 
1732     /* Only level (either FL or SL) is available, just use it */
1733     if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1734         return intel_cap_flts_sanity();
1735 
1736     /* Both levels are available, decide it based on domain type */
1737     return type != IOMMU_DOMAIN_UNMANAGED;
1738 }
1739 
1740 static struct dmar_domain *alloc_domain(unsigned int type)
1741 {
1742     struct dmar_domain *domain;
1743 
1744     domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1745     if (!domain)
1746         return NULL;
1747 
1748     domain->nid = NUMA_NO_NODE;
1749     if (first_level_by_default(type))
1750         domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1751     domain->has_iotlb_device = false;
1752     INIT_LIST_HEAD(&domain->devices);
1753     spin_lock_init(&domain->lock);
1754     xa_init(&domain->iommu_array);
1755 
1756     return domain;
1757 }
1758 
1759 static int domain_attach_iommu(struct dmar_domain *domain,
1760                    struct intel_iommu *iommu)
1761 {
1762     struct iommu_domain_info *info, *curr;
1763     unsigned long ndomains;
1764     int num, ret = -ENOSPC;
1765 
1766     info = kzalloc(sizeof(*info), GFP_KERNEL);
1767     if (!info)
1768         return -ENOMEM;
1769 
1770     spin_lock(&iommu->lock);
1771     curr = xa_load(&domain->iommu_array, iommu->seq_id);
1772     if (curr) {
1773         curr->refcnt++;
1774         spin_unlock(&iommu->lock);
1775         kfree(info);
1776         return 0;
1777     }
1778 
1779     ndomains = cap_ndoms(iommu->cap);
1780     num = find_first_zero_bit(iommu->domain_ids, ndomains);
1781     if (num >= ndomains) {
1782         pr_err("%s: No free domain ids\n", iommu->name);
1783         goto err_unlock;
1784     }
1785 
1786     set_bit(num, iommu->domain_ids);
1787     info->refcnt    = 1;
1788     info->did   = num;
1789     info->iommu = iommu;
1790     curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1791               NULL, info, GFP_ATOMIC);
1792     if (curr) {
1793         ret = xa_err(curr) ? : -EBUSY;
1794         goto err_clear;
1795     }
1796     domain_update_iommu_cap(domain);
1797 
1798     spin_unlock(&iommu->lock);
1799     return 0;
1800 
1801 err_clear:
1802     clear_bit(info->did, iommu->domain_ids);
1803 err_unlock:
1804     spin_unlock(&iommu->lock);
1805     kfree(info);
1806     return ret;
1807 }
1808 
1809 static void domain_detach_iommu(struct dmar_domain *domain,
1810                 struct intel_iommu *iommu)
1811 {
1812     struct iommu_domain_info *info;
1813 
1814     spin_lock(&iommu->lock);
1815     info = xa_load(&domain->iommu_array, iommu->seq_id);
1816     if (--info->refcnt == 0) {
1817         clear_bit(info->did, iommu->domain_ids);
1818         xa_erase(&domain->iommu_array, iommu->seq_id);
1819         domain->nid = NUMA_NO_NODE;
1820         domain_update_iommu_cap(domain);
1821         kfree(info);
1822     }
1823     spin_unlock(&iommu->lock);
1824 }
1825 
1826 static inline int guestwidth_to_adjustwidth(int gaw)
1827 {
1828     int agaw;
1829     int r = (gaw - 12) % 9;
1830 
1831     if (r == 0)
1832         agaw = gaw;
1833     else
1834         agaw = gaw + 9 - r;
1835     if (agaw > 64)
1836         agaw = 64;
1837     return agaw;
1838 }
1839 
1840 static void domain_exit(struct dmar_domain *domain)
1841 {
1842     if (domain->pgd) {
1843         LIST_HEAD(freelist);
1844 
1845         domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1846         put_pages_list(&freelist);
1847     }
1848 
1849     if (WARN_ON(!list_empty(&domain->devices)))
1850         return;
1851 
1852     kfree(domain);
1853 }
1854 
1855 /*
1856  * Get the PASID directory size for scalable mode context entry.
1857  * Value of X in the PDTS field of a scalable mode context entry
1858  * indicates PASID directory with 2^(X + 7) entries.
1859  */
1860 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1861 {
1862     unsigned long pds, max_pde;
1863 
1864     max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1865     pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1866     if (pds < 7)
1867         return 0;
1868 
1869     return pds - 7;
1870 }
1871 
1872 /*
1873  * Set the RID_PASID field of a scalable mode context entry. The
1874  * IOMMU hardware will use the PASID value set in this field for
1875  * DMA translations of DMA requests without PASID.
1876  */
1877 static inline void
1878 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1879 {
1880     context->hi |= pasid & ((1 << 20) - 1);
1881 }
1882 
1883 /*
1884  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1885  * entry.
1886  */
1887 static inline void context_set_sm_dte(struct context_entry *context)
1888 {
1889     context->lo |= (1 << 2);
1890 }
1891 
1892 /*
1893  * Set the PRE(Page Request Enable) field of a scalable mode context
1894  * entry.
1895  */
1896 static inline void context_set_sm_pre(struct context_entry *context)
1897 {
1898     context->lo |= (1 << 4);
1899 }
1900 
1901 /* Convert value to context PASID directory size field coding. */
1902 #define context_pdts(pds)   (((pds) & 0x7) << 9)
1903 
1904 static int domain_context_mapping_one(struct dmar_domain *domain,
1905                       struct intel_iommu *iommu,
1906                       struct pasid_table *table,
1907                       u8 bus, u8 devfn)
1908 {
1909     struct device_domain_info *info =
1910             iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1911     u16 did = domain_id_iommu(domain, iommu);
1912     int translation = CONTEXT_TT_MULTI_LEVEL;
1913     struct context_entry *context;
1914     int ret;
1915 
1916     WARN_ON(did == 0);
1917 
1918     if (hw_pass_through && domain_type_is_si(domain))
1919         translation = CONTEXT_TT_PASS_THROUGH;
1920 
1921     pr_debug("Set context mapping for %02x:%02x.%d\n",
1922         bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1923 
1924     BUG_ON(!domain->pgd);
1925 
1926     spin_lock(&iommu->lock);
1927     ret = -ENOMEM;
1928     context = iommu_context_addr(iommu, bus, devfn, 1);
1929     if (!context)
1930         goto out_unlock;
1931 
1932     ret = 0;
1933     if (context_present(context) && !context_copied(iommu, bus, devfn))
1934         goto out_unlock;
1935 
1936     /*
1937      * For kdump cases, old valid entries may be cached due to the
1938      * in-flight DMA and copied pgtable, but there is no unmapping
1939      * behaviour for them, thus we need an explicit cache flush for
1940      * the newly-mapped device. For kdump, at this point, the device
1941      * is supposed to finish reset at its driver probe stage, so no
1942      * in-flight DMA will exist, and we don't need to worry anymore
1943      * hereafter.
1944      */
1945     if (context_copied(iommu, bus, devfn)) {
1946         u16 did_old = context_domain_id(context);
1947 
1948         if (did_old < cap_ndoms(iommu->cap)) {
1949             iommu->flush.flush_context(iommu, did_old,
1950                            (((u16)bus) << 8) | devfn,
1951                            DMA_CCMD_MASK_NOBIT,
1952                            DMA_CCMD_DEVICE_INVL);
1953             iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1954                          DMA_TLB_DSI_FLUSH);
1955         }
1956 
1957         clear_context_copied(iommu, bus, devfn);
1958     }
1959 
1960     context_clear_entry(context);
1961 
1962     if (sm_supported(iommu)) {
1963         unsigned long pds;
1964 
1965         WARN_ON(!table);
1966 
1967         /* Setup the PASID DIR pointer: */
1968         pds = context_get_sm_pds(table);
1969         context->lo = (u64)virt_to_phys(table->table) |
1970                 context_pdts(pds);
1971 
1972         /* Setup the RID_PASID field: */
1973         context_set_sm_rid2pasid(context, PASID_RID2PASID);
1974 
1975         /*
1976          * Setup the Device-TLB enable bit and Page request
1977          * Enable bit:
1978          */
1979         if (info && info->ats_supported)
1980             context_set_sm_dte(context);
1981         if (info && info->pri_supported)
1982             context_set_sm_pre(context);
1983     } else {
1984         struct dma_pte *pgd = domain->pgd;
1985         int agaw;
1986 
1987         context_set_domain_id(context, did);
1988 
1989         if (translation != CONTEXT_TT_PASS_THROUGH) {
1990             /*
1991              * Skip top levels of page tables for iommu which has
1992              * less agaw than default. Unnecessary for PT mode.
1993              */
1994             for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1995                 ret = -ENOMEM;
1996                 pgd = phys_to_virt(dma_pte_addr(pgd));
1997                 if (!dma_pte_present(pgd))
1998                     goto out_unlock;
1999             }
2000 
2001             if (info && info->ats_supported)
2002                 translation = CONTEXT_TT_DEV_IOTLB;
2003             else
2004                 translation = CONTEXT_TT_MULTI_LEVEL;
2005 
2006             context_set_address_root(context, virt_to_phys(pgd));
2007             context_set_address_width(context, agaw);
2008         } else {
2009             /*
2010              * In pass through mode, AW must be programmed to
2011              * indicate the largest AGAW value supported by
2012              * hardware. And ASR is ignored by hardware.
2013              */
2014             context_set_address_width(context, iommu->msagaw);
2015         }
2016 
2017         context_set_translation_type(context, translation);
2018     }
2019 
2020     context_set_fault_enable(context);
2021     context_set_present(context);
2022     if (!ecap_coherent(iommu->ecap))
2023         clflush_cache_range(context, sizeof(*context));
2024 
2025     /*
2026      * It's a non-present to present mapping. If hardware doesn't cache
2027      * non-present entry we only need to flush the write-buffer. If the
2028      * _does_ cache non-present entries, then it does so in the special
2029      * domain #0, which we have to flush:
2030      */
2031     if (cap_caching_mode(iommu->cap)) {
2032         iommu->flush.flush_context(iommu, 0,
2033                        (((u16)bus) << 8) | devfn,
2034                        DMA_CCMD_MASK_NOBIT,
2035                        DMA_CCMD_DEVICE_INVL);
2036         iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2037     } else {
2038         iommu_flush_write_buffer(iommu);
2039     }
2040     iommu_enable_dev_iotlb(info);
2041 
2042     ret = 0;
2043 
2044 out_unlock:
2045     spin_unlock(&iommu->lock);
2046 
2047     return ret;
2048 }
2049 
2050 struct domain_context_mapping_data {
2051     struct dmar_domain *domain;
2052     struct intel_iommu *iommu;
2053     struct pasid_table *table;
2054 };
2055 
2056 static int domain_context_mapping_cb(struct pci_dev *pdev,
2057                      u16 alias, void *opaque)
2058 {
2059     struct domain_context_mapping_data *data = opaque;
2060 
2061     return domain_context_mapping_one(data->domain, data->iommu,
2062                       data->table, PCI_BUS_NUM(alias),
2063                       alias & 0xff);
2064 }
2065 
2066 static int
2067 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2068 {
2069     struct domain_context_mapping_data data;
2070     struct pasid_table *table;
2071     struct intel_iommu *iommu;
2072     u8 bus, devfn;
2073 
2074     iommu = device_to_iommu(dev, &bus, &devfn);
2075     if (!iommu)
2076         return -ENODEV;
2077 
2078     table = intel_pasid_get_table(dev);
2079 
2080     if (!dev_is_pci(dev))
2081         return domain_context_mapping_one(domain, iommu, table,
2082                           bus, devfn);
2083 
2084     data.domain = domain;
2085     data.iommu = iommu;
2086     data.table = table;
2087 
2088     return pci_for_each_dma_alias(to_pci_dev(dev),
2089                       &domain_context_mapping_cb, &data);
2090 }
2091 
2092 static int domain_context_mapped_cb(struct pci_dev *pdev,
2093                     u16 alias, void *opaque)
2094 {
2095     struct intel_iommu *iommu = opaque;
2096 
2097     return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2098 }
2099 
2100 static int domain_context_mapped(struct device *dev)
2101 {
2102     struct intel_iommu *iommu;
2103     u8 bus, devfn;
2104 
2105     iommu = device_to_iommu(dev, &bus, &devfn);
2106     if (!iommu)
2107         return -ENODEV;
2108 
2109     if (!dev_is_pci(dev))
2110         return device_context_mapped(iommu, bus, devfn);
2111 
2112     return !pci_for_each_dma_alias(to_pci_dev(dev),
2113                        domain_context_mapped_cb, iommu);
2114 }
2115 
2116 /* Returns a number of VTD pages, but aligned to MM page size */
2117 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2118                         size_t size)
2119 {
2120     host_addr &= ~PAGE_MASK;
2121     return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2122 }
2123 
2124 /* Return largest possible superpage level for a given mapping */
2125 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2126                       unsigned long iov_pfn,
2127                       unsigned long phy_pfn,
2128                       unsigned long pages)
2129 {
2130     int support, level = 1;
2131     unsigned long pfnmerge;
2132 
2133     support = domain->iommu_superpage;
2134 
2135     /* To use a large page, the virtual *and* physical addresses
2136        must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2137        of them will mean we have to use smaller pages. So just
2138        merge them and check both at once. */
2139     pfnmerge = iov_pfn | phy_pfn;
2140 
2141     while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2142         pages >>= VTD_STRIDE_SHIFT;
2143         if (!pages)
2144             break;
2145         pfnmerge >>= VTD_STRIDE_SHIFT;
2146         level++;
2147         support--;
2148     }
2149     return level;
2150 }
2151 
2152 /*
2153  * Ensure that old small page tables are removed to make room for superpage(s).
2154  * We're going to add new large pages, so make sure we don't remove their parent
2155  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2156  */
2157 static void switch_to_super_page(struct dmar_domain *domain,
2158                  unsigned long start_pfn,
2159                  unsigned long end_pfn, int level)
2160 {
2161     unsigned long lvl_pages = lvl_to_nr_pages(level);
2162     struct iommu_domain_info *info;
2163     struct dma_pte *pte = NULL;
2164     unsigned long i;
2165 
2166     while (start_pfn <= end_pfn) {
2167         if (!pte)
2168             pte = pfn_to_dma_pte(domain, start_pfn, &level);
2169 
2170         if (dma_pte_present(pte)) {
2171             dma_pte_free_pagetable(domain, start_pfn,
2172                            start_pfn + lvl_pages - 1,
2173                            level + 1);
2174 
2175             xa_for_each(&domain->iommu_array, i, info)
2176                 iommu_flush_iotlb_psi(info->iommu, domain,
2177                               start_pfn, lvl_pages,
2178                               0, 0);
2179         }
2180 
2181         pte++;
2182         start_pfn += lvl_pages;
2183         if (first_pte_in_page(pte))
2184             pte = NULL;
2185     }
2186 }
2187 
2188 static int
2189 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2190          unsigned long phys_pfn, unsigned long nr_pages, int prot)
2191 {
2192     struct dma_pte *first_pte = NULL, *pte = NULL;
2193     unsigned int largepage_lvl = 0;
2194     unsigned long lvl_pages = 0;
2195     phys_addr_t pteval;
2196     u64 attr;
2197 
2198     BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2199 
2200     if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2201         return -EINVAL;
2202 
2203     attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2204     attr |= DMA_FL_PTE_PRESENT;
2205     if (domain_use_first_level(domain)) {
2206         attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2207         if (prot & DMA_PTE_WRITE)
2208             attr |= DMA_FL_PTE_DIRTY;
2209     }
2210 
2211     pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2212 
2213     while (nr_pages > 0) {
2214         uint64_t tmp;
2215 
2216         if (!pte) {
2217             largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2218                     phys_pfn, nr_pages);
2219 
2220             pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2221             if (!pte)
2222                 return -ENOMEM;
2223             first_pte = pte;
2224 
2225             lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226 
2227             /* It is large page*/
2228             if (largepage_lvl > 1) {
2229                 unsigned long end_pfn;
2230                 unsigned long pages_to_remove;
2231 
2232                 pteval |= DMA_PTE_LARGE_PAGE;
2233                 pages_to_remove = min_t(unsigned long, nr_pages,
2234                             nr_pte_to_next_page(pte) * lvl_pages);
2235                 end_pfn = iov_pfn + pages_to_remove - 1;
2236                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237             } else {
2238                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239             }
2240 
2241         }
2242         /* We don't need lock here, nobody else
2243          * touches the iova range
2244          */
2245         tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246         if (tmp) {
2247             static int dumps = 5;
2248             pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249                 iov_pfn, tmp, (unsigned long long)pteval);
2250             if (dumps) {
2251                 dumps--;
2252                 debug_dma_dump_mappings(NULL);
2253             }
2254             WARN_ON(1);
2255         }
2256 
2257         nr_pages -= lvl_pages;
2258         iov_pfn += lvl_pages;
2259         phys_pfn += lvl_pages;
2260         pteval += lvl_pages * VTD_PAGE_SIZE;
2261 
2262         /* If the next PTE would be the first in a new page, then we
2263          * need to flush the cache on the entries we've just written.
2264          * And then we'll need to recalculate 'pte', so clear it and
2265          * let it get set again in the if (!pte) block above.
2266          *
2267          * If we're done (!nr_pages) we need to flush the cache too.
2268          *
2269          * Also if we've been setting superpages, we may need to
2270          * recalculate 'pte' and switch back to smaller pages for the
2271          * end of the mapping, if the trailing size is not enough to
2272          * use another superpage (i.e. nr_pages < lvl_pages).
2273          */
2274         pte++;
2275         if (!nr_pages || first_pte_in_page(pte) ||
2276             (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277             domain_flush_cache(domain, first_pte,
2278                        (void *)pte - (void *)first_pte);
2279             pte = NULL;
2280         }
2281     }
2282 
2283     return 0;
2284 }
2285 
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288     struct intel_iommu *iommu = info->iommu;
2289     struct context_entry *context;
2290     u16 did_old;
2291 
2292     if (!iommu)
2293         return;
2294 
2295     spin_lock(&iommu->lock);
2296     context = iommu_context_addr(iommu, bus, devfn, 0);
2297     if (!context) {
2298         spin_unlock(&iommu->lock);
2299         return;
2300     }
2301 
2302     if (sm_supported(iommu)) {
2303         if (hw_pass_through && domain_type_is_si(info->domain))
2304             did_old = FLPT_DEFAULT_DID;
2305         else
2306             did_old = domain_id_iommu(info->domain, iommu);
2307     } else {
2308         did_old = context_domain_id(context);
2309     }
2310 
2311     context_clear_entry(context);
2312     __iommu_flush_cache(iommu, context, sizeof(*context));
2313     spin_unlock(&iommu->lock);
2314     iommu->flush.flush_context(iommu,
2315                    did_old,
2316                    (((u16)bus) << 8) | devfn,
2317                    DMA_CCMD_MASK_NOBIT,
2318                    DMA_CCMD_DEVICE_INVL);
2319 
2320     if (sm_supported(iommu))
2321         qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322 
2323     iommu->flush.flush_iotlb(iommu,
2324                  did_old,
2325                  0,
2326                  0,
2327                  DMA_TLB_DSI_FLUSH);
2328 
2329     __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331 
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333                     struct dmar_domain *domain,
2334                     struct device *dev,
2335                     u32 pasid)
2336 {
2337     struct dma_pte *pgd = domain->pgd;
2338     int agaw, level;
2339     int flags = 0;
2340 
2341     /*
2342      * Skip top levels of page tables for iommu which has
2343      * less agaw than default. Unnecessary for PT mode.
2344      */
2345     for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346         pgd = phys_to_virt(dma_pte_addr(pgd));
2347         if (!dma_pte_present(pgd))
2348             return -ENOMEM;
2349     }
2350 
2351     level = agaw_to_level(agaw);
2352     if (level != 4 && level != 5)
2353         return -EINVAL;
2354 
2355     if (pasid != PASID_RID2PASID)
2356         flags |= PASID_FLAG_SUPERVISOR_MODE;
2357     if (level == 5)
2358         flags |= PASID_FLAG_FL5LP;
2359 
2360     if (domain->force_snooping)
2361         flags |= PASID_FLAG_PAGE_SNOOP;
2362 
2363     return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2364                          domain_id_iommu(domain, iommu),
2365                          flags);
2366 }
2367 
2368 static bool dev_is_real_dma_subdevice(struct device *dev)
2369 {
2370     return dev && dev_is_pci(dev) &&
2371            pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 }
2373 
2374 static int iommu_domain_identity_map(struct dmar_domain *domain,
2375                      unsigned long first_vpfn,
2376                      unsigned long last_vpfn)
2377 {
2378     /*
2379      * RMRR range might have overlap with physical memory range,
2380      * clear it first
2381      */
2382     dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2383 
2384     return __domain_mapping(domain, first_vpfn,
2385                 first_vpfn, last_vpfn - first_vpfn + 1,
2386                 DMA_PTE_READ|DMA_PTE_WRITE);
2387 }
2388 
2389 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2390 
2391 static int __init si_domain_init(int hw)
2392 {
2393     struct dmar_rmrr_unit *rmrr;
2394     struct device *dev;
2395     int i, nid, ret;
2396 
2397     si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2398     if (!si_domain)
2399         return -EFAULT;
2400 
2401     if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2402         domain_exit(si_domain);
2403         return -EFAULT;
2404     }
2405 
2406     if (hw)
2407         return 0;
2408 
2409     for_each_online_node(nid) {
2410         unsigned long start_pfn, end_pfn;
2411         int i;
2412 
2413         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2414             ret = iommu_domain_identity_map(si_domain,
2415                     mm_to_dma_pfn(start_pfn),
2416                     mm_to_dma_pfn(end_pfn));
2417             if (ret)
2418                 return ret;
2419         }
2420     }
2421 
2422     /*
2423      * Identity map the RMRRs so that devices with RMRRs could also use
2424      * the si_domain.
2425      */
2426     for_each_rmrr_units(rmrr) {
2427         for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2428                       i, dev) {
2429             unsigned long long start = rmrr->base_address;
2430             unsigned long long end = rmrr->end_address;
2431 
2432             if (WARN_ON(end < start ||
2433                     end >> agaw_to_width(si_domain->agaw)))
2434                 continue;
2435 
2436             ret = iommu_domain_identity_map(si_domain,
2437                     mm_to_dma_pfn(start >> PAGE_SHIFT),
2438                     mm_to_dma_pfn(end >> PAGE_SHIFT));
2439             if (ret)
2440                 return ret;
2441         }
2442     }
2443 
2444     return 0;
2445 }
2446 
2447 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2448 {
2449     struct device_domain_info *info = dev_iommu_priv_get(dev);
2450     struct intel_iommu *iommu;
2451     unsigned long flags;
2452     u8 bus, devfn;
2453     int ret;
2454 
2455     iommu = device_to_iommu(dev, &bus, &devfn);
2456     if (!iommu)
2457         return -ENODEV;
2458 
2459     ret = domain_attach_iommu(domain, iommu);
2460     if (ret)
2461         return ret;
2462     info->domain = domain;
2463     spin_lock_irqsave(&domain->lock, flags);
2464     list_add(&info->link, &domain->devices);
2465     spin_unlock_irqrestore(&domain->lock, flags);
2466 
2467     /* PASID table is mandatory for a PCI device in scalable mode. */
2468     if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469         ret = intel_pasid_alloc_table(dev);
2470         if (ret) {
2471             dev_err(dev, "PASID table allocation failed\n");
2472             dmar_remove_one_dev_info(dev);
2473             return ret;
2474         }
2475 
2476         /* Setup the PASID entry for requests without PASID: */
2477         if (hw_pass_through && domain_type_is_si(domain))
2478             ret = intel_pasid_setup_pass_through(iommu, domain,
2479                     dev, PASID_RID2PASID);
2480         else if (domain_use_first_level(domain))
2481             ret = domain_setup_first_level(iommu, domain, dev,
2482                     PASID_RID2PASID);
2483         else
2484             ret = intel_pasid_setup_second_level(iommu, domain,
2485                     dev, PASID_RID2PASID);
2486         if (ret) {
2487             dev_err(dev, "Setup RID2PASID failed\n");
2488             dmar_remove_one_dev_info(dev);
2489             return ret;
2490         }
2491     }
2492 
2493     ret = domain_context_mapping(domain, dev);
2494     if (ret) {
2495         dev_err(dev, "Domain context map failed\n");
2496         dmar_remove_one_dev_info(dev);
2497         return ret;
2498     }
2499 
2500     return 0;
2501 }
2502 
2503 static bool device_has_rmrr(struct device *dev)
2504 {
2505     struct dmar_rmrr_unit *rmrr;
2506     struct device *tmp;
2507     int i;
2508 
2509     rcu_read_lock();
2510     for_each_rmrr_units(rmrr) {
2511         /*
2512          * Return TRUE if this RMRR contains the device that
2513          * is passed in.
2514          */
2515         for_each_active_dev_scope(rmrr->devices,
2516                       rmrr->devices_cnt, i, tmp)
2517             if (tmp == dev ||
2518                 is_downstream_to_pci_bridge(dev, tmp)) {
2519                 rcu_read_unlock();
2520                 return true;
2521             }
2522     }
2523     rcu_read_unlock();
2524     return false;
2525 }
2526 
2527 /**
2528  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2529  * is relaxable (ie. is allowed to be not enforced under some conditions)
2530  * @dev: device handle
2531  *
2532  * We assume that PCI USB devices with RMRRs have them largely
2533  * for historical reasons and that the RMRR space is not actively used post
2534  * boot.  This exclusion may change if vendors begin to abuse it.
2535  *
2536  * The same exception is made for graphics devices, with the requirement that
2537  * any use of the RMRR regions will be torn down before assigning the device
2538  * to a guest.
2539  *
2540  * Return: true if the RMRR is relaxable, false otherwise
2541  */
2542 static bool device_rmrr_is_relaxable(struct device *dev)
2543 {
2544     struct pci_dev *pdev;
2545 
2546     if (!dev_is_pci(dev))
2547         return false;
2548 
2549     pdev = to_pci_dev(dev);
2550     if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2551         return true;
2552     else
2553         return false;
2554 }
2555 
2556 /*
2557  * There are a couple cases where we need to restrict the functionality of
2558  * devices associated with RMRRs.  The first is when evaluating a device for
2559  * identity mapping because problems exist when devices are moved in and out
2560  * of domains and their respective RMRR information is lost.  This means that
2561  * a device with associated RMRRs will never be in a "passthrough" domain.
2562  * The second is use of the device through the IOMMU API.  This interface
2563  * expects to have full control of the IOVA space for the device.  We cannot
2564  * satisfy both the requirement that RMRR access is maintained and have an
2565  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2566  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2567  * We therefore prevent devices associated with an RMRR from participating in
2568  * the IOMMU API, which eliminates them from device assignment.
2569  *
2570  * In both cases, devices which have relaxable RMRRs are not concerned by this
2571  * restriction. See device_rmrr_is_relaxable comment.
2572  */
2573 static bool device_is_rmrr_locked(struct device *dev)
2574 {
2575     if (!device_has_rmrr(dev))
2576         return false;
2577 
2578     if (device_rmrr_is_relaxable(dev))
2579         return false;
2580 
2581     return true;
2582 }
2583 
2584 /*
2585  * Return the required default domain type for a specific device.
2586  *
2587  * @dev: the device in query
2588  * @startup: true if this is during early boot
2589  *
2590  * Returns:
2591  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2592  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2593  *  - 0: both identity and dynamic domains work for this device
2594  */
2595 static int device_def_domain_type(struct device *dev)
2596 {
2597     if (dev_is_pci(dev)) {
2598         struct pci_dev *pdev = to_pci_dev(dev);
2599 
2600         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2601             return IOMMU_DOMAIN_IDENTITY;
2602 
2603         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2604             return IOMMU_DOMAIN_IDENTITY;
2605     }
2606 
2607     return 0;
2608 }
2609 
2610 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2611 {
2612     /*
2613      * Start from the sane iommu hardware state.
2614      * If the queued invalidation is already initialized by us
2615      * (for example, while enabling interrupt-remapping) then
2616      * we got the things already rolling from a sane state.
2617      */
2618     if (!iommu->qi) {
2619         /*
2620          * Clear any previous faults.
2621          */
2622         dmar_fault(-1, iommu);
2623         /*
2624          * Disable queued invalidation if supported and already enabled
2625          * before OS handover.
2626          */
2627         dmar_disable_qi(iommu);
2628     }
2629 
2630     if (dmar_enable_qi(iommu)) {
2631         /*
2632          * Queued Invalidate not enabled, use Register Based Invalidate
2633          */
2634         iommu->flush.flush_context = __iommu_flush_context;
2635         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2636         pr_info("%s: Using Register based invalidation\n",
2637             iommu->name);
2638     } else {
2639         iommu->flush.flush_context = qi_flush_context;
2640         iommu->flush.flush_iotlb = qi_flush_iotlb;
2641         pr_info("%s: Using Queued invalidation\n", iommu->name);
2642     }
2643 }
2644 
2645 static int copy_context_table(struct intel_iommu *iommu,
2646                   struct root_entry *old_re,
2647                   struct context_entry **tbl,
2648                   int bus, bool ext)
2649 {
2650     int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2651     struct context_entry *new_ce = NULL, ce;
2652     struct context_entry *old_ce = NULL;
2653     struct root_entry re;
2654     phys_addr_t old_ce_phys;
2655 
2656     tbl_idx = ext ? bus * 2 : bus;
2657     memcpy(&re, old_re, sizeof(re));
2658 
2659     for (devfn = 0; devfn < 256; devfn++) {
2660         /* First calculate the correct index */
2661         idx = (ext ? devfn * 2 : devfn) % 256;
2662 
2663         if (idx == 0) {
2664             /* First save what we may have and clean up */
2665             if (new_ce) {
2666                 tbl[tbl_idx] = new_ce;
2667                 __iommu_flush_cache(iommu, new_ce,
2668                             VTD_PAGE_SIZE);
2669                 pos = 1;
2670             }
2671 
2672             if (old_ce)
2673                 memunmap(old_ce);
2674 
2675             ret = 0;
2676             if (devfn < 0x80)
2677                 old_ce_phys = root_entry_lctp(&re);
2678             else
2679                 old_ce_phys = root_entry_uctp(&re);
2680 
2681             if (!old_ce_phys) {
2682                 if (ext && devfn == 0) {
2683                     /* No LCTP, try UCTP */
2684                     devfn = 0x7f;
2685                     continue;
2686                 } else {
2687                     goto out;
2688                 }
2689             }
2690 
2691             ret = -ENOMEM;
2692             old_ce = memremap(old_ce_phys, PAGE_SIZE,
2693                     MEMREMAP_WB);
2694             if (!old_ce)
2695                 goto out;
2696 
2697             new_ce = alloc_pgtable_page(iommu->node);
2698             if (!new_ce)
2699                 goto out_unmap;
2700 
2701             ret = 0;
2702         }
2703 
2704         /* Now copy the context entry */
2705         memcpy(&ce, old_ce + idx, sizeof(ce));
2706 
2707         if (!context_present(&ce))
2708             continue;
2709 
2710         did = context_domain_id(&ce);
2711         if (did >= 0 && did < cap_ndoms(iommu->cap))
2712             set_bit(did, iommu->domain_ids);
2713 
2714         set_context_copied(iommu, bus, devfn);
2715         new_ce[idx] = ce;
2716     }
2717 
2718     tbl[tbl_idx + pos] = new_ce;
2719 
2720     __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2721 
2722 out_unmap:
2723     memunmap(old_ce);
2724 
2725 out:
2726     return ret;
2727 }
2728 
2729 static int copy_translation_tables(struct intel_iommu *iommu)
2730 {
2731     struct context_entry **ctxt_tbls;
2732     struct root_entry *old_rt;
2733     phys_addr_t old_rt_phys;
2734     int ctxt_table_entries;
2735     u64 rtaddr_reg;
2736     int bus, ret;
2737     bool new_ext, ext;
2738 
2739     rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2740     ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2741     new_ext    = !!sm_supported(iommu);
2742 
2743     /*
2744      * The RTT bit can only be changed when translation is disabled,
2745      * but disabling translation means to open a window for data
2746      * corruption. So bail out and don't copy anything if we would
2747      * have to change the bit.
2748      */
2749     if (new_ext != ext)
2750         return -EINVAL;
2751 
2752     iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2753     if (!iommu->copied_tables)
2754         return -ENOMEM;
2755 
2756     old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2757     if (!old_rt_phys)
2758         return -EINVAL;
2759 
2760     old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2761     if (!old_rt)
2762         return -ENOMEM;
2763 
2764     /* This is too big for the stack - allocate it from slab */
2765     ctxt_table_entries = ext ? 512 : 256;
2766     ret = -ENOMEM;
2767     ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2768     if (!ctxt_tbls)
2769         goto out_unmap;
2770 
2771     for (bus = 0; bus < 256; bus++) {
2772         ret = copy_context_table(iommu, &old_rt[bus],
2773                      ctxt_tbls, bus, ext);
2774         if (ret) {
2775             pr_err("%s: Failed to copy context table for bus %d\n",
2776                 iommu->name, bus);
2777             continue;
2778         }
2779     }
2780 
2781     spin_lock(&iommu->lock);
2782 
2783     /* Context tables are copied, now write them to the root_entry table */
2784     for (bus = 0; bus < 256; bus++) {
2785         int idx = ext ? bus * 2 : bus;
2786         u64 val;
2787 
2788         if (ctxt_tbls[idx]) {
2789             val = virt_to_phys(ctxt_tbls[idx]) | 1;
2790             iommu->root_entry[bus].lo = val;
2791         }
2792 
2793         if (!ext || !ctxt_tbls[idx + 1])
2794             continue;
2795 
2796         val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2797         iommu->root_entry[bus].hi = val;
2798     }
2799 
2800     spin_unlock(&iommu->lock);
2801 
2802     kfree(ctxt_tbls);
2803 
2804     __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2805 
2806     ret = 0;
2807 
2808 out_unmap:
2809     memunmap(old_rt);
2810 
2811     return ret;
2812 }
2813 
2814 #ifdef CONFIG_INTEL_IOMMU_SVM
2815 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2816 {
2817     struct intel_iommu *iommu = data;
2818     ioasid_t ioasid;
2819 
2820     if (!iommu)
2821         return INVALID_IOASID;
2822     /*
2823      * VT-d virtual command interface always uses the full 20 bit
2824      * PASID range. Host can partition guest PASID range based on
2825      * policies but it is out of guest's control.
2826      */
2827     if (min < PASID_MIN || max > intel_pasid_max_id)
2828         return INVALID_IOASID;
2829 
2830     if (vcmd_alloc_pasid(iommu, &ioasid))
2831         return INVALID_IOASID;
2832 
2833     return ioasid;
2834 }
2835 
2836 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2837 {
2838     struct intel_iommu *iommu = data;
2839 
2840     if (!iommu)
2841         return;
2842     /*
2843      * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2844      * We can only free the PASID when all the devices are unbound.
2845      */
2846     if (ioasid_find(NULL, ioasid, NULL)) {
2847         pr_alert("Cannot free active IOASID %d\n", ioasid);
2848         return;
2849     }
2850     vcmd_free_pasid(iommu, ioasid);
2851 }
2852 
2853 static void register_pasid_allocator(struct intel_iommu *iommu)
2854 {
2855     /*
2856      * If we are running in the host, no need for custom allocator
2857      * in that PASIDs are allocated from the host system-wide.
2858      */
2859     if (!cap_caching_mode(iommu->cap))
2860         return;
2861 
2862     if (!sm_supported(iommu)) {
2863         pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2864         return;
2865     }
2866 
2867     /*
2868      * Register a custom PASID allocator if we are running in a guest,
2869      * guest PASID must be obtained via virtual command interface.
2870      * There can be multiple vIOMMUs in each guest but only one allocator
2871      * is active. All vIOMMU allocators will eventually be calling the same
2872      * host allocator.
2873      */
2874     if (!vccap_pasid(iommu->vccap))
2875         return;
2876 
2877     pr_info("Register custom PASID allocator\n");
2878     iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2879     iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2880     iommu->pasid_allocator.pdata = (void *)iommu;
2881     if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2882         pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2883         /*
2884          * Disable scalable mode on this IOMMU if there
2885          * is no custom allocator. Mixing SM capable vIOMMU
2886          * and non-SM vIOMMU are not supported.
2887          */
2888         intel_iommu_sm = 0;
2889     }
2890 }
2891 #endif
2892 
2893 static int __init init_dmars(void)
2894 {
2895     struct dmar_drhd_unit *drhd;
2896     struct intel_iommu *iommu;
2897     int ret;
2898 
2899     ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2900     if (ret)
2901         goto free_iommu;
2902 
2903     for_each_iommu(iommu, drhd) {
2904         if (drhd->ignored) {
2905             iommu_disable_translation(iommu);
2906             continue;
2907         }
2908 
2909         /*
2910          * Find the max pasid size of all IOMMU's in the system.
2911          * We need to ensure the system pasid table is no bigger
2912          * than the smallest supported.
2913          */
2914         if (pasid_supported(iommu)) {
2915             u32 temp = 2 << ecap_pss(iommu->ecap);
2916 
2917             intel_pasid_max_id = min_t(u32, temp,
2918                            intel_pasid_max_id);
2919         }
2920 
2921         intel_iommu_init_qi(iommu);
2922 
2923         ret = iommu_init_domains(iommu);
2924         if (ret)
2925             goto free_iommu;
2926 
2927         init_translation_status(iommu);
2928 
2929         if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2930             iommu_disable_translation(iommu);
2931             clear_translation_pre_enabled(iommu);
2932             pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2933                 iommu->name);
2934         }
2935 
2936         /*
2937          * TBD:
2938          * we could share the same root & context tables
2939          * among all IOMMU's. Need to Split it later.
2940          */
2941         ret = iommu_alloc_root_entry(iommu);
2942         if (ret)
2943             goto free_iommu;
2944 
2945         if (translation_pre_enabled(iommu)) {
2946             pr_info("Translation already enabled - trying to copy translation structures\n");
2947 
2948             ret = copy_translation_tables(iommu);
2949             if (ret) {
2950                 /*
2951                  * We found the IOMMU with translation
2952                  * enabled - but failed to copy over the
2953                  * old root-entry table. Try to proceed
2954                  * by disabling translation now and
2955                  * allocating a clean root-entry table.
2956                  * This might cause DMAR faults, but
2957                  * probably the dump will still succeed.
2958                  */
2959                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2960                        iommu->name);
2961                 iommu_disable_translation(iommu);
2962                 clear_translation_pre_enabled(iommu);
2963             } else {
2964                 pr_info("Copied translation tables from previous kernel for %s\n",
2965                     iommu->name);
2966             }
2967         }
2968 
2969         if (!ecap_pass_through(iommu->ecap))
2970             hw_pass_through = 0;
2971         intel_svm_check(iommu);
2972     }
2973 
2974     /*
2975      * Now that qi is enabled on all iommus, set the root entry and flush
2976      * caches. This is required on some Intel X58 chipsets, otherwise the
2977      * flush_context function will loop forever and the boot hangs.
2978      */
2979     for_each_active_iommu(iommu, drhd) {
2980         iommu_flush_write_buffer(iommu);
2981 #ifdef CONFIG_INTEL_IOMMU_SVM
2982         register_pasid_allocator(iommu);
2983 #endif
2984         iommu_set_root_entry(iommu);
2985     }
2986 
2987 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2988     dmar_map_gfx = 0;
2989 #endif
2990 
2991     if (!dmar_map_gfx)
2992         iommu_identity_mapping |= IDENTMAP_GFX;
2993 
2994     check_tylersburg_isoch();
2995 
2996     ret = si_domain_init(hw_pass_through);
2997     if (ret)
2998         goto free_iommu;
2999 
3000     /*
3001      * for each drhd
3002      *   enable fault log
3003      *   global invalidate context cache
3004      *   global invalidate iotlb
3005      *   enable translation
3006      */
3007     for_each_iommu(iommu, drhd) {
3008         if (drhd->ignored) {
3009             /*
3010              * we always have to disable PMRs or DMA may fail on
3011              * this device
3012              */
3013             if (force_on)
3014                 iommu_disable_protect_mem_regions(iommu);
3015             continue;
3016         }
3017 
3018         iommu_flush_write_buffer(iommu);
3019 
3020 #ifdef CONFIG_INTEL_IOMMU_SVM
3021         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3022             /*
3023              * Call dmar_alloc_hwirq() with dmar_global_lock held,
3024              * could cause possible lock race condition.
3025              */
3026             up_write(&dmar_global_lock);
3027             ret = intel_svm_enable_prq(iommu);
3028             down_write(&dmar_global_lock);
3029             if (ret)
3030                 goto free_iommu;
3031         }
3032 #endif
3033         ret = dmar_set_interrupt(iommu);
3034         if (ret)
3035             goto free_iommu;
3036     }
3037 
3038     return 0;
3039 
3040 free_iommu:
3041     for_each_active_iommu(iommu, drhd) {
3042         disable_dmar_iommu(iommu);
3043         free_dmar_iommu(iommu);
3044     }
3045 
3046     return ret;
3047 }
3048 
3049 static void __init init_no_remapping_devices(void)
3050 {
3051     struct dmar_drhd_unit *drhd;
3052     struct device *dev;
3053     int i;
3054 
3055     for_each_drhd_unit(drhd) {
3056         if (!drhd->include_all) {
3057             for_each_active_dev_scope(drhd->devices,
3058                           drhd->devices_cnt, i, dev)
3059                 break;
3060             /* ignore DMAR unit if no devices exist */
3061             if (i == drhd->devices_cnt)
3062                 drhd->ignored = 1;
3063         }
3064     }
3065 
3066     for_each_active_drhd_unit(drhd) {
3067         if (drhd->include_all)
3068             continue;
3069 
3070         for_each_active_dev_scope(drhd->devices,
3071                       drhd->devices_cnt, i, dev)
3072             if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3073                 break;
3074         if (i < drhd->devices_cnt)
3075             continue;
3076 
3077         /* This IOMMU has *only* gfx devices. Either bypass it or
3078            set the gfx_mapped flag, as appropriate */
3079         drhd->gfx_dedicated = 1;
3080         if (!dmar_map_gfx)
3081             drhd->ignored = 1;
3082     }
3083 }
3084 
3085 #ifdef CONFIG_SUSPEND
3086 static int init_iommu_hw(void)
3087 {
3088     struct dmar_drhd_unit *drhd;
3089     struct intel_iommu *iommu = NULL;
3090 
3091     for_each_active_iommu(iommu, drhd)
3092         if (iommu->qi)
3093             dmar_reenable_qi(iommu);
3094 
3095     for_each_iommu(iommu, drhd) {
3096         if (drhd->ignored) {
3097             /*
3098              * we always have to disable PMRs or DMA may fail on
3099              * this device
3100              */
3101             if (force_on)
3102                 iommu_disable_protect_mem_regions(iommu);
3103             continue;
3104         }
3105 
3106         iommu_flush_write_buffer(iommu);
3107         iommu_set_root_entry(iommu);
3108         iommu_enable_translation(iommu);
3109         iommu_disable_protect_mem_regions(iommu);
3110     }
3111 
3112     return 0;
3113 }
3114 
3115 static void iommu_flush_all(void)
3116 {
3117     struct dmar_drhd_unit *drhd;
3118     struct intel_iommu *iommu;
3119 
3120     for_each_active_iommu(iommu, drhd) {
3121         iommu->flush.flush_context(iommu, 0, 0, 0,
3122                        DMA_CCMD_GLOBAL_INVL);
3123         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3124                      DMA_TLB_GLOBAL_FLUSH);
3125     }
3126 }
3127 
3128 static int iommu_suspend(void)
3129 {
3130     struct dmar_drhd_unit *drhd;
3131     struct intel_iommu *iommu = NULL;
3132     unsigned long flag;
3133 
3134     for_each_active_iommu(iommu, drhd) {
3135         iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3136                          GFP_KERNEL);
3137         if (!iommu->iommu_state)
3138             goto nomem;
3139     }
3140 
3141     iommu_flush_all();
3142 
3143     for_each_active_iommu(iommu, drhd) {
3144         iommu_disable_translation(iommu);
3145 
3146         raw_spin_lock_irqsave(&iommu->register_lock, flag);
3147 
3148         iommu->iommu_state[SR_DMAR_FECTL_REG] =
3149             readl(iommu->reg + DMAR_FECTL_REG);
3150         iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3151             readl(iommu->reg + DMAR_FEDATA_REG);
3152         iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3153             readl(iommu->reg + DMAR_FEADDR_REG);
3154         iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3155             readl(iommu->reg + DMAR_FEUADDR_REG);
3156 
3157         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3158     }
3159     return 0;
3160 
3161 nomem:
3162     for_each_active_iommu(iommu, drhd)
3163         kfree(iommu->iommu_state);
3164 
3165     return -ENOMEM;
3166 }
3167 
3168 static void iommu_resume(void)
3169 {
3170     struct dmar_drhd_unit *drhd;
3171     struct intel_iommu *iommu = NULL;
3172     unsigned long flag;
3173 
3174     if (init_iommu_hw()) {
3175         if (force_on)
3176             panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3177         else
3178             WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3179         return;
3180     }
3181 
3182     for_each_active_iommu(iommu, drhd) {
3183 
3184         raw_spin_lock_irqsave(&iommu->register_lock, flag);
3185 
3186         writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3187             iommu->reg + DMAR_FECTL_REG);
3188         writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3189             iommu->reg + DMAR_FEDATA_REG);
3190         writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3191             iommu->reg + DMAR_FEADDR_REG);
3192         writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3193             iommu->reg + DMAR_FEUADDR_REG);
3194 
3195         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3196     }
3197 
3198     for_each_active_iommu(iommu, drhd)
3199         kfree(iommu->iommu_state);
3200 }
3201 
3202 static struct syscore_ops iommu_syscore_ops = {
3203     .resume     = iommu_resume,
3204     .suspend    = iommu_suspend,
3205 };
3206 
3207 static void __init init_iommu_pm_ops(void)
3208 {
3209     register_syscore_ops(&iommu_syscore_ops);
3210 }
3211 
3212 #else
3213 static inline void init_iommu_pm_ops(void) {}
3214 #endif  /* CONFIG_PM */
3215 
3216 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3217 {
3218     if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3219         !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3220         rmrr->end_address <= rmrr->base_address ||
3221         arch_rmrr_sanity_check(rmrr))
3222         return -EINVAL;
3223 
3224     return 0;
3225 }
3226 
3227 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3228 {
3229     struct acpi_dmar_reserved_memory *rmrr;
3230     struct dmar_rmrr_unit *rmrru;
3231 
3232     rmrr = (struct acpi_dmar_reserved_memory *)header;
3233     if (rmrr_sanity_check(rmrr)) {
3234         pr_warn(FW_BUG
3235                "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3236                "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3237                rmrr->base_address, rmrr->end_address,
3238                dmi_get_system_info(DMI_BIOS_VENDOR),
3239                dmi_get_system_info(DMI_BIOS_VERSION),
3240                dmi_get_system_info(DMI_PRODUCT_VERSION));
3241         add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3242     }
3243 
3244     rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3245     if (!rmrru)
3246         goto out;
3247 
3248     rmrru->hdr = header;
3249 
3250     rmrru->base_address = rmrr->base_address;
3251     rmrru->end_address = rmrr->end_address;
3252 
3253     rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3254                 ((void *)rmrr) + rmrr->header.length,
3255                 &rmrru->devices_cnt);
3256     if (rmrru->devices_cnt && rmrru->devices == NULL)
3257         goto free_rmrru;
3258 
3259     list_add(&rmrru->list, &dmar_rmrr_units);
3260 
3261     return 0;
3262 free_rmrru:
3263     kfree(rmrru);
3264 out:
3265     return -ENOMEM;
3266 }
3267 
3268 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3269 {
3270     struct dmar_atsr_unit *atsru;
3271     struct acpi_dmar_atsr *tmp;
3272 
3273     list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3274                 dmar_rcu_check()) {
3275         tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3276         if (atsr->segment != tmp->segment)
3277             continue;
3278         if (atsr->header.length != tmp->header.length)
3279             continue;
3280         if (memcmp(atsr, tmp, atsr->header.length) == 0)
3281             return atsru;
3282     }
3283 
3284     return NULL;
3285 }
3286 
3287 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3288 {
3289     struct acpi_dmar_atsr *atsr;
3290     struct dmar_atsr_unit *atsru;
3291 
3292     if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3293         return 0;
3294 
3295     atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3296     atsru = dmar_find_atsr(atsr);
3297     if (atsru)
3298         return 0;
3299 
3300     atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3301     if (!atsru)
3302         return -ENOMEM;
3303 
3304     /*
3305      * If memory is allocated from slab by ACPI _DSM method, we need to
3306      * copy the memory content because the memory buffer will be freed
3307      * on return.
3308      */
3309     atsru->hdr = (void *)(atsru + 1);
3310     memcpy(atsru->hdr, hdr, hdr->length);
3311     atsru->include_all = atsr->flags & 0x1;
3312     if (!atsru->include_all) {
3313         atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3314                 (void *)atsr + atsr->header.length,
3315                 &atsru->devices_cnt);
3316         if (atsru->devices_cnt && atsru->devices == NULL) {
3317             kfree(atsru);
3318             return -ENOMEM;
3319         }
3320     }
3321 
3322     list_add_rcu(&atsru->list, &dmar_atsr_units);
3323 
3324     return 0;
3325 }
3326 
3327 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3328 {
3329     dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3330     kfree(atsru);
3331 }
3332 
3333 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3334 {
3335     struct acpi_dmar_atsr *atsr;
3336     struct dmar_atsr_unit *atsru;
3337 
3338     atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3339     atsru = dmar_find_atsr(atsr);
3340     if (atsru) {
3341         list_del_rcu(&atsru->list);
3342         synchronize_rcu();
3343         intel_iommu_free_atsr(atsru);
3344     }
3345 
3346     return 0;
3347 }
3348 
3349 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3350 {
3351     int i;
3352     struct device *dev;
3353     struct acpi_dmar_atsr *atsr;
3354     struct dmar_atsr_unit *atsru;
3355 
3356     atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3357     atsru = dmar_find_atsr(atsr);
3358     if (!atsru)
3359         return 0;
3360 
3361     if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3362         for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3363                       i, dev)
3364             return -EBUSY;
3365     }
3366 
3367     return 0;
3368 }
3369 
3370 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3371 {
3372     struct dmar_satc_unit *satcu;
3373     struct acpi_dmar_satc *tmp;
3374 
3375     list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3376                 dmar_rcu_check()) {
3377         tmp = (struct acpi_dmar_satc *)satcu->hdr;
3378         if (satc->segment != tmp->segment)
3379             continue;
3380         if (satc->header.length != tmp->header.length)
3381             continue;
3382         if (memcmp(satc, tmp, satc->header.length) == 0)
3383             return satcu;
3384     }
3385 
3386     return NULL;
3387 }
3388 
3389 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3390 {
3391     struct acpi_dmar_satc *satc;
3392     struct dmar_satc_unit *satcu;
3393 
3394     if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3395         return 0;
3396 
3397     satc = container_of(hdr, struct acpi_dmar_satc, header);
3398     satcu = dmar_find_satc(satc);
3399     if (satcu)
3400         return 0;
3401 
3402     satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3403     if (!satcu)
3404         return -ENOMEM;
3405 
3406     satcu->hdr = (void *)(satcu + 1);
3407     memcpy(satcu->hdr, hdr, hdr->length);
3408     satcu->atc_required = satc->flags & 0x1;
3409     satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3410                           (void *)satc + satc->header.length,
3411                           &satcu->devices_cnt);
3412     if (satcu->devices_cnt && !satcu->devices) {
3413         kfree(satcu);
3414         return -ENOMEM;
3415     }
3416     list_add_rcu(&satcu->list, &dmar_satc_units);
3417 
3418     return 0;
3419 }
3420 
3421 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3422 {
3423     int sp, ret;
3424     struct intel_iommu *iommu = dmaru->iommu;
3425 
3426     ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3427     if (ret)
3428         goto out;
3429 
3430     if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3431         pr_warn("%s: Doesn't support hardware pass through.\n",
3432             iommu->name);
3433         return -ENXIO;
3434     }
3435 
3436     sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3437     if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3438         pr_warn("%s: Doesn't support large page.\n",
3439             iommu->name);
3440         return -ENXIO;
3441     }
3442 
3443     /*
3444      * Disable translation if already enabled prior to OS handover.
3445      */
3446     if (iommu->gcmd & DMA_GCMD_TE)
3447         iommu_disable_translation(iommu);
3448 
3449     ret = iommu_init_domains(iommu);
3450     if (ret == 0)
3451         ret = iommu_alloc_root_entry(iommu);
3452     if (ret)
3453         goto out;
3454 
3455     intel_svm_check(iommu);
3456 
3457     if (dmaru->ignored) {
3458         /*
3459          * we always have to disable PMRs or DMA may fail on this device
3460          */
3461         if (force_on)
3462             iommu_disable_protect_mem_regions(iommu);
3463         return 0;
3464     }
3465 
3466     intel_iommu_init_qi(iommu);
3467     iommu_flush_write_buffer(iommu);
3468 
3469 #ifdef CONFIG_INTEL_IOMMU_SVM
3470     if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3471         ret = intel_svm_enable_prq(iommu);
3472         if (ret)
3473             goto disable_iommu;
3474     }
3475 #endif
3476     ret = dmar_set_interrupt(iommu);
3477     if (ret)
3478         goto disable_iommu;
3479 
3480     iommu_set_root_entry(iommu);
3481     iommu_enable_translation(iommu);
3482 
3483     iommu_disable_protect_mem_regions(iommu);
3484     return 0;
3485 
3486 disable_iommu:
3487     disable_dmar_iommu(iommu);
3488 out:
3489     free_dmar_iommu(iommu);
3490     return ret;
3491 }
3492 
3493 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3494 {
3495     int ret = 0;
3496     struct intel_iommu *iommu = dmaru->iommu;
3497 
3498     if (!intel_iommu_enabled)
3499         return 0;
3500     if (iommu == NULL)
3501         return -EINVAL;
3502 
3503     if (insert) {
3504         ret = intel_iommu_add(dmaru);
3505     } else {
3506         disable_dmar_iommu(iommu);
3507         free_dmar_iommu(iommu);
3508     }
3509 
3510     return ret;
3511 }
3512 
3513 static void intel_iommu_free_dmars(void)
3514 {
3515     struct dmar_rmrr_unit *rmrru, *rmrr_n;
3516     struct dmar_atsr_unit *atsru, *atsr_n;
3517     struct dmar_satc_unit *satcu, *satc_n;
3518 
3519     list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3520         list_del(&rmrru->list);
3521         dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3522         kfree(rmrru);
3523     }
3524 
3525     list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3526         list_del(&atsru->list);
3527         intel_iommu_free_atsr(atsru);
3528     }
3529     list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3530         list_del(&satcu->list);
3531         dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3532         kfree(satcu);
3533     }
3534 }
3535 
3536 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3537 {
3538     struct dmar_satc_unit *satcu;
3539     struct acpi_dmar_satc *satc;
3540     struct device *tmp;
3541     int i;
3542 
3543     dev = pci_physfn(dev);
3544     rcu_read_lock();
3545 
3546     list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3547         satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3548         if (satc->segment != pci_domain_nr(dev->bus))
3549             continue;
3550         for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3551             if (to_pci_dev(tmp) == dev)
3552                 goto out;
3553     }
3554     satcu = NULL;
3555 out:
3556     rcu_read_unlock();
3557     return satcu;
3558 }
3559 
3560 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3561 {
3562     int i, ret = 1;
3563     struct pci_bus *bus;
3564     struct pci_dev *bridge = NULL;
3565     struct device *tmp;
3566     struct acpi_dmar_atsr *atsr;
3567     struct dmar_atsr_unit *atsru;
3568     struct dmar_satc_unit *satcu;
3569 
3570     dev = pci_physfn(dev);
3571     satcu = dmar_find_matched_satc_unit(dev);
3572     if (satcu)
3573         /*
3574          * This device supports ATS as it is in SATC table.
3575          * When IOMMU is in legacy mode, enabling ATS is done
3576          * automatically by HW for the device that requires
3577          * ATS, hence OS should not enable this device ATS
3578          * to avoid duplicated TLB invalidation.
3579          */
3580         return !(satcu->atc_required && !sm_supported(iommu));
3581 
3582     for (bus = dev->bus; bus; bus = bus->parent) {
3583         bridge = bus->self;
3584         /* If it's an integrated device, allow ATS */
3585         if (!bridge)
3586             return 1;
3587         /* Connected via non-PCIe: no ATS */
3588         if (!pci_is_pcie(bridge) ||
3589             pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3590             return 0;
3591         /* If we found the root port, look it up in the ATSR */
3592         if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3593             break;
3594     }
3595 
3596     rcu_read_lock();
3597     list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3598         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3599         if (atsr->segment != pci_domain_nr(dev->bus))
3600             continue;
3601 
3602         for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3603             if (tmp == &bridge->dev)
3604                 goto out;
3605 
3606         if (atsru->include_all)
3607             goto out;
3608     }
3609     ret = 0;
3610 out:
3611     rcu_read_unlock();
3612 
3613     return ret;
3614 }
3615 
3616 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3617 {
3618     int ret;
3619     struct dmar_rmrr_unit *rmrru;
3620     struct dmar_atsr_unit *atsru;
3621     struct dmar_satc_unit *satcu;
3622     struct acpi_dmar_atsr *atsr;
3623     struct acpi_dmar_reserved_memory *rmrr;
3624     struct acpi_dmar_satc *satc;
3625 
3626     if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3627         return 0;
3628 
3629     list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3630         rmrr = container_of(rmrru->hdr,
3631                     struct acpi_dmar_reserved_memory, header);
3632         if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3633             ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3634                 ((void *)rmrr) + rmrr->header.length,
3635                 rmrr->segment, rmrru->devices,
3636                 rmrru->devices_cnt);
3637             if (ret < 0)
3638                 return ret;
3639         } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3640             dmar_remove_dev_scope(info, rmrr->segment,
3641                 rmrru->devices, rmrru->devices_cnt);
3642         }
3643     }
3644 
3645     list_for_each_entry(atsru, &dmar_atsr_units, list) {
3646         if (atsru->include_all)
3647             continue;
3648 
3649         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3650         if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3651             ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3652                     (void *)atsr + atsr->header.length,
3653                     atsr->segment, atsru->devices,
3654                     atsru->devices_cnt);
3655             if (ret > 0)
3656                 break;
3657             else if (ret < 0)
3658                 return ret;
3659         } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3660             if (dmar_remove_dev_scope(info, atsr->segment,
3661                     atsru->devices, atsru->devices_cnt))
3662                 break;
3663         }
3664     }
3665     list_for_each_entry(satcu, &dmar_satc_units, list) {
3666         satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3667         if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3668             ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3669                     (void *)satc + satc->header.length,
3670                     satc->segment, satcu->devices,
3671                     satcu->devices_cnt);
3672             if (ret > 0)
3673                 break;
3674             else if (ret < 0)
3675                 return ret;
3676         } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3677             if (dmar_remove_dev_scope(info, satc->segment,
3678                     satcu->devices, satcu->devices_cnt))
3679                 break;
3680         }
3681     }
3682 
3683     return 0;
3684 }
3685 
3686 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3687                        unsigned long val, void *v)
3688 {
3689     struct memory_notify *mhp = v;
3690     unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3691     unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3692             mhp->nr_pages - 1);
3693 
3694     switch (val) {
3695     case MEM_GOING_ONLINE:
3696         if (iommu_domain_identity_map(si_domain,
3697                           start_vpfn, last_vpfn)) {
3698             pr_warn("Failed to build identity map for [%lx-%lx]\n",
3699                 start_vpfn, last_vpfn);
3700             return NOTIFY_BAD;
3701         }
3702         break;
3703 
3704     case MEM_OFFLINE:
3705     case MEM_CANCEL_ONLINE:
3706         {
3707             struct dmar_drhd_unit *drhd;
3708             struct intel_iommu *iommu;
3709             LIST_HEAD(freelist);
3710 
3711             domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3712 
3713             rcu_read_lock();
3714             for_each_active_iommu(iommu, drhd)
3715                 iommu_flush_iotlb_psi(iommu, si_domain,
3716                     start_vpfn, mhp->nr_pages,
3717                     list_empty(&freelist), 0);
3718             rcu_read_unlock();
3719             put_pages_list(&freelist);
3720         }
3721         break;
3722     }
3723 
3724     return NOTIFY_OK;
3725 }
3726 
3727 static struct notifier_block intel_iommu_memory_nb = {
3728     .notifier_call = intel_iommu_memory_notifier,
3729     .priority = 0
3730 };
3731 
3732 static void intel_disable_iommus(void)
3733 {
3734     struct intel_iommu *iommu = NULL;
3735     struct dmar_drhd_unit *drhd;
3736 
3737     for_each_iommu(iommu, drhd)
3738         iommu_disable_translation(iommu);
3739 }
3740 
3741 void intel_iommu_shutdown(void)
3742 {
3743     struct dmar_drhd_unit *drhd;
3744     struct intel_iommu *iommu = NULL;
3745 
3746     if (no_iommu || dmar_disabled)
3747         return;
3748 
3749     down_write(&dmar_global_lock);
3750 
3751     /* Disable PMRs explicitly here. */
3752     for_each_iommu(iommu, drhd)
3753         iommu_disable_protect_mem_regions(iommu);
3754 
3755     /* Make sure the IOMMUs are switched off */
3756     intel_disable_iommus();
3757 
3758     up_write(&dmar_global_lock);
3759 }
3760 
3761 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3762 {
3763     struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3764 
3765     return container_of(iommu_dev, struct intel_iommu, iommu);
3766 }
3767 
3768 static ssize_t version_show(struct device *dev,
3769                 struct device_attribute *attr, char *buf)
3770 {
3771     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3772     u32 ver = readl(iommu->reg + DMAR_VER_REG);
3773     return sprintf(buf, "%d:%d\n",
3774                DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3775 }
3776 static DEVICE_ATTR_RO(version);
3777 
3778 static ssize_t address_show(struct device *dev,
3779                 struct device_attribute *attr, char *buf)
3780 {
3781     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3782     return sprintf(buf, "%llx\n", iommu->reg_phys);
3783 }
3784 static DEVICE_ATTR_RO(address);
3785 
3786 static ssize_t cap_show(struct device *dev,
3787             struct device_attribute *attr, char *buf)
3788 {
3789     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3790     return sprintf(buf, "%llx\n", iommu->cap);
3791 }
3792 static DEVICE_ATTR_RO(cap);
3793 
3794 static ssize_t ecap_show(struct device *dev,
3795              struct device_attribute *attr, char *buf)
3796 {
3797     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3798     return sprintf(buf, "%llx\n", iommu->ecap);
3799 }
3800 static DEVICE_ATTR_RO(ecap);
3801 
3802 static ssize_t domains_supported_show(struct device *dev,
3803                       struct device_attribute *attr, char *buf)
3804 {
3805     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3806     return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3807 }
3808 static DEVICE_ATTR_RO(domains_supported);
3809 
3810 static ssize_t domains_used_show(struct device *dev,
3811                  struct device_attribute *attr, char *buf)
3812 {
3813     struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3814     return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3815                           cap_ndoms(iommu->cap)));
3816 }
3817 static DEVICE_ATTR_RO(domains_used);
3818 
3819 static struct attribute *intel_iommu_attrs[] = {
3820     &dev_attr_version.attr,
3821     &dev_attr_address.attr,
3822     &dev_attr_cap.attr,
3823     &dev_attr_ecap.attr,
3824     &dev_attr_domains_supported.attr,
3825     &dev_attr_domains_used.attr,
3826     NULL,
3827 };
3828 
3829 static struct attribute_group intel_iommu_group = {
3830     .name = "intel-iommu",
3831     .attrs = intel_iommu_attrs,
3832 };
3833 
3834 const struct attribute_group *intel_iommu_groups[] = {
3835     &intel_iommu_group,
3836     NULL,
3837 };
3838 
3839 static inline bool has_external_pci(void)
3840 {
3841     struct pci_dev *pdev = NULL;
3842 
3843     for_each_pci_dev(pdev)
3844         if (pdev->external_facing)
3845             return true;
3846 
3847     return false;
3848 }
3849 
3850 static int __init platform_optin_force_iommu(void)
3851 {
3852     if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3853         return 0;
3854 
3855     if (no_iommu || dmar_disabled)
3856         pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3857 
3858     /*
3859      * If Intel-IOMMU is disabled by default, we will apply identity
3860      * map for all devices except those marked as being untrusted.
3861      */
3862     if (dmar_disabled)
3863         iommu_set_default_passthrough(false);
3864 
3865     dmar_disabled = 0;
3866     no_iommu = 0;
3867 
3868     return 1;
3869 }
3870 
3871 static int __init probe_acpi_namespace_devices(void)
3872 {
3873     struct dmar_drhd_unit *drhd;
3874     /* To avoid a -Wunused-but-set-variable warning. */
3875     struct intel_iommu *iommu __maybe_unused;
3876     struct device *dev;
3877     int i, ret = 0;
3878 
3879     for_each_active_iommu(iommu, drhd) {
3880         for_each_active_dev_scope(drhd->devices,
3881                       drhd->devices_cnt, i, dev) {
3882             struct acpi_device_physical_node *pn;
3883             struct iommu_group *group;
3884             struct acpi_device *adev;
3885 
3886             if (dev->bus != &acpi_bus_type)
3887                 continue;
3888 
3889             adev = to_acpi_device(dev);
3890             mutex_lock(&adev->physical_node_lock);
3891             list_for_each_entry(pn,
3892                         &adev->physical_node_list, node) {
3893                 group = iommu_group_get(pn->dev);
3894                 if (group) {
3895                     iommu_group_put(group);
3896                     continue;
3897                 }
3898 
3899                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
3900                 ret = iommu_probe_device(pn->dev);
3901                 if (ret)
3902                     break;
3903             }
3904             mutex_unlock(&adev->physical_node_lock);
3905 
3906             if (ret)
3907                 return ret;
3908         }
3909     }
3910 
3911     return 0;
3912 }
3913 
3914 static __init int tboot_force_iommu(void)
3915 {
3916     if (!tboot_enabled())
3917         return 0;
3918 
3919     if (no_iommu || dmar_disabled)
3920         pr_warn("Forcing Intel-IOMMU to enabled\n");
3921 
3922     dmar_disabled = 0;
3923     no_iommu = 0;
3924 
3925     return 1;
3926 }
3927 
3928 int __init intel_iommu_init(void)
3929 {
3930     int ret = -ENODEV;
3931     struct dmar_drhd_unit *drhd;
3932     struct intel_iommu *iommu;
3933 
3934     /*
3935      * Intel IOMMU is required for a TXT/tboot launch or platform
3936      * opt in, so enforce that.
3937      */
3938     force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3939             platform_optin_force_iommu();
3940 
3941     down_write(&dmar_global_lock);
3942     if (dmar_table_init()) {
3943         if (force_on)
3944             panic("tboot: Failed to initialize DMAR table\n");
3945         goto out_free_dmar;
3946     }
3947 
3948     if (dmar_dev_scope_init() < 0) {
3949         if (force_on)
3950             panic("tboot: Failed to initialize DMAR device scope\n");
3951         goto out_free_dmar;
3952     }
3953 
3954     up_write(&dmar_global_lock);
3955 
3956     /*
3957      * The bus notifier takes the dmar_global_lock, so lockdep will
3958      * complain later when we register it under the lock.
3959      */
3960     dmar_register_bus_notifier();
3961 
3962     down_write(&dmar_global_lock);
3963 
3964     if (!no_iommu)
3965         intel_iommu_debugfs_init();
3966 
3967     if (no_iommu || dmar_disabled) {
3968         /*
3969          * We exit the function here to ensure IOMMU's remapping and
3970          * mempool aren't setup, which means that the IOMMU's PMRs
3971          * won't be disabled via the call to init_dmars(). So disable
3972          * it explicitly here. The PMRs were setup by tboot prior to
3973          * calling SENTER, but the kernel is expected to reset/tear
3974          * down the PMRs.
3975          */
3976         if (intel_iommu_tboot_noforce) {
3977             for_each_iommu(iommu, drhd)
3978                 iommu_disable_protect_mem_regions(iommu);
3979         }
3980 
3981         /*
3982          * Make sure the IOMMUs are switched off, even when we
3983          * boot into a kexec kernel and the previous kernel left
3984          * them enabled
3985          */
3986         intel_disable_iommus();
3987         goto out_free_dmar;
3988     }
3989 
3990     if (list_empty(&dmar_rmrr_units))
3991         pr_info("No RMRR found\n");
3992 
3993     if (list_empty(&dmar_atsr_units))
3994         pr_info("No ATSR found\n");
3995 
3996     if (list_empty(&dmar_satc_units))
3997         pr_info("No SATC found\n");
3998 
3999     init_no_remapping_devices();
4000 
4001     ret = init_dmars();
4002     if (ret) {
4003         if (force_on)
4004             panic("tboot: Failed to initialize DMARs\n");
4005         pr_err("Initialization failed\n");
4006         goto out_free_dmar;
4007     }
4008     up_write(&dmar_global_lock);
4009 
4010     init_iommu_pm_ops();
4011 
4012     down_read(&dmar_global_lock);
4013     for_each_active_iommu(iommu, drhd) {
4014         /*
4015          * The flush queue implementation does not perform
4016          * page-selective invalidations that are required for efficient
4017          * TLB flushes in virtual environments.  The benefit of batching
4018          * is likely to be much lower than the overhead of synchronizing
4019          * the virtual and physical IOMMU page-tables.
4020          */
4021         if (cap_caching_mode(iommu->cap)) {
4022             pr_info_once("IOMMU batching disallowed due to virtualization\n");
4023             iommu_set_dma_strict();
4024         }
4025         iommu_device_sysfs_add(&iommu->iommu, NULL,
4026                        intel_iommu_groups,
4027                        "%s", iommu->name);
4028         iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4029     }
4030     up_read(&dmar_global_lock);
4031 
4032     bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4033     if (si_domain && !hw_pass_through)
4034         register_memory_notifier(&intel_iommu_memory_nb);
4035 
4036     down_read(&dmar_global_lock);
4037     if (probe_acpi_namespace_devices())
4038         pr_warn("ACPI name space devices didn't probe correctly\n");
4039 
4040     /* Finally, we enable the DMA remapping hardware. */
4041     for_each_iommu(iommu, drhd) {
4042         if (!drhd->ignored && !translation_pre_enabled(iommu))
4043             iommu_enable_translation(iommu);
4044 
4045         iommu_disable_protect_mem_regions(iommu);
4046     }
4047     up_read(&dmar_global_lock);
4048 
4049     pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4050 
4051     intel_iommu_enabled = 1;
4052 
4053     return 0;
4054 
4055 out_free_dmar:
4056     intel_iommu_free_dmars();
4057     up_write(&dmar_global_lock);
4058     return ret;
4059 }
4060 
4061 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4062 {
4063     struct device_domain_info *info = opaque;
4064 
4065     domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4066     return 0;
4067 }
4068 
4069 /*
4070  * NB - intel-iommu lacks any sort of reference counting for the users of
4071  * dependent devices.  If multiple endpoints have intersecting dependent
4072  * devices, unbinding the driver from any one of them will possibly leave
4073  * the others unable to operate.
4074  */
4075 static void domain_context_clear(struct device_domain_info *info)
4076 {
4077     if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4078         return;
4079 
4080     pci_for_each_dma_alias(to_pci_dev(info->dev),
4081                    &domain_context_clear_one_cb, info);
4082 }
4083 
4084 static void dmar_remove_one_dev_info(struct device *dev)
4085 {
4086     struct device_domain_info *info = dev_iommu_priv_get(dev);
4087     struct dmar_domain *domain = info->domain;
4088     struct intel_iommu *iommu = info->iommu;
4089     unsigned long flags;
4090 
4091     if (!dev_is_real_dma_subdevice(info->dev)) {
4092         if (dev_is_pci(info->dev) && sm_supported(iommu))
4093             intel_pasid_tear_down_entry(iommu, info->dev,
4094                     PASID_RID2PASID, false);
4095 
4096         iommu_disable_dev_iotlb(info);
4097         domain_context_clear(info);
4098         intel_pasid_free_table(info->dev);
4099     }
4100 
4101     spin_lock_irqsave(&domain->lock, flags);
4102     list_del(&info->link);
4103     spin_unlock_irqrestore(&domain->lock, flags);
4104 
4105     domain_detach_iommu(domain, iommu);
4106     info->domain = NULL;
4107 }
4108 
4109 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4110 {
4111     int adjust_width;
4112 
4113     /* calculate AGAW */
4114     domain->gaw = guest_width;
4115     adjust_width = guestwidth_to_adjustwidth(guest_width);
4116     domain->agaw = width_to_agaw(adjust_width);
4117 
4118     domain->iommu_coherency = false;
4119     domain->iommu_superpage = 0;
4120     domain->max_addr = 0;
4121 
4122     /* always allocate the top pgd */
4123     domain->pgd = alloc_pgtable_page(domain->nid);
4124     if (!domain->pgd)
4125         return -ENOMEM;
4126     domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4127     return 0;
4128 }
4129 
4130 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4131 {
4132     struct dmar_domain *dmar_domain;
4133     struct iommu_domain *domain;
4134 
4135     switch (type) {
4136     case IOMMU_DOMAIN_DMA:
4137     case IOMMU_DOMAIN_DMA_FQ:
4138     case IOMMU_DOMAIN_UNMANAGED:
4139         dmar_domain = alloc_domain(type);
4140         if (!dmar_domain) {
4141             pr_err("Can't allocate dmar_domain\n");
4142             return NULL;
4143         }
4144         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4145             pr_err("Domain initialization failed\n");
4146             domain_exit(dmar_domain);
4147             return NULL;
4148         }
4149 
4150         domain = &dmar_domain->domain;
4151         domain->geometry.aperture_start = 0;
4152         domain->geometry.aperture_end   =
4153                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4154         domain->geometry.force_aperture = true;
4155 
4156         return domain;
4157     case IOMMU_DOMAIN_IDENTITY:
4158         return &si_domain->domain;
4159     default:
4160         return NULL;
4161     }
4162 
4163     return NULL;
4164 }
4165 
4166 static void intel_iommu_domain_free(struct iommu_domain *domain)
4167 {
4168     if (domain != &si_domain->domain)
4169         domain_exit(to_dmar_domain(domain));
4170 }
4171 
4172 static int prepare_domain_attach_device(struct iommu_domain *domain,
4173                     struct device *dev)
4174 {
4175     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4176     struct intel_iommu *iommu;
4177     int addr_width;
4178 
4179     iommu = device_to_iommu(dev, NULL, NULL);
4180     if (!iommu)
4181         return -ENODEV;
4182 
4183     if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4184         return -EOPNOTSUPP;
4185 
4186     /* check if this iommu agaw is sufficient for max mapped address */
4187     addr_width = agaw_to_width(iommu->agaw);
4188     if (addr_width > cap_mgaw(iommu->cap))
4189         addr_width = cap_mgaw(iommu->cap);
4190 
4191     if (dmar_domain->max_addr > (1LL << addr_width)) {
4192         dev_err(dev, "%s: iommu width (%d) is not "
4193                 "sufficient for the mapped address (%llx)\n",
4194                 __func__, addr_width, dmar_domain->max_addr);
4195         return -EFAULT;
4196     }
4197     dmar_domain->gaw = addr_width;
4198 
4199     /*
4200      * Knock out extra levels of page tables if necessary
4201      */
4202     while (iommu->agaw < dmar_domain->agaw) {
4203         struct dma_pte *pte;
4204 
4205         pte = dmar_domain->pgd;
4206         if (dma_pte_present(pte)) {
4207             dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4208             free_pgtable_page(pte);
4209         }
4210         dmar_domain->agaw--;
4211     }
4212 
4213     return 0;
4214 }
4215 
4216 static int intel_iommu_attach_device(struct iommu_domain *domain,
4217                      struct device *dev)
4218 {
4219     int ret;
4220 
4221     if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4222         device_is_rmrr_locked(dev)) {
4223         dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4224         return -EPERM;
4225     }
4226 
4227     /* normally dev is not mapped */
4228     if (unlikely(domain_context_mapped(dev))) {
4229         struct device_domain_info *info = dev_iommu_priv_get(dev);
4230 
4231         if (info->domain)
4232             dmar_remove_one_dev_info(dev);
4233     }
4234 
4235     ret = prepare_domain_attach_device(domain, dev);
4236     if (ret)
4237         return ret;
4238 
4239     return domain_add_dev_info(to_dmar_domain(domain), dev);
4240 }
4241 
4242 static void intel_iommu_detach_device(struct iommu_domain *domain,
4243                       struct device *dev)
4244 {
4245     dmar_remove_one_dev_info(dev);
4246 }
4247 
4248 static int intel_iommu_map(struct iommu_domain *domain,
4249                unsigned long iova, phys_addr_t hpa,
4250                size_t size, int iommu_prot, gfp_t gfp)
4251 {
4252     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4253     u64 max_addr;
4254     int prot = 0;
4255 
4256     if (iommu_prot & IOMMU_READ)
4257         prot |= DMA_PTE_READ;
4258     if (iommu_prot & IOMMU_WRITE)
4259         prot |= DMA_PTE_WRITE;
4260     if (dmar_domain->set_pte_snp)
4261         prot |= DMA_PTE_SNP;
4262 
4263     max_addr = iova + size;
4264     if (dmar_domain->max_addr < max_addr) {
4265         u64 end;
4266 
4267         /* check if minimum agaw is sufficient for mapped address */
4268         end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4269         if (end < max_addr) {
4270             pr_err("%s: iommu width (%d) is not "
4271                    "sufficient for the mapped address (%llx)\n",
4272                    __func__, dmar_domain->gaw, max_addr);
4273             return -EFAULT;
4274         }
4275         dmar_domain->max_addr = max_addr;
4276     }
4277     /* Round up size to next multiple of PAGE_SIZE, if it and
4278        the low bits of hpa would take us onto the next page */
4279     size = aligned_nrpages(hpa, size);
4280     return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4281                 hpa >> VTD_PAGE_SHIFT, size, prot);
4282 }
4283 
4284 static int intel_iommu_map_pages(struct iommu_domain *domain,
4285                  unsigned long iova, phys_addr_t paddr,
4286                  size_t pgsize, size_t pgcount,
4287                  int prot, gfp_t gfp, size_t *mapped)
4288 {
4289     unsigned long pgshift = __ffs(pgsize);
4290     size_t size = pgcount << pgshift;
4291     int ret;
4292 
4293     if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4294         return -EINVAL;
4295 
4296     if (!IS_ALIGNED(iova | paddr, pgsize))
4297         return -EINVAL;
4298 
4299     ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4300     if (!ret && mapped)
4301         *mapped = size;
4302 
4303     return ret;
4304 }
4305 
4306 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4307                 unsigned long iova, size_t size,
4308                 struct iommu_iotlb_gather *gather)
4309 {
4310     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4311     unsigned long start_pfn, last_pfn;
4312     int level = 0;
4313 
4314     /* Cope with horrid API which requires us to unmap more than the
4315        size argument if it happens to be a large-page mapping. */
4316     BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4317 
4318     if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4319         size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4320 
4321     start_pfn = iova >> VTD_PAGE_SHIFT;
4322     last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4323 
4324     domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4325 
4326     if (dmar_domain->max_addr == iova + size)
4327         dmar_domain->max_addr = iova;
4328 
4329     iommu_iotlb_gather_add_page(domain, gather, iova, size);
4330 
4331     return size;
4332 }
4333 
4334 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4335                       unsigned long iova,
4336                       size_t pgsize, size_t pgcount,
4337                       struct iommu_iotlb_gather *gather)
4338 {
4339     unsigned long pgshift = __ffs(pgsize);
4340     size_t size = pgcount << pgshift;
4341 
4342     return intel_iommu_unmap(domain, iova, size, gather);
4343 }
4344 
4345 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4346                  struct iommu_iotlb_gather *gather)
4347 {
4348     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4349     unsigned long iova_pfn = IOVA_PFN(gather->start);
4350     size_t size = gather->end - gather->start;
4351     struct iommu_domain_info *info;
4352     unsigned long start_pfn;
4353     unsigned long nrpages;
4354     unsigned long i;
4355 
4356     nrpages = aligned_nrpages(gather->start, size);
4357     start_pfn = mm_to_dma_pfn(iova_pfn);
4358 
4359     xa_for_each(&dmar_domain->iommu_array, i, info)
4360         iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4361                       start_pfn, nrpages,
4362                       list_empty(&gather->freelist), 0);
4363 
4364     put_pages_list(&gather->freelist);
4365 }
4366 
4367 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4368                         dma_addr_t iova)
4369 {
4370     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4371     struct dma_pte *pte;
4372     int level = 0;
4373     u64 phys = 0;
4374 
4375     pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4376     if (pte && dma_pte_present(pte))
4377         phys = dma_pte_addr(pte) +
4378             (iova & (BIT_MASK(level_to_offset_bits(level) +
4379                         VTD_PAGE_SHIFT) - 1));
4380 
4381     return phys;
4382 }
4383 
4384 static bool domain_support_force_snooping(struct dmar_domain *domain)
4385 {
4386     struct device_domain_info *info;
4387     bool support = true;
4388 
4389     assert_spin_locked(&domain->lock);
4390     list_for_each_entry(info, &domain->devices, link) {
4391         if (!ecap_sc_support(info->iommu->ecap)) {
4392             support = false;
4393             break;
4394         }
4395     }
4396 
4397     return support;
4398 }
4399 
4400 static void domain_set_force_snooping(struct dmar_domain *domain)
4401 {
4402     struct device_domain_info *info;
4403 
4404     assert_spin_locked(&domain->lock);
4405     /*
4406      * Second level page table supports per-PTE snoop control. The
4407      * iommu_map() interface will handle this by setting SNP bit.
4408      */
4409     if (!domain_use_first_level(domain)) {
4410         domain->set_pte_snp = true;
4411         return;
4412     }
4413 
4414     list_for_each_entry(info, &domain->devices, link)
4415         intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4416                              PASID_RID2PASID);
4417 }
4418 
4419 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4420 {
4421     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4422     unsigned long flags;
4423 
4424     if (dmar_domain->force_snooping)
4425         return true;
4426 
4427     spin_lock_irqsave(&dmar_domain->lock, flags);
4428     if (!domain_support_force_snooping(dmar_domain)) {
4429         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4430         return false;
4431     }
4432 
4433     domain_set_force_snooping(dmar_domain);
4434     dmar_domain->force_snooping = true;
4435     spin_unlock_irqrestore(&dmar_domain->lock, flags);
4436 
4437     return true;
4438 }
4439 
4440 static bool intel_iommu_capable(enum iommu_cap cap)
4441 {
4442     if (cap == IOMMU_CAP_CACHE_COHERENCY)
4443         return true;
4444     if (cap == IOMMU_CAP_INTR_REMAP)
4445         return irq_remapping_enabled == 1;
4446     if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4447         return dmar_platform_optin();
4448 
4449     return false;
4450 }
4451 
4452 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4453 {
4454     struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4455     struct device_domain_info *info;
4456     struct intel_iommu *iommu;
4457     u8 bus, devfn;
4458 
4459     iommu = device_to_iommu(dev, &bus, &devfn);
4460     if (!iommu)
4461         return ERR_PTR(-ENODEV);
4462 
4463     info = kzalloc(sizeof(*info), GFP_KERNEL);
4464     if (!info)
4465         return ERR_PTR(-ENOMEM);
4466 
4467     if (dev_is_real_dma_subdevice(dev)) {
4468         info->bus = pdev->bus->number;
4469         info->devfn = pdev->devfn;
4470         info->segment = pci_domain_nr(pdev->bus);
4471     } else {
4472         info->bus = bus;
4473         info->devfn = devfn;
4474         info->segment = iommu->segment;
4475     }
4476 
4477     info->dev = dev;
4478     info->iommu = iommu;
4479     if (dev_is_pci(dev)) {
4480         if (ecap_dev_iotlb_support(iommu->ecap) &&
4481             pci_ats_supported(pdev) &&
4482             dmar_ats_supported(pdev, iommu))
4483             info->ats_supported = 1;
4484 
4485         if (sm_supported(iommu)) {
4486             if (pasid_supported(iommu)) {
4487                 int features = pci_pasid_features(pdev);
4488 
4489                 if (features >= 0)
4490                     info->pasid_supported = features | 1;
4491             }
4492 
4493             if (info->ats_supported && ecap_prs(iommu->ecap) &&
4494                 pci_pri_supported(pdev))
4495                 info->pri_supported = 1;
4496         }
4497     }
4498 
4499     dev_iommu_priv_set(dev, info);
4500 
4501     return &iommu->iommu;
4502 }
4503 
4504 static void intel_iommu_release_device(struct device *dev)
4505 {
4506     struct device_domain_info *info = dev_iommu_priv_get(dev);
4507 
4508     dmar_remove_one_dev_info(dev);
4509     dev_iommu_priv_set(dev, NULL);
4510     kfree(info);
4511     set_dma_ops(dev, NULL);
4512 }
4513 
4514 static void intel_iommu_probe_finalize(struct device *dev)
4515 {
4516     set_dma_ops(dev, NULL);
4517     iommu_setup_dma_ops(dev, 0, U64_MAX);
4518 }
4519 
4520 static void intel_iommu_get_resv_regions(struct device *device,
4521                      struct list_head *head)
4522 {
4523     int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4524     struct iommu_resv_region *reg;
4525     struct dmar_rmrr_unit *rmrr;
4526     struct device *i_dev;
4527     int i;
4528 
4529     down_read(&dmar_global_lock);
4530     for_each_rmrr_units(rmrr) {
4531         for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4532                       i, i_dev) {
4533             struct iommu_resv_region *resv;
4534             enum iommu_resv_type type;
4535             size_t length;
4536 
4537             if (i_dev != device &&
4538                 !is_downstream_to_pci_bridge(device, i_dev))
4539                 continue;
4540 
4541             length = rmrr->end_address - rmrr->base_address + 1;
4542 
4543             type = device_rmrr_is_relaxable(device) ?
4544                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4545 
4546             resv = iommu_alloc_resv_region(rmrr->base_address,
4547                                length, prot, type);
4548             if (!resv)
4549                 break;
4550 
4551             list_add_tail(&resv->list, head);
4552         }
4553     }
4554     up_read(&dmar_global_lock);
4555 
4556 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4557     if (dev_is_pci(device)) {
4558         struct pci_dev *pdev = to_pci_dev(device);
4559 
4560         if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4561             reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4562                            IOMMU_RESV_DIRECT_RELAXABLE);
4563             if (reg)
4564                 list_add_tail(&reg->list, head);
4565         }
4566     }
4567 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4568 
4569     reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4570                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4571                       0, IOMMU_RESV_MSI);
4572     if (!reg)
4573         return;
4574     list_add_tail(&reg->list, head);
4575 }
4576 
4577 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4578 {
4579     struct device_domain_info *info = dev_iommu_priv_get(dev);
4580     struct context_entry *context;
4581     struct dmar_domain *domain;
4582     u64 ctx_lo;
4583     int ret;
4584 
4585     domain = info->domain;
4586     if (!domain)
4587         return -EINVAL;
4588 
4589     spin_lock(&iommu->lock);
4590     ret = -EINVAL;
4591     if (!info->pasid_supported)
4592         goto out;
4593 
4594     context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4595     if (WARN_ON(!context))
4596         goto out;
4597 
4598     ctx_lo = context[0].lo;
4599 
4600     if (!(ctx_lo & CONTEXT_PASIDE)) {
4601         ctx_lo |= CONTEXT_PASIDE;
4602         context[0].lo = ctx_lo;
4603         wmb();
4604         iommu->flush.flush_context(iommu,
4605                        domain_id_iommu(domain, iommu),
4606                        PCI_DEVID(info->bus, info->devfn),
4607                        DMA_CCMD_MASK_NOBIT,
4608                        DMA_CCMD_DEVICE_INVL);
4609     }
4610 
4611     /* Enable PASID support in the device, if it wasn't already */
4612     if (!info->pasid_enabled)
4613         iommu_enable_dev_iotlb(info);
4614 
4615     ret = 0;
4616 
4617  out:
4618     spin_unlock(&iommu->lock);
4619 
4620     return ret;
4621 }
4622 
4623 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4624 {
4625     if (dev_is_pci(dev))
4626         return pci_device_group(dev);
4627     return generic_device_group(dev);
4628 }
4629 
4630 static int intel_iommu_enable_sva(struct device *dev)
4631 {
4632     struct device_domain_info *info = dev_iommu_priv_get(dev);
4633     struct intel_iommu *iommu;
4634     int ret;
4635 
4636     if (!info || dmar_disabled)
4637         return -EINVAL;
4638 
4639     iommu = info->iommu;
4640     if (!iommu)
4641         return -EINVAL;
4642 
4643     if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4644         return -ENODEV;
4645 
4646     if (intel_iommu_enable_pasid(iommu, dev))
4647         return -ENODEV;
4648 
4649     if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4650         return -EINVAL;
4651 
4652     ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4653     if (!ret)
4654         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4655 
4656     return ret;
4657 }
4658 
4659 static int intel_iommu_disable_sva(struct device *dev)
4660 {
4661     struct device_domain_info *info = dev_iommu_priv_get(dev);
4662     struct intel_iommu *iommu = info->iommu;
4663     int ret;
4664 
4665     ret = iommu_unregister_device_fault_handler(dev);
4666     if (!ret)
4667         ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4668 
4669     return ret;
4670 }
4671 
4672 static int intel_iommu_enable_iopf(struct device *dev)
4673 {
4674     struct device_domain_info *info = dev_iommu_priv_get(dev);
4675 
4676     if (info && info->pri_supported)
4677         return 0;
4678 
4679     return -ENODEV;
4680 }
4681 
4682 static int
4683 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4684 {
4685     switch (feat) {
4686     case IOMMU_DEV_FEAT_IOPF:
4687         return intel_iommu_enable_iopf(dev);
4688 
4689     case IOMMU_DEV_FEAT_SVA:
4690         return intel_iommu_enable_sva(dev);
4691 
4692     default:
4693         return -ENODEV;
4694     }
4695 }
4696 
4697 static int
4698 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4699 {
4700     switch (feat) {
4701     case IOMMU_DEV_FEAT_IOPF:
4702         return 0;
4703 
4704     case IOMMU_DEV_FEAT_SVA:
4705         return intel_iommu_disable_sva(dev);
4706 
4707     default:
4708         return -ENODEV;
4709     }
4710 }
4711 
4712 static bool intel_iommu_is_attach_deferred(struct device *dev)
4713 {
4714     struct device_domain_info *info = dev_iommu_priv_get(dev);
4715 
4716     return translation_pre_enabled(info->iommu) && !info->domain;
4717 }
4718 
4719 /*
4720  * Check that the device does not live on an external facing PCI port that is
4721  * marked as untrusted. Such devices should not be able to apply quirks and
4722  * thus not be able to bypass the IOMMU restrictions.
4723  */
4724 static bool risky_device(struct pci_dev *pdev)
4725 {
4726     if (pdev->untrusted) {
4727         pci_info(pdev,
4728              "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4729              pdev->vendor, pdev->device);
4730         pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4731         return true;
4732     }
4733     return false;
4734 }
4735 
4736 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4737                        unsigned long iova, size_t size)
4738 {
4739     struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4740     unsigned long pages = aligned_nrpages(iova, size);
4741     unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4742     struct iommu_domain_info *info;
4743     unsigned long i;
4744 
4745     xa_for_each(&dmar_domain->iommu_array, i, info)
4746         __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4747 }
4748 
4749 const struct iommu_ops intel_iommu_ops = {
4750     .capable        = intel_iommu_capable,
4751     .domain_alloc       = intel_iommu_domain_alloc,
4752     .probe_device       = intel_iommu_probe_device,
4753     .probe_finalize     = intel_iommu_probe_finalize,
4754     .release_device     = intel_iommu_release_device,
4755     .get_resv_regions   = intel_iommu_get_resv_regions,
4756     .device_group       = intel_iommu_device_group,
4757     .dev_enable_feat    = intel_iommu_dev_enable_feat,
4758     .dev_disable_feat   = intel_iommu_dev_disable_feat,
4759     .is_attach_deferred = intel_iommu_is_attach_deferred,
4760     .def_domain_type    = device_def_domain_type,
4761     .pgsize_bitmap      = SZ_4K,
4762 #ifdef CONFIG_INTEL_IOMMU_SVM
4763     .sva_bind       = intel_svm_bind,
4764     .sva_unbind     = intel_svm_unbind,
4765     .sva_get_pasid      = intel_svm_get_pasid,
4766     .page_response      = intel_svm_page_response,
4767 #endif
4768     .default_domain_ops = &(const struct iommu_domain_ops) {
4769         .attach_dev     = intel_iommu_attach_device,
4770         .detach_dev     = intel_iommu_detach_device,
4771         .map_pages      = intel_iommu_map_pages,
4772         .unmap_pages        = intel_iommu_unmap_pages,
4773         .iotlb_sync_map     = intel_iommu_iotlb_sync_map,
4774         .flush_iotlb_all        = intel_flush_iotlb_all,
4775         .iotlb_sync     = intel_iommu_tlb_sync,
4776         .iova_to_phys       = intel_iommu_iova_to_phys,
4777         .free           = intel_iommu_domain_free,
4778         .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4779     }
4780 };
4781 
4782 static void quirk_iommu_igfx(struct pci_dev *dev)
4783 {
4784     if (risky_device(dev))
4785         return;
4786 
4787     pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4788     dmar_map_gfx = 0;
4789 }
4790 
4791 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4799 
4800 /* Broadwell igfx malfunctions with dmar */
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4825 
4826 static void quirk_iommu_rwbf(struct pci_dev *dev)
4827 {
4828     if (risky_device(dev))
4829         return;
4830 
4831     /*
4832      * Mobile 4 Series Chipset neglects to set RWBF capability,
4833      * but needs it. Same seems to hold for the desktop versions.
4834      */
4835     pci_info(dev, "Forcing write-buffer flush capability\n");
4836     rwbf_quirk = 1;
4837 }
4838 
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4846 
4847 #define GGC 0x52
4848 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4849 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4850 #define GGC_MEMORY_SIZE_1M  (0x1 << 8)
4851 #define GGC_MEMORY_SIZE_2M  (0x3 << 8)
4852 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4853 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4854 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4855 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4856 
4857 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4858 {
4859     unsigned short ggc;
4860 
4861     if (risky_device(dev))
4862         return;
4863 
4864     if (pci_read_config_word(dev, GGC, &ggc))
4865         return;
4866 
4867     if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4868         pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4869         dmar_map_gfx = 0;
4870     } else if (dmar_map_gfx) {
4871         /* we have to ensure the gfx device is idle before we flush */
4872         pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4873         iommu_set_dma_strict();
4874     }
4875 }
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4880 
4881 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4882 {
4883     unsigned short ver;
4884 
4885     if (!IS_GFX_DEVICE(dev))
4886         return;
4887 
4888     ver = (dev->device >> 8) & 0xff;
4889     if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4890         ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4891         ver != 0x9a && ver != 0xa7)
4892         return;
4893 
4894     if (risky_device(dev))
4895         return;
4896 
4897     pci_info(dev, "Skip IOMMU disabling for graphics\n");
4898     iommu_skip_te_disable = 1;
4899 }
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4901 
4902 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4903    ISOCH DMAR unit for the Azalia sound device, but not give it any
4904    TLB entries, which causes it to deadlock. Check for that.  We do
4905    this in a function called from init_dmars(), instead of in a PCI
4906    quirk, because we don't want to print the obnoxious "BIOS broken"
4907    message if VT-d is actually disabled.
4908 */
4909 static void __init check_tylersburg_isoch(void)
4910 {
4911     struct pci_dev *pdev;
4912     uint32_t vtisochctrl;
4913 
4914     /* If there's no Azalia in the system anyway, forget it. */
4915     pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4916     if (!pdev)
4917         return;
4918 
4919     if (risky_device(pdev)) {
4920         pci_dev_put(pdev);
4921         return;
4922     }
4923 
4924     pci_dev_put(pdev);
4925 
4926     /* System Management Registers. Might be hidden, in which case
4927        we can't do the sanity check. But that's OK, because the
4928        known-broken BIOSes _don't_ actually hide it, so far. */
4929     pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4930     if (!pdev)
4931         return;
4932 
4933     if (risky_device(pdev)) {
4934         pci_dev_put(pdev);
4935         return;
4936     }
4937 
4938     if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4939         pci_dev_put(pdev);
4940         return;
4941     }
4942 
4943     pci_dev_put(pdev);
4944 
4945     /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4946     if (vtisochctrl & 1)
4947         return;
4948 
4949     /* Drop all bits other than the number of TLB entries */
4950     vtisochctrl &= 0x1c;
4951 
4952     /* If we have the recommended number of TLB entries (16), fine. */
4953     if (vtisochctrl == 0x10)
4954         return;
4955 
4956     /* Zero TLB entries? You get to ride the short bus to school. */
4957     if (!vtisochctrl) {
4958         WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4959              "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4960              dmi_get_system_info(DMI_BIOS_VENDOR),
4961              dmi_get_system_info(DMI_BIOS_VERSION),
4962              dmi_get_system_info(DMI_PRODUCT_VERSION));
4963         iommu_identity_mapping |= IDENTMAP_AZALIA;
4964         return;
4965     }
4966 
4967     pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4968            vtisochctrl);
4969 }