Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * CPU-agnostic AMD IO page table allocator.
0004  *
0005  * Copyright (C) 2020 Advanced Micro Devices, Inc.
0006  * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
0007  */
0008 
0009 #define pr_fmt(fmt)     "AMD-Vi: " fmt
0010 #define dev_fmt(fmt)    pr_fmt(fmt)
0011 
0012 #include <linux/atomic.h>
0013 #include <linux/bitops.h>
0014 #include <linux/io-pgtable.h>
0015 #include <linux/kernel.h>
0016 #include <linux/sizes.h>
0017 #include <linux/slab.h>
0018 #include <linux/types.h>
0019 #include <linux/dma-mapping.h>
0020 
0021 #include <asm/barrier.h>
0022 
0023 #include "amd_iommu_types.h"
0024 #include "amd_iommu.h"
0025 
0026 static void v1_tlb_flush_all(void *cookie)
0027 {
0028 }
0029 
0030 static void v1_tlb_flush_walk(unsigned long iova, size_t size,
0031                   size_t granule, void *cookie)
0032 {
0033 }
0034 
0035 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
0036                      unsigned long iova, size_t granule,
0037                      void *cookie)
0038 {
0039 }
0040 
0041 static const struct iommu_flush_ops v1_flush_ops = {
0042     .tlb_flush_all  = v1_tlb_flush_all,
0043     .tlb_flush_walk = v1_tlb_flush_walk,
0044     .tlb_add_page   = v1_tlb_add_page,
0045 };
0046 
0047 /*
0048  * Helper function to get the first pte of a large mapping
0049  */
0050 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
0051              unsigned long *count)
0052 {
0053     unsigned long pte_mask, pg_size, cnt;
0054     u64 *fpte;
0055 
0056     pg_size  = PTE_PAGE_SIZE(*pte);
0057     cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
0058     pte_mask = ~((cnt << 3) - 1);
0059     fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
0060 
0061     if (page_size)
0062         *page_size = pg_size;
0063 
0064     if (count)
0065         *count = cnt;
0066 
0067     return fpte;
0068 }
0069 
0070 /****************************************************************************
0071  *
0072  * The functions below are used the create the page table mappings for
0073  * unity mapped regions.
0074  *
0075  ****************************************************************************/
0076 
0077 static void free_pt_page(u64 *pt, struct list_head *freelist)
0078 {
0079     struct page *p = virt_to_page(pt);
0080 
0081     list_add_tail(&p->lru, freelist);
0082 }
0083 
0084 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
0085 {
0086     u64 *p;
0087     int i;
0088 
0089     for (i = 0; i < 512; ++i) {
0090         /* PTE present? */
0091         if (!IOMMU_PTE_PRESENT(pt[i]))
0092             continue;
0093 
0094         /* Large PTE? */
0095         if (PM_PTE_LEVEL(pt[i]) == 0 ||
0096             PM_PTE_LEVEL(pt[i]) == 7)
0097             continue;
0098 
0099         /*
0100          * Free the next level. No need to look at l1 tables here since
0101          * they can only contain leaf PTEs; just free them directly.
0102          */
0103         p = IOMMU_PTE_PAGE(pt[i]);
0104         if (lvl > 2)
0105             free_pt_lvl(p, freelist, lvl - 1);
0106         else
0107             free_pt_page(p, freelist);
0108     }
0109 
0110     free_pt_page(pt, freelist);
0111 }
0112 
0113 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
0114 {
0115     switch (mode) {
0116     case PAGE_MODE_NONE:
0117     case PAGE_MODE_7_LEVEL:
0118         break;
0119     case PAGE_MODE_1_LEVEL:
0120         free_pt_page(root, freelist);
0121         break;
0122     case PAGE_MODE_2_LEVEL:
0123     case PAGE_MODE_3_LEVEL:
0124     case PAGE_MODE_4_LEVEL:
0125     case PAGE_MODE_5_LEVEL:
0126     case PAGE_MODE_6_LEVEL:
0127         free_pt_lvl(root, freelist, mode);
0128         break;
0129     default:
0130         BUG();
0131     }
0132 }
0133 
0134 void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
0135                   u64 *root, int mode)
0136 {
0137     u64 pt_root;
0138 
0139     /* lowest 3 bits encode pgtable mode */
0140     pt_root = mode & 7;
0141     pt_root |= (u64)root;
0142 
0143     amd_iommu_domain_set_pt_root(domain, pt_root);
0144 }
0145 
0146 /*
0147  * This function is used to add another level to an IO page table. Adding
0148  * another level increases the size of the address space by 9 bits to a size up
0149  * to 64 bits.
0150  */
0151 static bool increase_address_space(struct protection_domain *domain,
0152                    unsigned long address,
0153                    gfp_t gfp)
0154 {
0155     unsigned long flags;
0156     bool ret = true;
0157     u64 *pte;
0158 
0159     pte = (void *)get_zeroed_page(gfp);
0160     if (!pte)
0161         return false;
0162 
0163     spin_lock_irqsave(&domain->lock, flags);
0164 
0165     if (address <= PM_LEVEL_SIZE(domain->iop.mode))
0166         goto out;
0167 
0168     ret = false;
0169     if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL))
0170         goto out;
0171 
0172     *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root));
0173 
0174     domain->iop.root  = pte;
0175     domain->iop.mode += 1;
0176     amd_iommu_update_and_flush_device_table(domain);
0177     amd_iommu_domain_flush_complete(domain);
0178 
0179     /*
0180      * Device Table needs to be updated and flushed before the new root can
0181      * be published.
0182      */
0183     amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode);
0184 
0185     pte = NULL;
0186     ret = true;
0187 
0188 out:
0189     spin_unlock_irqrestore(&domain->lock, flags);
0190     free_page((unsigned long)pte);
0191 
0192     return ret;
0193 }
0194 
0195 static u64 *alloc_pte(struct protection_domain *domain,
0196               unsigned long address,
0197               unsigned long page_size,
0198               u64 **pte_page,
0199               gfp_t gfp,
0200               bool *updated)
0201 {
0202     int level, end_lvl;
0203     u64 *pte, *page;
0204 
0205     BUG_ON(!is_power_of_2(page_size));
0206 
0207     while (address > PM_LEVEL_SIZE(domain->iop.mode)) {
0208         /*
0209          * Return an error if there is no memory to update the
0210          * page-table.
0211          */
0212         if (!increase_address_space(domain, address, gfp))
0213             return NULL;
0214     }
0215 
0216 
0217     level   = domain->iop.mode - 1;
0218     pte     = &domain->iop.root[PM_LEVEL_INDEX(level, address)];
0219     address = PAGE_SIZE_ALIGN(address, page_size);
0220     end_lvl = PAGE_SIZE_LEVEL(page_size);
0221 
0222     while (level > end_lvl) {
0223         u64 __pte, __npte;
0224         int pte_level;
0225 
0226         __pte     = *pte;
0227         pte_level = PM_PTE_LEVEL(__pte);
0228 
0229         /*
0230          * If we replace a series of large PTEs, we need
0231          * to tear down all of them.
0232          */
0233         if (IOMMU_PTE_PRESENT(__pte) &&
0234             pte_level == PAGE_MODE_7_LEVEL) {
0235             unsigned long count, i;
0236             u64 *lpte;
0237 
0238             lpte = first_pte_l7(pte, NULL, &count);
0239 
0240             /*
0241              * Unmap the replicated PTEs that still match the
0242              * original large mapping
0243              */
0244             for (i = 0; i < count; ++i)
0245                 cmpxchg64(&lpte[i], __pte, 0ULL);
0246 
0247             *updated = true;
0248             continue;
0249         }
0250 
0251         if (!IOMMU_PTE_PRESENT(__pte) ||
0252             pte_level == PAGE_MODE_NONE) {
0253             page = (u64 *)get_zeroed_page(gfp);
0254 
0255             if (!page)
0256                 return NULL;
0257 
0258             __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
0259 
0260             /* pte could have been changed somewhere. */
0261             if (!try_cmpxchg64(pte, &__pte, __npte))
0262                 free_page((unsigned long)page);
0263             else if (IOMMU_PTE_PRESENT(__pte))
0264                 *updated = true;
0265 
0266             continue;
0267         }
0268 
0269         /* No level skipping support yet */
0270         if (pte_level != level)
0271             return NULL;
0272 
0273         level -= 1;
0274 
0275         pte = IOMMU_PTE_PAGE(__pte);
0276 
0277         if (pte_page && level == end_lvl)
0278             *pte_page = pte;
0279 
0280         pte = &pte[PM_LEVEL_INDEX(level, address)];
0281     }
0282 
0283     return pte;
0284 }
0285 
0286 /*
0287  * This function checks if there is a PTE for a given dma address. If
0288  * there is one, it returns the pointer to it.
0289  */
0290 static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
0291               unsigned long address,
0292               unsigned long *page_size)
0293 {
0294     int level;
0295     u64 *pte;
0296 
0297     *page_size = 0;
0298 
0299     if (address > PM_LEVEL_SIZE(pgtable->mode))
0300         return NULL;
0301 
0302     level      =  pgtable->mode - 1;
0303     pte    = &pgtable->root[PM_LEVEL_INDEX(level, address)];
0304     *page_size =  PTE_LEVEL_PAGE_SIZE(level);
0305 
0306     while (level > 0) {
0307 
0308         /* Not Present */
0309         if (!IOMMU_PTE_PRESENT(*pte))
0310             return NULL;
0311 
0312         /* Large PTE */
0313         if (PM_PTE_LEVEL(*pte) == 7 ||
0314             PM_PTE_LEVEL(*pte) == 0)
0315             break;
0316 
0317         /* No level skipping support yet */
0318         if (PM_PTE_LEVEL(*pte) != level)
0319             return NULL;
0320 
0321         level -= 1;
0322 
0323         /* Walk to the next level */
0324         pte    = IOMMU_PTE_PAGE(*pte);
0325         pte    = &pte[PM_LEVEL_INDEX(level, address)];
0326         *page_size = PTE_LEVEL_PAGE_SIZE(level);
0327     }
0328 
0329     /*
0330      * If we have a series of large PTEs, make
0331      * sure to return a pointer to the first one.
0332      */
0333     if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
0334         pte = first_pte_l7(pte, page_size, NULL);
0335 
0336     return pte;
0337 }
0338 
0339 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
0340 {
0341     u64 *pt;
0342     int mode;
0343 
0344     while (!try_cmpxchg64(pte, &pteval, 0))
0345         pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
0346 
0347     if (!IOMMU_PTE_PRESENT(pteval))
0348         return;
0349 
0350     pt   = IOMMU_PTE_PAGE(pteval);
0351     mode = IOMMU_PTE_MODE(pteval);
0352 
0353     free_sub_pt(pt, mode, freelist);
0354 }
0355 
0356 /*
0357  * Generic mapping functions. It maps a physical address into a DMA
0358  * address space. It allocates the page table pages if necessary.
0359  * In the future it can be extended to a generic mapping function
0360  * supporting all features of AMD IOMMU page tables like level skipping
0361  * and full 64 bit address spaces.
0362  */
0363 static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova,
0364               phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
0365 {
0366     struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
0367     LIST_HEAD(freelist);
0368     bool updated = false;
0369     u64 __pte, *pte;
0370     int ret, i, count;
0371 
0372     BUG_ON(!IS_ALIGNED(iova, size));
0373     BUG_ON(!IS_ALIGNED(paddr, size));
0374 
0375     ret = -EINVAL;
0376     if (!(prot & IOMMU_PROT_MASK))
0377         goto out;
0378 
0379     count = PAGE_SIZE_PTE_COUNT(size);
0380     pte   = alloc_pte(dom, iova, size, NULL, gfp, &updated);
0381 
0382     ret = -ENOMEM;
0383     if (!pte)
0384         goto out;
0385 
0386     for (i = 0; i < count; ++i)
0387         free_clear_pte(&pte[i], pte[i], &freelist);
0388 
0389     if (!list_empty(&freelist))
0390         updated = true;
0391 
0392     if (count > 1) {
0393         __pte = PAGE_SIZE_PTE(__sme_set(paddr), size);
0394         __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
0395     } else
0396         __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
0397 
0398     if (prot & IOMMU_PROT_IR)
0399         __pte |= IOMMU_PTE_IR;
0400     if (prot & IOMMU_PROT_IW)
0401         __pte |= IOMMU_PTE_IW;
0402 
0403     for (i = 0; i < count; ++i)
0404         pte[i] = __pte;
0405 
0406     ret = 0;
0407 
0408 out:
0409     if (updated) {
0410         unsigned long flags;
0411 
0412         spin_lock_irqsave(&dom->lock, flags);
0413         /*
0414          * Flush domain TLB(s) and wait for completion. Any Device-Table
0415          * Updates and flushing already happened in
0416          * increase_address_space().
0417          */
0418         amd_iommu_domain_flush_tlb_pde(dom);
0419         amd_iommu_domain_flush_complete(dom);
0420         spin_unlock_irqrestore(&dom->lock, flags);
0421     }
0422 
0423     /* Everything flushed out, free pages now */
0424     put_pages_list(&freelist);
0425 
0426     return ret;
0427 }
0428 
0429 static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops,
0430                       unsigned long iova,
0431                       size_t size,
0432                       struct iommu_iotlb_gather *gather)
0433 {
0434     struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
0435     unsigned long long unmapped;
0436     unsigned long unmap_size;
0437     u64 *pte;
0438 
0439     BUG_ON(!is_power_of_2(size));
0440 
0441     unmapped = 0;
0442 
0443     while (unmapped < size) {
0444         pte = fetch_pte(pgtable, iova, &unmap_size);
0445         if (pte) {
0446             int i, count;
0447 
0448             count = PAGE_SIZE_PTE_COUNT(unmap_size);
0449             for (i = 0; i < count; i++)
0450                 pte[i] = 0ULL;
0451         }
0452 
0453         iova = (iova & ~(unmap_size - 1)) + unmap_size;
0454         unmapped += unmap_size;
0455     }
0456 
0457     BUG_ON(unmapped && !is_power_of_2(unmapped));
0458 
0459     return unmapped;
0460 }
0461 
0462 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
0463 {
0464     struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
0465     unsigned long offset_mask, pte_pgsize;
0466     u64 *pte, __pte;
0467 
0468     pte = fetch_pte(pgtable, iova, &pte_pgsize);
0469 
0470     if (!pte || !IOMMU_PTE_PRESENT(*pte))
0471         return 0;
0472 
0473     offset_mask = pte_pgsize - 1;
0474     __pte       = __sme_clr(*pte & PM_ADDR_MASK);
0475 
0476     return (__pte & ~offset_mask) | (iova & offset_mask);
0477 }
0478 
0479 /*
0480  * ----------------------------------------------------
0481  */
0482 static void v1_free_pgtable(struct io_pgtable *iop)
0483 {
0484     struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
0485     struct protection_domain *dom;
0486     LIST_HEAD(freelist);
0487 
0488     if (pgtable->mode == PAGE_MODE_NONE)
0489         return;
0490 
0491     dom = container_of(pgtable, struct protection_domain, iop);
0492 
0493     /* Page-table is not visible to IOMMU anymore, so free it */
0494     BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
0495            pgtable->mode > PAGE_MODE_6_LEVEL);
0496 
0497     free_sub_pt(pgtable->root, pgtable->mode, &freelist);
0498 
0499     /* Update data structure */
0500     amd_iommu_domain_clr_pt_root(dom);
0501 
0502     /* Make changes visible to IOMMUs */
0503     amd_iommu_domain_update(dom);
0504 
0505     put_pages_list(&freelist);
0506 }
0507 
0508 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
0509 {
0510     struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
0511 
0512     cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
0513     cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
0514     cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
0515     cfg->tlb            = &v1_flush_ops;
0516 
0517     pgtable->iop.ops.map          = iommu_v1_map_page;
0518     pgtable->iop.ops.unmap        = iommu_v1_unmap_page;
0519     pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
0520 
0521     return &pgtable->iop;
0522 }
0523 
0524 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
0525     .alloc  = v1_alloc_pgtable,
0526     .free   = v1_free_pgtable,
0527 };