Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
0004  *
0005  * Rewrite, cleanup:
0006  *
0007  * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
0008  * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
0009  *
0010  * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
0011  */
0012 
0013 #include <linux/init.h>
0014 #include <linux/types.h>
0015 #include <linux/slab.h>
0016 #include <linux/mm.h>
0017 #include <linux/memblock.h>
0018 #include <linux/spinlock.h>
0019 #include <linux/string.h>
0020 #include <linux/pci.h>
0021 #include <linux/dma-mapping.h>
0022 #include <linux/crash_dump.h>
0023 #include <linux/memory.h>
0024 #include <linux/of.h>
0025 #include <linux/iommu.h>
0026 #include <linux/rculist.h>
0027 #include <asm/io.h>
0028 #include <asm/prom.h>
0029 #include <asm/rtas.h>
0030 #include <asm/iommu.h>
0031 #include <asm/pci-bridge.h>
0032 #include <asm/machdep.h>
0033 #include <asm/firmware.h>
0034 #include <asm/tce.h>
0035 #include <asm/ppc-pci.h>
0036 #include <asm/udbg.h>
0037 #include <asm/mmzone.h>
0038 #include <asm/plpar_wrappers.h>
0039 
0040 #include "pseries.h"
0041 
0042 enum {
0043     DDW_QUERY_PE_DMA_WIN  = 0,
0044     DDW_CREATE_PE_DMA_WIN = 1,
0045     DDW_REMOVE_PE_DMA_WIN = 2,
0046 
0047     DDW_APPLICABLE_SIZE
0048 };
0049 
0050 enum {
0051     DDW_EXT_SIZE = 0,
0052     DDW_EXT_RESET_DMA_WIN = 1,
0053     DDW_EXT_QUERY_OUT_SIZE = 2
0054 };
0055 
0056 static struct iommu_table *iommu_pseries_alloc_table(int node)
0057 {
0058     struct iommu_table *tbl;
0059 
0060     tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
0061     if (!tbl)
0062         return NULL;
0063 
0064     INIT_LIST_HEAD_RCU(&tbl->it_group_list);
0065     kref_init(&tbl->it_kref);
0066     return tbl;
0067 }
0068 
0069 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
0070 {
0071     struct iommu_table_group *table_group;
0072 
0073     table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
0074     if (!table_group)
0075         return NULL;
0076 
0077     table_group->tables[0] = iommu_pseries_alloc_table(node);
0078     if (table_group->tables[0])
0079         return table_group;
0080 
0081     kfree(table_group);
0082     return NULL;
0083 }
0084 
0085 static void iommu_pseries_free_group(struct iommu_table_group *table_group,
0086         const char *node_name)
0087 {
0088     struct iommu_table *tbl;
0089 
0090     if (!table_group)
0091         return;
0092 
0093     tbl = table_group->tables[0];
0094 #ifdef CONFIG_IOMMU_API
0095     if (table_group->group) {
0096         iommu_group_put(table_group->group);
0097         BUG_ON(table_group->group);
0098     }
0099 #endif
0100     iommu_tce_table_put(tbl);
0101 
0102     kfree(table_group);
0103 }
0104 
0105 static int tce_build_pSeries(struct iommu_table *tbl, long index,
0106                   long npages, unsigned long uaddr,
0107                   enum dma_data_direction direction,
0108                   unsigned long attrs)
0109 {
0110     u64 proto_tce;
0111     __be64 *tcep;
0112     u64 rpn;
0113     const unsigned long tceshift = tbl->it_page_shift;
0114     const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
0115 
0116     proto_tce = TCE_PCI_READ; // Read allowed
0117 
0118     if (direction != DMA_TO_DEVICE)
0119         proto_tce |= TCE_PCI_WRITE;
0120 
0121     tcep = ((__be64 *)tbl->it_base) + index;
0122 
0123     while (npages--) {
0124         /* can't move this out since we might cross MEMBLOCK boundary */
0125         rpn = __pa(uaddr) >> tceshift;
0126         *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
0127 
0128         uaddr += pagesize;
0129         tcep++;
0130     }
0131     return 0;
0132 }
0133 
0134 
0135 static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
0136 {
0137     __be64 *tcep;
0138 
0139     tcep = ((__be64 *)tbl->it_base) + index;
0140 
0141     while (npages--)
0142         *(tcep++) = 0;
0143 }
0144 
0145 static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
0146 {
0147     __be64 *tcep;
0148 
0149     tcep = ((__be64 *)tbl->it_base) + index;
0150 
0151     return be64_to_cpu(*tcep);
0152 }
0153 
0154 static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
0155 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
0156 
0157 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
0158                 long npages, unsigned long uaddr,
0159                 enum dma_data_direction direction,
0160                 unsigned long attrs)
0161 {
0162     u64 rc = 0;
0163     u64 proto_tce, tce;
0164     u64 rpn;
0165     int ret = 0;
0166     long tcenum_start = tcenum, npages_start = npages;
0167 
0168     rpn = __pa(uaddr) >> tceshift;
0169     proto_tce = TCE_PCI_READ;
0170     if (direction != DMA_TO_DEVICE)
0171         proto_tce |= TCE_PCI_WRITE;
0172 
0173     while (npages--) {
0174         tce = proto_tce | rpn << tceshift;
0175         rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
0176 
0177         if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
0178             ret = (int)rc;
0179             tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
0180                                (npages_start - (npages + 1)));
0181             break;
0182         }
0183 
0184         if (rc && printk_ratelimit()) {
0185             printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
0186             printk("\tindex   = 0x%llx\n", (u64)liobn);
0187             printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
0188             printk("\ttce val = 0x%llx\n", tce );
0189             dump_stack();
0190         }
0191 
0192         tcenum++;
0193         rpn++;
0194     }
0195     return ret;
0196 }
0197 
0198 static DEFINE_PER_CPU(__be64 *, tce_page);
0199 
0200 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
0201                      long npages, unsigned long uaddr,
0202                      enum dma_data_direction direction,
0203                      unsigned long attrs)
0204 {
0205     u64 rc = 0;
0206     u64 proto_tce;
0207     __be64 *tcep;
0208     u64 rpn;
0209     long l, limit;
0210     long tcenum_start = tcenum, npages_start = npages;
0211     int ret = 0;
0212     unsigned long flags;
0213     const unsigned long tceshift = tbl->it_page_shift;
0214 
0215     if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
0216         return tce_build_pSeriesLP(tbl->it_index, tcenum,
0217                        tceshift, npages, uaddr,
0218                                    direction, attrs);
0219     }
0220 
0221     local_irq_save(flags);  /* to protect tcep and the page behind it */
0222 
0223     tcep = __this_cpu_read(tce_page);
0224 
0225     /* This is safe to do since interrupts are off when we're called
0226      * from iommu_alloc{,_sg}()
0227      */
0228     if (!tcep) {
0229         tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
0230         /* If allocation fails, fall back to the loop implementation */
0231         if (!tcep) {
0232             local_irq_restore(flags);
0233             return tce_build_pSeriesLP(tbl->it_index, tcenum,
0234                     tceshift,
0235                     npages, uaddr, direction, attrs);
0236         }
0237         __this_cpu_write(tce_page, tcep);
0238     }
0239 
0240     rpn = __pa(uaddr) >> tceshift;
0241     proto_tce = TCE_PCI_READ;
0242     if (direction != DMA_TO_DEVICE)
0243         proto_tce |= TCE_PCI_WRITE;
0244 
0245     /* We can map max one pageful of TCEs at a time */
0246     do {
0247         /*
0248          * Set up the page with TCE data, looping through and setting
0249          * the values.
0250          */
0251         limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
0252 
0253         for (l = 0; l < limit; l++) {
0254             tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
0255             rpn++;
0256         }
0257 
0258         rc = plpar_tce_put_indirect((u64)tbl->it_index,
0259                         (u64)tcenum << tceshift,
0260                         (u64)__pa(tcep),
0261                         limit);
0262 
0263         npages -= limit;
0264         tcenum += limit;
0265     } while (npages > 0 && !rc);
0266 
0267     local_irq_restore(flags);
0268 
0269     if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
0270         ret = (int)rc;
0271         tce_freemulti_pSeriesLP(tbl, tcenum_start,
0272                                 (npages_start - (npages + limit)));
0273         return ret;
0274     }
0275 
0276     if (rc && printk_ratelimit()) {
0277         printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
0278         printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
0279         printk("\tnpages  = 0x%llx\n", (u64)npages);
0280         printk("\ttce[0] val = 0x%llx\n", tcep[0]);
0281         dump_stack();
0282     }
0283     return ret;
0284 }
0285 
0286 static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
0287                    long npages)
0288 {
0289     u64 rc;
0290 
0291     while (npages--) {
0292         rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
0293 
0294         if (rc && printk_ratelimit()) {
0295             printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
0296             printk("\tindex   = 0x%llx\n", (u64)liobn);
0297             printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
0298             dump_stack();
0299         }
0300 
0301         tcenum++;
0302     }
0303 }
0304 
0305 
0306 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
0307 {
0308     u64 rc;
0309 
0310     if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
0311         return tce_free_pSeriesLP(tbl->it_index, tcenum,
0312                       tbl->it_page_shift, npages);
0313 
0314     rc = plpar_tce_stuff((u64)tbl->it_index,
0315                  (u64)tcenum << tbl->it_page_shift, 0, npages);
0316 
0317     if (rc && printk_ratelimit()) {
0318         printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
0319         printk("\trc      = %lld\n", rc);
0320         printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
0321         printk("\tnpages  = 0x%llx\n", (u64)npages);
0322         dump_stack();
0323     }
0324 }
0325 
0326 static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
0327 {
0328     u64 rc;
0329     unsigned long tce_ret;
0330 
0331     rc = plpar_tce_get((u64)tbl->it_index,
0332                (u64)tcenum << tbl->it_page_shift, &tce_ret);
0333 
0334     if (rc && printk_ratelimit()) {
0335         printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
0336         printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
0337         printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
0338         dump_stack();
0339     }
0340 
0341     return tce_ret;
0342 }
0343 
0344 /* this is compatible with cells for the device tree property */
0345 struct dynamic_dma_window_prop {
0346     __be32  liobn;      /* tce table number */
0347     __be64  dma_base;   /* address hi,lo */
0348     __be32  tce_shift;  /* ilog2(tce_page_size) */
0349     __be32  window_shift;   /* ilog2(tce_window_size) */
0350 };
0351 
0352 struct dma_win {
0353     struct device_node *device;
0354     const struct dynamic_dma_window_prop *prop;
0355     struct list_head list;
0356 };
0357 
0358 /* Dynamic DMA Window support */
0359 struct ddw_query_response {
0360     u32 windows_available;
0361     u64 largest_available_block;
0362     u32 page_size;
0363     u32 migration_capable;
0364 };
0365 
0366 struct ddw_create_response {
0367     u32 liobn;
0368     u32 addr_hi;
0369     u32 addr_lo;
0370 };
0371 
0372 static LIST_HEAD(dma_win_list);
0373 /* prevents races between memory on/offline and window creation */
0374 static DEFINE_SPINLOCK(dma_win_list_lock);
0375 /* protects initializing window twice for same device */
0376 static DEFINE_MUTEX(dma_win_init_mutex);
0377 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
0378 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
0379 
0380 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
0381                     unsigned long num_pfn, const void *arg)
0382 {
0383     const struct dynamic_dma_window_prop *maprange = arg;
0384     int rc;
0385     u64 tce_size, num_tce, dma_offset, next;
0386     u32 tce_shift;
0387     long limit;
0388 
0389     tce_shift = be32_to_cpu(maprange->tce_shift);
0390     tce_size = 1ULL << tce_shift;
0391     next = start_pfn << PAGE_SHIFT;
0392     num_tce = num_pfn << PAGE_SHIFT;
0393 
0394     /* round back to the beginning of the tce page size */
0395     num_tce += next & (tce_size - 1);
0396     next &= ~(tce_size - 1);
0397 
0398     /* covert to number of tces */
0399     num_tce |= tce_size - 1;
0400     num_tce >>= tce_shift;
0401 
0402     do {
0403         /*
0404          * Set up the page with TCE data, looping through and setting
0405          * the values.
0406          */
0407         limit = min_t(long, num_tce, 512);
0408         dma_offset = next + be64_to_cpu(maprange->dma_base);
0409 
0410         rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
0411                          dma_offset,
0412                          0, limit);
0413         next += limit * tce_size;
0414         num_tce -= limit;
0415     } while (num_tce > 0 && !rc);
0416 
0417     return rc;
0418 }
0419 
0420 static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
0421                     unsigned long num_pfn, const void *arg)
0422 {
0423     const struct dynamic_dma_window_prop *maprange = arg;
0424     u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
0425     __be64 *tcep;
0426     u32 tce_shift;
0427     u64 rc = 0;
0428     long l, limit;
0429 
0430     if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
0431         unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
0432         unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
0433                 be64_to_cpu(maprange->dma_base);
0434         unsigned long tcenum = dmastart >> tceshift;
0435         unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
0436         void *uaddr = __va(start_pfn << PAGE_SHIFT);
0437 
0438         return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
0439                 tcenum, tceshift, npages, (unsigned long) uaddr,
0440                 DMA_BIDIRECTIONAL, 0);
0441     }
0442 
0443     local_irq_disable();    /* to protect tcep and the page behind it */
0444     tcep = __this_cpu_read(tce_page);
0445 
0446     if (!tcep) {
0447         tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
0448         if (!tcep) {
0449             local_irq_enable();
0450             return -ENOMEM;
0451         }
0452         __this_cpu_write(tce_page, tcep);
0453     }
0454 
0455     proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
0456 
0457     liobn = (u64)be32_to_cpu(maprange->liobn);
0458     tce_shift = be32_to_cpu(maprange->tce_shift);
0459     tce_size = 1ULL << tce_shift;
0460     next = start_pfn << PAGE_SHIFT;
0461     num_tce = num_pfn << PAGE_SHIFT;
0462 
0463     /* round back to the beginning of the tce page size */
0464     num_tce += next & (tce_size - 1);
0465     next &= ~(tce_size - 1);
0466 
0467     /* covert to number of tces */
0468     num_tce |= tce_size - 1;
0469     num_tce >>= tce_shift;
0470 
0471     /* We can map max one pageful of TCEs at a time */
0472     do {
0473         /*
0474          * Set up the page with TCE data, looping through and setting
0475          * the values.
0476          */
0477         limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
0478         dma_offset = next + be64_to_cpu(maprange->dma_base);
0479 
0480         for (l = 0; l < limit; l++) {
0481             tcep[l] = cpu_to_be64(proto_tce | next);
0482             next += tce_size;
0483         }
0484 
0485         rc = plpar_tce_put_indirect(liobn,
0486                         dma_offset,
0487                         (u64)__pa(tcep),
0488                         limit);
0489 
0490         num_tce -= limit;
0491     } while (num_tce > 0 && !rc);
0492 
0493     /* error cleanup: caller will clear whole range */
0494 
0495     local_irq_enable();
0496     return rc;
0497 }
0498 
0499 static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
0500         unsigned long num_pfn, void *arg)
0501 {
0502     return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
0503 }
0504 
0505 static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
0506                     unsigned long liobn, unsigned long win_addr,
0507                     unsigned long window_size, unsigned long page_shift,
0508                     void *base, struct iommu_table_ops *table_ops)
0509 {
0510     tbl->it_busno = busno;
0511     tbl->it_index = liobn;
0512     tbl->it_offset = win_addr >> page_shift;
0513     tbl->it_size = window_size >> page_shift;
0514     tbl->it_page_shift = page_shift;
0515     tbl->it_base = (unsigned long)base;
0516     tbl->it_blocksize = 16;
0517     tbl->it_type = TCE_PCI;
0518     tbl->it_ops = table_ops;
0519 }
0520 
0521 struct iommu_table_ops iommu_table_pseries_ops;
0522 
0523 static void iommu_table_setparms(struct pci_controller *phb,
0524                  struct device_node *dn,
0525                  struct iommu_table *tbl)
0526 {
0527     struct device_node *node;
0528     const unsigned long *basep;
0529     const u32 *sizep;
0530 
0531     /* Test if we are going over 2GB of DMA space */
0532     if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
0533         udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
0534         panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
0535     }
0536 
0537     node = phb->dn;
0538     basep = of_get_property(node, "linux,tce-base", NULL);
0539     sizep = of_get_property(node, "linux,tce-size", NULL);
0540     if (basep == NULL || sizep == NULL) {
0541         printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
0542                 "missing tce entries !\n", dn);
0543         return;
0544     }
0545 
0546     iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
0547                     phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
0548                     __va(*basep), &iommu_table_pseries_ops);
0549 
0550     if (!is_kdump_kernel())
0551         memset((void *)tbl->it_base, 0, *sizep);
0552 
0553     phb->dma_window_base_cur += phb->dma_window_size;
0554 }
0555 
0556 struct iommu_table_ops iommu_table_lpar_multi_ops;
0557 
0558 /*
0559  * iommu_table_setparms_lpar
0560  *
0561  * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
0562  */
0563 static void iommu_table_setparms_lpar(struct pci_controller *phb,
0564                       struct device_node *dn,
0565                       struct iommu_table *tbl,
0566                       struct iommu_table_group *table_group,
0567                       const __be32 *dma_window)
0568 {
0569     unsigned long offset, size, liobn;
0570 
0571     of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
0572 
0573     iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
0574                     &iommu_table_lpar_multi_ops);
0575 
0576 
0577     table_group->tce32_start = offset;
0578     table_group->tce32_size = size;
0579 }
0580 
0581 struct iommu_table_ops iommu_table_pseries_ops = {
0582     .set = tce_build_pSeries,
0583     .clear = tce_free_pSeries,
0584     .get = tce_get_pseries
0585 };
0586 
0587 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
0588 {
0589     struct device_node *dn;
0590     struct iommu_table *tbl;
0591     struct device_node *isa_dn, *isa_dn_orig;
0592     struct device_node *tmp;
0593     struct pci_dn *pci;
0594     int children;
0595 
0596     dn = pci_bus_to_OF_node(bus);
0597 
0598     pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
0599 
0600     if (bus->self) {
0601         /* This is not a root bus, any setup will be done for the
0602          * device-side of the bridge in iommu_dev_setup_pSeries().
0603          */
0604         return;
0605     }
0606     pci = PCI_DN(dn);
0607 
0608     /* Check if the ISA bus on the system is under
0609      * this PHB.
0610      */
0611     isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
0612 
0613     while (isa_dn && isa_dn != dn)
0614         isa_dn = isa_dn->parent;
0615 
0616     of_node_put(isa_dn_orig);
0617 
0618     /* Count number of direct PCI children of the PHB. */
0619     for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
0620         children++;
0621 
0622     pr_debug("Children: %d\n", children);
0623 
0624     /* Calculate amount of DMA window per slot. Each window must be
0625      * a power of two (due to pci_alloc_consistent requirements).
0626      *
0627      * Keep 256MB aside for PHBs with ISA.
0628      */
0629 
0630     if (!isa_dn) {
0631         /* No ISA/IDE - just set window size and return */
0632         pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
0633 
0634         while (pci->phb->dma_window_size * children > 0x80000000ul)
0635             pci->phb->dma_window_size >>= 1;
0636         pr_debug("No ISA/IDE, window size is 0x%llx\n",
0637              pci->phb->dma_window_size);
0638         pci->phb->dma_window_base_cur = 0;
0639 
0640         return;
0641     }
0642 
0643     /* If we have ISA, then we probably have an IDE
0644      * controller too. Allocate a 128MB table but
0645      * skip the first 128MB to avoid stepping on ISA
0646      * space.
0647      */
0648     pci->phb->dma_window_size = 0x8000000ul;
0649     pci->phb->dma_window_base_cur = 0x8000000ul;
0650 
0651     pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
0652     tbl = pci->table_group->tables[0];
0653 
0654     iommu_table_setparms(pci->phb, dn, tbl);
0655 
0656     if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
0657         panic("Failed to initialize iommu table");
0658 
0659     /* Divide the rest (1.75GB) among the children */
0660     pci->phb->dma_window_size = 0x80000000ul;
0661     while (pci->phb->dma_window_size * children > 0x70000000ul)
0662         pci->phb->dma_window_size >>= 1;
0663 
0664     pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
0665 }
0666 
0667 #ifdef CONFIG_IOMMU_API
0668 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
0669                 long *tce, enum dma_data_direction *direction)
0670 {
0671     long rc;
0672     unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
0673     unsigned long flags, oldtce = 0;
0674     u64 proto_tce = iommu_direction_to_tce_perm(*direction);
0675     unsigned long newtce = *tce | proto_tce;
0676 
0677     spin_lock_irqsave(&tbl->large_pool.lock, flags);
0678 
0679     rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
0680     if (!rc)
0681         rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
0682 
0683     if (!rc) {
0684         *direction = iommu_tce_direction(oldtce);
0685         *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
0686     }
0687 
0688     spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
0689 
0690     return rc;
0691 }
0692 #endif
0693 
0694 struct iommu_table_ops iommu_table_lpar_multi_ops = {
0695     .set = tce_buildmulti_pSeriesLP,
0696 #ifdef CONFIG_IOMMU_API
0697     .xchg_no_kill = tce_exchange_pseries,
0698 #endif
0699     .clear = tce_freemulti_pSeriesLP,
0700     .get = tce_get_pSeriesLP
0701 };
0702 
0703 /*
0704  * Find nearest ibm,dma-window (default DMA window) or direct DMA window or
0705  * dynamic 64bit DMA window, walking up the device tree.
0706  */
0707 static struct device_node *pci_dma_find(struct device_node *dn,
0708                     const __be32 **dma_window)
0709 {
0710     const __be32 *dw = NULL;
0711 
0712     for ( ; dn && PCI_DN(dn); dn = dn->parent) {
0713         dw = of_get_property(dn, "ibm,dma-window", NULL);
0714         if (dw) {
0715             if (dma_window)
0716                 *dma_window = dw;
0717             return dn;
0718         }
0719         dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
0720         if (dw)
0721             return dn;
0722         dw = of_get_property(dn, DMA64_PROPNAME, NULL);
0723         if (dw)
0724             return dn;
0725     }
0726 
0727     return NULL;
0728 }
0729 
0730 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
0731 {
0732     struct iommu_table *tbl;
0733     struct device_node *dn, *pdn;
0734     struct pci_dn *ppci;
0735     const __be32 *dma_window = NULL;
0736 
0737     dn = pci_bus_to_OF_node(bus);
0738 
0739     pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
0740          dn);
0741 
0742     pdn = pci_dma_find(dn, &dma_window);
0743 
0744     if (dma_window == NULL)
0745         pr_debug("  no ibm,dma-window property !\n");
0746 
0747     ppci = PCI_DN(pdn);
0748 
0749     pr_debug("  parent is %pOF, iommu_table: 0x%p\n",
0750          pdn, ppci->table_group);
0751 
0752     if (!ppci->table_group) {
0753         ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
0754         tbl = ppci->table_group->tables[0];
0755         if (dma_window) {
0756             iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
0757                           ppci->table_group, dma_window);
0758 
0759             if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
0760                 panic("Failed to initialize iommu table");
0761         }
0762         iommu_register_group(ppci->table_group,
0763                 pci_domain_nr(bus), 0);
0764         pr_debug("  created table: %p\n", ppci->table_group);
0765     }
0766 }
0767 
0768 
0769 static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
0770 {
0771     struct device_node *dn;
0772     struct iommu_table *tbl;
0773 
0774     pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
0775 
0776     dn = dev->dev.of_node;
0777 
0778     /* If we're the direct child of a root bus, then we need to allocate
0779      * an iommu table ourselves. The bus setup code should have setup
0780      * the window sizes already.
0781      */
0782     if (!dev->bus->self) {
0783         struct pci_controller *phb = PCI_DN(dn)->phb;
0784 
0785         pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
0786         PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
0787         tbl = PCI_DN(dn)->table_group->tables[0];
0788         iommu_table_setparms(phb, dn, tbl);
0789 
0790         if (!iommu_init_table(tbl, phb->node, 0, 0))
0791             panic("Failed to initialize iommu table");
0792 
0793         set_iommu_table_base(&dev->dev, tbl);
0794         return;
0795     }
0796 
0797     /* If this device is further down the bus tree, search upwards until
0798      * an already allocated iommu table is found and use that.
0799      */
0800 
0801     while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
0802         dn = dn->parent;
0803 
0804     if (dn && PCI_DN(dn))
0805         set_iommu_table_base(&dev->dev,
0806                 PCI_DN(dn)->table_group->tables[0]);
0807     else
0808         printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
0809                pci_name(dev));
0810 }
0811 
0812 static int __read_mostly disable_ddw;
0813 
0814 static int __init disable_ddw_setup(char *str)
0815 {
0816     disable_ddw = 1;
0817     printk(KERN_INFO "ppc iommu: disabling ddw.\n");
0818 
0819     return 0;
0820 }
0821 
0822 early_param("disable_ddw", disable_ddw_setup);
0823 
0824 static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
0825 {
0826     int ret;
0827 
0828     ret = tce_clearrange_multi_pSeriesLP(0,
0829         1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
0830     if (ret)
0831         pr_warn("%pOF failed to clear tces in window.\n",
0832             np);
0833     else
0834         pr_debug("%pOF successfully cleared tces in window.\n",
0835              np);
0836 }
0837 
0838 /*
0839  * Call only if DMA window is clean.
0840  */
0841 static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
0842 {
0843     int ret;
0844 
0845     ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
0846     if (ret)
0847         pr_warn("%pOF: failed to remove DMA window: rtas returned "
0848             "%d to ibm,remove-pe-dma-window(%x) %llx\n",
0849             np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
0850     else
0851         pr_debug("%pOF: successfully removed DMA window: rtas returned "
0852             "%d to ibm,remove-pe-dma-window(%x) %llx\n",
0853             np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
0854 }
0855 
0856 static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
0857                   struct property *win)
0858 {
0859     struct dynamic_dma_window_prop *dwp;
0860     u64 liobn;
0861 
0862     dwp = win->value;
0863     liobn = (u64)be32_to_cpu(dwp->liobn);
0864 
0865     clean_dma_window(np, dwp);
0866     __remove_dma_window(np, ddw_avail, liobn);
0867 }
0868 
0869 static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
0870 {
0871     struct property *win;
0872     u32 ddw_avail[DDW_APPLICABLE_SIZE];
0873     int ret = 0;
0874 
0875     win = of_find_property(np, win_name, NULL);
0876     if (!win)
0877         return -EINVAL;
0878 
0879     ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
0880                      &ddw_avail[0], DDW_APPLICABLE_SIZE);
0881     if (ret)
0882         return 0;
0883 
0884 
0885     if (win->length >= sizeof(struct dynamic_dma_window_prop))
0886         remove_dma_window(np, ddw_avail, win);
0887 
0888     if (!remove_prop)
0889         return 0;
0890 
0891     ret = of_remove_property(np, win);
0892     if (ret)
0893         pr_warn("%pOF: failed to remove DMA window property: %d\n",
0894             np, ret);
0895     return 0;
0896 }
0897 
0898 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
0899 {
0900     struct dma_win *window;
0901     const struct dynamic_dma_window_prop *dma64;
0902     bool found = false;
0903 
0904     spin_lock(&dma_win_list_lock);
0905     /* check if we already created a window and dupe that config if so */
0906     list_for_each_entry(window, &dma_win_list, list) {
0907         if (window->device == pdn) {
0908             dma64 = window->prop;
0909             *dma_addr = be64_to_cpu(dma64->dma_base);
0910             *window_shift = be32_to_cpu(dma64->window_shift);
0911             found = true;
0912             break;
0913         }
0914     }
0915     spin_unlock(&dma_win_list_lock);
0916 
0917     return found;
0918 }
0919 
0920 static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
0921                       const struct dynamic_dma_window_prop *dma64)
0922 {
0923     struct dma_win *window;
0924 
0925     window = kzalloc(sizeof(*window), GFP_KERNEL);
0926     if (!window)
0927         return NULL;
0928 
0929     window->device = pdn;
0930     window->prop = dma64;
0931 
0932     return window;
0933 }
0934 
0935 static void find_existing_ddw_windows_named(const char *name)
0936 {
0937     int len;
0938     struct device_node *pdn;
0939     struct dma_win *window;
0940     const struct dynamic_dma_window_prop *dma64;
0941 
0942     for_each_node_with_property(pdn, name) {
0943         dma64 = of_get_property(pdn, name, &len);
0944         if (!dma64 || len < sizeof(*dma64)) {
0945             remove_ddw(pdn, true, name);
0946             continue;
0947         }
0948 
0949         window = ddw_list_new_entry(pdn, dma64);
0950         if (!window) {
0951             of_node_put(pdn);
0952             break;
0953         }
0954 
0955         spin_lock(&dma_win_list_lock);
0956         list_add(&window->list, &dma_win_list);
0957         spin_unlock(&dma_win_list_lock);
0958     }
0959 }
0960 
0961 static int find_existing_ddw_windows(void)
0962 {
0963     if (!firmware_has_feature(FW_FEATURE_LPAR))
0964         return 0;
0965 
0966     find_existing_ddw_windows_named(DIRECT64_PROPNAME);
0967     find_existing_ddw_windows_named(DMA64_PROPNAME);
0968 
0969     return 0;
0970 }
0971 machine_arch_initcall(pseries, find_existing_ddw_windows);
0972 
0973 /**
0974  * ddw_read_ext - Get the value of an DDW extension
0975  * @np:     device node from which the extension value is to be read.
0976  * @extnum: index number of the extension.
0977  * @value:  pointer to return value, modified when extension is available.
0978  *
0979  * Checks if "ibm,ddw-extensions" exists for this node, and get the value
0980  * on index 'extnum'.
0981  * It can be used only to check if a property exists, passing value == NULL.
0982  *
0983  * Returns:
0984  *  0 if extension successfully read
0985  *  -EINVAL if the "ibm,ddw-extensions" does not exist,
0986  *  -ENODATA if "ibm,ddw-extensions" does not have a value, and
0987  *  -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
0988  */
0989 static inline int ddw_read_ext(const struct device_node *np, int extnum,
0990                    u32 *value)
0991 {
0992     static const char propname[] = "ibm,ddw-extensions";
0993     u32 count;
0994     int ret;
0995 
0996     ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
0997     if (ret)
0998         return ret;
0999 
1000     if (count < extnum)
1001         return -EOVERFLOW;
1002 
1003     if (!value)
1004         value = &count;
1005 
1006     return of_property_read_u32_index(np, propname, extnum, value);
1007 }
1008 
1009 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
1010              struct ddw_query_response *query,
1011              struct device_node *parent)
1012 {
1013     struct device_node *dn;
1014     struct pci_dn *pdn;
1015     u32 cfg_addr, ext_query, query_out[5];
1016     u64 buid;
1017     int ret, out_sz;
1018 
1019     /*
1020      * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
1021      * output parameters ibm,query-pe-dma-windows will have, ranging from
1022      * 5 to 6.
1023      */
1024     ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
1025     if (!ret && ext_query == 1)
1026         out_sz = 6;
1027     else
1028         out_sz = 5;
1029 
1030     /*
1031      * Get the config address and phb buid of the PE window.
1032      * Rely on eeh to retrieve this for us.
1033      * Retrieve them from the pci device, not the node with the
1034      * dma-window property
1035      */
1036     dn = pci_device_to_OF_node(dev);
1037     pdn = PCI_DN(dn);
1038     buid = pdn->phb->buid;
1039     cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
1040 
1041     ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
1042             cfg_addr, BUID_HI(buid), BUID_LO(buid));
1043 
1044     switch (out_sz) {
1045     case 5:
1046         query->windows_available = query_out[0];
1047         query->largest_available_block = query_out[1];
1048         query->page_size = query_out[2];
1049         query->migration_capable = query_out[3];
1050         break;
1051     case 6:
1052         query->windows_available = query_out[0];
1053         query->largest_available_block = ((u64)query_out[1] << 32) |
1054                          query_out[2];
1055         query->page_size = query_out[3];
1056         query->migration_capable = query_out[4];
1057         break;
1058     }
1059 
1060     dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n",
1061          ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
1062          BUID_LO(buid), ret, query->largest_available_block,
1063          query->page_size, query->windows_available);
1064 
1065     return ret;
1066 }
1067 
1068 static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
1069             struct ddw_create_response *create, int page_shift,
1070             int window_shift)
1071 {
1072     struct device_node *dn;
1073     struct pci_dn *pdn;
1074     u32 cfg_addr;
1075     u64 buid;
1076     int ret;
1077 
1078     /*
1079      * Get the config address and phb buid of the PE window.
1080      * Rely on eeh to retrieve this for us.
1081      * Retrieve them from the pci device, not the node with the
1082      * dma-window property
1083      */
1084     dn = pci_device_to_OF_node(dev);
1085     pdn = PCI_DN(dn);
1086     buid = pdn->phb->buid;
1087     cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
1088 
1089     do {
1090         /* extra outputs are LIOBN and dma-addr (hi, lo) */
1091         ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
1092                 (u32 *)create, cfg_addr, BUID_HI(buid),
1093                 BUID_LO(buid), page_shift, window_shift);
1094     } while (rtas_busy_delay(ret));
1095     dev_info(&dev->dev,
1096         "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
1097         "(liobn = 0x%x starting addr = %x %x)\n",
1098          ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
1099          BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
1100          create->addr_hi, create->addr_lo);
1101 
1102     return ret;
1103 }
1104 
1105 struct failed_ddw_pdn {
1106     struct device_node *pdn;
1107     struct list_head list;
1108 };
1109 
1110 static LIST_HEAD(failed_ddw_pdn_list);
1111 
1112 static phys_addr_t ddw_memory_hotplug_max(void)
1113 {
1114     phys_addr_t max_addr = memory_hotplug_max();
1115     struct device_node *memory;
1116 
1117     for_each_node_by_type(memory, "memory") {
1118         unsigned long start, size;
1119         int n_mem_addr_cells, n_mem_size_cells, len;
1120         const __be32 *memcell_buf;
1121 
1122         memcell_buf = of_get_property(memory, "reg", &len);
1123         if (!memcell_buf || len <= 0)
1124             continue;
1125 
1126         n_mem_addr_cells = of_n_addr_cells(memory);
1127         n_mem_size_cells = of_n_size_cells(memory);
1128 
1129         start = of_read_number(memcell_buf, n_mem_addr_cells);
1130         memcell_buf += n_mem_addr_cells;
1131         size = of_read_number(memcell_buf, n_mem_size_cells);
1132         memcell_buf += n_mem_size_cells;
1133 
1134         max_addr = max_t(phys_addr_t, max_addr, start + size);
1135     }
1136 
1137     return max_addr;
1138 }
1139 
1140 /*
1141  * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
1142  * ibm,ddw-extensions, which carries the rtas token for
1143  * ibm,reset-pe-dma-windows.
1144  * That rtas-call can be used to restore the default DMA window for the device.
1145  */
1146 static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
1147 {
1148     int ret;
1149     u32 cfg_addr, reset_dma_win;
1150     u64 buid;
1151     struct device_node *dn;
1152     struct pci_dn *pdn;
1153 
1154     ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
1155     if (ret)
1156         return;
1157 
1158     dn = pci_device_to_OF_node(dev);
1159     pdn = PCI_DN(dn);
1160     buid = pdn->phb->buid;
1161     cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
1162 
1163     ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
1164             BUID_LO(buid));
1165     if (ret)
1166         dev_info(&dev->dev,
1167              "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
1168              reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
1169              ret);
1170 }
1171 
1172 /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */
1173 static int iommu_get_page_shift(u32 query_page_size)
1174 {
1175     /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */
1176     const int shift[] = {
1177         __builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),
1178         __builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),
1179         __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)
1180     };
1181 
1182     int i = ARRAY_SIZE(shift) - 1;
1183     int ret = 0;
1184 
1185     /*
1186      * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:
1187      * - bit 31 means 4k pages are supported,
1188      * - bit 30 means 64k pages are supported, and so on.
1189      * Larger pagesizes map more memory with the same amount of TCEs, so start probing them.
1190      */
1191     for (; i >= 0 ; i--) {
1192         if (query_page_size & (1 << i))
1193             ret = max(ret, shift[i]);
1194     }
1195 
1196     return ret;
1197 }
1198 
1199 static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
1200                         u32 page_shift, u32 window_shift)
1201 {
1202     struct dynamic_dma_window_prop *ddwprop;
1203     struct property *win64;
1204 
1205     win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
1206     if (!win64)
1207         return NULL;
1208 
1209     win64->name = kstrdup(propname, GFP_KERNEL);
1210     ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
1211     win64->value = ddwprop;
1212     win64->length = sizeof(*ddwprop);
1213     if (!win64->name || !win64->value) {
1214         kfree(win64->name);
1215         kfree(win64->value);
1216         kfree(win64);
1217         return NULL;
1218     }
1219 
1220     ddwprop->liobn = cpu_to_be32(liobn);
1221     ddwprop->dma_base = cpu_to_be64(dma_addr);
1222     ddwprop->tce_shift = cpu_to_be32(page_shift);
1223     ddwprop->window_shift = cpu_to_be32(window_shift);
1224 
1225     return win64;
1226 }
1227 
1228 /*
1229  * If the PE supports dynamic dma windows, and there is space for a table
1230  * that can map all pages in a linear offset, then setup such a table,
1231  * and record the dma-offset in the struct device.
1232  *
1233  * dev: the pci device we are checking
1234  * pdn: the parent pe node with the ibm,dma_window property
1235  * Future: also check if we can remap the base window for our base page size
1236  *
1237  * returns true if can map all pages (direct mapping), false otherwise..
1238  */
1239 static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
1240 {
1241     int len = 0, ret;
1242     int max_ram_len = order_base_2(ddw_memory_hotplug_max());
1243     struct ddw_query_response query;
1244     struct ddw_create_response create;
1245     int page_shift;
1246     u64 win_addr;
1247     const char *win_name;
1248     struct device_node *dn;
1249     u32 ddw_avail[DDW_APPLICABLE_SIZE];
1250     struct dma_win *window;
1251     struct property *win64;
1252     struct failed_ddw_pdn *fpdn;
1253     bool default_win_removed = false, direct_mapping = false;
1254     bool pmem_present;
1255     struct pci_dn *pci = PCI_DN(pdn);
1256     struct property *default_win = NULL;
1257 
1258     dn = of_find_node_by_type(NULL, "ibm,pmemory");
1259     pmem_present = dn != NULL;
1260     of_node_put(dn);
1261 
1262     mutex_lock(&dma_win_init_mutex);
1263 
1264     if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
1265         direct_mapping = (len >= max_ram_len);
1266         goto out_unlock;
1267     }
1268 
1269     /*
1270      * If we already went through this for a previous function of
1271      * the same device and failed, we don't want to muck with the
1272      * DMA window again, as it will race with in-flight operations
1273      * and can lead to EEHs. The above mutex protects access to the
1274      * list.
1275      */
1276     list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
1277         if (fpdn->pdn == pdn)
1278             goto out_unlock;
1279     }
1280 
1281     /*
1282      * the ibm,ddw-applicable property holds the tokens for:
1283      * ibm,query-pe-dma-window
1284      * ibm,create-pe-dma-window
1285      * ibm,remove-pe-dma-window
1286      * for the given node in that order.
1287      * the property is actually in the parent, not the PE
1288      */
1289     ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
1290                      &ddw_avail[0], DDW_APPLICABLE_SIZE);
1291     if (ret)
1292         goto out_failed;
1293 
1294        /*
1295      * Query if there is a second window of size to map the
1296      * whole partition.  Query returns number of windows, largest
1297      * block assigned to PE (partition endpoint), and two bitmasks
1298      * of page sizes: supported and supported for migrate-dma.
1299      */
1300     dn = pci_device_to_OF_node(dev);
1301     ret = query_ddw(dev, ddw_avail, &query, pdn);
1302     if (ret != 0)
1303         goto out_failed;
1304 
1305     /*
1306      * If there is no window available, remove the default DMA window,
1307      * if it's present. This will make all the resources available to the
1308      * new DDW window.
1309      * If anything fails after this, we need to restore it, so also check
1310      * for extensions presence.
1311      */
1312     if (query.windows_available == 0) {
1313         int reset_win_ext;
1314 
1315         /* DDW + IOMMU on single window may fail if there is any allocation */
1316         if (iommu_table_in_use(pci->table_group->tables[0])) {
1317             dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
1318             goto out_failed;
1319         }
1320 
1321         default_win = of_find_property(pdn, "ibm,dma-window", NULL);
1322         if (!default_win)
1323             goto out_failed;
1324 
1325         reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
1326         if (reset_win_ext)
1327             goto out_failed;
1328 
1329         remove_dma_window(pdn, ddw_avail, default_win);
1330         default_win_removed = true;
1331 
1332         /* Query again, to check if the window is available */
1333         ret = query_ddw(dev, ddw_avail, &query, pdn);
1334         if (ret != 0)
1335             goto out_failed;
1336 
1337         if (query.windows_available == 0) {
1338             /* no windows are available for this device. */
1339             dev_dbg(&dev->dev, "no free dynamic windows");
1340             goto out_failed;
1341         }
1342     }
1343 
1344     page_shift = iommu_get_page_shift(query.page_size);
1345     if (!page_shift) {
1346         dev_dbg(&dev->dev, "no supported page size in mask %x",
1347             query.page_size);
1348         goto out_failed;
1349     }
1350 
1351 
1352     /*
1353      * The "ibm,pmemory" can appear anywhere in the address space.
1354      * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
1355      * for the upper limit and fallback to max RAM otherwise but this
1356      * disables device::dma_ops_bypass.
1357      */
1358     len = max_ram_len;
1359     if (pmem_present) {
1360         if (query.largest_available_block >=
1361             (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
1362             len = MAX_PHYSMEM_BITS;
1363         else
1364             dev_info(&dev->dev, "Skipping ibm,pmemory");
1365     }
1366 
1367     /* check if the available block * number of ptes will map everything */
1368     if (query.largest_available_block < (1ULL << (len - page_shift))) {
1369         dev_dbg(&dev->dev,
1370             "can't map partition max 0x%llx with %llu %llu-sized pages\n",
1371             1ULL << len,
1372             query.largest_available_block,
1373             1ULL << page_shift);
1374 
1375         len = order_base_2(query.largest_available_block << page_shift);
1376         win_name = DMA64_PROPNAME;
1377     } else {
1378         direct_mapping = !default_win_removed ||
1379             (len == MAX_PHYSMEM_BITS) ||
1380             (!pmem_present && (len == max_ram_len));
1381         win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;
1382     }
1383 
1384     ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
1385     if (ret != 0)
1386         goto out_failed;
1387 
1388     dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
1389           create.liobn, dn);
1390 
1391     win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
1392     win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
1393 
1394     if (!win64) {
1395         dev_info(&dev->dev,
1396              "couldn't allocate property, property name, or value\n");
1397         goto out_remove_win;
1398     }
1399 
1400     ret = of_add_property(pdn, win64);
1401     if (ret) {
1402         dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
1403             pdn, ret);
1404         goto out_free_prop;
1405     }
1406 
1407     window = ddw_list_new_entry(pdn, win64->value);
1408     if (!window)
1409         goto out_del_prop;
1410 
1411     if (direct_mapping) {
1412         /* DDW maps the whole partition, so enable direct DMA mapping */
1413         ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
1414                         win64->value, tce_setrange_multi_pSeriesLP_walk);
1415         if (ret) {
1416             dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
1417                  dn, ret);
1418 
1419             /* Make sure to clean DDW if any TCE was set*/
1420             clean_dma_window(pdn, win64->value);
1421             goto out_del_list;
1422         }
1423     } else {
1424         struct iommu_table *newtbl;
1425         int i;
1426         unsigned long start = 0, end = 0;
1427 
1428         for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
1429             const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
1430 
1431             /* Look for MMIO32 */
1432             if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
1433                 start = pci->phb->mem_resources[i].start;
1434                 end = pci->phb->mem_resources[i].end;
1435                 break;
1436             }
1437         }
1438 
1439         /* New table for using DDW instead of the default DMA window */
1440         newtbl = iommu_pseries_alloc_table(pci->phb->node);
1441         if (!newtbl) {
1442             dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
1443             goto out_del_list;
1444         }
1445 
1446         iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
1447                         1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
1448         iommu_init_table(newtbl, pci->phb->node, start, end);
1449 
1450         pci->table_group->tables[1] = newtbl;
1451 
1452         set_iommu_table_base(&dev->dev, newtbl);
1453     }
1454 
1455     if (default_win_removed) {
1456         iommu_tce_table_put(pci->table_group->tables[0]);
1457         pci->table_group->tables[0] = NULL;
1458 
1459         /* default_win is valid here because default_win_removed == true */
1460         of_remove_property(pdn, default_win);
1461         dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);
1462     }
1463 
1464     spin_lock(&dma_win_list_lock);
1465     list_add(&window->list, &dma_win_list);
1466     spin_unlock(&dma_win_list_lock);
1467 
1468     dev->dev.archdata.dma_offset = win_addr;
1469     goto out_unlock;
1470 
1471 out_del_list:
1472     kfree(window);
1473 
1474 out_del_prop:
1475     of_remove_property(pdn, win64);
1476 
1477 out_free_prop:
1478     kfree(win64->name);
1479     kfree(win64->value);
1480     kfree(win64);
1481 
1482 out_remove_win:
1483     /* DDW is clean, so it's ok to call this directly. */
1484     __remove_dma_window(pdn, ddw_avail, create.liobn);
1485 
1486 out_failed:
1487     if (default_win_removed)
1488         reset_dma_window(dev, pdn);
1489 
1490     fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
1491     if (!fpdn)
1492         goto out_unlock;
1493     fpdn->pdn = pdn;
1494     list_add(&fpdn->list, &failed_ddw_pdn_list);
1495 
1496 out_unlock:
1497     mutex_unlock(&dma_win_init_mutex);
1498 
1499     /*
1500      * If we have persistent memory and the window size is only as big
1501      * as RAM, then we failed to create a window to cover persistent
1502      * memory and need to set the DMA limit.
1503      */
1504     if (pmem_present && direct_mapping && len == max_ram_len)
1505         dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
1506 
1507     return direct_mapping;
1508 }
1509 
1510 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1511 {
1512     struct device_node *pdn, *dn;
1513     struct iommu_table *tbl;
1514     const __be32 *dma_window = NULL;
1515     struct pci_dn *pci;
1516 
1517     pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
1518 
1519     /* dev setup for LPAR is a little tricky, since the device tree might
1520      * contain the dma-window properties per-device and not necessarily
1521      * for the bus. So we need to search upwards in the tree until we
1522      * either hit a dma-window property, OR find a parent with a table
1523      * already allocated.
1524      */
1525     dn = pci_device_to_OF_node(dev);
1526     pr_debug("  node is %pOF\n", dn);
1527 
1528     pdn = pci_dma_find(dn, &dma_window);
1529     if (!pdn || !PCI_DN(pdn)) {
1530         printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
1531                "no DMA window found for pci dev=%s dn=%pOF\n",
1532                  pci_name(dev), dn);
1533         return;
1534     }
1535     pr_debug("  parent is %pOF\n", pdn);
1536 
1537     pci = PCI_DN(pdn);
1538     if (!pci->table_group) {
1539         pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
1540         tbl = pci->table_group->tables[0];
1541         iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1542                 pci->table_group, dma_window);
1543 
1544         iommu_init_table(tbl, pci->phb->node, 0, 0);
1545         iommu_register_group(pci->table_group,
1546                 pci_domain_nr(pci->phb->bus), 0);
1547         pr_debug("  created table: %p\n", pci->table_group);
1548     } else {
1549         pr_debug("  found DMA window, table: %p\n", pci->table_group);
1550     }
1551 
1552     set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
1553     iommu_add_device(pci->table_group, &dev->dev);
1554 }
1555 
1556 static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
1557 {
1558     struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
1559 
1560     /* only attempt to use a new window if 64-bit DMA is requested */
1561     if (dma_mask < DMA_BIT_MASK(64))
1562         return false;
1563 
1564     dev_dbg(&pdev->dev, "node is %pOF\n", dn);
1565 
1566     /*
1567      * the device tree might contain the dma-window properties
1568      * per-device and not necessarily for the bus. So we need to
1569      * search upwards in the tree until we either hit a dma-window
1570      * property, OR find a parent with a table already allocated.
1571      */
1572     pdn = pci_dma_find(dn, NULL);
1573     if (pdn && PCI_DN(pdn))
1574         return enable_ddw(pdev, pdn);
1575 
1576     return false;
1577 }
1578 
1579 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
1580         void *data)
1581 {
1582     struct dma_win *window;
1583     struct memory_notify *arg = data;
1584     int ret = 0;
1585 
1586     switch (action) {
1587     case MEM_GOING_ONLINE:
1588         spin_lock(&dma_win_list_lock);
1589         list_for_each_entry(window, &dma_win_list, list) {
1590             ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
1591                     arg->nr_pages, window->prop);
1592             /* XXX log error */
1593         }
1594         spin_unlock(&dma_win_list_lock);
1595         break;
1596     case MEM_CANCEL_ONLINE:
1597     case MEM_OFFLINE:
1598         spin_lock(&dma_win_list_lock);
1599         list_for_each_entry(window, &dma_win_list, list) {
1600             ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
1601                     arg->nr_pages, window->prop);
1602             /* XXX log error */
1603         }
1604         spin_unlock(&dma_win_list_lock);
1605         break;
1606     default:
1607         break;
1608     }
1609     if (ret && action != MEM_CANCEL_ONLINE)
1610         return NOTIFY_BAD;
1611 
1612     return NOTIFY_OK;
1613 }
1614 
1615 static struct notifier_block iommu_mem_nb = {
1616     .notifier_call = iommu_mem_notifier,
1617 };
1618 
1619 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
1620 {
1621     int err = NOTIFY_OK;
1622     struct of_reconfig_data *rd = data;
1623     struct device_node *np = rd->dn;
1624     struct pci_dn *pci = PCI_DN(np);
1625     struct dma_win *window;
1626 
1627     switch (action) {
1628     case OF_RECONFIG_DETACH_NODE:
1629         /*
1630          * Removing the property will invoke the reconfig
1631          * notifier again, which causes dead-lock on the
1632          * read-write semaphore of the notifier chain. So
1633          * we have to remove the property when releasing
1634          * the device node.
1635          */
1636         if (remove_ddw(np, false, DIRECT64_PROPNAME))
1637             remove_ddw(np, false, DMA64_PROPNAME);
1638 
1639         if (pci && pci->table_group)
1640             iommu_pseries_free_group(pci->table_group,
1641                     np->full_name);
1642 
1643         spin_lock(&dma_win_list_lock);
1644         list_for_each_entry(window, &dma_win_list, list) {
1645             if (window->device == np) {
1646                 list_del(&window->list);
1647                 kfree(window);
1648                 break;
1649             }
1650         }
1651         spin_unlock(&dma_win_list_lock);
1652         break;
1653     default:
1654         err = NOTIFY_DONE;
1655         break;
1656     }
1657     return err;
1658 }
1659 
1660 static struct notifier_block iommu_reconfig_nb = {
1661     .notifier_call = iommu_reconfig_notifier,
1662 };
1663 
1664 /* These are called very early. */
1665 void __init iommu_init_early_pSeries(void)
1666 {
1667     if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
1668         return;
1669 
1670     if (firmware_has_feature(FW_FEATURE_LPAR)) {
1671         pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1672         pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1673         if (!disable_ddw)
1674             pseries_pci_controller_ops.iommu_bypass_supported =
1675                 iommu_bypass_supported_pSeriesLP;
1676     } else {
1677         pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
1678         pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
1679     }
1680 
1681 
1682     of_reconfig_notifier_register(&iommu_reconfig_nb);
1683     register_memory_notifier(&iommu_mem_nb);
1684 
1685     set_pci_dma_ops(&dma_iommu_ops);
1686 }
1687 
1688 static int __init disable_multitce(char *str)
1689 {
1690     if (strcmp(str, "off") == 0 &&
1691         firmware_has_feature(FW_FEATURE_LPAR) &&
1692         (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||
1693          firmware_has_feature(FW_FEATURE_STUFF_TCE))) {
1694         printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
1695         powerpc_firmware_features &=
1696             ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);
1697     }
1698     return 1;
1699 }
1700 
1701 __setup("multitce=", disable_multitce);
1702 
1703 static int tce_iommu_bus_notifier(struct notifier_block *nb,
1704         unsigned long action, void *data)
1705 {
1706     struct device *dev = data;
1707 
1708     switch (action) {
1709     case BUS_NOTIFY_DEL_DEVICE:
1710         iommu_del_device(dev);
1711         return 0;
1712     default:
1713         return 0;
1714     }
1715 }
1716 
1717 static struct notifier_block tce_iommu_bus_nb = {
1718     .notifier_call = tce_iommu_bus_notifier,
1719 };
1720 
1721 static int __init tce_iommu_bus_notifier_init(void)
1722 {
1723     bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
1724     return 0;
1725 }
1726 machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);