powerpc/pnv/pci: Fix incorrect PE reservation attempt on some 64-bit BARs
[cascardo/linux.git] / arch / powerpc / platforms / powernv / pci-ioda.c
index c5baaf3..fd9444f 100644 (file)
 #include "powernv.h"
 #include "pci.h"
 
-/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE       ((0x10000000 / 0x1000) * 8)
+#define PNV_IODA1_M64_NUM      16      /* Number of M64 BARs   */
+#define PNV_IODA1_M64_SEGS     8       /* Segments per M64 BAR */
+#define PNV_IODA1_DMA32_SEGSIZE        0x10000000
 
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
 #define POWERNV_IOMMU_MAX_LEVELS       5
 
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU" };
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
-static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
                            const char *fmt, ...)
 {
        struct va_format vaf;
@@ -87,13 +89,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
        va_end(args);
 }
 
-#define pe_err(pe, fmt, ...)                                   \
-       pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
-#define pe_warn(pe, fmt, ...)                                  \
-       pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
-#define pe_info(pe, fmt, ...)                                  \
-       pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
-
 static bool pnv_iommu_bypass_disabled __read_mostly;
 
 static int __init iommu_setup(char *str)
@@ -116,15 +111,30 @@ static int __init iommu_setup(char *str)
 }
 early_param("iommu", iommu_setup);
 
-static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+       /*
+        * WARNING: We cannot rely on the resource flags. The Linux PCI
+        * allocation code sometimes decides to put a 64-bit prefetchable
+        * BAR in the 32-bit window, so we have to compare the addresses.
+        *
+        * For simplicity we only test resource start.
+        */
+       return (r->start >= phb->ioda.m64_base &&
+               r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 {
-       return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
-               (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
+       phb->ioda.pe_array[pe_no].phb = phb;
+       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+
+       return &phb->ioda.pe_array[pe_no];
 }
 
 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 {
-       if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+       if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
                pr_warn("%s: Invalid PE %d on PHB#%x\n",
                        __func__, pe_no, phb->hose->global_number);
                return;
@@ -134,32 +144,29 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
                pr_debug("%s: PE %d was reserved on PHB#%x\n",
                         __func__, pe_no, phb->hose->global_number);
 
-       phb->ioda.pe_array[pe_no].phb = phb;
-       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+       pnv_ioda_init_pe(phb, pe_no);
 }
 
-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
-       unsigned long pe;
+       unsigned long pe = phb->ioda.total_pe_num - 1;
 
-       do {
-               pe = find_next_zero_bit(phb->ioda.pe_alloc,
-                                       phb->ioda.total_pe, 0);
-               if (pe >= phb->ioda.total_pe)
-                       return IODA_INVALID_PE;
-       } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
-
-       phb->ioda.pe_array[pe].phb = phb;
-       phb->ioda.pe_array[pe].pe_number = pe;
-       return pe;
+       for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
+               if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
+                       return pnv_ioda_init_pe(phb, pe);
+       }
+
+       return NULL;
 }
 
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 {
-       WARN_ON(phb->ioda.pe_array[pe].pdev);
+       struct pnv_phb *phb = pe->phb;
+
+       WARN_ON(pe->pdev);
 
-       memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
-       clear_bit(pe, phb->ioda.pe_alloc);
+       memset(pe, 0, sizeof(struct pnv_ioda_pe));
+       clear_bit(pe->pe_number, phb->ioda.pe_alloc);
 }
 
 /* The default M64 BAR is shared by all PEs */
@@ -191,21 +198,18 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
                goto fail;
        }
 
-       /* Mark the M64 BAR assigned */
-       set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
-
        /*
-        * Strip off the segment used by the reserved PE, which is
-        * expected to be 0 or last one of PE capabicity.
+        * Exclude the segments for reserved and root bus PE, which
+        * are first or last two PEs.
         */
        r = &phb->hose->mem_resources[1];
-       if (phb->ioda.reserved_pe == 0)
-               r->start += phb->ioda.m64_segsize;
-       else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
-               r->end -= phb->ioda.m64_segsize;
+       if (phb->ioda.reserved_pe_idx == 0)
+               r->start += (2 * phb->ioda.m64_segsize);
+       else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+               r->end -= (2 * phb->ioda.m64_segsize);
        else
                pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
-                       phb->ioda.reserved_pe);
+                       phb->ioda.reserved_pe_idx);
 
        return 0;
 
@@ -219,7 +223,7 @@ fail:
        return -EIO;
 }
 
-static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
                                         unsigned long *pe_bitmap)
 {
        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -232,7 +236,7 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
        sgsz = phb->ioda.m64_segsize;
        for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
                r = &pdev->resource[i];
-               if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags))
+               if (!r->parent || !pnv_pci_is_m64(phb, r))
                        continue;
 
                start = _ALIGN_DOWN(r->start - base, sgsz);
@@ -246,22 +250,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
        }
 }
 
-static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus,
-                                    unsigned long *pe_bitmap,
-                                    bool all)
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+       struct resource *r;
+       int index;
+
+       /*
+        * There are 16 M64 BARs, each of which has 8 segments. So
+        * there are as many M64 segments as the maximum number of
+        * PEs, which is 128.
+        */
+       for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+               unsigned long base, segsz = phb->ioda.m64_segsize;
+               int64_t rc;
+
+               base = phb->ioda.m64_base +
+                      index * PNV_IODA1_M64_SEGS * segsz;
+               rc = opal_pci_set_phb_mem_window(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index, base, 0,
+                               PNV_IODA1_M64_SEGS * segsz);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld setting M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+
+               rc = opal_pci_phb_mmio_enable(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index,
+                               OPAL_ENABLE_M64_SPLIT);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld enabling M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+       }
+
+       /*
+        * Exclude the segments for reserved and root bus PE, which
+        * are first or last two PEs.
+        */
+       r = &phb->hose->mem_resources[1];
+       if (phb->ioda.reserved_pe_idx == 0)
+               r->start += (2 * phb->ioda.m64_segsize);
+       else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+               r->end -= (2 * phb->ioda.m64_segsize);
+       else
+               WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
+                    phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+       return 0;
+
+fail:
+       for ( ; index >= 0; index--)
+               opal_pci_phb_mmio_enable(phb->opal_id,
+                       OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+       return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+                                   unsigned long *pe_bitmap,
+                                   bool all)
 {
        struct pci_dev *pdev;
 
        list_for_each_entry(pdev, &bus->devices, bus_list) {
-               pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap);
+               pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
 
                if (all && pdev->subordinate)
-                       pnv_ioda2_reserve_m64_pe(pdev->subordinate,
-                                                pe_bitmap, all);
+                       pnv_ioda_reserve_m64_pe(pdev->subordinate,
+                                               pe_bitmap, all);
        }
 }
 
-static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 {
        struct pci_controller *hose = pci_bus_to_host(bus);
        struct pnv_phb *phb = hose->private_data;
@@ -271,28 +333,28 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
 
        /* Root bus shouldn't use M64 */
        if (pci_is_root_bus(bus))
-               return IODA_INVALID_PE;
+               return NULL;
 
        /* Allocate bitmap */
-       size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+       size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
        pe_alloc = kzalloc(size, GFP_KERNEL);
        if (!pe_alloc) {
                pr_warn("%s: Out of memory !\n",
                        __func__);
-               return IODA_INVALID_PE;
+               return NULL;
        }
 
        /* Figure out reserved PE numbers by the PE */
-       pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all);
+       pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
 
        /*
         * the current bus might not own M64 window and that's all
         * contributed by its child buses. For the case, we needn't
         * pick M64 dependent PE#.
         */
-       if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
+       if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
                kfree(pe_alloc);
-               return IODA_INVALID_PE;
+               return NULL;
        }
 
        /*
@@ -301,10 +363,11 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
         */
        master_pe = NULL;
        i = -1;
-       while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
-               phb->ioda.total_pe) {
+       while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
+               phb->ioda.total_pe_num) {
                pe = &phb->ioda.pe_array[i];
 
+               phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
                if (!master_pe) {
                        pe->flags |= PNV_IODA_PE_MASTER;
                        INIT_LIST_HEAD(&pe->slaves);
@@ -314,10 +377,30 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
                        pe->master = master_pe;
                        list_add_tail(&pe->list, &master_pe->slaves);
                }
+
+               /*
+                * P7IOC supports M64DT, which helps mapping M64 segment
+                * to one particular PE#. However, PHB3 has fixed mapping
+                * between M64 segment and PE#. In order to have same logic
+                * for P7IOC and PHB3, we enforce fixed mapping between M64
+                * segment and PE# on P7IOC.
+                */
+               if (phb->type == PNV_PHB_IODA1) {
+                       int64_t rc;
+
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                                       pe->pe_number, OPAL_M64_WINDOW_TYPE,
+                                       pe->pe_number / PNV_IODA1_M64_SEGS,
+                                       pe->pe_number % PNV_IODA1_M64_SEGS);
+                       if (rc != OPAL_SUCCESS)
+                               pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
+                                       __func__, rc, phb->hose->global_number,
+                                       pe->pe_number);
+               }
        }
 
        kfree(pe_alloc);
-       return master_pe->pe_number;
+       return master_pe;
 }
 
 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -325,11 +408,11 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
        struct pci_controller *hose = phb->hose;
        struct device_node *dn = hose->dn;
        struct resource *res;
+       u32 m64_range[2], i;
        const u32 *r;
        u64 pci_addr;
 
-       /* FIXME: Support M64 for P7IOC */
-       if (phb->type != PNV_PHB_IODA2) {
+       if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
                pr_info("  Not support M64 window\n");
                return;
        }
@@ -346,6 +429,30 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
                return;
        }
 
+       /*
+        * Find the available M64 BAR range and pickup the last one for
+        * covering the whole 64-bits space. We support only one range.
+        */
+       if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
+                                      m64_range, 2)) {
+               /* In absence of the property, assume 0..15 */
+               m64_range[0] = 0;
+               m64_range[1] = 16;
+       }
+       /* We only support 64 bits in our allocator */
+       if (m64_range[1] > 63) {
+               pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
+                       __func__, m64_range[1], phb->hose->global_number);
+               m64_range[1] = 63;
+       }
+       /* Empty range, no m64 */
+       if (m64_range[1] <= m64_range[0]) {
+               pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
+                       __func__, phb->hose->global_number);
+               return;
+       }
+
+       /* Configure M64 informations */
        res = &hose->mem_resources[1];
        res->name = dn->full_name;
        res->start = of_translate_address(dn, r + 2);
@@ -355,17 +462,37 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
        hose->mem_offset[1] = res->start - pci_addr;
 
        phb->ioda.m64_size = resource_size(res);
-       phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
+       phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
        phb->ioda.m64_base = pci_addr;
 
-       pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
-                       res->start, res->end, pci_addr);
+       /* This lines up nicely with the display from processing OF ranges */
+       pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
+               res->start, res->end, pci_addr, m64_range[0],
+               m64_range[0] + m64_range[1] - 1);
+
+       /* Mark all M64 used up by default */
+       phb->ioda.m64_bar_alloc = (unsigned long)-1;
 
        /* Use last M64 BAR to cover M64 window */
-       phb->ioda.m64_bar_idx = 15;
-       phb->init_m64 = pnv_ioda2_init_m64;
-       phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
-       phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
+       m64_range[1]--;
+       phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
+
+       pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
+
+       /* Mark remaining ones free */
+       for (i = m64_range[0]; i < m64_range[1]; i++)
+               clear_bit(i, &phb->ioda.m64_bar_alloc);
+
+       /*
+        * Setup init functions for M64 based on IODA version, IODA3 uses
+        * the IODA2 code.
+        */
+       if (phb->type == PNV_PHB_IODA1)
+               phb->init_m64 = pnv_ioda1_init_m64;
+       else
+               phb->init_m64 = pnv_ioda2_init_m64;
+       phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
+       phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
 }
 
 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -456,7 +583,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
        s64 rc;
 
        /* Sanity check on PE number */
-       if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
+       if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
                return OPAL_EEH_STOPPED_PERM_UNAVAIL;
 
        /*
@@ -514,7 +641,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
  * but in the meantime, we need to protect them to avoid warnings
  */
 #ifdef CONFIG_PCI_MSI
-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
        struct pci_controller *hose = pci_bus_to_host(dev->bus);
        struct pnv_phb *phb = hose->private_data;
@@ -632,7 +759,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
        return 0;
 }
 
-#ifdef CONFIG_PCI_IOV
 static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
        struct pci_dev *parent;
@@ -667,9 +793,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
                }
                rid_end = pe->rid + (count << 8);
        } else {
+#ifdef CONFIG_PCI_IOV
                if (pe->flags & PNV_IODA_PE_VF)
                        parent = pe->parent_dev;
                else
+#endif
                        parent = pe->pdev->bus->self;
                bcomp = OpalPciBusAll;
                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
@@ -679,7 +807,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 
        /* Clear the reverse map */
        for (rid = pe->rid; rid < rid_end; rid++)
-               phb->ioda.pe_rmap[rid] = 0;
+               phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
 
        /* Release from all parents PELT-V */
        while (parent) {
@@ -707,11 +835,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 
        pe->pbus = NULL;
        pe->pdev = NULL;
+#ifdef CONFIG_PCI_IOV
        pe->parent_dev = NULL;
+#endif
 
        return 0;
 }
-#endif /* CONFIG_PCI_IOV */
 
 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
@@ -808,44 +937,6 @@ out:
        return 0;
 }
 
-static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
-                                      struct pnv_ioda_pe *pe)
-{
-       struct pnv_ioda_pe *lpe;
-
-       list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
-               if (lpe->dma_weight < pe->dma_weight) {
-                       list_add_tail(&pe->dma_link, &lpe->dma_link);
-                       return;
-               }
-       }
-       list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
-}
-
-static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
-{
-       /* This is quite simplistic. The "base" weight of a device
-        * is 10. 0 means no DMA is to be accounted for it.
-        */
-
-       /* If it's a bridge, no DMA */
-       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
-               return 0;
-
-       /* Reduce the weight of slow USB controllers */
-       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
-               return 3;
-
-       /* Increase the weight of RAID (includes Obsidian) */
-       if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
-               return 15;
-
-       /* Default */
-       return 10;
-}
-
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 {
@@ -919,7 +1010,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
        struct pnv_phb *phb = hose->private_data;
        struct pci_dn *pdn = pci_get_pdn(dev);
        struct pnv_ioda_pe *pe;
-       int pe_num;
 
        if (!pdn) {
                pr_err("%s: Device tree node not associated properly\n",
@@ -929,8 +1019,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
        if (pdn->pe_number != IODA_INVALID_PE)
                return NULL;
 
-       pe_num = pnv_ioda_alloc_pe(phb);
-       if (pe_num == IODA_INVALID_PE) {
+       pe = pnv_ioda_alloc_pe(phb);
+       if (!pe) {
                pr_warning("%s: Not enough PE# available, disabling device\n",
                           pci_name(dev));
                return NULL;
@@ -943,14 +1033,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
         *
         * At some point we want to remove the PDN completely anyways
         */
-       pe = &phb->ioda.pe_array[pe_num];
        pci_dev_get(dev);
        pdn->pcidev = dev;
-       pdn->pe_number = pe_num;
+       pdn->pe_number = pe->pe_number;
        pe->flags = PNV_IODA_PE_DEV;
        pe->pdev = dev;
        pe->pbus = NULL;
-       pe->tce32_seg = -1;
        pe->mve_number = -1;
        pe->rid = dev->bus->number << 8 | pdn->devfn;
 
@@ -958,23 +1046,15 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
        if (pnv_ioda_configure_pe(phb, pe)) {
                /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                pdn->pe_number = IODA_INVALID_PE;
                pe->pdev = NULL;
                pci_dev_put(dev);
                return NULL;
        }
 
-       /* Assign a DMA weight to the device */
-       pe->dma_weight = pnv_ioda_dma_weight(dev);
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
+       /* Put PE to the list */
+       list_add_tail(&pe->list, &phb->ioda.pe_list);
 
        return pe;
 }
@@ -991,9 +1071,18 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
                                pci_name(dev));
                        continue;
                }
+
+               /*
+                * In partial hotplug case, the PCI device might be still
+                * associated with the PE and needn't attach it to the PE
+                * again.
+                */
+               if (pdn->pe_number != IODA_INVALID_PE)
+                       continue;
+
+               pe->device_count++;
                pdn->pcidev = dev;
                pdn->pe_number = pe->pe_number;
-               pe->dma_weight += pnv_ioda_dma_weight(dev);
                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
                        pnv_ioda_setup_same_PE(dev->subordinate, pe);
        }
@@ -1005,49 +1094,61 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
  * subordinate PCI devices and buses. The second type of PE is normally
  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
  */
-static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 {
        struct pci_controller *hose = pci_bus_to_host(bus);
        struct pnv_phb *phb = hose->private_data;
-       struct pnv_ioda_pe *pe;
-       int pe_num = IODA_INVALID_PE;
+       struct pnv_ioda_pe *pe = NULL;
+       unsigned int pe_num;
+
+       /*
+        * In partial hotplug case, the PE instance might be still alive.
+        * We should reuse it instead of allocating a new one.
+        */
+       pe_num = phb->ioda.pe_rmap[bus->number << 8];
+       if (pe_num != IODA_INVALID_PE) {
+               pe = &phb->ioda.pe_array[pe_num];
+               pnv_ioda_setup_same_PE(bus, pe);
+               return NULL;
+       }
+
+       /* PE number for root bus should have been reserved */
+       if (pci_is_root_bus(bus) &&
+           phb->ioda.root_pe_idx != IODA_INVALID_PE)
+               pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
 
        /* Check if PE is determined by M64 */
-       if (phb->pick_m64_pe)
-               pe_num = phb->pick_m64_pe(bus, all);
+       if (!pe && phb->pick_m64_pe)
+               pe = phb->pick_m64_pe(bus, all);
 
        /* The PE number isn't pinned by M64 */
-       if (pe_num == IODA_INVALID_PE)
-               pe_num = pnv_ioda_alloc_pe(phb);
+       if (!pe)
+               pe = pnv_ioda_alloc_pe(phb);
 
-       if (pe_num == IODA_INVALID_PE) {
+       if (!pe) {
                pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
                        __func__, pci_domain_nr(bus), bus->number);
-               return;
+               return NULL;
        }
 
-       pe = &phb->ioda.pe_array[pe_num];
        pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
        pe->pbus = bus;
        pe->pdev = NULL;
-       pe->tce32_seg = -1;
        pe->mve_number = -1;
        pe->rid = bus->busn_res.start << 8;
-       pe->dma_weight = 0;
 
        if (all)
                pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
-                       bus->busn_res.start, bus->busn_res.end, pe_num);
+                       bus->busn_res.start, bus->busn_res.end, pe->pe_number);
        else
                pe_info(pe, "Secondary bus %d associated with PE#%d\n",
-                       bus->busn_res.start, pe_num);
+                       bus->busn_res.start, pe->pe_number);
 
        if (pnv_ioda_configure_pe(phb, pe)) {
                /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                pe->pbus = NULL;
-               return;
+               return NULL;
        }
 
        /* Associate it with all child devices */
@@ -1056,16 +1157,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
        /* Put PE to the list */
        list_add_tail(&pe->list, &phb->ioda.pe_list);
 
-       /* Account for one DMA PE if at least one DMA capable device exist
-        * below the bridge
-        */
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
+       return pe;
 }
 
 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
@@ -1088,7 +1180,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
         * same GPU get assigned the same PE.
         */
        gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
-       for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) {
+       for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
                pe = &phb->ioda.pe_array[pe_num];
                if (!pe->pdev)
                        continue;
@@ -1106,7 +1198,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
                        rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
                        npu_pdn->pcidev = npu_pdev;
                        npu_pdn->pe_number = pe_num;
-                       pe->dma_weight += pnv_ioda_dma_weight(npu_pdev);
                        phb->ioda.pe_rmap[rid] = pe->pe_number;
 
                        /* Map the PE to this link */
@@ -1139,30 +1230,6 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
                pnv_ioda_setup_npu_PE(pdev);
 }
 
-static void pnv_ioda_setup_PEs(struct pci_bus *bus)
-{
-       struct pci_dev *dev;
-
-       pnv_ioda_setup_bus_PE(bus, false);
-
-       list_for_each_entry(dev, &bus->devices, bus_list) {
-               if (dev->subordinate) {
-                       if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
-                               pnv_ioda_setup_bus_PE(dev->subordinate, true);
-                       else
-                               pnv_ioda_setup_PEs(dev->subordinate);
-               }
-       }
-}
-
-/*
- * Configure PEs so that the downstream PCI buses and devices
- * could have their associated PE#. Unfortunately, we didn't
- * figure out the way to identify the PLX bridge yet. So we
- * simply put the PCI bus and the subordinate behind the root
- * port to PE# here. The game rule here is expected to be changed
- * as soon as we can detected PLX bridge correctly.
- */
 static void pnv_pci_ioda_setup_PEs(void)
 {
        struct pci_controller *hose, *tmp;
@@ -1170,22 +1237,11 @@ static void pnv_pci_ioda_setup_PEs(void)
 
        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
                phb = hose->private_data;
-
-               /* M64 layout might affect PE allocation */
-               if (phb->reserve_m64_pe)
-                       phb->reserve_m64_pe(hose->bus, NULL, true);
-
-               /*
-                * On NPU PHB, we expect separate PEs for individual PCI
-                * functions. PCI bus dependent PEs are required for the
-                * remaining types of PHBs.
-                */
                if (phb->type == PNV_PHB_NPU) {
                        /* PE#0 is needed for error reporting */
                        pnv_ioda_reserve_pe(phb, 0);
                        pnv_ioda_setup_npu_PEs(hose->bus);
-               } else
-                       pnv_ioda_setup_PEs(hose->bus);
+               }
        }
 }
 
@@ -1378,7 +1434,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 
                pnv_ioda_deconfigure_pe(phb, pe);
 
-               pnv_ioda_free_pe(phb, pe->pe_number);
+               pnv_ioda_free_pe(pe);
        }
 }
 
@@ -1387,6 +1443,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
        struct pci_dn         *pdn;
        struct pci_sriov      *iov;
        u16                    num_vfs, i;
@@ -1411,8 +1468,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
                /* Release PE numbers */
                if (pdn->m64_single_mode) {
                        for (i = 0; i < num_vfs; i++) {
-                               if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                                       pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                               if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                                       continue;
+
+                               pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                               pnv_ioda_free_pe(pe);
                        }
                } else
                        bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1454,7 +1514,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
                pe->flags = PNV_IODA_PE_VF;
                pe->pbus = NULL;
                pe->parent_dev = pdev;
-               pe->tce32_seg = -1;
                pe->mve_number = -1;
                pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
                           pci_iov_virtfn_devfn(pdev, vf_index);
@@ -1466,8 +1525,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
                if (pnv_ioda_configure_pe(phb, pe)) {
                        /* XXX What do we do here ? */
-                       if (pe_num)
-                               pnv_ioda_free_pe(phb, pe_num);
+                       pnv_ioda_free_pe(pe);
                        pe->pdev = NULL;
                        continue;
                }
@@ -1486,6 +1544,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
        struct pci_dn         *pdn;
        int                    ret;
        u16                    i;
@@ -1528,18 +1587,20 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
                /* Calculate available PE for required VFs */
                if (pdn->m64_single_mode) {
                        for (i = 0; i < num_vfs; i++) {
-                               pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
-                               if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+                               pe = pnv_ioda_alloc_pe(phb);
+                               if (!pe) {
                                        ret = -EBUSY;
                                        goto m64_failed;
                                }
+
+                               pdn->pe_num_map[i] = pe->pe_number;
                        }
                } else {
                        mutex_lock(&phb->ioda.pe_alloc_mutex);
                        *pdn->pe_num_map = bitmap_find_next_zero_area(
-                               phb->ioda.pe_alloc, phb->ioda.total_pe,
+                               phb->ioda.pe_alloc, phb->ioda.total_pe_num,
                                0, num_vfs, 0);
-                       if (*pdn->pe_num_map >= phb->ioda.total_pe) {
+                       if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
                                mutex_unlock(&phb->ioda.pe_alloc_mutex);
                                dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
                                kfree(pdn->pe_num_map);
@@ -1577,8 +1638,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 m64_failed:
        if (pdn->m64_single_mode) {
                for (i = 0; i < num_vfs; i++) {
-                       if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                               pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                       if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                               continue;
+
+                       pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                       pnv_ioda_free_pe(pe);
                }
        } else
                bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1640,8 +1704,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
        struct pnv_ioda_pe *pe;
        uint64_t top;
        bool bypass = false;
-       struct pci_dev *linked_npu_dev;
-       int i;
 
        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
                return -ENODEV;;
@@ -1662,15 +1724,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
        *pdev->dev.dma_mask = dma_mask;
 
        /* Update peer npu devices */
-       if (pe->flags & PNV_IODA_PE_PEER)
-               for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                       if (!pe->peers[i])
-                               continue;
-
-                       linked_npu_dev = pe->peers[i]->pdev;
-                       if (dma_get_mask(&linked_npu_dev->dev) != dma_mask)
-                               dma_set_mask(&linked_npu_dev->dev, dma_mask);
-               }
+       pnv_npu_try_dma_set_bypass(pdev, bypass);
 
        return 0;
 }
@@ -1713,7 +1767,14 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
        }
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
+                                                    bool real_mode)
+{
+       return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
+               (phb->regs + 0x210);
+}
+
+static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
                unsigned long index, unsigned long npages, bool rm)
 {
        struct iommu_table_group_link *tgl = list_first_entry_or_null(
@@ -1721,33 +1782,17 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
                        next);
        struct pnv_ioda_pe *pe = container_of(tgl->table_group,
                        struct pnv_ioda_pe, table_group);
-       __be64 __iomem *invalidate = rm ?
-               (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
-               pe->phb->ioda.tce_inval_reg;
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
        unsigned long start, end, inc;
-       const unsigned shift = tbl->it_page_shift;
 
        start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
        end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
                        npages - 1);
 
-       /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
-       if (tbl->it_busno) {
-               start <<= shift;
-               end <<= shift;
-               inc = 128ull << shift;
-               start |= tbl->it_busno;
-               end |= tbl->it_busno;
-       } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
-               /* p7ioc-style invalidation, 2 TCEs per write */
-               start |= (1ull << 63);
-               end |= (1ull << 63);
-               inc = 16;
-        } else {
-               /* Default (older HW) */
-                inc = 128;
-       }
-
+       /* p7ioc-style invalidation, 2 TCEs per write */
+       start |= (1ull << 63);
+       end |= (1ull << 63);
+       inc = 16;
         end |= inc - 1;        /* round up end to be different than start */
 
         mb(); /* Ensure above stores are visible */
@@ -1768,13 +1813,13 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
                long npages, unsigned long uaddr,
                enum dma_data_direction direction,
-               struct dma_attrs *attrs)
+               unsigned long attrs)
 {
        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
                        attrs);
 
-       if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
-               pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+       if (!ret)
+               pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
 
        return ret;
 }
@@ -1785,9 +1830,8 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 {
        long ret = pnv_tce_xchg(tbl, index, hpa, direction);
 
-       if (!ret && (tbl->it_type &
-                       (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
-               pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
+       if (!ret)
+               pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
 
        return ret;
 }
@@ -1798,8 +1842,7 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 {
        pnv_tce_free(tbl, index, npages);
 
-       if (tbl->it_type & TCE_PCI_SWINV_FREE)
-               pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+       pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
 }
 
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
@@ -1811,39 +1854,42 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
        .get = pnv_tce_get,
 };
 
-static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
-{
-       /* 01xb - invalidate TCEs that match the specified PE# */
-       unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
-       struct pnv_phb *phb = pe->phb;
-       struct pnv_ioda_pe *npe;
-       int i;
+#define PHB3_TCE_KILL_INVAL_ALL                PPC_BIT(0)
+#define PHB3_TCE_KILL_INVAL_PE         PPC_BIT(1)
+#define PHB3_TCE_KILL_INVAL_ONE                PPC_BIT(2)
 
-       if (!phb->ioda.tce_inval_reg)
-               return;
+void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
+       const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
 
-       mb(); /* Ensure above stores are visible */
-       __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+       mb(); /* Ensure previous TCE table stores are visible */
+       if (rm)
+               __raw_rm_writeq(cpu_to_be64(val), invalidate);
+       else
+               __raw_writeq(cpu_to_be64(val), invalidate);
+}
 
-       if (pe->flags & PNV_IODA_PE_PEER)
-               for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                       npe = pe->peers[i];
-                       if (!npe || npe->phb->type != PNV_PHB_NPU)
-                               continue;
+static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+       /* 01xb - invalidate TCEs that match the specified PE# */
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+       unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
 
-                       pnv_npu_tce_invalidate_entire(npe);
-               }
+       mb(); /* Ensure above stores are visible */
+       __raw_writeq(cpu_to_be64(val), invalidate);
 }
 
-static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
-               __be64 __iomem *invalidate, unsigned shift,
-               unsigned long index, unsigned long npages)
+static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+                                       unsigned shift, unsigned long index,
+                                       unsigned long npages)
 {
+       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
        unsigned long start, end, inc;
 
        /* We'll invalidate DMA address in PE scope */
-       start = 0x2ull << 60;
-       start |= (pe_number & 0xFF);
+       start = PHB3_TCE_KILL_INVAL_ONE;
+       start |= (pe->pe_number & 0xFF);
        end = start;
 
        /* Figure out the start, end and step */
@@ -1861,46 +1907,62 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
        }
 }
 
+static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+       struct pnv_phb *phb = pe->phb;
+
+       if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+               pnv_pci_phb3_tce_invalidate_pe(pe);
+       else
+               opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
+                                 pe->pe_number, 0, 0, 0);
+}
+
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
                unsigned long index, unsigned long npages, bool rm)
 {
        struct iommu_table_group_link *tgl;
 
        list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
-               struct pnv_ioda_pe *npe;
                struct pnv_ioda_pe *pe = container_of(tgl->table_group,
                                struct pnv_ioda_pe, table_group);
-               __be64 __iomem *invalidate = rm ?
-                       (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
-                       pe->phb->ioda.tce_inval_reg;
-               int i;
-
-               pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
-                       invalidate, tbl->it_page_shift,
-                       index, npages);
-
-               if (pe->flags & PNV_IODA_PE_PEER)
-                       /* Invalidate PEs using the same TCE table */
-                       for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                               npe = pe->peers[i];
-                               if (!npe || npe->phb->type != PNV_PHB_NPU)
-                                       continue;
+               struct pnv_phb *phb = pe->phb;
+               unsigned int shift = tbl->it_page_shift;
 
-                               pnv_npu_tce_invalidate(npe, tbl, index,
-                                                       npages, rm);
-                       }
+               if (phb->type == PNV_PHB_NPU) {
+                       /*
+                        * The NVLink hardware does not support TCE kill
+                        * per TCE entry so we have to invalidate
+                        * the entire cache for it.
+                        */
+                       pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+                       continue;
+               }
+               if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
+                       pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+                                                   index, npages);
+               else if (rm)
+                       opal_rm_pci_tce_kill(phb->opal_id,
+                                            OPAL_PCI_TCE_KILL_PAGES,
+                                            pe->pe_number, 1u << shift,
+                                            index << shift, npages);
+               else
+                       opal_pci_tce_kill(phb->opal_id,
+                                         OPAL_PCI_TCE_KILL_PAGES,
+                                         pe->pe_number, 1u << shift,
+                                         index << shift, npages);
        }
 }
 
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
                long npages, unsigned long uaddr,
                enum dma_data_direction direction,
-               struct dma_attrs *attrs)
+               unsigned long attrs)
 {
        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
                        attrs);
 
-       if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+       if (!ret)
                pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 
        return ret;
@@ -1912,8 +1974,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 {
        long ret = pnv_tce_xchg(tbl, index, hpa, direction);
 
-       if (!ret && (tbl->it_type &
-                       (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+       if (!ret)
                pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
 
        return ret;
@@ -1925,8 +1986,7 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 {
        pnv_tce_free(tbl, index, npages);
 
-       if (tbl->it_type & TCE_PCI_SWINV_FREE)
-               pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+       pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
 static void pnv_ioda2_table_free(struct iommu_table *tbl)
@@ -1945,56 +2005,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
        .free = pnv_ioda2_table_free,
 };
 
-static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
-                                     struct pnv_ioda_pe *pe, unsigned int base,
-                                     unsigned int segs)
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+       unsigned int *weight = (unsigned int *)data;
+
+       /* This is quite simplistic. The "base" weight of a device
+        * is 10. 0 means no DMA is to be accounted for it.
+        */
+       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+               return 0;
+
+       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+               *weight += 3;
+       else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+               *weight += 15;
+       else
+               *weight += 10;
+
+       return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+       unsigned int weight = 0;
+
+       /* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+       if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+               pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+               return weight;
+       }
+#endif
+
+       if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+               pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+               struct pci_dev *pdev;
+
+               list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+                       pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+               pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+       }
+
+       return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+                                      struct pnv_ioda_pe *pe)
 {
 
        struct page *tce_mem = NULL;
        struct iommu_table *tbl;
-       unsigned int i;
+       unsigned int weight, total_weight = 0;
+       unsigned int tce32_segsz, base, segs, avail, i;
        int64_t rc;
        void *addr;
 
        /* XXX FIXME: Handle 64-bit only DMA devices */
        /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
        /* XXX FIXME: Allocate multi-level tables on PHB3 */
+       weight = pnv_pci_ioda_pe_dma_weight(pe);
+       if (!weight)
+               return;
+
+       pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+                    &total_weight);
+       segs = (weight * phb->ioda.dma32_count) / total_weight;
+       if (!segs)
+               segs = 1;
+
+       /*
+        * Allocate contiguous DMA32 segments. We begin with the expected
+        * number of segments. With one more attempt, the number of DMA32
+        * segments to be allocated is decreased by one until one segment
+        * is allocated successfully.
+        */
+       do {
+               for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+                       for (avail = 0, i = base; i < base + segs; i++) {
+                               if (phb->ioda.dma32_segmap[i] ==
+                                   IODA_INVALID_PE)
+                                       avail++;
+                       }
 
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
+                       if (avail == segs)
+                               goto found;
+               }
+       } while (--segs);
+
+       if (!segs) {
+               pe_warn(pe, "No available DMA32 segments\n");
                return;
+       }
 
+found:
        tbl = pnv_pci_table_alloc(phb->hose->node);
        iommu_register_group(&pe->table_group, phb->hose->global_number,
                        pe->pe_number);
        pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
        /* Grab a 32-bit TCE table */
-       pe->tce32_seg = base;
+       pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+               weight, total_weight, base, segs);
        pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
-               (base << 28), ((base + segs) << 28) - 1);
+               base * PNV_IODA1_DMA32_SEGSIZE,
+               (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
 
        /* XXX Currently, we allocate one big contiguous table for the
         * TCEs. We only really need one chunk per 256M of TCE space
         * (ie per segment) but that's an optimization for later, it
         * requires some added smarts with our get/put_tce implementation
+        *
+        * Each TCE page is 4KB in size and each TCE entry occupies 8
+        * bytes
         */
+       tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
        tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-                                  get_order(TCE32_TABLE_SIZE * segs));
+                                  get_order(tce32_segsz * segs));
        if (!tce_mem) {
                pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
                goto fail;
        }
        addr = page_address(tce_mem);
-       memset(addr, 0, TCE32_TABLE_SIZE * segs);
+       memset(addr, 0, tce32_segsz * segs);
 
        /* Configure HW */
        for (i = 0; i < segs; i++) {
                rc = opal_pci_map_pe_dma_window(phb->opal_id,
                                              pe->pe_number,
                                              base + i, 1,
-                                             __pa(addr) + TCE32_TABLE_SIZE * i,
-                                             TCE32_TABLE_SIZE, 0x1000);
+                                             __pa(addr) + tce32_segsz * i,
+                                             tce32_segsz, IOMMU_PAGE_SIZE_4K);
                if (rc) {
                        pe_err(pe, " Failed to configure 32-bit TCE table,"
                               " err %ld\n", rc);
@@ -2002,15 +2146,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                }
        }
 
-       /* Setup linux iommu table */
-       pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
-                                 base << 28, IOMMU_PAGE_SHIFT_4K);
+       /* Setup DMA32 segment mapping */
+       for (i = base; i < base + segs; i++)
+               phb->ioda.dma32_segmap[i] = pe->pe_number;
 
-       /* OPAL variant of P7IOC SW invalidated TCEs */
-       if (phb->ioda.tce_inval_reg)
-               tbl->it_type |= (TCE_PCI_SWINV_CREATE |
-                                TCE_PCI_SWINV_FREE   |
-                                TCE_PCI_SWINV_PAIR);
+       /* Setup linux iommu table */
+       pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+                                 base * PNV_IODA1_DMA32_SEGSIZE,
+                                 IOMMU_PAGE_SHIFT_4K);
 
        tbl->it_ops = &pnv_ioda1_iommu_ops;
        pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
@@ -2031,10 +2174,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
        return;
  fail:
        /* XXX Failure: Try to fallback to 64-bit only ? */
-       if (pe->tce32_seg >= 0)
-               pe->tce32_seg = -1;
        if (tce_mem)
-               __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+               __free_pages(tce_mem, get_order(tce32_segsz * segs));
        if (tbl) {
                pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
                iommu_free_table(tbl, "pnv");
@@ -2075,7 +2216,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 
        pnv_pci_link_table_and_group(phb->hose->node, num,
                        tbl, &pe->table_group);
-       pnv_pci_ioda2_tce_invalidate_entire(pe);
+       pnv_pci_phb3_tce_invalidate_pe(pe);
 
        return 0;
 }
@@ -2136,8 +2277,6 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
        }
 
        tbl->it_ops = &pnv_ioda2_iommu_ops;
-       if (pe->phb->ioda.tce_inval_reg)
-               tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 
        *ptbl = tbl;
 
@@ -2186,10 +2325,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
        if (!pnv_iommu_bypass_disabled)
                pnv_pci_ioda2_set_bypass(pe, true);
 
-       /* OPAL variant of PHB3 invalidated TCEs */
-       if (pe->phb->ioda.tce_inval_reg)
-               tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-
        /*
         * Setting table base here only for carrying iommu_group
         * further down to let iommu_add_device() do the job.
@@ -2219,7 +2354,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
        if (ret)
                pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
        else
-               pnv_pci_ioda2_tce_invalidate_entire(pe);
+               pnv_pci_phb3_tce_invalidate_pe(pe);
 
        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
 
@@ -2288,38 +2423,135 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
        .take_ownership = pnv_ioda2_take_ownership,
        .release_ownership = pnv_ioda2_release_ownership,
 };
-#endif
 
-static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
+static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
 {
-       const __be64 *swinvp;
+       struct pci_controller *hose;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe **ptmppe = opaque;
+       struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+       struct pci_dn *pdn = pci_get_pdn(pdev);
 
-       /* OPAL variant of PHB3 invalidated TCEs */
-       swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-       if (!swinvp)
-               return;
+       if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+               return 0;
+
+       hose = pci_bus_to_host(pdev->bus);
+       phb = hose->private_data;
+       if (phb->type != PNV_PHB_NPU)
+               return 0;
 
-       phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp);
-       phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
+       *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
+
+       return 1;
 }
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
-               unsigned levels, unsigned long limit,
-               unsigned long *current_offset, unsigned long *total_allocated)
+/*
+ * This returns PE of associated NPU.
+ * This assumes that NPU is in the same IOMMU group with GPU and there is
+ * no other PEs.
+ */
+static struct pnv_ioda_pe *gpe_table_group_to_npe(
+               struct iommu_table_group *table_group)
 {
-       struct page *tce_mem = NULL;
-       __be64 *addr, *tmp;
-       unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
-       unsigned long allocated = 1UL << (order + PAGE_SHIFT);
-       unsigned entries = 1UL << (shift - 3);
-       long i;
+       struct pnv_ioda_pe *npe = NULL;
+       int ret = iommu_group_for_each_dev(table_group->group, &npe,
+                       gpe_table_group_to_npe_cb);
 
-       tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
-       if (!tce_mem) {
-               pr_err("Failed to allocate a TCE memory, order=%d\n", order);
-               return NULL;
-       }
-       addr = page_address(tce_mem);
+       BUG_ON(!ret || !npe);
+
+       return npe;
+}
+
+static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
+               int num, struct iommu_table *tbl)
+{
+       long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
+
+       if (ret)
+               return ret;
+
+       ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+       if (ret)
+               pnv_pci_ioda2_unset_window(table_group, num);
+
+       return ret;
+}
+
+static long pnv_pci_ioda2_npu_unset_window(
+               struct iommu_table_group *table_group,
+               int num)
+{
+       long ret = pnv_pci_ioda2_unset_window(table_group, num);
+
+       if (ret)
+               return ret;
+
+       return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
+}
+
+static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+{
+       /*
+        * Detach NPU first as pnv_ioda2_take_ownership() will destroy
+        * the iommu_table if 32bit DMA is enabled.
+        */
+       pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
+       pnv_ioda2_take_ownership(table_group);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
+       .get_table_size = pnv_pci_ioda2_get_table_size,
+       .create_table = pnv_pci_ioda2_create_table,
+       .set_window = pnv_pci_ioda2_npu_set_window,
+       .unset_window = pnv_pci_ioda2_npu_unset_window,
+       .take_ownership = pnv_ioda2_npu_take_ownership,
+       .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static void pnv_pci_ioda_setup_iommu_api(void)
+{
+       struct pci_controller *hose, *tmp;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe *pe, *gpe;
+
+       /*
+        * Now we have all PHBs discovered, time to add NPU devices to
+        * the corresponding IOMMU groups.
+        */
+       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+               phb = hose->private_data;
+
+               if (phb->type != PNV_PHB_NPU)
+                       continue;
+
+               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+                       gpe = pnv_pci_npu_setup_iommu(pe);
+                       if (gpe)
+                               gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+               }
+       }
+}
+#else /* !CONFIG_IOMMU_API */
+static void pnv_pci_ioda_setup_iommu_api(void) { };
+#endif
+
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+               unsigned levels, unsigned long limit,
+               unsigned long *current_offset, unsigned long *total_allocated)
+{
+       struct page *tce_mem = NULL;
+       __be64 *addr, *tmp;
+       unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+       unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+       unsigned entries = 1UL << (shift - 3);
+       long i;
+
+       tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+       if (!tce_mem) {
+               pr_err("Failed to allocate a TCE memory, order=%d\n", order);
+               return NULL;
+       }
+       addr = page_address(tce_mem);
        memset(addr, 0, allocated);
        *total_allocated += allocated;
 
@@ -2443,8 +2675,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 {
        int64_t rc;
 
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
+       if (!pnv_pci_ioda_pe_dma_weight(pe))
                return;
 
        /* TVE #1 is selected by PCI address bit 59 */
@@ -2454,7 +2685,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
                        pe->pe_number);
 
        /* The PE will reserve all possible 32-bits space */
-       pe->tce32_seg = 0;
        pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
                phb->ioda.m32_pci_base);
 
@@ -2470,11 +2700,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 #endif
 
        rc = pnv_pci_ioda2_setup_default_config(pe);
-       if (rc) {
-               if (pe->tce32_seg >= 0)
-                       pe->tce32_seg = -1;
+       if (rc)
                return;
-       }
 
        if (pe->flags & PNV_IODA_PE_DEV)
                iommu_add_device(&pe->pdev->dev);
@@ -2482,78 +2709,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
                pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
-static void pnv_ioda_setup_dma(struct pnv_phb *phb)
-{
-       struct pci_controller *hose = phb->hose;
-       unsigned int residual, remaining, segs, tw, base;
-       struct pnv_ioda_pe *pe;
-
-       /* If we have more PE# than segments available, hand out one
-        * per PE until we run out and let the rest fail. If not,
-        * then we assign at least one segment per PE, plus more based
-        * on the amount of devices under that PE
-        */
-       if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
-               residual = 0;
-       else
-               residual = phb->ioda.tce32_count -
-                       phb->ioda.dma_pe_count;
-
-       pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
-               hose->global_number, phb->ioda.tce32_count);
-       pr_info("PCI: %d PE# for a total weight of %d\n",
-               phb->ioda.dma_pe_count, phb->ioda.dma_weight);
-
-       pnv_pci_ioda_setup_opal_tce_kill(phb);
-
-       /* Walk our PE list and configure their DMA segments, hand them
-        * out one base segment plus any residual segments based on
-        * weight
-        */
-       remaining = phb->ioda.tce32_count;
-       tw = phb->ioda.dma_weight;
-       base = 0;
-       list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
-               if (!pe->dma_weight)
-                       continue;
-               if (!remaining) {
-                       pe_warn(pe, "No DMA32 resources available\n");
-                       continue;
-               }
-               segs = 1;
-               if (residual) {
-                       segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
-                       if (segs > remaining)
-                               segs = remaining;
-               }
-
-               /*
-                * For IODA2 compliant PHB3, we needn't care about the weight.
-                * The all available 32-bits DMA space will be assigned to
-                * the specific PE.
-                */
-               if (phb->type == PNV_PHB_IODA1) {
-                       pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
-                               pe->dma_weight, segs);
-                       pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
-               } else if (phb->type == PNV_PHB_IODA2) {
-                       pe_info(pe, "Assign DMA32 space\n");
-                       segs = 0;
-                       pnv_pci_ioda2_setup_dma_pe(phb, pe);
-               } else if (phb->type == PNV_PHB_NPU) {
-                       /*
-                        * We initialise the DMA space for an NPU PHB
-                        * after setup of the PHB is complete as we
-                        * point the NPU TVT to the the same location
-                        * as the PHB3 TVT.
-                        */
-               }
-
-               remaining -= segs;
-               base += segs;
-       }
-}
-
 #ifdef CONFIG_PCI_MSI
 static void pnv_ioda2_msi_eoi(struct irq_data *d)
 {
@@ -2570,12 +2725,13 @@ static void pnv_ioda2_msi_eoi(struct irq_data *d)
 }
 
 
-static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
+void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
 {
        struct irq_data *idata;
        struct irq_chip *ichip;
 
-       if (phb->type != PNV_PHB_IODA2)
+       /* The MSI EOI OPAL call is only needed on PHB3 */
+       if (phb->model != PNV_PHB_MODEL_PHB3)
                return;
 
        if (!phb->ioda.irq_chip_init) {
@@ -2592,157 +2748,6 @@ static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
        irq_set_chip(virq, &phb->ioda.irq_chip);
 }
 
-#ifdef CONFIG_CXL_BASE
-
-struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-
-       return of_node_get(hose->dn);
-}
-EXPORT_SYMBOL(pnv_pci_get_phb_node);
-
-int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       struct pnv_ioda_pe *pe;
-       int rc;
-
-       pe = pnv_ioda_get_pe(dev);
-       if (!pe)
-               return -ENODEV;
-
-       pe_info(pe, "Switching PHB to CXL\n");
-
-       rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
-       if (rc)
-               dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
-
-       return rc;
-}
-EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
-
-/* Find PHB for cxl dev and allocate MSI hwirqs?
- * Returns the absolute hardware IRQ number
- */
-int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
-
-       if (hwirq < 0) {
-               dev_warn(&dev->dev, "Failed to find a free MSI\n");
-               return -ENOSPC;
-       }
-
-       return phb->msi_base + hwirq;
-}
-EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
-
-void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-
-       msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
-}
-EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
-
-void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
-                                 struct pci_dev *dev)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       int i, hwirq;
-
-       for (i = 1; i < CXL_IRQ_RANGES; i++) {
-               if (!irqs->range[i])
-                       continue;
-               pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
-                        i, irqs->offset[i],
-                        irqs->range[i]);
-               hwirq = irqs->offset[i] - phb->msi_base;
-               msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
-                                      irqs->range[i]);
-       }
-}
-EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
-
-int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
-                              struct pci_dev *dev, int num)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       int i, hwirq, try;
-
-       memset(irqs, 0, sizeof(struct cxl_irq_ranges));
-
-       /* 0 is reserved for the multiplexed PSL DSI interrupt */
-       for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
-               try = num;
-               while (try) {
-                       hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
-                       if (hwirq >= 0)
-                               break;
-                       try /= 2;
-               }
-               if (!try)
-                       goto fail;
-
-               irqs->offset[i] = phb->msi_base + hwirq;
-               irqs->range[i] = try;
-               pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
-                        i, irqs->offset[i], irqs->range[i]);
-               num -= try;
-       }
-       if (num)
-               goto fail;
-
-       return 0;
-fail:
-       pnv_cxl_release_hwirq_ranges(irqs, dev);
-       return -ENOSPC;
-}
-EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
-
-int pnv_cxl_get_irq_count(struct pci_dev *dev)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-
-       return phb->msi_bmp.irq_count;
-}
-EXPORT_SYMBOL(pnv_cxl_get_irq_count);
-
-int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
-                          unsigned int virq)
-{
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       unsigned int xive_num = hwirq - phb->msi_base;
-       struct pnv_ioda_pe *pe;
-       int rc;
-
-       if (!(pe = pnv_ioda_get_pe(dev)))
-               return -ENODEV;
-
-       /* Assign XIVE to PE */
-       rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
-       if (rc) {
-               pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
-                       "hwirq 0x%x XIVE 0x%x PE\n",
-                       pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
-               return -EIO;
-       }
-       set_msi_irq_chip(phb, virq);
-
-       return 0;
-}
-EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
-#endif
-
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
                                  unsigned int hwirq, unsigned int virq,
                                  unsigned int is_64, struct msi_msg *msg)
@@ -2799,7 +2804,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
        }
        msg->data = be32_to_cpu(data);
 
-       set_msi_irq_chip(phb, virq);
+       pnv_set_msi_irq_chip(phb, virq);
 
        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
                 " address=%x_%08x data=%x PE# %d\n",
@@ -2858,14 +2863,14 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
        pdn->m64_single_mode = false;
 
        total_vfs = pci_sriov_get_totalvfs(pdev);
-       mul = phb->ioda.total_pe;
+       mul = phb->ioda.total_pe_num;
        total_vf_bar_sz = 0;
 
        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                res = &pdev->resource[i + PCI_IOV_RESOURCES];
                if (!res->flags || res->parent)
                        continue;
-               if (!pnv_pci_is_mem_pref_64(res->flags)) {
+               if (!pnv_pci_is_m64(phb, res)) {
                        dev_warn(&pdev->dev, "Don't support SR-IOV with"
                                        " non M64 VF BAR%d: %pR. \n",
                                 i, res);
@@ -2929,111 +2934,94 @@ truncate_iov:
 }
 #endif /* CONFIG_PCI_IOV */
 
-/*
- * This function is supposed to be called on basis of PE from top
- * to bottom style. So the the I/O or MMIO segment assigned to
- * parent PE could be overrided by its child PEs if necessary.
- */
-static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
-                                 struct pnv_ioda_pe *pe)
+static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
+                                 struct resource *res)
 {
-       struct pnv_phb *phb = hose->private_data;
+       struct pnv_phb *phb = pe->phb;
        struct pci_bus_region region;
-       struct resource *res;
-       int i, index;
-       int rc;
-
-       /*
-        * NOTE: We only care PCI bus based PE for now. For PCI
-        * device based PE, for example SRIOV sensitive VF should
-        * be figured out later.
-        */
-       BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
-
-       pci_bus_for_each_resource(pe->pbus, res, i) {
-               if (!res || !res->flags ||
-                   res->start > res->end)
-                       continue;
+       int index;
+       int64_t rc;
 
-               if (res->flags & IORESOURCE_IO) {
-                       region.start = res->start - phb->ioda.io_pci_base;
-                       region.end   = res->end - phb->ioda.io_pci_base;
-                       index = region.start / phb->ioda.io_segsize;
+       if (!res || !res->flags || res->start > res->end)
+               return;
 
-                       while (index < phb->ioda.total_pe &&
-                              region.start <= region.end) {
-                               phb->ioda.io_segmap[index] = pe->pe_number;
-                               rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-                                       pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
-                               if (rc != OPAL_SUCCESS) {
-                                       pr_err("%s: OPAL error %d when mapping IO "
-                                              "segment #%d to PE#%d\n",
-                                              __func__, rc, index, pe->pe_number);
-                                       break;
-                               }
+       if (res->flags & IORESOURCE_IO) {
+               region.start = res->start - phb->ioda.io_pci_base;
+               region.end   = res->end - phb->ioda.io_pci_base;
+               index = region.start / phb->ioda.io_segsize;
 
-                               region.start += phb->ioda.io_segsize;
-                               index++;
+               while (index < phb->ioda.total_pe_num &&
+                      region.start <= region.end) {
+                       phb->ioda.io_segmap[index] = pe->pe_number;
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                               pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+                       if (rc != OPAL_SUCCESS) {
+                               pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n",
+                                      __func__, rc, index, pe->pe_number);
+                               break;
                        }
-               } else if ((res->flags & IORESOURCE_MEM) &&
-                          !pnv_pci_is_mem_pref_64(res->flags)) {
-                       region.start = res->start -
-                                      hose->mem_offset[0] -
-                                      phb->ioda.m32_pci_base;
-                       region.end   = res->end -
-                                      hose->mem_offset[0] -
-                                      phb->ioda.m32_pci_base;
-                       index = region.start / phb->ioda.m32_segsize;
-
-                       while (index < phb->ioda.total_pe &&
-                              region.start <= region.end) {
-                               phb->ioda.m32_segmap[index] = pe->pe_number;
-                               rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-                                       pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
-                               if (rc != OPAL_SUCCESS) {
-                                       pr_err("%s: OPAL error %d when mapping M32 "
-                                              "segment#%d to PE#%d",
-                                              __func__, rc, index, pe->pe_number);
-                                       break;
-                               }
 
-                               region.start += phb->ioda.m32_segsize;
-                               index++;
-                       }
+                       region.start += phb->ioda.io_segsize;
+                       index++;
                }
-       }
-}
-
-static void pnv_pci_ioda_setup_seg(void)
-{
-       struct pci_controller *tmp, *hose;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-               phb = hose->private_data;
-
-               /* NPU PHB does not support IO or MMIO segmentation */
-               if (phb->type == PNV_PHB_NPU)
-                       continue;
+       } else if ((res->flags & IORESOURCE_MEM) &&
+                  !pnv_pci_is_m64(phb, res)) {
+               region.start = res->start -
+                              phb->hose->mem_offset[0] -
+                              phb->ioda.m32_pci_base;
+               region.end   = res->end -
+                              phb->hose->mem_offset[0] -
+                              phb->ioda.m32_pci_base;
+               index = region.start / phb->ioda.m32_segsize;
+
+               while (index < phb->ioda.total_pe_num &&
+                      region.start <= region.end) {
+                       phb->ioda.m32_segmap[index] = pe->pe_number;
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                               pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+                       if (rc != OPAL_SUCCESS) {
+                               pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d",
+                                      __func__, rc, index, pe->pe_number);
+                               break;
+                       }
 
-               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-                       pnv_ioda_setup_pe_seg(hose, pe);
+                       region.start += phb->ioda.m32_segsize;
+                       index++;
                }
        }
 }
 
-static void pnv_pci_ioda_setup_DMA(void)
+/*
+ * This function is supposed to be called on basis of PE from top
+ * to bottom style. So the the I/O or MMIO segment assigned to
+ * parent PE could be overrided by its child PEs if necessary.
+ */
+static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
 {
-       struct pci_controller *hose, *tmp;
-       struct pnv_phb *phb;
+       struct pci_dev *pdev;
+       int i;
 
-       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-               pnv_ioda_setup_dma(hose->private_data);
+       /*
+        * NOTE: We only care PCI bus based PE for now. For PCI
+        * device based PE, for example SRIOV sensitive VF should
+        * be figured out later.
+        */
+       BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
 
-               /* Mark the PHB initialization done */
-               phb = hose->private_data;
-               phb->initialized = 1;
+       list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+               for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+                       pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
+
+               /*
+                * If the PE contains all subordinate PCI buses, the
+                * windows of the child bridges should be mapped to
+                * the PE as well.
+                */
+               if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
+                       continue;
+               for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
+                       pnv_ioda_setup_pe_res(pe,
+                               &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
        }
 }
 
@@ -3047,6 +3035,9 @@ static void pnv_pci_ioda_create_dbgfs(void)
        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
                phb = hose->private_data;
 
+               /* Notify initialization of PHB done */
+               phb->initialized = 1;
+
                sprintf(name, "PCI%04x", hose->global_number);
                phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
                if (!phb->dbgfs)
@@ -3056,42 +3047,16 @@ static void pnv_pci_ioda_create_dbgfs(void)
 #endif /* CONFIG_DEBUG_FS */
 }
 
-static void pnv_npu_ioda_fixup(void)
-{
-       bool enable_bypass;
-       struct pci_controller *hose, *tmp;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type != PNV_PHB_NPU)
-                       continue;
-
-               list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
-                       enable_bypass = dma_get_mask(&pe->pdev->dev) ==
-                               DMA_BIT_MASK(64);
-                       pnv_npu_init_dma_pe(pe);
-                       pnv_npu_dma_set_bypass(pe, enable_bypass);
-               }
-       }
-}
-
 static void pnv_pci_ioda_fixup(void)
 {
        pnv_pci_ioda_setup_PEs();
-       pnv_pci_ioda_setup_seg();
-       pnv_pci_ioda_setup_DMA();
-
+       pnv_pci_ioda_setup_iommu_api();
        pnv_pci_ioda_create_dbgfs();
 
 #ifdef CONFIG_EEH
        eeh_init();
        eeh_addr_cache_build();
 #endif
-
-       /* Link NPU IODA tables to their PCI devices. */
-       pnv_npu_ioda_fixup();
 }
 
 /*
@@ -3125,9 +3090,12 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
                bridge = bridge->bus->self;
        }
 
-       /* We fail back to M32 if M64 isn't supported */
-       if (phb->ioda.m64_segsize &&
-           pnv_pci_is_mem_pref_64(type))
+       /*
+        * We fall back to M32 if M64 isn't supported. We enforce the M64
+        * alignment for any 64-bit resource, PCIe doesn't care and
+        * bridges only do 64-bit prefetchable anyway.
+        */
+       if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64))
                return phb->ioda.m64_segsize;
        if (type & IORESOURCE_MEM)
                return phb->ioda.m32_segsize;
@@ -3135,6 +3103,115 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
        return phb->ioda.io_segsize;
 }
 
+/*
+ * We are updating root port or the upstream port of the
+ * bridge behind the root port with PHB's windows in order
+ * to accommodate the changes on required resources during
+ * PCI (slot) hotplug, which is connected to either root
+ * port or the downstream ports of PCIe switch behind the
+ * root port.
+ */
+static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
+                                          unsigned long type)
+{
+       struct pci_controller *hose = pci_bus_to_host(bus);
+       struct pnv_phb *phb = hose->private_data;
+       struct pci_dev *bridge = bus->self;
+       struct resource *r, *w;
+       bool msi_region = false;
+       int i;
+
+       /* Check if we need apply fixup to the bridge's windows */
+       if (!pci_is_root_bus(bridge->bus) &&
+           !pci_is_root_bus(bridge->bus->self->bus))
+               return;
+
+       /* Fixup the resources */
+       for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+               r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
+               if (!r->flags || !r->parent)
+                       continue;
+
+               w = NULL;
+               if (r->flags & type & IORESOURCE_IO)
+                       w = &hose->io_resource;
+               else if (pnv_pci_is_m64(phb, r) &&
+                        (type & IORESOURCE_PREFETCH) &&
+                        phb->ioda.m64_segsize)
+                       w = &hose->mem_resources[1];
+               else if (r->flags & type & IORESOURCE_MEM) {
+                       w = &hose->mem_resources[0];
+                       msi_region = true;
+               }
+
+               r->start = w->start;
+               r->end = w->end;
+
+               /* The 64KB 32-bits MSI region shouldn't be included in
+                * the 32-bits bridge window. Otherwise, we can see strange
+                * issues. One of them is EEH error observed on Garrison.
+                *
+                * Exclude top 1MB region which is the minimal alignment of
+                * 32-bits bridge window.
+                */
+               if (msi_region) {
+                       r->end += 0x10000;
+                       r->end -= 0x100000;
+               }
+       }
+}
+
+static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
+{
+       struct pci_controller *hose = pci_bus_to_host(bus);
+       struct pnv_phb *phb = hose->private_data;
+       struct pci_dev *bridge = bus->self;
+       struct pnv_ioda_pe *pe;
+       bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
+
+       /* Extend bridge's windows if necessary */
+       pnv_pci_fixup_bridge_resources(bus, type);
+
+       /* The PE for root bus should be realized before any one else */
+       if (!phb->ioda.root_pe_populated) {
+               pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
+               if (pe) {
+                       phb->ioda.root_pe_idx = pe->pe_number;
+                       phb->ioda.root_pe_populated = true;
+               }
+       }
+
+       /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
+       if (list_empty(&bus->devices))
+               return;
+
+       /* Reserve PEs according to used M64 resources */
+       if (phb->reserve_m64_pe)
+               phb->reserve_m64_pe(bus, NULL, all);
+
+       /*
+        * Assign PE. We might run here because of partial hotplug.
+        * For the case, we just pick up the existing PE and should
+        * not allocate resources again.
+        */
+       pe = pnv_ioda_setup_bus_PE(bus, all);
+       if (!pe)
+               return;
+
+       pnv_ioda_setup_pe_seg(pe);
+       switch (phb->type) {
+       case PNV_PHB_IODA1:
+               pnv_pci_ioda1_setup_dma_pe(phb, pe);
+               break;
+       case PNV_PHB_IODA2:
+               pnv_pci_ioda2_setup_dma_pe(phb, pe);
+               break;
+       default:
+               pr_warn("%s: No DMA for PHB#%d (type %d)\n",
+                       __func__, phb->hose->global_number, phb->type);
+       }
+}
+
 #ifdef CONFIG_PCI_IOV
 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
                                                      int resno)
@@ -3174,7 +3251,7 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 /* Prevent enabling devices for which we couldn't properly
  * assign a PE
  */
-static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
+bool pnv_pci_enable_device_hook(struct pci_dev *dev)
 {
        struct pci_controller *hose = pci_bus_to_host(dev->bus);
        struct pnv_phb *phb = hose->private_data;
@@ -3195,10 +3272,176 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
        return true;
 }
 
-static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
-                              u32 devfn)
+static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
+                                      int num)
+{
+       struct pnv_ioda_pe *pe = container_of(table_group,
+                                             struct pnv_ioda_pe, table_group);
+       struct pnv_phb *phb = pe->phb;
+       unsigned int idx;
+       long rc;
+
+       pe_info(pe, "Removing DMA window #%d\n", num);
+       for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
+               if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
+                       continue;
+
+               rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+                                               idx, 0, 0ul, 0ul, 0ul);
+               if (rc != OPAL_SUCCESS) {
+                       pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
+                               rc, idx);
+                       return rc;
+               }
+
+               phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
+       }
+
+       pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+       return OPAL_SUCCESS;
+}
+
+static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+       unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
+       struct iommu_table *tbl = pe->table_group.tables[0];
+       int64_t rc;
+
+       if (!weight)
+               return;
+
+       rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
+       if (rc != OPAL_SUCCESS)
+               return;
+
+       pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
+       if (pe->table_group.group) {
+               iommu_group_put(pe->table_group.group);
+               WARN_ON(pe->table_group.group);
+       }
+
+       free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+       iommu_free_table(tbl, "pnv");
+}
+
+static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+{
+       struct iommu_table *tbl = pe->table_group.tables[0];
+       unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
+#ifdef CONFIG_IOMMU_API
+       int64_t rc;
+#endif
+
+       if (!weight)
+               return;
+
+#ifdef CONFIG_IOMMU_API
+       rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+       if (rc)
+               pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+#endif
+
+       pnv_pci_ioda2_set_bypass(pe, false);
+       if (pe->table_group.group) {
+               iommu_group_put(pe->table_group.group);
+               WARN_ON(pe->table_group.group);
+       }
+
+       pnv_pci_ioda2_table_free_pages(tbl);
+       iommu_free_table(tbl, "pnv");
+}
+
+static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
+                                unsigned short win,
+                                unsigned int *map)
 {
-       return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
+       struct pnv_phb *phb = pe->phb;
+       int idx;
+       int64_t rc;
+
+       for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
+               if (map[idx] != pe->pe_number)
+                       continue;
+
+               if (win == OPAL_M64_WINDOW_TYPE)
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                                       phb->ioda.reserved_pe_idx, win,
+                                       idx / PNV_IODA1_M64_SEGS,
+                                       idx % PNV_IODA1_M64_SEGS);
+               else
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                                       phb->ioda.reserved_pe_idx, win, 0, idx);
+
+               if (rc != OPAL_SUCCESS)
+                       pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
+                               rc, win, idx);
+
+               map[idx] = IODA_INVALID_PE;
+       }
+}
+
+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
+{
+       struct pnv_phb *phb = pe->phb;
+
+       if (phb->type == PNV_PHB_IODA1) {
+               pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
+                                    phb->ioda.io_segmap);
+               pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+                                    phb->ioda.m32_segmap);
+               pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
+                                    phb->ioda.m64_segmap);
+       } else if (phb->type == PNV_PHB_IODA2) {
+               pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
+                                    phb->ioda.m32_segmap);
+       }
+}
+
+static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
+{
+       struct pnv_phb *phb = pe->phb;
+       struct pnv_ioda_pe *slave, *tmp;
+
+       /* Release slave PEs in compound PE */
+       if (pe->flags & PNV_IODA_PE_MASTER) {
+               list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
+                       pnv_ioda_release_pe(slave);
+       }
+
+       list_del(&pe->list);
+       switch (phb->type) {
+       case PNV_PHB_IODA1:
+               pnv_pci_ioda1_release_pe_dma(pe);
+               break;
+       case PNV_PHB_IODA2:
+               pnv_pci_ioda2_release_pe_dma(pe);
+               break;
+       default:
+               WARN_ON(1);
+       }
+
+       pnv_ioda_release_pe_seg(pe);
+       pnv_ioda_deconfigure_pe(pe->phb, pe);
+       pnv_ioda_free_pe(pe);
+}
+
+static void pnv_pci_release_device(struct pci_dev *pdev)
+{
+       struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+       struct pnv_phb *phb = hose->private_data;
+       struct pci_dn *pdn = pci_get_pdn(pdev);
+       struct pnv_ioda_pe *pe;
+
+       if (pdev->is_virtfn)
+               return;
+
+       if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+               return;
+
+       pe = &phb->ioda.pe_array[pdn->pe_number];
+       WARN_ON(--pe->device_count < 0);
+       if (pe->device_count == 0)
+               pnv_ioda_release_pe(pe);
 }
 
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
@@ -3210,47 +3453,84 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 }
 
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
-       .dma_dev_setup = pnv_pci_dma_dev_setup,
-       .dma_bus_setup = pnv_pci_dma_bus_setup,
+       .dma_dev_setup          = pnv_pci_dma_dev_setup,
+       .dma_bus_setup          = pnv_pci_dma_bus_setup,
 #ifdef CONFIG_PCI_MSI
-       .setup_msi_irqs = pnv_setup_msi_irqs,
-       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+       .setup_msi_irqs         = pnv_setup_msi_irqs,
+       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
 #endif
-       .enable_device_hook = pnv_pci_enable_device_hook,
-       .window_alignment = pnv_pci_window_alignment,
-       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
-       .dma_set_mask = pnv_pci_ioda_dma_set_mask,
-       .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
-       .shutdown = pnv_pci_ioda_shutdown,
+       .enable_device_hook     = pnv_pci_enable_device_hook,
+       .release_device         = pnv_pci_release_device,
+       .window_alignment       = pnv_pci_window_alignment,
+       .setup_bridge           = pnv_pci_setup_bridge,
+       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
+       .dma_set_mask           = pnv_pci_ioda_dma_set_mask,
+       .dma_get_required_mask  = pnv_pci_ioda_dma_get_required_mask,
+       .shutdown               = pnv_pci_ioda_shutdown,
 };
 
+static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+{
+       dev_err_once(&npdev->dev,
+                       "%s operation unsupported for NVLink devices\n",
+                       __func__);
+       return -EPERM;
+}
+
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-       .dma_dev_setup = pnv_pci_dma_dev_setup,
+       .dma_dev_setup          = pnv_pci_dma_dev_setup,
 #ifdef CONFIG_PCI_MSI
-       .setup_msi_irqs = pnv_setup_msi_irqs,
-       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+       .setup_msi_irqs         = pnv_setup_msi_irqs,
+       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
 #endif
-       .enable_device_hook = pnv_pci_enable_device_hook,
-       .window_alignment = pnv_pci_window_alignment,
-       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
-       .dma_set_mask = pnv_npu_dma_set_mask,
-       .shutdown = pnv_pci_ioda_shutdown,
+       .enable_device_hook     = pnv_pci_enable_device_hook,
+       .window_alignment       = pnv_pci_window_alignment,
+       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
+       .dma_set_mask           = pnv_npu_dma_set_mask,
+       .shutdown               = pnv_pci_ioda_shutdown,
 };
 
+#ifdef CONFIG_CXL_BASE
+const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = {
+       .dma_dev_setup          = pnv_pci_dma_dev_setup,
+       .dma_bus_setup          = pnv_pci_dma_bus_setup,
+#ifdef CONFIG_PCI_MSI
+       .setup_msi_irqs         = pnv_cxl_cx4_setup_msi_irqs,
+       .teardown_msi_irqs      = pnv_cxl_cx4_teardown_msi_irqs,
+#endif
+       .enable_device_hook     = pnv_cxl_enable_device_hook,
+       .disable_device         = pnv_cxl_disable_device,
+       .release_device         = pnv_pci_release_device,
+       .window_alignment       = pnv_pci_window_alignment,
+       .setup_bridge           = pnv_pci_setup_bridge,
+       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
+       .dma_set_mask           = pnv_pci_ioda_dma_set_mask,
+       .dma_get_required_mask  = pnv_pci_ioda_dma_get_required_mask,
+       .shutdown               = pnv_pci_ioda_shutdown,
+};
+#endif
+
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
                                         u64 hub_id, int ioda_type)
 {
        struct pci_controller *hose;
        struct pnv_phb *phb;
-       unsigned long size, m32map_off, pemap_off, iomap_off = 0;
+       unsigned long size, m64map_off, m32map_off, pemap_off;
+       unsigned long iomap_off = 0, dma32map_off = 0;
+       struct resource r;
        const __be64 *prop64;
        const __be32 *prop32;
        int len;
+       unsigned int segno;
        u64 phb_id;
        void *aux;
        long rc;
 
-       pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
+       if (!of_device_is_available(np))
+               return;
+
+       pr_info("Initializing %s PHB (%s)\n",
+               pnv_phb_names[ioda_type], of_node_full_name(np));
 
        prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
        if (!prop64) {
@@ -3301,18 +3581,25 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
 
        /* Get registers */
-       phb->regs = of_iomap(np, 0);
-       if (phb->regs == NULL)
-               pr_err("  Failed to map registers !\n");
+       if (!of_address_to_resource(np, 0, &r)) {
+               phb->regs_phys = r.start;
+               phb->regs = ioremap(r.start, resource_size(&r));
+               if (phb->regs == NULL)
+                       pr_err("  Failed to map registers !\n");
+       }
 
        /* Initialize more IODA stuff */
-       phb->ioda.total_pe = 1;
+       phb->ioda.total_pe_num = 1;
        prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
        if (prop32)
-               phb->ioda.total_pe = be32_to_cpup(prop32);
+               phb->ioda.total_pe_num = be32_to_cpup(prop32);
        prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
        if (prop32)
-               phb->ioda.reserved_pe = be32_to_cpup(prop32);
+               phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
+
+       /* Invalidate RID to PE# mapping */
+       for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
+               phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
 
        /* Parse 64-bit MMIO range */
        pnv_ioda_parse_m64_window(phb);
@@ -3321,36 +3608,73 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        /* FW Has already off top 64k of M32 space (MSI space) */
        phb->ioda.m32_size += 0x10000;
 
-       phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+       phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
        phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
        phb->ioda.io_size = hose->pci_io_size;
-       phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+       phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
 
+       /* Calculate how many 32-bit TCE segments we have */
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
+
        /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
-       size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+       size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+                       sizeof(unsigned long));
+       m64map_off = size;
+       size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
        m32map_off = size;
-       size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
+       size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
        if (phb->type == PNV_PHB_IODA1) {
                iomap_off = size;
-               size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
+               size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+               dma32map_off = size;
+               size += phb->ioda.dma32_count *
+                       sizeof(phb->ioda.dma32_segmap[0]);
        }
        pemap_off = size;
-       size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+       size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
        aux = memblock_virt_alloc(size, 0);
        phb->ioda.pe_alloc = aux;
+       phb->ioda.m64_segmap = aux + m64map_off;
        phb->ioda.m32_segmap = aux + m32map_off;
-       if (phb->type == PNV_PHB_IODA1)
+       for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
+               phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
+               phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
+       }
+       if (phb->type == PNV_PHB_IODA1) {
                phb->ioda.io_segmap = aux + iomap_off;
+               for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
+                       phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+               phb->ioda.dma32_segmap = aux + dma32map_off;
+               for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+                       phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
+       }
        phb->ioda.pe_array = aux + pemap_off;
-       set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
 
-       INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
+       /*
+        * Choose PE number for root bus, which shouldn't have
+        * M64 resources consumed by its child devices. To pick
+        * the PE number adjacent to the reserved one if possible.
+        */
+       pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
+       if (phb->ioda.reserved_pe_idx == 0) {
+               phb->ioda.root_pe_idx = 1;
+               pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+       } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
+               phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
+               pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
+       } else {
+               phb->ioda.root_pe_idx = IODA_INVALID_PE;
+       }
+
        INIT_LIST_HEAD(&phb->ioda.pe_list);
        mutex_init(&phb->ioda.pe_list_mutex);
 
        /* Calculate how many 32-bit TCE segments we have */
-       phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
 
 #if 0 /* We should really do that ... */
        rc = opal_pci_set_phb_mem_window(opal->phb_id,
@@ -3362,7 +3686,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 #endif
 
        pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
-               phb->ioda.total_pe, phb->ioda.reserved_pe,
+               phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
                phb->ioda.m32_size, phb->ioda.m32_segsize);
        if (phb->ioda.m64_size)
                pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
@@ -3377,12 +3701,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        phb->freeze_pe = pnv_ioda_freeze_pe;
        phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
 
-       /* Setup RID -> PE mapping function */
-       phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
-
-       /* Setup TCEs */
-       phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
-
        /* Setup MSI support */
        pnv_pci_init_ioda_msis(phb);
 
@@ -3395,10 +3713,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
         */
        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
 
-       if (phb->type == PNV_PHB_NPU)
+       if (phb->type == PNV_PHB_NPU) {
                hose->controller_ops = pnv_npu_ioda_controller_ops;
-       else
+       } else {
+               phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
                hose->controller_ops = pnv_pci_ioda_controller_ops;
+       }
 
 #ifdef CONFIG_PCI_IOV
        ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;