powerpc/powernv: Use PE instead of number during setup and release
[cascardo/linux.git] / arch / powerpc / platforms / powernv / pci-ioda.c
index 832b430..c762f38 100644 (file)
@@ -48,8 +48,9 @@
 #include "powernv.h"
 #include "pci.h"
 
-/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE       ((0x10000000 / 0x1000) * 8)
+#define PNV_IODA1_M64_NUM      16      /* Number of M64 BARs   */
+#define PNV_IODA1_M64_SEGS     8       /* Segments per M64 BAR */
+#define PNV_IODA1_DMA32_SEGSIZE        0x10000000
 
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
 #define POWERNV_IOMMU_MAX_LEVELS       5
@@ -122,6 +123,14 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
                (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
 }
 
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
+{
+       phb->ioda.pe_array[pe_no].phb = phb;
+       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+
+       return &phb->ioda.pe_array[pe_no];
+}
+
 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 {
        if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
@@ -134,11 +143,10 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
                pr_debug("%s: PE %d was reserved on PHB#%x\n",
                         __func__, pe_no, phb->hose->global_number);
 
-       phb->ioda.pe_array[pe_no].phb = phb;
-       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+       pnv_ioda_init_pe(phb, pe_no);
 }
 
-static unsigned int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
        unsigned long pe;
 
@@ -146,20 +154,20 @@ static unsigned int pnv_ioda_alloc_pe(struct pnv_phb *phb)
                pe = find_next_zero_bit(phb->ioda.pe_alloc,
                                        phb->ioda.total_pe_num, 0);
                if (pe >= phb->ioda.total_pe_num)
-                       return IODA_INVALID_PE;
+                       return NULL;
        } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
 
-       phb->ioda.pe_array[pe].phb = phb;
-       phb->ioda.pe_array[pe].pe_number = pe;
-       return pe;
+       return pnv_ioda_init_pe(phb, pe);
 }
 
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 {
-       WARN_ON(phb->ioda.pe_array[pe].pdev);
+       struct pnv_phb *phb = pe->phb;
 
-       memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
-       clear_bit(pe, phb->ioda.pe_alloc);
+       WARN_ON(pe->pdev);
+
+       memset(pe, 0, sizeof(struct pnv_ioda_pe));
+       clear_bit(pe->pe_number, phb->ioda.pe_alloc);
 }
 
 /* The default M64 BAR is shared by all PEs */
@@ -219,7 +227,7 @@ fail:
        return -EIO;
 }
 
-static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
                                         unsigned long *pe_bitmap)
 {
        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -246,22 +254,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
        }
 }
 
-static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus,
-                                    unsigned long *pe_bitmap,
-                                    bool all)
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+       struct resource *r;
+       int index;
+
+       /*
+        * There are 16 M64 BARs, each of which has 8 segments. So
+        * there are as many M64 segments as the maximum number of
+        * PEs, which is 128.
+        */
+       for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+               unsigned long base, segsz = phb->ioda.m64_segsize;
+               int64_t rc;
+
+               base = phb->ioda.m64_base +
+                      index * PNV_IODA1_M64_SEGS * segsz;
+               rc = opal_pci_set_phb_mem_window(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index, base, 0,
+                               PNV_IODA1_M64_SEGS * segsz);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld setting M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+
+               rc = opal_pci_phb_mmio_enable(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index,
+                               OPAL_ENABLE_M64_SPLIT);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld enabling M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+       }
+
+       /*
+        * Exclude the segment used by the reserved PE, which
+        * is expected to be 0 or last supported PE#.
+        */
+       r = &phb->hose->mem_resources[1];
+       if (phb->ioda.reserved_pe_idx == 0)
+               r->start += phb->ioda.m64_segsize;
+       else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+               r->end -= phb->ioda.m64_segsize;
+       else
+               WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
+                    phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+       return 0;
+
+fail:
+       for ( ; index >= 0; index--)
+               opal_pci_phb_mmio_enable(phb->opal_id,
+                       OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+       return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+                                   unsigned long *pe_bitmap,
+                                   bool all)
 {
        struct pci_dev *pdev;
 
        list_for_each_entry(pdev, &bus->devices, bus_list) {
-               pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap);
+               pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
 
                if (all && pdev->subordinate)
-                       pnv_ioda2_reserve_m64_pe(pdev->subordinate,
-                                                pe_bitmap, all);
+                       pnv_ioda_reserve_m64_pe(pdev->subordinate,
+                                               pe_bitmap, all);
        }
 }
 
-static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 {
        struct pci_controller *hose = pci_bus_to_host(bus);
        struct pnv_phb *phb = hose->private_data;
@@ -271,7 +337,7 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
 
        /* Root bus shouldn't use M64 */
        if (pci_is_root_bus(bus))
-               return IODA_INVALID_PE;
+               return NULL;
 
        /* Allocate bitmap */
        size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
@@ -279,11 +345,11 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
        if (!pe_alloc) {
                pr_warn("%s: Out of memory !\n",
                        __func__);
-               return IODA_INVALID_PE;
+               return NULL;
        }
 
        /* Figure out reserved PE numbers by the PE */
-       pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all);
+       pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
 
        /*
         * the current bus might not own M64 window and that's all
@@ -292,7 +358,7 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
         */
        if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
                kfree(pe_alloc);
-               return IODA_INVALID_PE;
+               return NULL;
        }
 
        /*
@@ -315,10 +381,30 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
                        pe->master = master_pe;
                        list_add_tail(&pe->list, &master_pe->slaves);
                }
+
+               /*
+                * P7IOC supports M64DT, which helps mapping M64 segment
+                * to one particular PE#. However, PHB3 has fixed mapping
+                * between M64 segment and PE#. In order to have same logic
+                * for P7IOC and PHB3, we enforce fixed mapping between M64
+                * segment and PE# on P7IOC.
+                */
+               if (phb->type == PNV_PHB_IODA1) {
+                       int64_t rc;
+
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                                       pe->pe_number, OPAL_M64_WINDOW_TYPE,
+                                       pe->pe_number / PNV_IODA1_M64_SEGS,
+                                       pe->pe_number % PNV_IODA1_M64_SEGS);
+                       if (rc != OPAL_SUCCESS)
+                               pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
+                                       __func__, rc, phb->hose->global_number,
+                                       pe->pe_number);
+               }
        }
 
        kfree(pe_alloc);
-       return master_pe->pe_number;
+       return master_pe;
 }
 
 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -329,8 +415,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
        const u32 *r;
        u64 pci_addr;
 
-       /* FIXME: Support M64 for P7IOC */
-       if (phb->type != PNV_PHB_IODA2) {
+       if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
                pr_info("  Not support M64 window\n");
                return;
        }
@@ -364,9 +449,12 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 
        /* Use last M64 BAR to cover M64 window */
        phb->ioda.m64_bar_idx = 15;
-       phb->init_m64 = pnv_ioda2_init_m64;
-       phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
-       phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
+       if (phb->type == PNV_PHB_IODA1)
+               phb->init_m64 = pnv_ioda1_init_m64;
+       else
+               phb->init_m64 = pnv_ioda2_init_m64;
+       phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
+       phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
 }
 
 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -809,44 +897,6 @@ out:
        return 0;
 }
 
-static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
-                                      struct pnv_ioda_pe *pe)
-{
-       struct pnv_ioda_pe *lpe;
-
-       list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
-               if (lpe->dma_weight < pe->dma_weight) {
-                       list_add_tail(&pe->dma_link, &lpe->dma_link);
-                       return;
-               }
-       }
-       list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
-}
-
-static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
-{
-       /* This is quite simplistic. The "base" weight of a device
-        * is 10. 0 means no DMA is to be accounted for it.
-        */
-
-       /* If it's a bridge, no DMA */
-       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
-               return 0;
-
-       /* Reduce the weight of slow USB controllers */
-       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
-               return 3;
-
-       /* Increase the weight of RAID (includes Obsidian) */
-       if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
-               return 15;
-
-       /* Default */
-       return 10;
-}
-
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 {
@@ -920,7 +970,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
        struct pnv_phb *phb = hose->private_data;
        struct pci_dn *pdn = pci_get_pdn(dev);
        struct pnv_ioda_pe *pe;
-       unsigned int pe_num;
 
        if (!pdn) {
                pr_err("%s: Device tree node not associated properly\n",
@@ -930,8 +979,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
        if (pdn->pe_number != IODA_INVALID_PE)
                return NULL;
 
-       pe_num = pnv_ioda_alloc_pe(phb);
-       if (pe_num == IODA_INVALID_PE) {
+       pe = pnv_ioda_alloc_pe(phb);
+       if (!pe) {
                pr_warning("%s: Not enough PE# available, disabling device\n",
                           pci_name(dev));
                return NULL;
@@ -944,14 +993,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
         *
         * At some point we want to remove the PDN completely anyways
         */
-       pe = &phb->ioda.pe_array[pe_num];
        pci_dev_get(dev);
        pdn->pcidev = dev;
-       pdn->pe_number = pe_num;
+       pdn->pe_number = pe->pe_number;
        pe->flags = PNV_IODA_PE_DEV;
        pe->pdev = dev;
        pe->pbus = NULL;
-       pe->tce32_seg = -1;
        pe->mve_number = -1;
        pe->rid = dev->bus->number << 8 | pdn->devfn;
 
@@ -959,24 +1006,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
        if (pnv_ioda_configure_pe(phb, pe)) {
                /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                pdn->pe_number = IODA_INVALID_PE;
                pe->pdev = NULL;
                pci_dev_put(dev);
                return NULL;
        }
 
-       /* Assign a DMA weight to the device */
-       pe->dma_weight = pnv_ioda_dma_weight(dev);
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
-
        return pe;
 }
 
@@ -994,7 +1030,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
                }
                pdn->pcidev = dev;
                pdn->pe_number = pe->pe_number;
-               pe->dma_weight += pnv_ioda_dma_weight(dev);
                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
                        pnv_ioda_setup_same_PE(dev->subordinate, pe);
        }
@@ -1006,49 +1041,44 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
  * subordinate PCI devices and buses. The second type of PE is normally
  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
  */
-static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 {
        struct pci_controller *hose = pci_bus_to_host(bus);
        struct pnv_phb *phb = hose->private_data;
-       struct pnv_ioda_pe *pe;
-       unsigned int pe_num = IODA_INVALID_PE;
+       struct pnv_ioda_pe *pe = NULL;
 
        /* Check if PE is determined by M64 */
        if (phb->pick_m64_pe)
-               pe_num = phb->pick_m64_pe(bus, all);
+               pe = phb->pick_m64_pe(bus, all);
 
        /* The PE number isn't pinned by M64 */
-       if (pe_num == IODA_INVALID_PE)
-               pe_num = pnv_ioda_alloc_pe(phb);
+       if (!pe)
+               pe = pnv_ioda_alloc_pe(phb);
 
-       if (pe_num == IODA_INVALID_PE) {
+       if (!pe) {
                pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
                        __func__, pci_domain_nr(bus), bus->number);
-               return;
+               return NULL;
        }
 
-       pe = &phb->ioda.pe_array[pe_num];
        pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
        pe->pbus = bus;
        pe->pdev = NULL;
-       pe->tce32_seg = -1;
        pe->mve_number = -1;
        pe->rid = bus->busn_res.start << 8;
-       pe->dma_weight = 0;
 
        if (all)
                pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
-                       bus->busn_res.start, bus->busn_res.end, pe_num);
+                       bus->busn_res.start, bus->busn_res.end, pe->pe_number);
        else
                pe_info(pe, "Secondary bus %d associated with PE#%d\n",
-                       bus->busn_res.start, pe_num);
+                       bus->busn_res.start, pe->pe_number);
 
        if (pnv_ioda_configure_pe(phb, pe)) {
                /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                pe->pbus = NULL;
-               return;
+               return NULL;
        }
 
        /* Associate it with all child devices */
@@ -1057,16 +1087,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
        /* Put PE to the list */
        list_add_tail(&pe->list, &phb->ioda.pe_list);
 
-       /* Account for one DMA PE if at least one DMA capable device exist
-        * below the bridge
-        */
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
+       return pe;
 }
 
 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
@@ -1107,7 +1128,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
                        rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
                        npu_pdn->pcidev = npu_pdev;
                        npu_pdn->pe_number = pe_num;
-                       pe->dma_weight += pnv_ioda_dma_weight(npu_pdev);
                        phb->ioda.pe_rmap[rid] = pe->pe_number;
 
                        /* Map the PE to this link */
@@ -1379,7 +1399,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 
                pnv_ioda_deconfigure_pe(phb, pe);
 
-               pnv_ioda_free_pe(phb, pe->pe_number);
+               pnv_ioda_free_pe(pe);
        }
 }
 
@@ -1388,6 +1408,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
        struct pci_dn         *pdn;
        struct pci_sriov      *iov;
        u16                    num_vfs, i;
@@ -1412,8 +1433,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
                /* Release PE numbers */
                if (pdn->m64_single_mode) {
                        for (i = 0; i < num_vfs; i++) {
-                               if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                                       pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                               if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                                       continue;
+
+                               pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                               pnv_ioda_free_pe(pe);
                        }
                } else
                        bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1455,7 +1479,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
                pe->flags = PNV_IODA_PE_VF;
                pe->pbus = NULL;
                pe->parent_dev = pdev;
-               pe->tce32_seg = -1;
                pe->mve_number = -1;
                pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
                           pci_iov_virtfn_devfn(pdev, vf_index);
@@ -1467,8 +1490,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
                if (pnv_ioda_configure_pe(phb, pe)) {
                        /* XXX What do we do here ? */
-                       if (pe_num)
-                               pnv_ioda_free_pe(phb, pe_num);
+                       pnv_ioda_free_pe(pe);
                        pe->pdev = NULL;
                        continue;
                }
@@ -1487,6 +1509,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
        struct pci_dn         *pdn;
        int                    ret;
        u16                    i;
@@ -1529,11 +1552,13 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
                /* Calculate available PE for required VFs */
                if (pdn->m64_single_mode) {
                        for (i = 0; i < num_vfs; i++) {
-                               pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
-                               if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+                               pe = pnv_ioda_alloc_pe(phb);
+                               if (!pe) {
                                        ret = -EBUSY;
                                        goto m64_failed;
                                }
+
+                               pdn->pe_num_map[i] = pe->pe_number;
                        }
                } else {
                        mutex_lock(&phb->ioda.pe_alloc_mutex);
@@ -1578,8 +1603,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 m64_failed:
        if (pdn->m64_single_mode) {
                for (i = 0; i < num_vfs; i++) {
-                       if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                               pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                       if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                               continue;
+
+                       pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                       pnv_ioda_free_pe(pe);
                }
        } else
                bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1946,56 +1974,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
        .free = pnv_ioda2_table_free,
 };
 
-static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
-                                     struct pnv_ioda_pe *pe, unsigned int base,
-                                     unsigned int segs)
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+       unsigned int *weight = (unsigned int *)data;
+
+       /* This is quite simplistic. The "base" weight of a device
+        * is 10. 0 means no DMA is to be accounted for it.
+        */
+       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+               return 0;
+
+       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+               *weight += 3;
+       else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+               *weight += 15;
+       else
+               *weight += 10;
+
+       return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+       unsigned int weight = 0;
+
+       /* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+       if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+               pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+               return weight;
+       }
+#endif
+
+       if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+               pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+               struct pci_dev *pdev;
+
+               list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+                       pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+               pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+       }
+
+       return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+                                      struct pnv_ioda_pe *pe)
 {
 
        struct page *tce_mem = NULL;
        struct iommu_table *tbl;
-       unsigned int i;
+       unsigned int weight, total_weight = 0;
+       unsigned int tce32_segsz, base, segs, avail, i;
        int64_t rc;
        void *addr;
 
        /* XXX FIXME: Handle 64-bit only DMA devices */
        /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
        /* XXX FIXME: Allocate multi-level tables on PHB3 */
+       weight = pnv_pci_ioda_pe_dma_weight(pe);
+       if (!weight)
+               return;
+
+       pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+                    &total_weight);
+       segs = (weight * phb->ioda.dma32_count) / total_weight;
+       if (!segs)
+               segs = 1;
 
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
+       /*
+        * Allocate contiguous DMA32 segments. We begin with the expected
+        * number of segments. With one more attempt, the number of DMA32
+        * segments to be allocated is decreased by one until one segment
+        * is allocated successfully.
+        */
+       do {
+               for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+                       for (avail = 0, i = base; i < base + segs; i++) {
+                               if (phb->ioda.dma32_segmap[i] ==
+                                   IODA_INVALID_PE)
+                                       avail++;
+                       }
+
+                       if (avail == segs)
+                               goto found;
+               }
+       } while (--segs);
+
+       if (!segs) {
+               pe_warn(pe, "No available DMA32 segments\n");
                return;
+       }
 
+found:
        tbl = pnv_pci_table_alloc(phb->hose->node);
        iommu_register_group(&pe->table_group, phb->hose->global_number,
                        pe->pe_number);
        pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
        /* Grab a 32-bit TCE table */
-       pe->tce32_seg = base;
+       pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+               weight, total_weight, base, segs);
        pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
-               (base << 28), ((base + segs) << 28) - 1);
+               base * PNV_IODA1_DMA32_SEGSIZE,
+               (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
 
        /* XXX Currently, we allocate one big contiguous table for the
         * TCEs. We only really need one chunk per 256M of TCE space
         * (ie per segment) but that's an optimization for later, it
         * requires some added smarts with our get/put_tce implementation
+        *
+        * Each TCE page is 4KB in size and each TCE entry occupies 8
+        * bytes
         */
+       tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
        tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-                                  get_order(TCE32_TABLE_SIZE * segs));
+                                  get_order(tce32_segsz * segs));
        if (!tce_mem) {
                pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
                goto fail;
        }
        addr = page_address(tce_mem);
-       memset(addr, 0, TCE32_TABLE_SIZE * segs);
+       memset(addr, 0, tce32_segsz * segs);
 
        /* Configure HW */
        for (i = 0; i < segs; i++) {
                rc = opal_pci_map_pe_dma_window(phb->opal_id,
                                              pe->pe_number,
                                              base + i, 1,
-                                             __pa(addr) + TCE32_TABLE_SIZE * i,
-                                             TCE32_TABLE_SIZE, 0x1000);
+                                             __pa(addr) + tce32_segsz * i,
+                                             tce32_segsz, IOMMU_PAGE_SIZE_4K);
                if (rc) {
                        pe_err(pe, " Failed to configure 32-bit TCE table,"
                               " err %ld\n", rc);
@@ -2003,9 +2115,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                }
        }
 
+       /* Setup DMA32 segment mapping */
+       for (i = base; i < base + segs; i++)
+               phb->ioda.dma32_segmap[i] = pe->pe_number;
+
        /* Setup linux iommu table */
-       pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
-                                 base << 28, IOMMU_PAGE_SHIFT_4K);
+       pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+                                 base * PNV_IODA1_DMA32_SEGSIZE,
+                                 IOMMU_PAGE_SHIFT_4K);
 
        /* OPAL variant of P7IOC SW invalidated TCEs */
        if (phb->ioda.tce_inval_reg)
@@ -2032,10 +2149,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
        return;
  fail:
        /* XXX Failure: Try to fallback to 64-bit only ? */
-       if (pe->tce32_seg >= 0)
-               pe->tce32_seg = -1;
        if (tce_mem)
-               __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+               __free_pages(tce_mem, get_order(tce32_segsz * segs));
        if (tbl) {
                pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
                iommu_free_table(tbl, "pnv");
@@ -2444,10 +2559,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 {
        int64_t rc;
 
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
-               return;
-
        /* TVE #1 is selected by PCI address bit 59 */
        pe->tce_bypass_base = 1ull << 59;
 
@@ -2455,7 +2566,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
                        pe->pe_number);
 
        /* The PE will reserve all possible 32-bits space */
-       pe->tce32_seg = 0;
        pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
                phb->ioda.m32_pci_base);
 
@@ -2471,11 +2581,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 #endif
 
        rc = pnv_pci_ioda2_setup_default_config(pe);
-       if (rc) {
-               if (pe->tce32_seg >= 0)
-                       pe->tce32_seg = -1;
+       if (rc)
                return;
-       }
 
        if (pe->flags & PNV_IODA_PE_DEV)
                iommu_add_device(&pe->pdev->dev);
@@ -2486,47 +2593,24 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 {
        struct pci_controller *hose = phb->hose;
-       unsigned int residual, remaining, segs, tw, base;
        struct pnv_ioda_pe *pe;
+       unsigned int weight;
 
        /* If we have more PE# than segments available, hand out one
         * per PE until we run out and let the rest fail. If not,
         * then we assign at least one segment per PE, plus more based
         * on the amount of devices under that PE
         */
-       if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
-               residual = 0;
-       else
-               residual = phb->ioda.tce32_count -
-                       phb->ioda.dma_pe_count;
-
-       pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
-               hose->global_number, phb->ioda.tce32_count);
-       pr_info("PCI: %d PE# for a total weight of %d\n",
-               phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+       pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n",
+               hose->global_number, phb->ioda.dma32_count);
 
        pnv_pci_ioda_setup_opal_tce_kill(phb);
 
-       /* Walk our PE list and configure their DMA segments, hand them
-        * out one base segment plus any residual segments based on
-        * weight
-        */
-       remaining = phb->ioda.tce32_count;
-       tw = phb->ioda.dma_weight;
-       base = 0;
-       list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
-               if (!pe->dma_weight)
+       /* Walk our PE list and configure their DMA segments */
+       list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+               weight = pnv_pci_ioda_pe_dma_weight(pe);
+               if (!weight)
                        continue;
-               if (!remaining) {
-                       pe_warn(pe, "No DMA32 resources available\n");
-                       continue;
-               }
-               segs = 1;
-               if (residual) {
-                       segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
-                       if (segs > remaining)
-                               segs = remaining;
-               }
 
                /*
                 * For IODA2 compliant PHB3, we needn't care about the weight.
@@ -2534,12 +2618,9 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
                 * the specific PE.
                 */
                if (phb->type == PNV_PHB_IODA1) {
-                       pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
-                               pe->dma_weight, segs);
-                       pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+                       pnv_pci_ioda1_setup_dma_pe(phb, pe);
                } else if (phb->type == PNV_PHB_IODA2) {
                        pe_info(pe, "Assign DMA32 space\n");
-                       segs = 0;
                        pnv_pci_ioda2_setup_dma_pe(phb, pe);
                } else if (phb->type == PNV_PHB_NPU) {
                        /*
@@ -2549,9 +2630,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
                         * as the PHB3 TVT.
                         */
                }
-
-               remaining -= segs;
-               base += segs;
        }
 }
 
@@ -3079,13 +3157,18 @@ static void pnv_npu_ioda_fixup(void)
        struct pci_controller *hose, *tmp;
        struct pnv_phb *phb;
        struct pnv_ioda_pe *pe;
+       unsigned int weight;
 
        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
                phb = hose->private_data;
                if (phb->type != PNV_PHB_NPU)
                        continue;
 
-               list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
+               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+                       weight = pnv_pci_ioda_pe_dma_weight(pe);
+                       if (WARN_ON(!weight))
+                               continue;
+
                        enable_bypass = dma_get_mask(&pe->pdev->dev) ==
                                DMA_BIT_MASK(64);
                        pnv_npu_init_dma_pe(pe);
@@ -3253,7 +3336,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 {
        struct pci_controller *hose;
        struct pnv_phb *phb;
-       unsigned long size, m64map_off, m32map_off, pemap_off, iomap_off = 0;
+       unsigned long size, m64map_off, m32map_off, pemap_off;
+       unsigned long iomap_off = 0, dma32map_off = 0;
        const __be64 *prop64;
        const __be32 *prop32;
        int len;
@@ -3339,6 +3423,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
 
+       /* Calculate how many 32-bit TCE segments we have */
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
+
        /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
        size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
        m64map_off = size;
@@ -3348,6 +3436,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        if (phb->type == PNV_PHB_IODA1) {
                iomap_off = size;
                size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+               dma32map_off = size;
+               size += phb->ioda.dma32_count *
+                       sizeof(phb->ioda.dma32_segmap[0]);
        }
        pemap_off = size;
        size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
@@ -3363,16 +3454,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
                phb->ioda.io_segmap = aux + iomap_off;
                for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
                        phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+               phb->ioda.dma32_segmap = aux + dma32map_off;
+               for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+                       phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
        }
        phb->ioda.pe_array = aux + pemap_off;
        set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc);
 
-       INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
        INIT_LIST_HEAD(&phb->ioda.pe_list);
        mutex_init(&phb->ioda.pe_list_mutex);
 
        /* Calculate how many 32-bit TCE segments we have */
-       phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
 
 #if 0 /* We should really do that ... */
        rc = opal_pci_set_phb_mem_window(opal->phb_id,