powerpc/powernv: Dynamically release PE
[cascardo/linux.git] / arch / powerpc / platforms / powernv / pci-ioda.c
1 /*
2  * Support PCI/PCIe on PowerNV platforms
3  *
4  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11
12 #undef DEBUG
13
14 #include <linux/kernel.h>
15 #include <linux/pci.h>
16 #include <linux/crash_dump.h>
17 #include <linux/debugfs.h>
18 #include <linux/delay.h>
19 #include <linux/string.h>
20 #include <linux/init.h>
21 #include <linux/bootmem.h>
22 #include <linux/irq.h>
23 #include <linux/io.h>
24 #include <linux/msi.h>
25 #include <linux/memblock.h>
26 #include <linux/iommu.h>
27 #include <linux/rculist.h>
28 #include <linux/sizes.h>
29
30 #include <asm/sections.h>
31 #include <asm/io.h>
32 #include <asm/prom.h>
33 #include <asm/pci-bridge.h>
34 #include <asm/machdep.h>
35 #include <asm/msi_bitmap.h>
36 #include <asm/ppc-pci.h>
37 #include <asm/opal.h>
38 #include <asm/iommu.h>
39 #include <asm/tce.h>
40 #include <asm/xics.h>
41 #include <asm/debug.h>
42 #include <asm/firmware.h>
43 #include <asm/pnv-pci.h>
44 #include <asm/mmzone.h>
45
46 #include <misc/cxl-base.h>
47
48 #include "powernv.h"
49 #include "pci.h"
50
51 #define PNV_IODA1_M64_NUM       16      /* Number of M64 BARs   */
52 #define PNV_IODA1_M64_SEGS      8       /* Segments per M64 BAR */
53 #define PNV_IODA1_DMA32_SEGSIZE 0x10000000
54
55 #define POWERNV_IOMMU_DEFAULT_LEVELS    1
56 #define POWERNV_IOMMU_MAX_LEVELS        5
57
58 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
59
60 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
61                             const char *fmt, ...)
62 {
63         struct va_format vaf;
64         va_list args;
65         char pfix[32];
66
67         va_start(args, fmt);
68
69         vaf.fmt = fmt;
70         vaf.va = &args;
71
72         if (pe->flags & PNV_IODA_PE_DEV)
73                 strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
74         else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
75                 sprintf(pfix, "%04x:%02x     ",
76                         pci_domain_nr(pe->pbus), pe->pbus->number);
77 #ifdef CONFIG_PCI_IOV
78         else if (pe->flags & PNV_IODA_PE_VF)
79                 sprintf(pfix, "%04x:%02x:%2x.%d",
80                         pci_domain_nr(pe->parent_dev->bus),
81                         (pe->rid & 0xff00) >> 8,
82                         PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
83 #endif /* CONFIG_PCI_IOV*/
84
85         printk("%spci %s: [PE# %.3d] %pV",
86                level, pfix, pe->pe_number, &vaf);
87
88         va_end(args);
89 }
90
91 static bool pnv_iommu_bypass_disabled __read_mostly;
92
93 static int __init iommu_setup(char *str)
94 {
95         if (!str)
96                 return -EINVAL;
97
98         while (*str) {
99                 if (!strncmp(str, "nobypass", 8)) {
100                         pnv_iommu_bypass_disabled = true;
101                         pr_info("PowerNV: IOMMU bypass window disabled.\n");
102                         break;
103                 }
104                 str += strcspn(str, ",");
105                 if (*str == ',')
106                         str++;
107         }
108
109         return 0;
110 }
111 early_param("iommu", iommu_setup);
112
113 static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
114 {
115         return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
116                 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
117 }
118
119 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
120 {
121         phb->ioda.pe_array[pe_no].phb = phb;
122         phb->ioda.pe_array[pe_no].pe_number = pe_no;
123
124         return &phb->ioda.pe_array[pe_no];
125 }
126
127 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
128 {
129         if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
130                 pr_warn("%s: Invalid PE %d on PHB#%x\n",
131                         __func__, pe_no, phb->hose->global_number);
132                 return;
133         }
134
135         if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
136                 pr_debug("%s: PE %d was reserved on PHB#%x\n",
137                          __func__, pe_no, phb->hose->global_number);
138
139         pnv_ioda_init_pe(phb, pe_no);
140 }
141
142 static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
143 {
144         unsigned long pe = phb->ioda.total_pe_num - 1;
145
146         for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
147                 if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
148                         return pnv_ioda_init_pe(phb, pe);
149         }
150
151         return NULL;
152 }
153
154 static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
155 {
156         struct pnv_phb *phb = pe->phb;
157
158         WARN_ON(pe->pdev);
159
160         memset(pe, 0, sizeof(struct pnv_ioda_pe));
161         clear_bit(pe->pe_number, phb->ioda.pe_alloc);
162 }
163
164 /* The default M64 BAR is shared by all PEs */
165 static int pnv_ioda2_init_m64(struct pnv_phb *phb)
166 {
167         const char *desc;
168         struct resource *r;
169         s64 rc;
170
171         /* Configure the default M64 BAR */
172         rc = opal_pci_set_phb_mem_window(phb->opal_id,
173                                          OPAL_M64_WINDOW_TYPE,
174                                          phb->ioda.m64_bar_idx,
175                                          phb->ioda.m64_base,
176                                          0, /* unused */
177                                          phb->ioda.m64_size);
178         if (rc != OPAL_SUCCESS) {
179                 desc = "configuring";
180                 goto fail;
181         }
182
183         /* Enable the default M64 BAR */
184         rc = opal_pci_phb_mmio_enable(phb->opal_id,
185                                       OPAL_M64_WINDOW_TYPE,
186                                       phb->ioda.m64_bar_idx,
187                                       OPAL_ENABLE_M64_SPLIT);
188         if (rc != OPAL_SUCCESS) {
189                 desc = "enabling";
190                 goto fail;
191         }
192
193         /* Mark the M64 BAR assigned */
194         set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
195
196         /*
197          * Exclude the segments for reserved and root bus PE, which
198          * are first or last two PEs.
199          */
200         r = &phb->hose->mem_resources[1];
201         if (phb->ioda.reserved_pe_idx == 0)
202                 r->start += (2 * phb->ioda.m64_segsize);
203         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
204                 r->end -= (2 * phb->ioda.m64_segsize);
205         else
206                 pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
207                         phb->ioda.reserved_pe_idx);
208
209         return 0;
210
211 fail:
212         pr_warn("  Failure %lld %s M64 BAR#%d\n",
213                 rc, desc, phb->ioda.m64_bar_idx);
214         opal_pci_phb_mmio_enable(phb->opal_id,
215                                  OPAL_M64_WINDOW_TYPE,
216                                  phb->ioda.m64_bar_idx,
217                                  OPAL_DISABLE_M64);
218         return -EIO;
219 }
220
221 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
222                                          unsigned long *pe_bitmap)
223 {
224         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
225         struct pnv_phb *phb = hose->private_data;
226         struct resource *r;
227         resource_size_t base, sgsz, start, end;
228         int segno, i;
229
230         base = phb->ioda.m64_base;
231         sgsz = phb->ioda.m64_segsize;
232         for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
233                 r = &pdev->resource[i];
234                 if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags))
235                         continue;
236
237                 start = _ALIGN_DOWN(r->start - base, sgsz);
238                 end = _ALIGN_UP(r->end - base, sgsz);
239                 for (segno = start / sgsz; segno < end / sgsz; segno++) {
240                         if (pe_bitmap)
241                                 set_bit(segno, pe_bitmap);
242                         else
243                                 pnv_ioda_reserve_pe(phb, segno);
244                 }
245         }
246 }
247
248 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
249 {
250         struct resource *r;
251         int index;
252
253         /*
254          * There are 16 M64 BARs, each of which has 8 segments. So
255          * there are as many M64 segments as the maximum number of
256          * PEs, which is 128.
257          */
258         for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
259                 unsigned long base, segsz = phb->ioda.m64_segsize;
260                 int64_t rc;
261
262                 base = phb->ioda.m64_base +
263                        index * PNV_IODA1_M64_SEGS * segsz;
264                 rc = opal_pci_set_phb_mem_window(phb->opal_id,
265                                 OPAL_M64_WINDOW_TYPE, index, base, 0,
266                                 PNV_IODA1_M64_SEGS * segsz);
267                 if (rc != OPAL_SUCCESS) {
268                         pr_warn("  Error %lld setting M64 PHB#%d-BAR#%d\n",
269                                 rc, phb->hose->global_number, index);
270                         goto fail;
271                 }
272
273                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
274                                 OPAL_M64_WINDOW_TYPE, index,
275                                 OPAL_ENABLE_M64_SPLIT);
276                 if (rc != OPAL_SUCCESS) {
277                         pr_warn("  Error %lld enabling M64 PHB#%d-BAR#%d\n",
278                                 rc, phb->hose->global_number, index);
279                         goto fail;
280                 }
281         }
282
283         /*
284          * Exclude the segments for reserved and root bus PE, which
285          * are first or last two PEs.
286          */
287         r = &phb->hose->mem_resources[1];
288         if (phb->ioda.reserved_pe_idx == 0)
289                 r->start += (2 * phb->ioda.m64_segsize);
290         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
291                 r->end -= (2 * phb->ioda.m64_segsize);
292         else
293                 WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
294                      phb->ioda.reserved_pe_idx, phb->hose->global_number);
295
296         return 0;
297
298 fail:
299         for ( ; index >= 0; index--)
300                 opal_pci_phb_mmio_enable(phb->opal_id,
301                         OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
302
303         return -EIO;
304 }
305
306 static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
307                                     unsigned long *pe_bitmap,
308                                     bool all)
309 {
310         struct pci_dev *pdev;
311
312         list_for_each_entry(pdev, &bus->devices, bus_list) {
313                 pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
314
315                 if (all && pdev->subordinate)
316                         pnv_ioda_reserve_m64_pe(pdev->subordinate,
317                                                 pe_bitmap, all);
318         }
319 }
320
321 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
322 {
323         struct pci_controller *hose = pci_bus_to_host(bus);
324         struct pnv_phb *phb = hose->private_data;
325         struct pnv_ioda_pe *master_pe, *pe;
326         unsigned long size, *pe_alloc;
327         int i;
328
329         /* Root bus shouldn't use M64 */
330         if (pci_is_root_bus(bus))
331                 return NULL;
332
333         /* Allocate bitmap */
334         size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
335         pe_alloc = kzalloc(size, GFP_KERNEL);
336         if (!pe_alloc) {
337                 pr_warn("%s: Out of memory !\n",
338                         __func__);
339                 return NULL;
340         }
341
342         /* Figure out reserved PE numbers by the PE */
343         pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
344
345         /*
346          * the current bus might not own M64 window and that's all
347          * contributed by its child buses. For the case, we needn't
348          * pick M64 dependent PE#.
349          */
350         if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
351                 kfree(pe_alloc);
352                 return NULL;
353         }
354
355         /*
356          * Figure out the master PE and put all slave PEs to master
357          * PE's list to form compound PE.
358          */
359         master_pe = NULL;
360         i = -1;
361         while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
362                 phb->ioda.total_pe_num) {
363                 pe = &phb->ioda.pe_array[i];
364
365                 phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
366                 if (!master_pe) {
367                         pe->flags |= PNV_IODA_PE_MASTER;
368                         INIT_LIST_HEAD(&pe->slaves);
369                         master_pe = pe;
370                 } else {
371                         pe->flags |= PNV_IODA_PE_SLAVE;
372                         pe->master = master_pe;
373                         list_add_tail(&pe->list, &master_pe->slaves);
374                 }
375
376                 /*
377                  * P7IOC supports M64DT, which helps mapping M64 segment
378                  * to one particular PE#. However, PHB3 has fixed mapping
379                  * between M64 segment and PE#. In order to have same logic
380                  * for P7IOC and PHB3, we enforce fixed mapping between M64
381                  * segment and PE# on P7IOC.
382                  */
383                 if (phb->type == PNV_PHB_IODA1) {
384                         int64_t rc;
385
386                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
387                                         pe->pe_number, OPAL_M64_WINDOW_TYPE,
388                                         pe->pe_number / PNV_IODA1_M64_SEGS,
389                                         pe->pe_number % PNV_IODA1_M64_SEGS);
390                         if (rc != OPAL_SUCCESS)
391                                 pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
392                                         __func__, rc, phb->hose->global_number,
393                                         pe->pe_number);
394                 }
395         }
396
397         kfree(pe_alloc);
398         return master_pe;
399 }
400
401 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
402 {
403         struct pci_controller *hose = phb->hose;
404         struct device_node *dn = hose->dn;
405         struct resource *res;
406         const u32 *r;
407         u64 pci_addr;
408
409         if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
410                 pr_info("  Not support M64 window\n");
411                 return;
412         }
413
414         if (!firmware_has_feature(FW_FEATURE_OPAL)) {
415                 pr_info("  Firmware too old to support M64 window\n");
416                 return;
417         }
418
419         r = of_get_property(dn, "ibm,opal-m64-window", NULL);
420         if (!r) {
421                 pr_info("  No <ibm,opal-m64-window> on %s\n",
422                         dn->full_name);
423                 return;
424         }
425
426         res = &hose->mem_resources[1];
427         res->name = dn->full_name;
428         res->start = of_translate_address(dn, r + 2);
429         res->end = res->start + of_read_number(r + 4, 2) - 1;
430         res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
431         pci_addr = of_read_number(r, 2);
432         hose->mem_offset[1] = res->start - pci_addr;
433
434         phb->ioda.m64_size = resource_size(res);
435         phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
436         phb->ioda.m64_base = pci_addr;
437
438         pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
439                         res->start, res->end, pci_addr);
440
441         /* Use last M64 BAR to cover M64 window */
442         phb->ioda.m64_bar_idx = 15;
443         if (phb->type == PNV_PHB_IODA1)
444                 phb->init_m64 = pnv_ioda1_init_m64;
445         else
446                 phb->init_m64 = pnv_ioda2_init_m64;
447         phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
448         phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
449 }
450
451 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
452 {
453         struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
454         struct pnv_ioda_pe *slave;
455         s64 rc;
456
457         /* Fetch master PE */
458         if (pe->flags & PNV_IODA_PE_SLAVE) {
459                 pe = pe->master;
460                 if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
461                         return;
462
463                 pe_no = pe->pe_number;
464         }
465
466         /* Freeze master PE */
467         rc = opal_pci_eeh_freeze_set(phb->opal_id,
468                                      pe_no,
469                                      OPAL_EEH_ACTION_SET_FREEZE_ALL);
470         if (rc != OPAL_SUCCESS) {
471                 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
472                         __func__, rc, phb->hose->global_number, pe_no);
473                 return;
474         }
475
476         /* Freeze slave PEs */
477         if (!(pe->flags & PNV_IODA_PE_MASTER))
478                 return;
479
480         list_for_each_entry(slave, &pe->slaves, list) {
481                 rc = opal_pci_eeh_freeze_set(phb->opal_id,
482                                              slave->pe_number,
483                                              OPAL_EEH_ACTION_SET_FREEZE_ALL);
484                 if (rc != OPAL_SUCCESS)
485                         pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
486                                 __func__, rc, phb->hose->global_number,
487                                 slave->pe_number);
488         }
489 }
490
491 static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
492 {
493         struct pnv_ioda_pe *pe, *slave;
494         s64 rc;
495
496         /* Find master PE */
497         pe = &phb->ioda.pe_array[pe_no];
498         if (pe->flags & PNV_IODA_PE_SLAVE) {
499                 pe = pe->master;
500                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
501                 pe_no = pe->pe_number;
502         }
503
504         /* Clear frozen state for master PE */
505         rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
506         if (rc != OPAL_SUCCESS) {
507                 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
508                         __func__, rc, opt, phb->hose->global_number, pe_no);
509                 return -EIO;
510         }
511
512         if (!(pe->flags & PNV_IODA_PE_MASTER))
513                 return 0;
514
515         /* Clear frozen state for slave PEs */
516         list_for_each_entry(slave, &pe->slaves, list) {
517                 rc = opal_pci_eeh_freeze_clear(phb->opal_id,
518                                              slave->pe_number,
519                                              opt);
520                 if (rc != OPAL_SUCCESS) {
521                         pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
522                                 __func__, rc, opt, phb->hose->global_number,
523                                 slave->pe_number);
524                         return -EIO;
525                 }
526         }
527
528         return 0;
529 }
530
531 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
532 {
533         struct pnv_ioda_pe *slave, *pe;
534         u8 fstate, state;
535         __be16 pcierr;
536         s64 rc;
537
538         /* Sanity check on PE number */
539         if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
540                 return OPAL_EEH_STOPPED_PERM_UNAVAIL;
541
542         /*
543          * Fetch the master PE and the PE instance might be
544          * not initialized yet.
545          */
546         pe = &phb->ioda.pe_array[pe_no];
547         if (pe->flags & PNV_IODA_PE_SLAVE) {
548                 pe = pe->master;
549                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
550                 pe_no = pe->pe_number;
551         }
552
553         /* Check the master PE */
554         rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
555                                         &state, &pcierr, NULL);
556         if (rc != OPAL_SUCCESS) {
557                 pr_warn("%s: Failure %lld getting "
558                         "PHB#%x-PE#%x state\n",
559                         __func__, rc,
560                         phb->hose->global_number, pe_no);
561                 return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
562         }
563
564         /* Check the slave PE */
565         if (!(pe->flags & PNV_IODA_PE_MASTER))
566                 return state;
567
568         list_for_each_entry(slave, &pe->slaves, list) {
569                 rc = opal_pci_eeh_freeze_status(phb->opal_id,
570                                                 slave->pe_number,
571                                                 &fstate,
572                                                 &pcierr,
573                                                 NULL);
574                 if (rc != OPAL_SUCCESS) {
575                         pr_warn("%s: Failure %lld getting "
576                                 "PHB#%x-PE#%x state\n",
577                                 __func__, rc,
578                                 phb->hose->global_number, slave->pe_number);
579                         return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
580                 }
581
582                 /*
583                  * Override the result based on the ascending
584                  * priority.
585                  */
586                 if (fstate > state)
587                         state = fstate;
588         }
589
590         return state;
591 }
592
593 /* Currently those 2 are only used when MSIs are enabled, this will change
594  * but in the meantime, we need to protect them to avoid warnings
595  */
596 #ifdef CONFIG_PCI_MSI
597 static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
598 {
599         struct pci_controller *hose = pci_bus_to_host(dev->bus);
600         struct pnv_phb *phb = hose->private_data;
601         struct pci_dn *pdn = pci_get_pdn(dev);
602
603         if (!pdn)
604                 return NULL;
605         if (pdn->pe_number == IODA_INVALID_PE)
606                 return NULL;
607         return &phb->ioda.pe_array[pdn->pe_number];
608 }
609 #endif /* CONFIG_PCI_MSI */
610
611 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
612                                   struct pnv_ioda_pe *parent,
613                                   struct pnv_ioda_pe *child,
614                                   bool is_add)
615 {
616         const char *desc = is_add ? "adding" : "removing";
617         uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
618                               OPAL_REMOVE_PE_FROM_DOMAIN;
619         struct pnv_ioda_pe *slave;
620         long rc;
621
622         /* Parent PE affects child PE */
623         rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
624                                 child->pe_number, op);
625         if (rc != OPAL_SUCCESS) {
626                 pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
627                         rc, desc);
628                 return -ENXIO;
629         }
630
631         if (!(child->flags & PNV_IODA_PE_MASTER))
632                 return 0;
633
634         /* Compound case: parent PE affects slave PEs */
635         list_for_each_entry(slave, &child->slaves, list) {
636                 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
637                                         slave->pe_number, op);
638                 if (rc != OPAL_SUCCESS) {
639                         pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
640                                 rc, desc);
641                         return -ENXIO;
642                 }
643         }
644
645         return 0;
646 }
647
648 static int pnv_ioda_set_peltv(struct pnv_phb *phb,
649                               struct pnv_ioda_pe *pe,
650                               bool is_add)
651 {
652         struct pnv_ioda_pe *slave;
653         struct pci_dev *pdev = NULL;
654         int ret;
655
656         /*
657          * Clear PE frozen state. If it's master PE, we need
658          * clear slave PE frozen state as well.
659          */
660         if (is_add) {
661                 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
662                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
663                 if (pe->flags & PNV_IODA_PE_MASTER) {
664                         list_for_each_entry(slave, &pe->slaves, list)
665                                 opal_pci_eeh_freeze_clear(phb->opal_id,
666                                                           slave->pe_number,
667                                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
668                 }
669         }
670
671         /*
672          * Associate PE in PELT. We need add the PE into the
673          * corresponding PELT-V as well. Otherwise, the error
674          * originated from the PE might contribute to other
675          * PEs.
676          */
677         ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
678         if (ret)
679                 return ret;
680
681         /* For compound PEs, any one affects all of them */
682         if (pe->flags & PNV_IODA_PE_MASTER) {
683                 list_for_each_entry(slave, &pe->slaves, list) {
684                         ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
685                         if (ret)
686                                 return ret;
687                 }
688         }
689
690         if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
691                 pdev = pe->pbus->self;
692         else if (pe->flags & PNV_IODA_PE_DEV)
693                 pdev = pe->pdev->bus->self;
694 #ifdef CONFIG_PCI_IOV
695         else if (pe->flags & PNV_IODA_PE_VF)
696                 pdev = pe->parent_dev;
697 #endif /* CONFIG_PCI_IOV */
698         while (pdev) {
699                 struct pci_dn *pdn = pci_get_pdn(pdev);
700                 struct pnv_ioda_pe *parent;
701
702                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
703                         parent = &phb->ioda.pe_array[pdn->pe_number];
704                         ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
705                         if (ret)
706                                 return ret;
707                 }
708
709                 pdev = pdev->bus->self;
710         }
711
712         return 0;
713 }
714
715 static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
716 {
717         struct pci_dev *parent;
718         uint8_t bcomp, dcomp, fcomp;
719         int64_t rc;
720         long rid_end, rid;
721
722         /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
723         if (pe->pbus) {
724                 int count;
725
726                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
727                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
728                 parent = pe->pbus->self;
729                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
730                         count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
731                 else
732                         count = 1;
733
734                 switch(count) {
735                 case  1: bcomp = OpalPciBusAll;         break;
736                 case  2: bcomp = OpalPciBus7Bits;       break;
737                 case  4: bcomp = OpalPciBus6Bits;       break;
738                 case  8: bcomp = OpalPciBus5Bits;       break;
739                 case 16: bcomp = OpalPciBus4Bits;       break;
740                 case 32: bcomp = OpalPciBus3Bits;       break;
741                 default:
742                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
743                                 count);
744                         /* Do an exact match only */
745                         bcomp = OpalPciBusAll;
746                 }
747                 rid_end = pe->rid + (count << 8);
748         } else {
749 #ifdef CONFIG_PCI_IOV
750                 if (pe->flags & PNV_IODA_PE_VF)
751                         parent = pe->parent_dev;
752                 else
753 #endif
754                         parent = pe->pdev->bus->self;
755                 bcomp = OpalPciBusAll;
756                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
757                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
758                 rid_end = pe->rid + 1;
759         }
760
761         /* Clear the reverse map */
762         for (rid = pe->rid; rid < rid_end; rid++)
763                 phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
764
765         /* Release from all parents PELT-V */
766         while (parent) {
767                 struct pci_dn *pdn = pci_get_pdn(parent);
768                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
769                         rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
770                                                 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
771                         /* XXX What to do in case of error ? */
772                 }
773                 parent = parent->bus->self;
774         }
775
776         opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
777                                   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
778
779         /* Disassociate PE in PELT */
780         rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
781                                 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
782         if (rc)
783                 pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
784         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
785                              bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
786         if (rc)
787                 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
788
789         pe->pbus = NULL;
790         pe->pdev = NULL;
791 #ifdef CONFIG_PCI_IOV
792         pe->parent_dev = NULL;
793 #endif
794
795         return 0;
796 }
797
798 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
799 {
800         struct pci_dev *parent;
801         uint8_t bcomp, dcomp, fcomp;
802         long rc, rid_end, rid;
803
804         /* Bus validation ? */
805         if (pe->pbus) {
806                 int count;
807
808                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
809                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
810                 parent = pe->pbus->self;
811                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
812                         count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
813                 else
814                         count = 1;
815
816                 switch(count) {
817                 case  1: bcomp = OpalPciBusAll;         break;
818                 case  2: bcomp = OpalPciBus7Bits;       break;
819                 case  4: bcomp = OpalPciBus6Bits;       break;
820                 case  8: bcomp = OpalPciBus5Bits;       break;
821                 case 16: bcomp = OpalPciBus4Bits;       break;
822                 case 32: bcomp = OpalPciBus3Bits;       break;
823                 default:
824                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
825                                 count);
826                         /* Do an exact match only */
827                         bcomp = OpalPciBusAll;
828                 }
829                 rid_end = pe->rid + (count << 8);
830         } else {
831 #ifdef CONFIG_PCI_IOV
832                 if (pe->flags & PNV_IODA_PE_VF)
833                         parent = pe->parent_dev;
834                 else
835 #endif /* CONFIG_PCI_IOV */
836                         parent = pe->pdev->bus->self;
837                 bcomp = OpalPciBusAll;
838                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
839                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
840                 rid_end = pe->rid + 1;
841         }
842
843         /*
844          * Associate PE in PELT. We need add the PE into the
845          * corresponding PELT-V as well. Otherwise, the error
846          * originated from the PE might contribute to other
847          * PEs.
848          */
849         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
850                              bcomp, dcomp, fcomp, OPAL_MAP_PE);
851         if (rc) {
852                 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
853                 return -ENXIO;
854         }
855
856         /*
857          * Configure PELTV. NPUs don't have a PELTV table so skip
858          * configuration on them.
859          */
860         if (phb->type != PNV_PHB_NPU)
861                 pnv_ioda_set_peltv(phb, pe, true);
862
863         /* Setup reverse map */
864         for (rid = pe->rid; rid < rid_end; rid++)
865                 phb->ioda.pe_rmap[rid] = pe->pe_number;
866
867         /* Setup one MVTs on IODA1 */
868         if (phb->type != PNV_PHB_IODA1) {
869                 pe->mve_number = 0;
870                 goto out;
871         }
872
873         pe->mve_number = pe->pe_number;
874         rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
875         if (rc != OPAL_SUCCESS) {
876                 pe_err(pe, "OPAL error %ld setting up MVE %d\n",
877                        rc, pe->mve_number);
878                 pe->mve_number = -1;
879         } else {
880                 rc = opal_pci_set_mve_enable(phb->opal_id,
881                                              pe->mve_number, OPAL_ENABLE_MVE);
882                 if (rc) {
883                         pe_err(pe, "OPAL error %ld enabling MVE %d\n",
884                                rc, pe->mve_number);
885                         pe->mve_number = -1;
886                 }
887         }
888
889 out:
890         return 0;
891 }
892
893 #ifdef CONFIG_PCI_IOV
894 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
895 {
896         struct pci_dn *pdn = pci_get_pdn(dev);
897         int i;
898         struct resource *res, res2;
899         resource_size_t size;
900         u16 num_vfs;
901
902         if (!dev->is_physfn)
903                 return -EINVAL;
904
905         /*
906          * "offset" is in VFs.  The M64 windows are sized so that when they
907          * are segmented, each segment is the same size as the IOV BAR.
908          * Each segment is in a separate PE, and the high order bits of the
909          * address are the PE number.  Therefore, each VF's BAR is in a
910          * separate PE, and changing the IOV BAR start address changes the
911          * range of PEs the VFs are in.
912          */
913         num_vfs = pdn->num_vfs;
914         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
915                 res = &dev->resource[i + PCI_IOV_RESOURCES];
916                 if (!res->flags || !res->parent)
917                         continue;
918
919                 /*
920                  * The actual IOV BAR range is determined by the start address
921                  * and the actual size for num_vfs VFs BAR.  This check is to
922                  * make sure that after shifting, the range will not overlap
923                  * with another device.
924                  */
925                 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
926                 res2.flags = res->flags;
927                 res2.start = res->start + (size * offset);
928                 res2.end = res2.start + (size * num_vfs) - 1;
929
930                 if (res2.end > res->end) {
931                         dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
932                                 i, &res2, res, num_vfs, offset);
933                         return -EBUSY;
934                 }
935         }
936
937         /*
938          * After doing so, there would be a "hole" in the /proc/iomem when
939          * offset is a positive value. It looks like the device return some
940          * mmio back to the system, which actually no one could use it.
941          */
942         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
943                 res = &dev->resource[i + PCI_IOV_RESOURCES];
944                 if (!res->flags || !res->parent)
945                         continue;
946
947                 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
948                 res2 = *res;
949                 res->start += size * offset;
950
951                 dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
952                          i, &res2, res, (offset > 0) ? "En" : "Dis",
953                          num_vfs, offset);
954                 pci_update_resource(dev, i + PCI_IOV_RESOURCES);
955         }
956         return 0;
957 }
958 #endif /* CONFIG_PCI_IOV */
959
960 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
961 {
962         struct pci_controller *hose = pci_bus_to_host(dev->bus);
963         struct pnv_phb *phb = hose->private_data;
964         struct pci_dn *pdn = pci_get_pdn(dev);
965         struct pnv_ioda_pe *pe;
966
967         if (!pdn) {
968                 pr_err("%s: Device tree node not associated properly\n",
969                            pci_name(dev));
970                 return NULL;
971         }
972         if (pdn->pe_number != IODA_INVALID_PE)
973                 return NULL;
974
975         pe = pnv_ioda_alloc_pe(phb);
976         if (!pe) {
977                 pr_warning("%s: Not enough PE# available, disabling device\n",
978                            pci_name(dev));
979                 return NULL;
980         }
981
982         /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
983          * pointer in the PE data structure, both should be destroyed at the
984          * same time. However, this needs to be looked at more closely again
985          * once we actually start removing things (Hotplug, SR-IOV, ...)
986          *
987          * At some point we want to remove the PDN completely anyways
988          */
989         pci_dev_get(dev);
990         pdn->pcidev = dev;
991         pdn->pe_number = pe->pe_number;
992         pe->flags = PNV_IODA_PE_DEV;
993         pe->pdev = dev;
994         pe->pbus = NULL;
995         pe->mve_number = -1;
996         pe->rid = dev->bus->number << 8 | pdn->devfn;
997
998         pe_info(pe, "Associated device to PE\n");
999
1000         if (pnv_ioda_configure_pe(phb, pe)) {
1001                 /* XXX What do we do here ? */
1002                 pnv_ioda_free_pe(pe);
1003                 pdn->pe_number = IODA_INVALID_PE;
1004                 pe->pdev = NULL;
1005                 pci_dev_put(dev);
1006                 return NULL;
1007         }
1008
1009         /* Put PE to the list */
1010         list_add_tail(&pe->list, &phb->ioda.pe_list);
1011
1012         return pe;
1013 }
1014
1015 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1016 {
1017         struct pci_dev *dev;
1018
1019         list_for_each_entry(dev, &bus->devices, bus_list) {
1020                 struct pci_dn *pdn = pci_get_pdn(dev);
1021
1022                 if (pdn == NULL) {
1023                         pr_warn("%s: No device node associated with device !\n",
1024                                 pci_name(dev));
1025                         continue;
1026                 }
1027
1028                 /*
1029                  * In partial hotplug case, the PCI device might be still
1030                  * associated with the PE and needn't attach it to the PE
1031                  * again.
1032                  */
1033                 if (pdn->pe_number != IODA_INVALID_PE)
1034                         continue;
1035
1036                 pe->device_count++;
1037                 pdn->pcidev = dev;
1038                 pdn->pe_number = pe->pe_number;
1039                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1040                         pnv_ioda_setup_same_PE(dev->subordinate, pe);
1041         }
1042 }
1043
1044 /*
1045  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1046  * single PCI bus. Another one that contains the primary PCI bus and its
1047  * subordinate PCI devices and buses. The second type of PE is normally
1048  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1049  */
1050 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1051 {
1052         struct pci_controller *hose = pci_bus_to_host(bus);
1053         struct pnv_phb *phb = hose->private_data;
1054         struct pnv_ioda_pe *pe = NULL;
1055         unsigned int pe_num;
1056
1057         /*
1058          * In partial hotplug case, the PE instance might be still alive.
1059          * We should reuse it instead of allocating a new one.
1060          */
1061         pe_num = phb->ioda.pe_rmap[bus->number << 8];
1062         if (pe_num != IODA_INVALID_PE) {
1063                 pe = &phb->ioda.pe_array[pe_num];
1064                 pnv_ioda_setup_same_PE(bus, pe);
1065                 return NULL;
1066         }
1067
1068         /* PE number for root bus should have been reserved */
1069         if (pci_is_root_bus(bus) &&
1070             phb->ioda.root_pe_idx != IODA_INVALID_PE)
1071                 pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1072
1073         /* Check if PE is determined by M64 */
1074         if (!pe && phb->pick_m64_pe)
1075                 pe = phb->pick_m64_pe(bus, all);
1076
1077         /* The PE number isn't pinned by M64 */
1078         if (!pe)
1079                 pe = pnv_ioda_alloc_pe(phb);
1080
1081         if (!pe) {
1082                 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1083                         __func__, pci_domain_nr(bus), bus->number);
1084                 return NULL;
1085         }
1086
1087         pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1088         pe->pbus = bus;
1089         pe->pdev = NULL;
1090         pe->mve_number = -1;
1091         pe->rid = bus->busn_res.start << 8;
1092
1093         if (all)
1094                 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
1095                         bus->busn_res.start, bus->busn_res.end, pe->pe_number);
1096         else
1097                 pe_info(pe, "Secondary bus %d associated with PE#%d\n",
1098                         bus->busn_res.start, pe->pe_number);
1099
1100         if (pnv_ioda_configure_pe(phb, pe)) {
1101                 /* XXX What do we do here ? */
1102                 pnv_ioda_free_pe(pe);
1103                 pe->pbus = NULL;
1104                 return NULL;
1105         }
1106
1107         /* Associate it with all child devices */
1108         pnv_ioda_setup_same_PE(bus, pe);
1109
1110         /* Put PE to the list */
1111         list_add_tail(&pe->list, &phb->ioda.pe_list);
1112
1113         return pe;
1114 }
1115
1116 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1117 {
1118         int pe_num, found_pe = false, rc;
1119         long rid;
1120         struct pnv_ioda_pe *pe;
1121         struct pci_dev *gpu_pdev;
1122         struct pci_dn *npu_pdn;
1123         struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
1124         struct pnv_phb *phb = hose->private_data;
1125
1126         /*
1127          * Due to a hardware errata PE#0 on the NPU is reserved for
1128          * error handling. This means we only have three PEs remaining
1129          * which need to be assigned to four links, implying some
1130          * links must share PEs.
1131          *
1132          * To achieve this we assign PEs such that NPUs linking the
1133          * same GPU get assigned the same PE.
1134          */
1135         gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1136         for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1137                 pe = &phb->ioda.pe_array[pe_num];
1138                 if (!pe->pdev)
1139                         continue;
1140
1141                 if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1142                         /*
1143                          * This device has the same peer GPU so should
1144                          * be assigned the same PE as the existing
1145                          * peer NPU.
1146                          */
1147                         dev_info(&npu_pdev->dev,
1148                                 "Associating to existing PE %d\n", pe_num);
1149                         pci_dev_get(npu_pdev);
1150                         npu_pdn = pci_get_pdn(npu_pdev);
1151                         rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1152                         npu_pdn->pcidev = npu_pdev;
1153                         npu_pdn->pe_number = pe_num;
1154                         phb->ioda.pe_rmap[rid] = pe->pe_number;
1155
1156                         /* Map the PE to this link */
1157                         rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1158                                         OpalPciBusAll,
1159                                         OPAL_COMPARE_RID_DEVICE_NUMBER,
1160                                         OPAL_COMPARE_RID_FUNCTION_NUMBER,
1161                                         OPAL_MAP_PE);
1162                         WARN_ON(rc != OPAL_SUCCESS);
1163                         found_pe = true;
1164                         break;
1165                 }
1166         }
1167
1168         if (!found_pe)
1169                 /*
1170                  * Could not find an existing PE so allocate a new
1171                  * one.
1172                  */
1173                 return pnv_ioda_setup_dev_PE(npu_pdev);
1174         else
1175                 return pe;
1176 }
1177
1178 static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1179 {
1180         struct pci_dev *pdev;
1181
1182         list_for_each_entry(pdev, &bus->devices, bus_list)
1183                 pnv_ioda_setup_npu_PE(pdev);
1184 }
1185
1186 static void pnv_pci_ioda_setup_PEs(void)
1187 {
1188         struct pci_controller *hose, *tmp;
1189         struct pnv_phb *phb;
1190
1191         list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1192                 phb = hose->private_data;
1193                 if (phb->type == PNV_PHB_NPU) {
1194                         /* PE#0 is needed for error reporting */
1195                         pnv_ioda_reserve_pe(phb, 0);
1196                         pnv_ioda_setup_npu_PEs(hose->bus);
1197                 }
1198         }
1199 }
1200
1201 #ifdef CONFIG_PCI_IOV
1202 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
1203 {
1204         struct pci_bus        *bus;
1205         struct pci_controller *hose;
1206         struct pnv_phb        *phb;
1207         struct pci_dn         *pdn;
1208         int                    i, j;
1209         int                    m64_bars;
1210
1211         bus = pdev->bus;
1212         hose = pci_bus_to_host(bus);
1213         phb = hose->private_data;
1214         pdn = pci_get_pdn(pdev);
1215
1216         if (pdn->m64_single_mode)
1217                 m64_bars = num_vfs;
1218         else
1219                 m64_bars = 1;
1220
1221         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1222                 for (j = 0; j < m64_bars; j++) {
1223                         if (pdn->m64_map[j][i] == IODA_INVALID_M64)
1224                                 continue;
1225                         opal_pci_phb_mmio_enable(phb->opal_id,
1226                                 OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
1227                         clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
1228                         pdn->m64_map[j][i] = IODA_INVALID_M64;
1229                 }
1230
1231         kfree(pdn->m64_map);
1232         return 0;
1233 }
1234
1235 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1236 {
1237         struct pci_bus        *bus;
1238         struct pci_controller *hose;
1239         struct pnv_phb        *phb;
1240         struct pci_dn         *pdn;
1241         unsigned int           win;
1242         struct resource       *res;
1243         int                    i, j;
1244         int64_t                rc;
1245         int                    total_vfs;
1246         resource_size_t        size, start;
1247         int                    pe_num;
1248         int                    m64_bars;
1249
1250         bus = pdev->bus;
1251         hose = pci_bus_to_host(bus);
1252         phb = hose->private_data;
1253         pdn = pci_get_pdn(pdev);
1254         total_vfs = pci_sriov_get_totalvfs(pdev);
1255
1256         if (pdn->m64_single_mode)
1257                 m64_bars = num_vfs;
1258         else
1259                 m64_bars = 1;
1260
1261         pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
1262         if (!pdn->m64_map)
1263                 return -ENOMEM;
1264         /* Initialize the m64_map to IODA_INVALID_M64 */
1265         for (i = 0; i < m64_bars ; i++)
1266                 for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
1267                         pdn->m64_map[i][j] = IODA_INVALID_M64;
1268
1269
1270         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1271                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
1272                 if (!res->flags || !res->parent)
1273                         continue;
1274
1275                 for (j = 0; j < m64_bars; j++) {
1276                         do {
1277                                 win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1278                                                 phb->ioda.m64_bar_idx + 1, 0);
1279
1280                                 if (win >= phb->ioda.m64_bar_idx + 1)
1281                                         goto m64_failed;
1282                         } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1283
1284                         pdn->m64_map[j][i] = win;
1285
1286                         if (pdn->m64_single_mode) {
1287                                 size = pci_iov_resource_size(pdev,
1288                                                         PCI_IOV_RESOURCES + i);
1289                                 start = res->start + size * j;
1290                         } else {
1291                                 size = resource_size(res);
1292                                 start = res->start;
1293                         }
1294
1295                         /* Map the M64 here */
1296                         if (pdn->m64_single_mode) {
1297                                 pe_num = pdn->pe_num_map[j];
1298                                 rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1299                                                 pe_num, OPAL_M64_WINDOW_TYPE,
1300                                                 pdn->m64_map[j][i], 0);
1301                         }
1302
1303                         rc = opal_pci_set_phb_mem_window(phb->opal_id,
1304                                                  OPAL_M64_WINDOW_TYPE,
1305                                                  pdn->m64_map[j][i],
1306                                                  start,
1307                                                  0, /* unused */
1308                                                  size);
1309
1310
1311                         if (rc != OPAL_SUCCESS) {
1312                                 dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1313                                         win, rc);
1314                                 goto m64_failed;
1315                         }
1316
1317                         if (pdn->m64_single_mode)
1318                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
1319                                      OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
1320                         else
1321                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
1322                                      OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
1323
1324                         if (rc != OPAL_SUCCESS) {
1325                                 dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1326                                         win, rc);
1327                                 goto m64_failed;
1328                         }
1329                 }
1330         }
1331         return 0;
1332
1333 m64_failed:
1334         pnv_pci_vf_release_m64(pdev, num_vfs);
1335         return -EBUSY;
1336 }
1337
1338 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1339                 int num);
1340 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
1341
1342 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1343 {
1344         struct iommu_table    *tbl;
1345         int64_t               rc;
1346
1347         tbl = pe->table_group.tables[0];
1348         rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1349         if (rc)
1350                 pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
1351
1352         pnv_pci_ioda2_set_bypass(pe, false);
1353         if (pe->table_group.group) {
1354                 iommu_group_put(pe->table_group.group);
1355                 BUG_ON(pe->table_group.group);
1356         }
1357         pnv_pci_ioda2_table_free_pages(tbl);
1358         iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
1359 }
1360
1361 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
1362 {
1363         struct pci_bus        *bus;
1364         struct pci_controller *hose;
1365         struct pnv_phb        *phb;
1366         struct pnv_ioda_pe    *pe, *pe_n;
1367         struct pci_dn         *pdn;
1368
1369         bus = pdev->bus;
1370         hose = pci_bus_to_host(bus);
1371         phb = hose->private_data;
1372         pdn = pci_get_pdn(pdev);
1373
1374         if (!pdev->is_physfn)
1375                 return;
1376
1377         list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1378                 if (pe->parent_dev != pdev)
1379                         continue;
1380
1381                 pnv_pci_ioda2_release_dma_pe(pdev, pe);
1382
1383                 /* Remove from list */
1384                 mutex_lock(&phb->ioda.pe_list_mutex);
1385                 list_del(&pe->list);
1386                 mutex_unlock(&phb->ioda.pe_list_mutex);
1387
1388                 pnv_ioda_deconfigure_pe(phb, pe);
1389
1390                 pnv_ioda_free_pe(pe);
1391         }
1392 }
1393
1394 void pnv_pci_sriov_disable(struct pci_dev *pdev)
1395 {
1396         struct pci_bus        *bus;
1397         struct pci_controller *hose;
1398         struct pnv_phb        *phb;
1399         struct pnv_ioda_pe    *pe;
1400         struct pci_dn         *pdn;
1401         struct pci_sriov      *iov;
1402         u16                    num_vfs, i;
1403
1404         bus = pdev->bus;
1405         hose = pci_bus_to_host(bus);
1406         phb = hose->private_data;
1407         pdn = pci_get_pdn(pdev);
1408         iov = pdev->sriov;
1409         num_vfs = pdn->num_vfs;
1410
1411         /* Release VF PEs */
1412         pnv_ioda_release_vf_PE(pdev);
1413
1414         if (phb->type == PNV_PHB_IODA2) {
1415                 if (!pdn->m64_single_mode)
1416                         pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
1417
1418                 /* Release M64 windows */
1419                 pnv_pci_vf_release_m64(pdev, num_vfs);
1420
1421                 /* Release PE numbers */
1422                 if (pdn->m64_single_mode) {
1423                         for (i = 0; i < num_vfs; i++) {
1424                                 if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1425                                         continue;
1426
1427                                 pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1428                                 pnv_ioda_free_pe(pe);
1429                         }
1430                 } else
1431                         bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1432                 /* Releasing pe_num_map */
1433                 kfree(pdn->pe_num_map);
1434         }
1435 }
1436
1437 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1438                                        struct pnv_ioda_pe *pe);
1439 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1440 {
1441         struct pci_bus        *bus;
1442         struct pci_controller *hose;
1443         struct pnv_phb        *phb;
1444         struct pnv_ioda_pe    *pe;
1445         int                    pe_num;
1446         u16                    vf_index;
1447         struct pci_dn         *pdn;
1448
1449         bus = pdev->bus;
1450         hose = pci_bus_to_host(bus);
1451         phb = hose->private_data;
1452         pdn = pci_get_pdn(pdev);
1453
1454         if (!pdev->is_physfn)
1455                 return;
1456
1457         /* Reserve PE for each VF */
1458         for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1459                 if (pdn->m64_single_mode)
1460                         pe_num = pdn->pe_num_map[vf_index];
1461                 else
1462                         pe_num = *pdn->pe_num_map + vf_index;
1463
1464                 pe = &phb->ioda.pe_array[pe_num];
1465                 pe->pe_number = pe_num;
1466                 pe->phb = phb;
1467                 pe->flags = PNV_IODA_PE_VF;
1468                 pe->pbus = NULL;
1469                 pe->parent_dev = pdev;
1470                 pe->mve_number = -1;
1471                 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
1472                            pci_iov_virtfn_devfn(pdev, vf_index);
1473
1474                 pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
1475                         hose->global_number, pdev->bus->number,
1476                         PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
1477                         PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
1478
1479                 if (pnv_ioda_configure_pe(phb, pe)) {
1480                         /* XXX What do we do here ? */
1481                         pnv_ioda_free_pe(pe);
1482                         pe->pdev = NULL;
1483                         continue;
1484                 }
1485
1486                 /* Put PE to the list */
1487                 mutex_lock(&phb->ioda.pe_list_mutex);
1488                 list_add_tail(&pe->list, &phb->ioda.pe_list);
1489                 mutex_unlock(&phb->ioda.pe_list_mutex);
1490
1491                 pnv_pci_ioda2_setup_dma_pe(phb, pe);
1492         }
1493 }
1494
1495 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1496 {
1497         struct pci_bus        *bus;
1498         struct pci_controller *hose;
1499         struct pnv_phb        *phb;
1500         struct pnv_ioda_pe    *pe;
1501         struct pci_dn         *pdn;
1502         int                    ret;
1503         u16                    i;
1504
1505         bus = pdev->bus;
1506         hose = pci_bus_to_host(bus);
1507         phb = hose->private_data;
1508         pdn = pci_get_pdn(pdev);
1509
1510         if (phb->type == PNV_PHB_IODA2) {
1511                 if (!pdn->vfs_expanded) {
1512                         dev_info(&pdev->dev, "don't support this SRIOV device"
1513                                 " with non 64bit-prefetchable IOV BAR\n");
1514                         return -ENOSPC;
1515                 }
1516
1517                 /*
1518                  * When M64 BARs functions in Single PE mode, the number of VFs
1519                  * could be enabled must be less than the number of M64 BARs.
1520                  */
1521                 if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
1522                         dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
1523                         return -EBUSY;
1524                 }
1525
1526                 /* Allocating pe_num_map */
1527                 if (pdn->m64_single_mode)
1528                         pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
1529                                         GFP_KERNEL);
1530                 else
1531                         pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
1532
1533                 if (!pdn->pe_num_map)
1534                         return -ENOMEM;
1535
1536                 if (pdn->m64_single_mode)
1537                         for (i = 0; i < num_vfs; i++)
1538                                 pdn->pe_num_map[i] = IODA_INVALID_PE;
1539
1540                 /* Calculate available PE for required VFs */
1541                 if (pdn->m64_single_mode) {
1542                         for (i = 0; i < num_vfs; i++) {
1543                                 pe = pnv_ioda_alloc_pe(phb);
1544                                 if (!pe) {
1545                                         ret = -EBUSY;
1546                                         goto m64_failed;
1547                                 }
1548
1549                                 pdn->pe_num_map[i] = pe->pe_number;
1550                         }
1551                 } else {
1552                         mutex_lock(&phb->ioda.pe_alloc_mutex);
1553                         *pdn->pe_num_map = bitmap_find_next_zero_area(
1554                                 phb->ioda.pe_alloc, phb->ioda.total_pe_num,
1555                                 0, num_vfs, 0);
1556                         if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
1557                                 mutex_unlock(&phb->ioda.pe_alloc_mutex);
1558                                 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1559                                 kfree(pdn->pe_num_map);
1560                                 return -EBUSY;
1561                         }
1562                         bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1563                         mutex_unlock(&phb->ioda.pe_alloc_mutex);
1564                 }
1565                 pdn->num_vfs = num_vfs;
1566
1567                 /* Assign M64 window accordingly */
1568                 ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1569                 if (ret) {
1570                         dev_info(&pdev->dev, "Not enough M64 window resources\n");
1571                         goto m64_failed;
1572                 }
1573
1574                 /*
1575                  * When using one M64 BAR to map one IOV BAR, we need to shift
1576                  * the IOV BAR according to the PE# allocated to the VFs.
1577                  * Otherwise, the PE# for the VF will conflict with others.
1578                  */
1579                 if (!pdn->m64_single_mode) {
1580                         ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
1581                         if (ret)
1582                                 goto m64_failed;
1583                 }
1584         }
1585
1586         /* Setup VF PEs */
1587         pnv_ioda_setup_vf_PE(pdev, num_vfs);
1588
1589         return 0;
1590
1591 m64_failed:
1592         if (pdn->m64_single_mode) {
1593                 for (i = 0; i < num_vfs; i++) {
1594                         if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1595                                 continue;
1596
1597                         pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1598                         pnv_ioda_free_pe(pe);
1599                 }
1600         } else
1601                 bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1602
1603         /* Releasing pe_num_map */
1604         kfree(pdn->pe_num_map);
1605
1606         return ret;
1607 }
1608
1609 int pcibios_sriov_disable(struct pci_dev *pdev)
1610 {
1611         pnv_pci_sriov_disable(pdev);
1612
1613         /* Release PCI data */
1614         remove_dev_pci_data(pdev);
1615         return 0;
1616 }
1617
1618 int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1619 {
1620         /* Allocate PCI data */
1621         add_dev_pci_data(pdev);
1622
1623         return pnv_pci_sriov_enable(pdev, num_vfs);
1624 }
1625 #endif /* CONFIG_PCI_IOV */
1626
1627 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1628 {
1629         struct pci_dn *pdn = pci_get_pdn(pdev);
1630         struct pnv_ioda_pe *pe;
1631
1632         /*
1633          * The function can be called while the PE#
1634          * hasn't been assigned. Do nothing for the
1635          * case.
1636          */
1637         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1638                 return;
1639
1640         pe = &phb->ioda.pe_array[pdn->pe_number];
1641         WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1642         set_dma_offset(&pdev->dev, pe->tce_bypass_base);
1643         set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1644         /*
1645          * Note: iommu_add_device() will fail here as
1646          * for physical PE: the device is already added by now;
1647          * for virtual PE: sysfs entries are not ready yet and
1648          * tce_iommu_bus_notifier will add the device to a group later.
1649          */
1650 }
1651
1652 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
1653 {
1654         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1655         struct pnv_phb *phb = hose->private_data;
1656         struct pci_dn *pdn = pci_get_pdn(pdev);
1657         struct pnv_ioda_pe *pe;
1658         uint64_t top;
1659         bool bypass = false;
1660
1661         if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1662                 return -ENODEV;;
1663
1664         pe = &phb->ioda.pe_array[pdn->pe_number];
1665         if (pe->tce_bypass_enabled) {
1666                 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1667                 bypass = (dma_mask >= top);
1668         }
1669
1670         if (bypass) {
1671                 dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
1672                 set_dma_ops(&pdev->dev, &dma_direct_ops);
1673         } else {
1674                 dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
1675                 set_dma_ops(&pdev->dev, &dma_iommu_ops);
1676         }
1677         *pdev->dev.dma_mask = dma_mask;
1678
1679         /* Update peer npu devices */
1680         pnv_npu_try_dma_set_bypass(pdev, bypass);
1681
1682         return 0;
1683 }
1684
1685 static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
1686 {
1687         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1688         struct pnv_phb *phb = hose->private_data;
1689         struct pci_dn *pdn = pci_get_pdn(pdev);
1690         struct pnv_ioda_pe *pe;
1691         u64 end, mask;
1692
1693         if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1694                 return 0;
1695
1696         pe = &phb->ioda.pe_array[pdn->pe_number];
1697         if (!pe->tce_bypass_enabled)
1698                 return __dma_get_required_mask(&pdev->dev);
1699
1700
1701         end = pe->tce_bypass_base + memblock_end_of_DRAM();
1702         mask = 1ULL << (fls64(end) - 1);
1703         mask += mask - 1;
1704
1705         return mask;
1706 }
1707
1708 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
1709                                    struct pci_bus *bus)
1710 {
1711         struct pci_dev *dev;
1712
1713         list_for_each_entry(dev, &bus->devices, bus_list) {
1714                 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1715                 set_dma_offset(&dev->dev, pe->tce_bypass_base);
1716                 iommu_add_device(&dev->dev);
1717
1718                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1719                         pnv_ioda_setup_bus_dma(pe, dev->subordinate);
1720         }
1721 }
1722
1723 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
1724                 unsigned long index, unsigned long npages, bool rm)
1725 {
1726         struct iommu_table_group_link *tgl = list_first_entry_or_null(
1727                         &tbl->it_group_list, struct iommu_table_group_link,
1728                         next);
1729         struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1730                         struct pnv_ioda_pe, table_group);
1731         __be64 __iomem *invalidate = rm ?
1732                 (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
1733                 pe->phb->ioda.tce_inval_reg;
1734         unsigned long start, end, inc;
1735         const unsigned shift = tbl->it_page_shift;
1736
1737         start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1738         end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1739                         npages - 1);
1740
1741         /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
1742         if (tbl->it_busno) {
1743                 start <<= shift;
1744                 end <<= shift;
1745                 inc = 128ull << shift;
1746                 start |= tbl->it_busno;
1747                 end |= tbl->it_busno;
1748         } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
1749                 /* p7ioc-style invalidation, 2 TCEs per write */
1750                 start |= (1ull << 63);
1751                 end |= (1ull << 63);
1752                 inc = 16;
1753         } else {
1754                 /* Default (older HW) */
1755                 inc = 128;
1756         }
1757
1758         end |= inc - 1; /* round up end to be different than start */
1759
1760         mb(); /* Ensure above stores are visible */
1761         while (start <= end) {
1762                 if (rm)
1763                         __raw_rm_writeq(cpu_to_be64(start), invalidate);
1764                 else
1765                         __raw_writeq(cpu_to_be64(start), invalidate);
1766                 start += inc;
1767         }
1768
1769         /*
1770          * The iommu layer will do another mb() for us on build()
1771          * and we don't care on free()
1772          */
1773 }
1774
1775 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1776                 long npages, unsigned long uaddr,
1777                 enum dma_data_direction direction,
1778                 struct dma_attrs *attrs)
1779 {
1780         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1781                         attrs);
1782
1783         if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
1784                 pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
1785
1786         return ret;
1787 }
1788
1789 #ifdef CONFIG_IOMMU_API
1790 static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1791                 unsigned long *hpa, enum dma_data_direction *direction)
1792 {
1793         long ret = pnv_tce_xchg(tbl, index, hpa, direction);
1794
1795         if (!ret && (tbl->it_type &
1796                         (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
1797                 pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
1798
1799         return ret;
1800 }
1801 #endif
1802
1803 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1804                 long npages)
1805 {
1806         pnv_tce_free(tbl, index, npages);
1807
1808         if (tbl->it_type & TCE_PCI_SWINV_FREE)
1809                 pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
1810 }
1811
1812 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1813         .set = pnv_ioda1_tce_build,
1814 #ifdef CONFIG_IOMMU_API
1815         .exchange = pnv_ioda1_tce_xchg,
1816 #endif
1817         .clear = pnv_ioda1_tce_free,
1818         .get = pnv_tce_get,
1819 };
1820
1821 #define TCE_KILL_INVAL_ALL  PPC_BIT(0)
1822 #define TCE_KILL_INVAL_PE   PPC_BIT(1)
1823 #define TCE_KILL_INVAL_TCE  PPC_BIT(2)
1824
1825 void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1826 {
1827         const unsigned long val = TCE_KILL_INVAL_ALL;
1828
1829         mb(); /* Ensure previous TCE table stores are visible */
1830         if (rm)
1831                 __raw_rm_writeq(cpu_to_be64(val),
1832                                 (__be64 __iomem *)
1833                                 phb->ioda.tce_inval_reg_phys);
1834         else
1835                 __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
1836 }
1837
1838 static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1839 {
1840         /* 01xb - invalidate TCEs that match the specified PE# */
1841         unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
1842         struct pnv_phb *phb = pe->phb;
1843
1844         if (!phb->ioda.tce_inval_reg)
1845                 return;
1846
1847         mb(); /* Ensure above stores are visible */
1848         __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
1849 }
1850
1851 static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
1852                 __be64 __iomem *invalidate, unsigned shift,
1853                 unsigned long index, unsigned long npages)
1854 {
1855         unsigned long start, end, inc;
1856
1857         /* We'll invalidate DMA address in PE scope */
1858         start = TCE_KILL_INVAL_TCE;
1859         start |= (pe_number & 0xFF);
1860         end = start;
1861
1862         /* Figure out the start, end and step */
1863         start |= (index << shift);
1864         end |= ((index + npages - 1) << shift);
1865         inc = (0x1ull << shift);
1866         mb();
1867
1868         while (start <= end) {
1869                 if (rm)
1870                         __raw_rm_writeq(cpu_to_be64(start), invalidate);
1871                 else
1872                         __raw_writeq(cpu_to_be64(start), invalidate);
1873                 start += inc;
1874         }
1875 }
1876
1877 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1878                 unsigned long index, unsigned long npages, bool rm)
1879 {
1880         struct iommu_table_group_link *tgl;
1881
1882         list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
1883                 struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1884                                 struct pnv_ioda_pe, table_group);
1885                 __be64 __iomem *invalidate = rm ?
1886                         (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
1887                         pe->phb->ioda.tce_inval_reg;
1888
1889                 if (pe->phb->type == PNV_PHB_NPU) {
1890                         /*
1891                          * The NVLink hardware does not support TCE kill
1892                          * per TCE entry so we have to invalidate
1893                          * the entire cache for it.
1894                          */
1895                         pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm);
1896                         continue;
1897                 }
1898                 pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
1899                         invalidate, tbl->it_page_shift,
1900                         index, npages);
1901         }
1902 }
1903
1904 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
1905                 long npages, unsigned long uaddr,
1906                 enum dma_data_direction direction,
1907                 struct dma_attrs *attrs)
1908 {
1909         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1910                         attrs);
1911
1912         if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
1913                 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1914
1915         return ret;
1916 }
1917
1918 #ifdef CONFIG_IOMMU_API
1919 static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
1920                 unsigned long *hpa, enum dma_data_direction *direction)
1921 {
1922         long ret = pnv_tce_xchg(tbl, index, hpa, direction);
1923
1924         if (!ret && (tbl->it_type &
1925                         (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
1926                 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
1927
1928         return ret;
1929 }
1930 #endif
1931
1932 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
1933                 long npages)
1934 {
1935         pnv_tce_free(tbl, index, npages);
1936
1937         if (tbl->it_type & TCE_PCI_SWINV_FREE)
1938                 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1939 }
1940
1941 static void pnv_ioda2_table_free(struct iommu_table *tbl)
1942 {
1943         pnv_pci_ioda2_table_free_pages(tbl);
1944         iommu_free_table(tbl, "pnv");
1945 }
1946
1947 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
1948         .set = pnv_ioda2_tce_build,
1949 #ifdef CONFIG_IOMMU_API
1950         .exchange = pnv_ioda2_tce_xchg,
1951 #endif
1952         .clear = pnv_ioda2_tce_free,
1953         .get = pnv_tce_get,
1954         .free = pnv_ioda2_table_free,
1955 };
1956
1957 static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
1958 {
1959         unsigned int *weight = (unsigned int *)data;
1960
1961         /* This is quite simplistic. The "base" weight of a device
1962          * is 10. 0 means no DMA is to be accounted for it.
1963          */
1964         if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1965                 return 0;
1966
1967         if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
1968             dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
1969             dev->class == PCI_CLASS_SERIAL_USB_EHCI)
1970                 *weight += 3;
1971         else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
1972                 *weight += 15;
1973         else
1974                 *weight += 10;
1975
1976         return 0;
1977 }
1978
1979 static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
1980 {
1981         unsigned int weight = 0;
1982
1983         /* SRIOV VF has same DMA32 weight as its PF */
1984 #ifdef CONFIG_PCI_IOV
1985         if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
1986                 pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
1987                 return weight;
1988         }
1989 #endif
1990
1991         if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
1992                 pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
1993         } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
1994                 struct pci_dev *pdev;
1995
1996                 list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
1997                         pnv_pci_ioda_dev_dma_weight(pdev, &weight);
1998         } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
1999                 pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
2000         }
2001
2002         return weight;
2003 }
2004
2005 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
2006                                        struct pnv_ioda_pe *pe)
2007 {
2008
2009         struct page *tce_mem = NULL;
2010         struct iommu_table *tbl;
2011         unsigned int weight, total_weight = 0;
2012         unsigned int tce32_segsz, base, segs, avail, i;
2013         int64_t rc;
2014         void *addr;
2015
2016         /* XXX FIXME: Handle 64-bit only DMA devices */
2017         /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
2018         /* XXX FIXME: Allocate multi-level tables on PHB3 */
2019         weight = pnv_pci_ioda_pe_dma_weight(pe);
2020         if (!weight)
2021                 return;
2022
2023         pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
2024                      &total_weight);
2025         segs = (weight * phb->ioda.dma32_count) / total_weight;
2026         if (!segs)
2027                 segs = 1;
2028
2029         /*
2030          * Allocate contiguous DMA32 segments. We begin with the expected
2031          * number of segments. With one more attempt, the number of DMA32
2032          * segments to be allocated is decreased by one until one segment
2033          * is allocated successfully.
2034          */
2035         do {
2036                 for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
2037                         for (avail = 0, i = base; i < base + segs; i++) {
2038                                 if (phb->ioda.dma32_segmap[i] ==
2039                                     IODA_INVALID_PE)
2040                                         avail++;
2041                         }
2042
2043                         if (avail == segs)
2044                                 goto found;
2045                 }
2046         } while (--segs);
2047
2048         if (!segs) {
2049                 pe_warn(pe, "No available DMA32 segments\n");
2050                 return;
2051         }
2052
2053 found:
2054         tbl = pnv_pci_table_alloc(phb->hose->node);
2055         iommu_register_group(&pe->table_group, phb->hose->global_number,
2056                         pe->pe_number);
2057         pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
2058
2059         /* Grab a 32-bit TCE table */
2060         pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
2061                 weight, total_weight, base, segs);
2062         pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
2063                 base * PNV_IODA1_DMA32_SEGSIZE,
2064                 (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
2065
2066         /* XXX Currently, we allocate one big contiguous table for the
2067          * TCEs. We only really need one chunk per 256M of TCE space
2068          * (ie per segment) but that's an optimization for later, it
2069          * requires some added smarts with our get/put_tce implementation
2070          *
2071          * Each TCE page is 4KB in size and each TCE entry occupies 8
2072          * bytes
2073          */
2074         tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
2075         tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
2076                                    get_order(tce32_segsz * segs));
2077         if (!tce_mem) {
2078                 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
2079                 goto fail;
2080         }
2081         addr = page_address(tce_mem);
2082         memset(addr, 0, tce32_segsz * segs);
2083
2084         /* Configure HW */
2085         for (i = 0; i < segs; i++) {
2086                 rc = opal_pci_map_pe_dma_window(phb->opal_id,
2087                                               pe->pe_number,
2088                                               base + i, 1,
2089                                               __pa(addr) + tce32_segsz * i,
2090                                               tce32_segsz, IOMMU_PAGE_SIZE_4K);
2091                 if (rc) {
2092                         pe_err(pe, " Failed to configure 32-bit TCE table,"
2093                                " err %ld\n", rc);
2094                         goto fail;
2095                 }
2096         }
2097
2098         /* Setup DMA32 segment mapping */
2099         for (i = base; i < base + segs; i++)
2100                 phb->ioda.dma32_segmap[i] = pe->pe_number;
2101
2102         /* Setup linux iommu table */
2103         pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
2104                                   base * PNV_IODA1_DMA32_SEGSIZE,
2105                                   IOMMU_PAGE_SHIFT_4K);
2106
2107         /* OPAL variant of P7IOC SW invalidated TCEs */
2108         if (phb->ioda.tce_inval_reg)
2109                 tbl->it_type |= (TCE_PCI_SWINV_CREATE |
2110                                  TCE_PCI_SWINV_FREE   |
2111                                  TCE_PCI_SWINV_PAIR);
2112
2113         tbl->it_ops = &pnv_ioda1_iommu_ops;
2114         pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
2115         pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2116         iommu_init_table(tbl, phb->hose->node);
2117
2118         if (pe->flags & PNV_IODA_PE_DEV) {
2119                 /*
2120                  * Setting table base here only for carrying iommu_group
2121                  * further down to let iommu_add_device() do the job.
2122                  * pnv_pci_ioda_dma_dev_setup will override it later anyway.
2123                  */
2124                 set_iommu_table_base(&pe->pdev->dev, tbl);
2125                 iommu_add_device(&pe->pdev->dev);
2126         } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2127                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2128
2129         return;
2130  fail:
2131         /* XXX Failure: Try to fallback to 64-bit only ? */
2132         if (tce_mem)
2133                 __free_pages(tce_mem, get_order(tce32_segsz * segs));
2134         if (tbl) {
2135                 pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
2136                 iommu_free_table(tbl, "pnv");
2137         }
2138 }
2139
2140 static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
2141                 int num, struct iommu_table *tbl)
2142 {
2143         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2144                         table_group);
2145         struct pnv_phb *phb = pe->phb;
2146         int64_t rc;
2147         const unsigned long size = tbl->it_indirect_levels ?
2148                         tbl->it_level_size : tbl->it_size;
2149         const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
2150         const __u64 win_size = tbl->it_size << tbl->it_page_shift;
2151
2152         pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
2153                         start_addr, start_addr + win_size - 1,
2154                         IOMMU_PAGE_SIZE(tbl));
2155
2156         /*
2157          * Map TCE table through TVT. The TVE index is the PE number
2158          * shifted by 1 bit for 32-bits DMA space.
2159          */
2160         rc = opal_pci_map_pe_dma_window(phb->opal_id,
2161                         pe->pe_number,
2162                         (pe->pe_number << 1) + num,
2163                         tbl->it_indirect_levels + 1,
2164                         __pa(tbl->it_base),
2165                         size << 3,
2166                         IOMMU_PAGE_SIZE(tbl));
2167         if (rc) {
2168                 pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
2169                 return rc;
2170         }
2171
2172         pnv_pci_link_table_and_group(phb->hose->node, num,
2173                         tbl, &pe->table_group);
2174         pnv_pci_ioda2_tce_invalidate_pe(pe);
2175
2176         return 0;
2177 }
2178
2179 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
2180 {
2181         uint16_t window_id = (pe->pe_number << 1 ) + 1;
2182         int64_t rc;
2183
2184         pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
2185         if (enable) {
2186                 phys_addr_t top = memblock_end_of_DRAM();
2187
2188                 top = roundup_pow_of_two(top);
2189                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2190                                                      pe->pe_number,
2191                                                      window_id,
2192                                                      pe->tce_bypass_base,
2193                                                      top);
2194         } else {
2195                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2196                                                      pe->pe_number,
2197                                                      window_id,
2198                                                      pe->tce_bypass_base,
2199                                                      0);
2200         }
2201         if (rc)
2202                 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
2203         else
2204                 pe->tce_bypass_enabled = enable;
2205 }
2206
2207 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
2208                 __u32 page_shift, __u64 window_size, __u32 levels,
2209                 struct iommu_table *tbl);
2210
2211 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2212                 int num, __u32 page_shift, __u64 window_size, __u32 levels,
2213                 struct iommu_table **ptbl)
2214 {
2215         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2216                         table_group);
2217         int nid = pe->phb->hose->node;
2218         __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
2219         long ret;
2220         struct iommu_table *tbl;
2221
2222         tbl = pnv_pci_table_alloc(nid);
2223         if (!tbl)
2224                 return -ENOMEM;
2225
2226         ret = pnv_pci_ioda2_table_alloc_pages(nid,
2227                         bus_offset, page_shift, window_size,
2228                         levels, tbl);
2229         if (ret) {
2230                 iommu_free_table(tbl, "pnv");
2231                 return ret;
2232         }
2233
2234         tbl->it_ops = &pnv_ioda2_iommu_ops;
2235         if (pe->phb->ioda.tce_inval_reg)
2236                 tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
2237
2238         *ptbl = tbl;
2239
2240         return 0;
2241 }
2242
2243 static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2244 {
2245         struct iommu_table *tbl = NULL;
2246         long rc;
2247
2248         /*
2249          * crashkernel= specifies the kdump kernel's maximum memory at
2250          * some offset and there is no guaranteed the result is a power
2251          * of 2, which will cause errors later.
2252          */
2253         const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
2254
2255         /*
2256          * In memory constrained environments, e.g. kdump kernel, the
2257          * DMA window can be larger than available memory, which will
2258          * cause errors later.
2259          */
2260         const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
2261
2262         rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2263                         IOMMU_PAGE_SHIFT_4K,
2264                         window_size,
2265                         POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
2266         if (rc) {
2267                 pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2268                                 rc);
2269                 return rc;
2270         }
2271
2272         iommu_init_table(tbl, pe->phb->hose->node);
2273
2274         rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2275         if (rc) {
2276                 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2277                                 rc);
2278                 pnv_ioda2_table_free(tbl);
2279                 return rc;
2280         }
2281
2282         if (!pnv_iommu_bypass_disabled)
2283                 pnv_pci_ioda2_set_bypass(pe, true);
2284
2285         /* OPAL variant of PHB3 invalidated TCEs */
2286         if (pe->phb->ioda.tce_inval_reg)
2287                 tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
2288
2289         /*
2290          * Setting table base here only for carrying iommu_group
2291          * further down to let iommu_add_device() do the job.
2292          * pnv_pci_ioda_dma_dev_setup will override it later anyway.
2293          */
2294         if (pe->flags & PNV_IODA_PE_DEV)
2295                 set_iommu_table_base(&pe->pdev->dev, tbl);
2296
2297         return 0;
2298 }
2299
2300 #if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
2301 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
2302                 int num)
2303 {
2304         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2305                         table_group);
2306         struct pnv_phb *phb = pe->phb;
2307         long ret;
2308
2309         pe_info(pe, "Removing DMA window #%d\n", num);
2310
2311         ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2312                         (pe->pe_number << 1) + num,
2313                         0/* levels */, 0/* table address */,
2314                         0/* table size */, 0/* page size */);
2315         if (ret)
2316                 pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
2317         else
2318                 pnv_pci_ioda2_tce_invalidate_pe(pe);
2319
2320         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2321
2322         return ret;
2323 }
2324 #endif
2325
2326 #ifdef CONFIG_IOMMU_API
2327 static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2328                 __u64 window_size, __u32 levels)
2329 {
2330         unsigned long bytes = 0;
2331         const unsigned window_shift = ilog2(window_size);
2332         unsigned entries_shift = window_shift - page_shift;
2333         unsigned table_shift = entries_shift + 3;
2334         unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
2335         unsigned long direct_table_size;
2336
2337         if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2338                         (window_size > memory_hotplug_max()) ||
2339                         !is_power_of_2(window_size))
2340                 return 0;
2341
2342         /* Calculate a direct table size from window_size and levels */
2343         entries_shift = (entries_shift + levels - 1) / levels;
2344         table_shift = entries_shift + 3;
2345         table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2346         direct_table_size =  1UL << table_shift;
2347
2348         for ( ; levels; --levels) {
2349                 bytes += _ALIGN_UP(tce_table_size, direct_table_size);
2350
2351                 tce_table_size /= direct_table_size;
2352                 tce_table_size <<= 3;
2353                 tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
2354         }
2355
2356         return bytes;
2357 }
2358
2359 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2360 {
2361         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2362                                                 table_group);
2363         /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2364         struct iommu_table *tbl = pe->table_group.tables[0];
2365
2366         pnv_pci_ioda2_set_bypass(pe, false);
2367         pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2368         pnv_ioda2_table_free(tbl);
2369 }
2370
2371 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2372 {
2373         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2374                                                 table_group);
2375
2376         pnv_pci_ioda2_setup_default_config(pe);
2377 }
2378
2379 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2380         .get_table_size = pnv_pci_ioda2_get_table_size,
2381         .create_table = pnv_pci_ioda2_create_table,
2382         .set_window = pnv_pci_ioda2_set_window,
2383         .unset_window = pnv_pci_ioda2_unset_window,
2384         .take_ownership = pnv_ioda2_take_ownership,
2385         .release_ownership = pnv_ioda2_release_ownership,
2386 };
2387
2388 static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
2389 {
2390         struct pci_controller *hose;
2391         struct pnv_phb *phb;
2392         struct pnv_ioda_pe **ptmppe = opaque;
2393         struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
2394         struct pci_dn *pdn = pci_get_pdn(pdev);
2395
2396         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2397                 return 0;
2398
2399         hose = pci_bus_to_host(pdev->bus);
2400         phb = hose->private_data;
2401         if (phb->type != PNV_PHB_NPU)
2402                 return 0;
2403
2404         *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
2405
2406         return 1;
2407 }
2408
2409 /*
2410  * This returns PE of associated NPU.
2411  * This assumes that NPU is in the same IOMMU group with GPU and there is
2412  * no other PEs.
2413  */
2414 static struct pnv_ioda_pe *gpe_table_group_to_npe(
2415                 struct iommu_table_group *table_group)
2416 {
2417         struct pnv_ioda_pe *npe = NULL;
2418         int ret = iommu_group_for_each_dev(table_group->group, &npe,
2419                         gpe_table_group_to_npe_cb);
2420
2421         BUG_ON(!ret || !npe);
2422
2423         return npe;
2424 }
2425
2426 static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
2427                 int num, struct iommu_table *tbl)
2428 {
2429         long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
2430
2431         if (ret)
2432                 return ret;
2433
2434         ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
2435         if (ret)
2436                 pnv_pci_ioda2_unset_window(table_group, num);
2437
2438         return ret;
2439 }
2440
2441 static long pnv_pci_ioda2_npu_unset_window(
2442                 struct iommu_table_group *table_group,
2443                 int num)
2444 {
2445         long ret = pnv_pci_ioda2_unset_window(table_group, num);
2446
2447         if (ret)
2448                 return ret;
2449
2450         return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
2451 }
2452
2453 static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
2454 {
2455         /*
2456          * Detach NPU first as pnv_ioda2_take_ownership() will destroy
2457          * the iommu_table if 32bit DMA is enabled.
2458          */
2459         pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
2460         pnv_ioda2_take_ownership(table_group);
2461 }
2462
2463 static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
2464         .get_table_size = pnv_pci_ioda2_get_table_size,
2465         .create_table = pnv_pci_ioda2_create_table,
2466         .set_window = pnv_pci_ioda2_npu_set_window,
2467         .unset_window = pnv_pci_ioda2_npu_unset_window,
2468         .take_ownership = pnv_ioda2_npu_take_ownership,
2469         .release_ownership = pnv_ioda2_release_ownership,
2470 };
2471
2472 static void pnv_pci_ioda_setup_iommu_api(void)
2473 {
2474         struct pci_controller *hose, *tmp;
2475         struct pnv_phb *phb;
2476         struct pnv_ioda_pe *pe, *gpe;
2477
2478         /*
2479          * Now we have all PHBs discovered, time to add NPU devices to
2480          * the corresponding IOMMU groups.
2481          */
2482         list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2483                 phb = hose->private_data;
2484
2485                 if (phb->type != PNV_PHB_NPU)
2486                         continue;
2487
2488                 list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2489                         gpe = pnv_pci_npu_setup_iommu(pe);
2490                         if (gpe)
2491                                 gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
2492                 }
2493         }
2494 }
2495 #else /* !CONFIG_IOMMU_API */
2496 static void pnv_pci_ioda_setup_iommu_api(void) { };
2497 #endif
2498
2499 static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
2500 {
2501         const __be64 *swinvp;
2502
2503         /* OPAL variant of PHB3 invalidated TCEs */
2504         swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
2505         if (!swinvp)
2506                 return;
2507
2508         phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp);
2509         phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
2510 }
2511
2512 static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
2513                 unsigned levels, unsigned long limit,
2514                 unsigned long *current_offset, unsigned long *total_allocated)
2515 {
2516         struct page *tce_mem = NULL;
2517         __be64 *addr, *tmp;
2518         unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
2519         unsigned long allocated = 1UL << (order + PAGE_SHIFT);
2520         unsigned entries = 1UL << (shift - 3);
2521         long i;
2522
2523         tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
2524         if (!tce_mem) {
2525                 pr_err("Failed to allocate a TCE memory, order=%d\n", order);
2526                 return NULL;
2527         }
2528         addr = page_address(tce_mem);
2529         memset(addr, 0, allocated);
2530         *total_allocated += allocated;
2531
2532         --levels;
2533         if (!levels) {
2534                 *current_offset += allocated;
2535                 return addr;
2536         }
2537
2538         for (i = 0; i < entries; ++i) {
2539                 tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
2540                                 levels, limit, current_offset, total_allocated);
2541                 if (!tmp)
2542                         break;
2543
2544                 addr[i] = cpu_to_be64(__pa(tmp) |
2545                                 TCE_PCI_READ | TCE_PCI_WRITE);
2546
2547                 if (*current_offset >= limit)
2548                         break;
2549         }
2550
2551         return addr;
2552 }
2553
2554 static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
2555                 unsigned long size, unsigned level);
2556
2557 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
2558                 __u32 page_shift, __u64 window_size, __u32 levels,
2559                 struct iommu_table *tbl)
2560 {
2561         void *addr;
2562         unsigned long offset = 0, level_shift, total_allocated = 0;
2563         const unsigned window_shift = ilog2(window_size);
2564         unsigned entries_shift = window_shift - page_shift;
2565         unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
2566         const unsigned long tce_table_size = 1UL << table_shift;
2567
2568         if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
2569                 return -EINVAL;
2570
2571         if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
2572                 return -EINVAL;
2573
2574         /* Adjust direct table size from window_size and levels */
2575         entries_shift = (entries_shift + levels - 1) / levels;
2576         level_shift = entries_shift + 3;
2577         level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
2578
2579         /* Allocate TCE table */
2580         addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
2581                         levels, tce_table_size, &offset, &total_allocated);
2582
2583         /* addr==NULL means that the first level allocation failed */
2584         if (!addr)
2585                 return -ENOMEM;
2586
2587         /*
2588          * First level was allocated but some lower level failed as
2589          * we did not allocate as much as we wanted,
2590          * release partially allocated table.
2591          */
2592         if (offset < tce_table_size) {
2593                 pnv_pci_ioda2_table_do_free_pages(addr,
2594                                 1ULL << (level_shift - 3), levels - 1);
2595                 return -ENOMEM;
2596         }
2597
2598         /* Setup linux iommu table */
2599         pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
2600                         page_shift);
2601         tbl->it_level_size = 1ULL << (level_shift - 3);
2602         tbl->it_indirect_levels = levels - 1;
2603         tbl->it_allocated_size = total_allocated;
2604
2605         pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
2606                         window_size, tce_table_size, bus_offset);
2607
2608         return 0;
2609 }
2610
2611 static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
2612                 unsigned long size, unsigned level)
2613 {
2614         const unsigned long addr_ul = (unsigned long) addr &
2615                         ~(TCE_PCI_READ | TCE_PCI_WRITE);
2616
2617         if (level) {
2618                 long i;
2619                 u64 *tmp = (u64 *) addr_ul;
2620
2621                 for (i = 0; i < size; ++i) {
2622                         unsigned long hpa = be64_to_cpu(tmp[i]);
2623
2624                         if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
2625                                 continue;
2626
2627                         pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
2628                                         level - 1);
2629                 }
2630         }
2631
2632         free_pages(addr_ul, get_order(size << 3));
2633 }
2634
2635 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
2636 {
2637         const unsigned long size = tbl->it_indirect_levels ?
2638                         tbl->it_level_size : tbl->it_size;
2639
2640         if (!tbl->it_size)
2641                 return;
2642
2643         pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
2644                         tbl->it_indirect_levels);
2645 }
2646
2647 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2648                                        struct pnv_ioda_pe *pe)
2649 {
2650         int64_t rc;
2651
2652         if (!pnv_pci_ioda_pe_dma_weight(pe))
2653                 return;
2654
2655         /* TVE #1 is selected by PCI address bit 59 */
2656         pe->tce_bypass_base = 1ull << 59;
2657
2658         iommu_register_group(&pe->table_group, phb->hose->global_number,
2659                         pe->pe_number);
2660
2661         /* The PE will reserve all possible 32-bits space */
2662         pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2663                 phb->ioda.m32_pci_base);
2664
2665         /* Setup linux iommu table */
2666         pe->table_group.tce32_start = 0;
2667         pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2668         pe->table_group.max_dynamic_windows_supported =
2669                         IOMMU_TABLE_GROUP_MAX_TABLES;
2670         pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2671         pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
2672 #ifdef CONFIG_IOMMU_API
2673         pe->table_group.ops = &pnv_pci_ioda2_ops;
2674 #endif
2675
2676         rc = pnv_pci_ioda2_setup_default_config(pe);
2677         if (rc)
2678                 return;
2679
2680         if (pe->flags & PNV_IODA_PE_DEV)
2681                 iommu_add_device(&pe->pdev->dev);
2682         else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2683                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2684 }
2685
2686 #ifdef CONFIG_PCI_MSI
2687 static void pnv_ioda2_msi_eoi(struct irq_data *d)
2688 {
2689         unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2690         struct irq_chip *chip = irq_data_get_irq_chip(d);
2691         struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2692                                            ioda.irq_chip);
2693         int64_t rc;
2694
2695         rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
2696         WARN_ON_ONCE(rc);
2697
2698         icp_native_eoi(d);
2699 }
2700
2701
2702 static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2703 {
2704         struct irq_data *idata;
2705         struct irq_chip *ichip;
2706
2707         if (phb->type != PNV_PHB_IODA2)
2708                 return;
2709
2710         if (!phb->ioda.irq_chip_init) {
2711                 /*
2712                  * First time we setup an MSI IRQ, we need to setup the
2713                  * corresponding IRQ chip to route correctly.
2714                  */
2715                 idata = irq_get_irq_data(virq);
2716                 ichip = irq_data_get_irq_chip(idata);
2717                 phb->ioda.irq_chip_init = 1;
2718                 phb->ioda.irq_chip = *ichip;
2719                 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2720         }
2721         irq_set_chip(virq, &phb->ioda.irq_chip);
2722 }
2723
2724 #ifdef CONFIG_CXL_BASE
2725
2726 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
2727 {
2728         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2729
2730         return of_node_get(hose->dn);
2731 }
2732 EXPORT_SYMBOL(pnv_pci_get_phb_node);
2733
2734 int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
2735 {
2736         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2737         struct pnv_phb *phb = hose->private_data;
2738         struct pnv_ioda_pe *pe;
2739         int rc;
2740
2741         pe = pnv_ioda_get_pe(dev);
2742         if (!pe)
2743                 return -ENODEV;
2744
2745         pe_info(pe, "Switching PHB to CXL\n");
2746
2747         rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
2748         if (rc == OPAL_UNSUPPORTED)
2749                 dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n");
2750         else if (rc)
2751                 dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
2752
2753         return rc;
2754 }
2755 EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
2756
2757 /* Find PHB for cxl dev and allocate MSI hwirqs?
2758  * Returns the absolute hardware IRQ number
2759  */
2760 int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
2761 {
2762         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2763         struct pnv_phb *phb = hose->private_data;
2764         int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
2765
2766         if (hwirq < 0) {
2767                 dev_warn(&dev->dev, "Failed to find a free MSI\n");
2768                 return -ENOSPC;
2769         }
2770
2771         return phb->msi_base + hwirq;
2772 }
2773 EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
2774
2775 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
2776 {
2777         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2778         struct pnv_phb *phb = hose->private_data;
2779
2780         msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
2781 }
2782 EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
2783
2784 void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
2785                                   struct pci_dev *dev)
2786 {
2787         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2788         struct pnv_phb *phb = hose->private_data;
2789         int i, hwirq;
2790
2791         for (i = 1; i < CXL_IRQ_RANGES; i++) {
2792                 if (!irqs->range[i])
2793                         continue;
2794                 pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
2795                          i, irqs->offset[i],
2796                          irqs->range[i]);
2797                 hwirq = irqs->offset[i] - phb->msi_base;
2798                 msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
2799                                        irqs->range[i]);
2800         }
2801 }
2802 EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
2803
2804 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
2805                                struct pci_dev *dev, int num)
2806 {
2807         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2808         struct pnv_phb *phb = hose->private_data;
2809         int i, hwirq, try;
2810
2811         memset(irqs, 0, sizeof(struct cxl_irq_ranges));
2812
2813         /* 0 is reserved for the multiplexed PSL DSI interrupt */
2814         for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
2815                 try = num;
2816                 while (try) {
2817                         hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
2818                         if (hwirq >= 0)
2819                                 break;
2820                         try /= 2;
2821                 }
2822                 if (!try)
2823                         goto fail;
2824
2825                 irqs->offset[i] = phb->msi_base + hwirq;
2826                 irqs->range[i] = try;
2827                 pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
2828                          i, irqs->offset[i], irqs->range[i]);
2829                 num -= try;
2830         }
2831         if (num)
2832                 goto fail;
2833
2834         return 0;
2835 fail:
2836         pnv_cxl_release_hwirq_ranges(irqs, dev);
2837         return -ENOSPC;
2838 }
2839 EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
2840
2841 int pnv_cxl_get_irq_count(struct pci_dev *dev)
2842 {
2843         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2844         struct pnv_phb *phb = hose->private_data;
2845
2846         return phb->msi_bmp.irq_count;
2847 }
2848 EXPORT_SYMBOL(pnv_cxl_get_irq_count);
2849
2850 int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
2851                            unsigned int virq)
2852 {
2853         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2854         struct pnv_phb *phb = hose->private_data;
2855         unsigned int xive_num = hwirq - phb->msi_base;
2856         struct pnv_ioda_pe *pe;
2857         int rc;
2858
2859         if (!(pe = pnv_ioda_get_pe(dev)))
2860                 return -ENODEV;
2861
2862         /* Assign XIVE to PE */
2863         rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2864         if (rc) {
2865                 pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
2866                         "hwirq 0x%x XIVE 0x%x PE\n",
2867                         pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
2868                 return -EIO;
2869         }
2870         set_msi_irq_chip(phb, virq);
2871
2872         return 0;
2873 }
2874 EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
2875 #endif
2876
2877 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2878                                   unsigned int hwirq, unsigned int virq,
2879                                   unsigned int is_64, struct msi_msg *msg)
2880 {
2881         struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2882         unsigned int xive_num = hwirq - phb->msi_base;
2883         __be32 data;
2884         int rc;
2885
2886         /* No PE assigned ? bail out ... no MSI for you ! */
2887         if (pe == NULL)
2888                 return -ENXIO;
2889
2890         /* Check if we have an MVE */
2891         if (pe->mve_number < 0)
2892                 return -ENXIO;
2893
2894         /* Force 32-bit MSI on some broken devices */
2895         if (dev->no_64bit_msi)
2896                 is_64 = 0;
2897
2898         /* Assign XIVE to PE */
2899         rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2900         if (rc) {
2901                 pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2902                         pci_name(dev), rc, xive_num);
2903                 return -EIO;
2904         }
2905
2906         if (is_64) {
2907                 __be64 addr64;
2908
2909                 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2910                                      &addr64, &data);
2911                 if (rc) {
2912                         pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2913                                 pci_name(dev), rc);
2914                         return -EIO;
2915                 }
2916                 msg->address_hi = be64_to_cpu(addr64) >> 32;
2917                 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2918         } else {
2919                 __be32 addr32;
2920
2921                 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2922                                      &addr32, &data);
2923                 if (rc) {
2924                         pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2925                                 pci_name(dev), rc);
2926                         return -EIO;
2927                 }
2928                 msg->address_hi = 0;
2929                 msg->address_lo = be32_to_cpu(addr32);
2930         }
2931         msg->data = be32_to_cpu(data);
2932
2933         set_msi_irq_chip(phb, virq);
2934
2935         pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2936                  " address=%x_%08x data=%x PE# %d\n",
2937                  pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2938                  msg->address_hi, msg->address_lo, data, pe->pe_number);
2939
2940         return 0;
2941 }
2942
2943 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2944 {
2945         unsigned int count;
2946         const __be32 *prop = of_get_property(phb->hose->dn,
2947                                              "ibm,opal-msi-ranges", NULL);
2948         if (!prop) {
2949                 /* BML Fallback */
2950                 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2951         }
2952         if (!prop)
2953                 return;
2954
2955         phb->msi_base = be32_to_cpup(prop);
2956         count = be32_to_cpup(prop + 1);
2957         if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2958                 pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2959                        phb->hose->global_number);
2960                 return;
2961         }
2962
2963         phb->msi_setup = pnv_pci_ioda_msi_setup;
2964         phb->msi32_support = 1;
2965         pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2966                 count, phb->msi_base);
2967 }
2968 #else
2969 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
2970 #endif /* CONFIG_PCI_MSI */
2971
2972 #ifdef CONFIG_PCI_IOV
2973 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2974 {
2975         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
2976         struct pnv_phb *phb = hose->private_data;
2977         const resource_size_t gate = phb->ioda.m64_segsize >> 2;
2978         struct resource *res;
2979         int i;
2980         resource_size_t size, total_vf_bar_sz;
2981         struct pci_dn *pdn;
2982         int mul, total_vfs;
2983
2984         if (!pdev->is_physfn || pdev->is_added)
2985                 return;
2986
2987         pdn = pci_get_pdn(pdev);
2988         pdn->vfs_expanded = 0;
2989         pdn->m64_single_mode = false;
2990
2991         total_vfs = pci_sriov_get_totalvfs(pdev);
2992         mul = phb->ioda.total_pe_num;
2993         total_vf_bar_sz = 0;
2994
2995         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2996                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
2997                 if (!res->flags || res->parent)
2998                         continue;
2999                 if (!pnv_pci_is_mem_pref_64(res->flags)) {
3000                         dev_warn(&pdev->dev, "Don't support SR-IOV with"
3001                                         " non M64 VF BAR%d: %pR. \n",
3002                                  i, res);
3003                         goto truncate_iov;
3004                 }
3005
3006                 total_vf_bar_sz += pci_iov_resource_size(pdev,
3007                                 i + PCI_IOV_RESOURCES);
3008
3009                 /*
3010                  * If bigger than quarter of M64 segment size, just round up
3011                  * power of two.
3012                  *
3013                  * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
3014                  * with other devices, IOV BAR size is expanded to be
3015                  * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
3016                  * segment size , the expanded size would equal to half of the
3017                  * whole M64 space size, which will exhaust the M64 Space and
3018                  * limit the system flexibility.  This is a design decision to
3019                  * set the boundary to quarter of the M64 segment size.
3020                  */
3021                 if (total_vf_bar_sz > gate) {
3022                         mul = roundup_pow_of_two(total_vfs);
3023                         dev_info(&pdev->dev,
3024                                 "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
3025                                 total_vf_bar_sz, gate, mul);
3026                         pdn->m64_single_mode = true;
3027                         break;
3028                 }
3029         }
3030
3031         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3032                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
3033                 if (!res->flags || res->parent)
3034                         continue;
3035
3036                 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
3037                 /*
3038                  * On PHB3, the minimum size alignment of M64 BAR in single
3039                  * mode is 32MB.
3040                  */
3041                 if (pdn->m64_single_mode && (size < SZ_32M))
3042                         goto truncate_iov;
3043                 dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
3044                 res->end = res->start + size * mul - 1;
3045                 dev_dbg(&pdev->dev, "                       %pR\n", res);
3046                 dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
3047                          i, res, mul);
3048         }
3049         pdn->vfs_expanded = mul;
3050
3051         return;
3052
3053 truncate_iov:
3054         /* To save MMIO space, IOV BAR is truncated. */
3055         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3056                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
3057                 res->flags = 0;
3058                 res->end = res->start - 1;
3059         }
3060 }
3061 #endif /* CONFIG_PCI_IOV */
3062
3063 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
3064                                   struct resource *res)
3065 {
3066         struct pnv_phb *phb = pe->phb;
3067         struct pci_bus_region region;
3068         int index;
3069         int64_t rc;
3070
3071         if (!res || !res->flags || res->start > res->end)
3072                 return;
3073
3074         if (res->flags & IORESOURCE_IO) {
3075                 region.start = res->start - phb->ioda.io_pci_base;
3076                 region.end   = res->end - phb->ioda.io_pci_base;
3077                 index = region.start / phb->ioda.io_segsize;
3078
3079                 while (index < phb->ioda.total_pe_num &&
3080                        region.start <= region.end) {
3081                         phb->ioda.io_segmap[index] = pe->pe_number;
3082                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3083                                 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
3084                         if (rc != OPAL_SUCCESS) {
3085                                 pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n",
3086                                        __func__, rc, index, pe->pe_number);
3087                                 break;
3088                         }
3089
3090                         region.start += phb->ioda.io_segsize;
3091                         index++;
3092                 }
3093         } else if ((res->flags & IORESOURCE_MEM) &&
3094                    !pnv_pci_is_mem_pref_64(res->flags)) {
3095                 region.start = res->start -
3096                                phb->hose->mem_offset[0] -
3097                                phb->ioda.m32_pci_base;
3098                 region.end   = res->end -
3099                                phb->hose->mem_offset[0] -
3100                                phb->ioda.m32_pci_base;
3101                 index = region.start / phb->ioda.m32_segsize;
3102
3103                 while (index < phb->ioda.total_pe_num &&
3104                        region.start <= region.end) {
3105                         phb->ioda.m32_segmap[index] = pe->pe_number;
3106                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3107                                 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
3108                         if (rc != OPAL_SUCCESS) {
3109                                 pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d",
3110                                        __func__, rc, index, pe->pe_number);
3111                                 break;
3112                         }
3113
3114                         region.start += phb->ioda.m32_segsize;
3115                         index++;
3116                 }
3117         }
3118 }
3119
3120 /*
3121  * This function is supposed to be called on basis of PE from top
3122  * to bottom style. So the the I/O or MMIO segment assigned to
3123  * parent PE could be overrided by its child PEs if necessary.
3124  */
3125 static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
3126 {
3127         struct pci_dev *pdev;
3128         int i;
3129
3130         /*
3131          * NOTE: We only care PCI bus based PE for now. For PCI
3132          * device based PE, for example SRIOV sensitive VF should
3133          * be figured out later.
3134          */
3135         BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
3136
3137         list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
3138                 for (i = 0; i <= PCI_ROM_RESOURCE; i++)
3139                         pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
3140
3141                 /*
3142                  * If the PE contains all subordinate PCI buses, the
3143                  * windows of the child bridges should be mapped to
3144                  * the PE as well.
3145                  */
3146                 if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
3147                         continue;
3148                 for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
3149                         pnv_ioda_setup_pe_res(pe,
3150                                 &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
3151         }
3152 }
3153
3154 static void pnv_pci_ioda_create_dbgfs(void)
3155 {
3156 #ifdef CONFIG_DEBUG_FS
3157         struct pci_controller *hose, *tmp;
3158         struct pnv_phb *phb;
3159         char name[16];
3160
3161         list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
3162                 phb = hose->private_data;
3163
3164                 /* Notify initialization of PHB done */
3165                 phb->initialized = 1;
3166
3167                 sprintf(name, "PCI%04x", hose->global_number);
3168                 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
3169                 if (!phb->dbgfs)
3170                         pr_warning("%s: Error on creating debugfs on PHB#%x\n",
3171                                 __func__, hose->global_number);
3172         }
3173 #endif /* CONFIG_DEBUG_FS */
3174 }
3175
3176 static void pnv_pci_ioda_fixup(void)
3177 {
3178         pnv_pci_ioda_setup_PEs();
3179         pnv_pci_ioda_setup_iommu_api();
3180         pnv_pci_ioda_create_dbgfs();
3181
3182 #ifdef CONFIG_EEH
3183         eeh_init();
3184         eeh_addr_cache_build();
3185 #endif
3186 }
3187
3188 /*
3189  * Returns the alignment for I/O or memory windows for P2P
3190  * bridges. That actually depends on how PEs are segmented.
3191  * For now, we return I/O or M32 segment size for PE sensitive
3192  * P2P bridges. Otherwise, the default values (4KiB for I/O,
3193  * 1MiB for memory) will be returned.
3194  *
3195  * The current PCI bus might be put into one PE, which was
3196  * create against the parent PCI bridge. For that case, we
3197  * needn't enlarge the alignment so that we can save some
3198  * resources.
3199  */
3200 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
3201                                                 unsigned long type)
3202 {
3203         struct pci_dev *bridge;
3204         struct pci_controller *hose = pci_bus_to_host(bus);
3205         struct pnv_phb *phb = hose->private_data;
3206         int num_pci_bridges = 0;
3207
3208         bridge = bus->self;
3209         while (bridge) {
3210                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
3211                         num_pci_bridges++;
3212                         if (num_pci_bridges >= 2)
3213                                 return 1;
3214                 }
3215
3216                 bridge = bridge->bus->self;
3217         }
3218
3219         /* We fail back to M32 if M64 isn't supported */
3220         if (phb->ioda.m64_segsize &&
3221             pnv_pci_is_mem_pref_64(type))
3222                 return phb->ioda.m64_segsize;
3223         if (type & IORESOURCE_MEM)
3224                 return phb->ioda.m32_segsize;
3225
3226         return phb->ioda.io_segsize;
3227 }
3228
3229 /*
3230  * We are updating root port or the upstream port of the
3231  * bridge behind the root port with PHB's windows in order
3232  * to accommodate the changes on required resources during
3233  * PCI (slot) hotplug, which is connected to either root
3234  * port or the downstream ports of PCIe switch behind the
3235  * root port.
3236  */
3237 static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
3238                                            unsigned long type)
3239 {
3240         struct pci_controller *hose = pci_bus_to_host(bus);
3241         struct pnv_phb *phb = hose->private_data;
3242         struct pci_dev *bridge = bus->self;
3243         struct resource *r, *w;
3244         bool msi_region = false;
3245         int i;
3246
3247         /* Check if we need apply fixup to the bridge's windows */
3248         if (!pci_is_root_bus(bridge->bus) &&
3249             !pci_is_root_bus(bridge->bus->self->bus))
3250                 return;
3251
3252         /* Fixup the resources */
3253         for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
3254                 r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
3255                 if (!r->flags || !r->parent)
3256                         continue;
3257
3258                 w = NULL;
3259                 if (r->flags & type & IORESOURCE_IO)
3260                         w = &hose->io_resource;
3261                 else if (pnv_pci_is_mem_pref_64(r->flags) &&
3262                          (type & IORESOURCE_PREFETCH) &&
3263                          phb->ioda.m64_segsize)
3264                         w = &hose->mem_resources[1];
3265                 else if (r->flags & type & IORESOURCE_MEM) {
3266                         w = &hose->mem_resources[0];
3267                         msi_region = true;
3268                 }
3269
3270                 r->start = w->start;
3271                 r->end = w->end;
3272
3273                 /* The 64KB 32-bits MSI region shouldn't be included in
3274                  * the 32-bits bridge window. Otherwise, we can see strange
3275                  * issues. One of them is EEH error observed on Garrison.
3276                  *
3277                  * Exclude top 1MB region which is the minimal alignment of
3278                  * 32-bits bridge window.
3279                  */
3280                 if (msi_region) {
3281                         r->end += 0x10000;
3282                         r->end -= 0x100000;
3283                 }
3284         }
3285 }
3286
3287 static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
3288 {
3289         struct pci_controller *hose = pci_bus_to_host(bus);
3290         struct pnv_phb *phb = hose->private_data;
3291         struct pci_dev *bridge = bus->self;
3292         struct pnv_ioda_pe *pe;
3293         bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
3294
3295         /* Extend bridge's windows if necessary */
3296         pnv_pci_fixup_bridge_resources(bus, type);
3297
3298         /* The PE for root bus should be realized before any one else */
3299         if (!phb->ioda.root_pe_populated) {
3300                 pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
3301                 if (pe) {
3302                         phb->ioda.root_pe_idx = pe->pe_number;
3303                         phb->ioda.root_pe_populated = true;
3304                 }
3305         }
3306
3307         /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
3308         if (list_empty(&bus->devices))
3309                 return;
3310
3311         /* Reserve PEs according to used M64 resources */
3312         if (phb->reserve_m64_pe)
3313                 phb->reserve_m64_pe(bus, NULL, all);
3314
3315         /*
3316          * Assign PE. We might run here because of partial hotplug.
3317          * For the case, we just pick up the existing PE and should
3318          * not allocate resources again.
3319          */
3320         pe = pnv_ioda_setup_bus_PE(bus, all);
3321         if (!pe)
3322                 return;
3323
3324         pnv_ioda_setup_pe_seg(pe);
3325         switch (phb->type) {
3326         case PNV_PHB_IODA1:
3327                 pnv_pci_ioda1_setup_dma_pe(phb, pe);
3328                 break;
3329         case PNV_PHB_IODA2:
3330                 pnv_pci_ioda2_setup_dma_pe(phb, pe);
3331                 break;
3332         default:
3333                 pr_warn("%s: No DMA for PHB#%d (type %d)\n",
3334                         __func__, phb->hose->global_number, phb->type);
3335         }
3336 }
3337
3338 #ifdef CONFIG_PCI_IOV
3339 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
3340                                                       int resno)
3341 {
3342         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3343         struct pnv_phb *phb = hose->private_data;
3344         struct pci_dn *pdn = pci_get_pdn(pdev);
3345         resource_size_t align;
3346
3347         /*
3348          * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
3349          * SR-IOV. While from hardware perspective, the range mapped by M64
3350          * BAR should be size aligned.
3351          *
3352          * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
3353          * powernv-specific hardware restriction is gone. But if just use the
3354          * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
3355          * in one segment of M64 #15, which introduces the PE conflict between
3356          * PF and VF. Based on this, the minimum alignment of an IOV BAR is
3357          * m64_segsize.
3358          *
3359          * This function returns the total IOV BAR size if M64 BAR is in
3360          * Shared PE mode or just VF BAR size if not.
3361          * If the M64 BAR is in Single PE mode, return the VF BAR size or
3362          * M64 segment size if IOV BAR size is less.
3363          */
3364         align = pci_iov_resource_size(pdev, resno);
3365         if (!pdn->vfs_expanded)
3366                 return align;
3367         if (pdn->m64_single_mode)
3368                 return max(align, (resource_size_t)phb->ioda.m64_segsize);
3369
3370         return pdn->vfs_expanded * align;
3371 }
3372 #endif /* CONFIG_PCI_IOV */
3373
3374 /* Prevent enabling devices for which we couldn't properly
3375  * assign a PE
3376  */
3377 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
3378 {
3379         struct pci_controller *hose = pci_bus_to_host(dev->bus);
3380         struct pnv_phb *phb = hose->private_data;
3381         struct pci_dn *pdn;
3382
3383         /* The function is probably called while the PEs have
3384          * not be created yet. For example, resource reassignment
3385          * during PCI probe period. We just skip the check if
3386          * PEs isn't ready.
3387          */
3388         if (!phb->initialized)
3389                 return true;
3390
3391         pdn = pci_get_pdn(dev);
3392         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3393                 return false;
3394
3395         return true;
3396 }
3397
3398 static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
3399                                        int num)
3400 {
3401         struct pnv_ioda_pe *pe = container_of(table_group,
3402                                               struct pnv_ioda_pe, table_group);
3403         struct pnv_phb *phb = pe->phb;
3404         unsigned int idx;
3405         long rc;
3406
3407         pe_info(pe, "Removing DMA window #%d\n", num);
3408         for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
3409                 if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
3410                         continue;
3411
3412                 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
3413                                                 idx, 0, 0ul, 0ul, 0ul);
3414                 if (rc != OPAL_SUCCESS) {
3415                         pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
3416                                 rc, idx);
3417                         return rc;
3418                 }
3419
3420                 phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
3421         }
3422
3423         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
3424         return OPAL_SUCCESS;
3425 }
3426
3427 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
3428 {
3429         unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3430         struct iommu_table *tbl = pe->table_group.tables[0];
3431         int64_t rc;
3432
3433         if (!weight)
3434                 return;
3435
3436         rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
3437         if (rc != OPAL_SUCCESS)
3438                 return;
3439
3440         pnv_pci_ioda1_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
3441         if (pe->table_group.group) {
3442                 iommu_group_put(pe->table_group.group);
3443                 WARN_ON(pe->table_group.group);
3444         }
3445
3446         free_pages(tbl->it_base, get_order(tbl->it_size << 3));
3447         iommu_free_table(tbl, "pnv");
3448 }
3449
3450 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
3451 {
3452         struct iommu_table *tbl = pe->table_group.tables[0];
3453         unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3454 #ifdef CONFIG_IOMMU_API
3455         int64_t rc;
3456 #endif
3457
3458         if (!weight)
3459                 return;
3460
3461 #ifdef CONFIG_IOMMU_API
3462         rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
3463         if (rc)
3464                 pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
3465 #endif
3466
3467         pnv_pci_ioda2_set_bypass(pe, false);
3468         if (pe->table_group.group) {
3469                 iommu_group_put(pe->table_group.group);
3470                 WARN_ON(pe->table_group.group);
3471         }
3472
3473         pnv_pci_ioda2_table_free_pages(tbl);
3474         iommu_free_table(tbl, "pnv");
3475 }
3476
3477 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
3478                                  unsigned short win,
3479                                  unsigned int *map)
3480 {
3481         struct pnv_phb *phb = pe->phb;
3482         int idx;
3483         int64_t rc;
3484
3485         for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
3486                 if (map[idx] != pe->pe_number)
3487                         continue;
3488
3489                 if (win == OPAL_M64_WINDOW_TYPE)
3490                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3491                                         phb->ioda.reserved_pe_idx, win,
3492                                         idx / PNV_IODA1_M64_SEGS,
3493                                         idx % PNV_IODA1_M64_SEGS);
3494                 else
3495                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3496                                         phb->ioda.reserved_pe_idx, win, 0, idx);
3497
3498                 if (rc != OPAL_SUCCESS)
3499                         pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
3500                                 rc, win, idx);
3501
3502                 map[idx] = IODA_INVALID_PE;
3503         }
3504 }
3505
3506 static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
3507 {
3508         struct pnv_phb *phb = pe->phb;
3509
3510         if (phb->type == PNV_PHB_IODA1) {
3511                 pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
3512                                      phb->ioda.io_segmap);
3513                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3514                                      phb->ioda.m32_segmap);
3515                 pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
3516                                      phb->ioda.m64_segmap);
3517         } else if (phb->type == PNV_PHB_IODA2) {
3518                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3519                                      phb->ioda.m32_segmap);
3520         }
3521 }
3522
3523 static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
3524 {
3525         struct pnv_phb *phb = pe->phb;
3526         struct pnv_ioda_pe *slave, *tmp;
3527
3528         /* Release slave PEs in compound PE */
3529         if (pe->flags & PNV_IODA_PE_MASTER) {
3530                 list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
3531                         pnv_ioda_release_pe(slave);
3532         }
3533
3534         list_del(&pe->list);
3535         switch (phb->type) {
3536         case PNV_PHB_IODA1:
3537                 pnv_pci_ioda1_release_pe_dma(pe);
3538                 break;
3539         case PNV_PHB_IODA2:
3540                 pnv_pci_ioda2_release_pe_dma(pe);
3541                 break;
3542         default:
3543                 WARN_ON(1);
3544         }
3545
3546         pnv_ioda_release_pe_seg(pe);
3547         pnv_ioda_deconfigure_pe(pe->phb, pe);
3548         pnv_ioda_free_pe(pe);
3549 }
3550
3551 static void pnv_pci_release_device(struct pci_dev *pdev)
3552 {
3553         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3554         struct pnv_phb *phb = hose->private_data;
3555         struct pci_dn *pdn = pci_get_pdn(pdev);
3556         struct pnv_ioda_pe *pe;
3557
3558         if (pdev->is_virtfn)
3559                 return;
3560
3561         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3562                 return;
3563
3564         pe = &phb->ioda.pe_array[pdn->pe_number];
3565         WARN_ON(--pe->device_count < 0);
3566         if (pe->device_count == 0)
3567                 pnv_ioda_release_pe(pe);
3568 }
3569
3570 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3571 {
3572         struct pnv_phb *phb = hose->private_data;
3573
3574         opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
3575                        OPAL_ASSERT_RESET);
3576 }
3577
3578 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3579         .dma_dev_setup          = pnv_pci_dma_dev_setup,
3580         .dma_bus_setup          = pnv_pci_dma_bus_setup,
3581 #ifdef CONFIG_PCI_MSI
3582         .setup_msi_irqs         = pnv_setup_msi_irqs,
3583         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3584 #endif
3585         .enable_device_hook     = pnv_pci_enable_device_hook,
3586         .release_device         = pnv_pci_release_device,
3587         .window_alignment       = pnv_pci_window_alignment,
3588         .setup_bridge           = pnv_pci_setup_bridge,
3589         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3590         .dma_set_mask           = pnv_pci_ioda_dma_set_mask,
3591         .dma_get_required_mask  = pnv_pci_ioda_dma_get_required_mask,
3592         .shutdown               = pnv_pci_ioda_shutdown,
3593 };
3594
3595 static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
3596 {
3597         dev_err_once(&npdev->dev,
3598                         "%s operation unsupported for NVLink devices\n",
3599                         __func__);
3600         return -EPERM;
3601 }
3602
3603 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3604         .dma_dev_setup          = pnv_pci_dma_dev_setup,
3605 #ifdef CONFIG_PCI_MSI
3606         .setup_msi_irqs         = pnv_setup_msi_irqs,
3607         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3608 #endif
3609         .enable_device_hook     = pnv_pci_enable_device_hook,
3610         .window_alignment       = pnv_pci_window_alignment,
3611         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3612         .dma_set_mask           = pnv_npu_dma_set_mask,
3613         .shutdown               = pnv_pci_ioda_shutdown,
3614 };
3615
3616 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3617                                          u64 hub_id, int ioda_type)
3618 {
3619         struct pci_controller *hose;
3620         struct pnv_phb *phb;
3621         unsigned long size, m64map_off, m32map_off, pemap_off;
3622         unsigned long iomap_off = 0, dma32map_off = 0;
3623         const __be64 *prop64;
3624         const __be32 *prop32;
3625         int len;
3626         unsigned int segno;
3627         u64 phb_id;
3628         void *aux;
3629         long rc;
3630
3631         pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
3632
3633         prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
3634         if (!prop64) {
3635                 pr_err("  Missing \"ibm,opal-phbid\" property !\n");
3636                 return;
3637         }
3638         phb_id = be64_to_cpup(prop64);
3639         pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
3640
3641         phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
3642
3643         /* Allocate PCI controller */
3644         phb->hose = hose = pcibios_alloc_controller(np);
3645         if (!phb->hose) {
3646                 pr_err("  Can't allocate PCI controller for %s\n",
3647                        np->full_name);
3648                 memblock_free(__pa(phb), sizeof(struct pnv_phb));
3649                 return;
3650         }
3651
3652         spin_lock_init(&phb->lock);
3653         prop32 = of_get_property(np, "bus-range", &len);
3654         if (prop32 && len == 8) {
3655                 hose->first_busno = be32_to_cpu(prop32[0]);
3656                 hose->last_busno = be32_to_cpu(prop32[1]);
3657         } else {
3658                 pr_warn("  Broken <bus-range> on %s\n", np->full_name);
3659                 hose->first_busno = 0;
3660                 hose->last_busno = 0xff;
3661         }
3662         hose->private_data = phb;
3663         phb->hub_id = hub_id;
3664         phb->opal_id = phb_id;
3665         phb->type = ioda_type;
3666         mutex_init(&phb->ioda.pe_alloc_mutex);
3667
3668         /* Detect specific models for error handling */
3669         if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
3670                 phb->model = PNV_PHB_MODEL_P7IOC;
3671         else if (of_device_is_compatible(np, "ibm,power8-pciex"))
3672                 phb->model = PNV_PHB_MODEL_PHB3;
3673         else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
3674                 phb->model = PNV_PHB_MODEL_NPU;
3675         else
3676                 phb->model = PNV_PHB_MODEL_UNKNOWN;
3677
3678         /* Parse 32-bit and IO ranges (if any) */
3679         pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3680
3681         /* Get registers */
3682         phb->regs = of_iomap(np, 0);
3683         if (phb->regs == NULL)
3684                 pr_err("  Failed to map registers !\n");
3685
3686         /* Initialize TCE kill register */
3687         pnv_pci_ioda_setup_opal_tce_kill(phb);
3688
3689         /* Initialize more IODA stuff */
3690         phb->ioda.total_pe_num = 1;
3691         prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3692         if (prop32)
3693                 phb->ioda.total_pe_num = be32_to_cpup(prop32);
3694         prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3695         if (prop32)
3696                 phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3697
3698         /* Invalidate RID to PE# mapping */
3699         for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3700                 phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3701
3702         /* Parse 64-bit MMIO range */
3703         pnv_ioda_parse_m64_window(phb);
3704
3705         phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3706         /* FW Has already off top 64k of M32 space (MSI space) */
3707         phb->ioda.m32_size += 0x10000;
3708
3709         phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3710         phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3711         phb->ioda.io_size = hose->pci_io_size;
3712         phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3713         phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3714
3715         /* Calculate how many 32-bit TCE segments we have */
3716         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3717                                 PNV_IODA1_DMA32_SEGSIZE;
3718
3719         /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3720         size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3721                         sizeof(unsigned long));
3722         m64map_off = size;
3723         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3724         m32map_off = size;
3725         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3726         if (phb->type == PNV_PHB_IODA1) {
3727                 iomap_off = size;
3728                 size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3729                 dma32map_off = size;
3730                 size += phb->ioda.dma32_count *
3731                         sizeof(phb->ioda.dma32_segmap[0]);
3732         }
3733         pemap_off = size;
3734         size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3735         aux = memblock_virt_alloc(size, 0);
3736         phb->ioda.pe_alloc = aux;
3737         phb->ioda.m64_segmap = aux + m64map_off;
3738         phb->ioda.m32_segmap = aux + m32map_off;
3739         for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3740                 phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3741                 phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3742         }
3743         if (phb->type == PNV_PHB_IODA1) {
3744                 phb->ioda.io_segmap = aux + iomap_off;
3745                 for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3746                         phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3747
3748                 phb->ioda.dma32_segmap = aux + dma32map_off;
3749                 for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3750                         phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3751         }
3752         phb->ioda.pe_array = aux + pemap_off;
3753
3754         /*
3755          * Choose PE number for root bus, which shouldn't have
3756          * M64 resources consumed by its child devices. To pick
3757          * the PE number adjacent to the reserved one if possible.
3758          */
3759         pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3760         if (phb->ioda.reserved_pe_idx == 0) {
3761                 phb->ioda.root_pe_idx = 1;
3762                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3763         } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3764                 phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3765                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3766         } else {
3767                 phb->ioda.root_pe_idx = IODA_INVALID_PE;
3768         }
3769
3770         INIT_LIST_HEAD(&phb->ioda.pe_list);
3771         mutex_init(&phb->ioda.pe_list_mutex);
3772
3773         /* Calculate how many 32-bit TCE segments we have */
3774         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3775                                 PNV_IODA1_DMA32_SEGSIZE;
3776
3777 #if 0 /* We should really do that ... */
3778         rc = opal_pci_set_phb_mem_window(opal->phb_id,
3779                                          window_type,
3780                                          window_num,
3781                                          starting_real_address,
3782                                          starting_pci_address,
3783                                          segment_size);
3784 #endif
3785
3786         pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3787                 phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3788                 phb->ioda.m32_size, phb->ioda.m32_segsize);
3789         if (phb->ioda.m64_size)
3790                 pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3791                         phb->ioda.m64_size, phb->ioda.m64_segsize);
3792         if (phb->ioda.io_size)
3793                 pr_info("                  IO: 0x%x [segment=0x%x]\n",
3794                         phb->ioda.io_size, phb->ioda.io_segsize);
3795
3796
3797         phb->hose->ops = &pnv_pci_ops;
3798         phb->get_pe_state = pnv_ioda_get_pe_state;
3799         phb->freeze_pe = pnv_ioda_freeze_pe;
3800         phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3801
3802         /* Setup MSI support */
3803         pnv_pci_init_ioda_msis(phb);
3804
3805         /*
3806          * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3807          * to let the PCI core do resource assignment. It's supposed
3808          * that the PCI core will do correct I/O and MMIO alignment
3809          * for the P2P bridge bars so that each PCI bus (excluding
3810          * the child P2P bridges) can form individual PE.
3811          */
3812         ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3813
3814         if (phb->type == PNV_PHB_NPU) {
3815                 hose->controller_ops = pnv_npu_ioda_controller_ops;
3816         } else {
3817                 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
3818                 hose->controller_ops = pnv_pci_ioda_controller_ops;
3819         }
3820
3821 #ifdef CONFIG_PCI_IOV
3822         ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
3823         ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3824 #endif
3825
3826         pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3827
3828         /* Reset IODA tables to a clean state */
3829         rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3830         if (rc)
3831                 pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
3832
3833         /* If we're running in kdump kerenl, the previous kerenl never
3834          * shutdown PCI devices correctly. We already got IODA table
3835          * cleaned out. So we have to issue PHB reset to stop all PCI
3836          * transactions from previous kerenl.
3837          */
3838         if (is_kdump_kernel()) {
3839                 pr_info("  Issue PHB reset ...\n");
3840                 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3841                 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3842         }
3843
3844         /* Remove M64 resource if we can't configure it successfully */
3845         if (!phb->init_m64 || phb->init_m64(phb))
3846                 hose->mem_resources[1].flags = 0;
3847 }
3848
3849 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3850 {
3851         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3852 }
3853
3854 void __init pnv_pci_init_npu_phb(struct device_node *np)
3855 {
3856         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU);
3857 }
3858
3859 void __init pnv_pci_init_ioda_hub(struct device_node *np)
3860 {
3861         struct device_node *phbn;
3862         const __be64 *prop64;
3863         u64 hub_id;
3864
3865         pr_info("Probing IODA IO-Hub %s\n", np->full_name);
3866
3867         prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3868         if (!prop64) {
3869                 pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3870                 return;
3871         }
3872         hub_id = be64_to_cpup(prop64);
3873         pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3874
3875         /* Count child PHBs */
3876         for_each_child_of_node(np, phbn) {
3877                 /* Look for IODA1 PHBs */
3878                 if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3879                         pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3880         }
3881 }