net/mlx4_core: Enable device recovery flow with SRIOV
[cascardo/linux.git] / drivers / net / ethernet / mellanox / mlx4 / main.c
index 943cbd4..1baf1f1 100644 (file)
@@ -108,6 +108,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
                                         MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
                                         MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
+#define RESET_PERSIST_MASK_FLAGS       (MLX4_FLAG_SRIOV)
+
 static char mlx4_version[] =
        DRV_NAME ": Mellanox ConnectX core driver v"
        DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -318,10 +320,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                return -ENODEV;
        }
 
-       if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+       if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
                mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
                         dev_cap->uar_size,
-                        (unsigned long long) pci_resource_len(dev->pdev, 2));
+                        (unsigned long long)
+                        pci_resource_len(dev->persist->pdev, 2));
                return -ENODEV;
        }
 
@@ -541,8 +544,10 @@ static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
        *speed = PCI_SPEED_UNKNOWN;
        *width = PCIE_LNK_WIDTH_UNKNOWN;
 
-       err1 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP, &lnkcap1);
-       err2 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP2, &lnkcap2);
+       err1 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP,
+                                         &lnkcap1);
+       err2 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP2,
+                                         &lnkcap2);
        if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
                if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
                        *speed = PCIE_SPEED_8_0GT;
@@ -587,7 +592,7 @@ static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
                return;
        }
 
-       err = pcie_get_minimum_link(dev->pdev, &speed, &width);
+       err = pcie_get_minimum_link(dev->persist->pdev, &speed, &width);
        if (err || speed == PCI_SPEED_UNKNOWN ||
            width == PCIE_LNK_WIDTH_UNKNOWN) {
                mlx4_warn(dev,
@@ -837,10 +842,12 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
        if (dev->caps.uar_page_size * (dev->caps.num_uars -
                                       dev->caps.reserved_uars) >
-                                      pci_resource_len(dev->pdev, 2)) {
+                                      pci_resource_len(dev->persist->pdev,
+                                                       2)) {
                mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
                         dev->caps.uar_page_size * dev->caps.num_uars,
-                        (unsigned long long) pci_resource_len(dev->pdev, 2));
+                        (unsigned long long)
+                        pci_resource_len(dev->persist->pdev, 2));
                goto err_mem;
        }
 
@@ -1492,9 +1499,9 @@ static int map_bf_area(struct mlx4_dev *dev)
        if (!dev->caps.bf_reg_size)
                return -ENXIO;
 
-       bf_start = pci_resource_start(dev->pdev, 2) +
+       bf_start = pci_resource_start(dev->persist->pdev, 2) +
                        (dev->caps.num_uars << PAGE_SHIFT);
-       bf_len = pci_resource_len(dev->pdev, 2) -
+       bf_len = pci_resource_len(dev->persist->pdev, 2) -
                        (dev->caps.num_uars << PAGE_SHIFT);
        priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
        if (!priv->bf_mapping)
@@ -1536,7 +1543,8 @@ static int map_internal_clock(struct mlx4_dev *dev)
        struct mlx4_priv *priv = mlx4_priv(dev);
 
        priv->clock_mapping =
-               ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) +
+               ioremap(pci_resource_start(dev->persist->pdev,
+                                          priv->fw.clock_bar) +
                        priv->fw.clock_offset, MLX4_CLOCK_SIZE);
 
        if (!priv->clock_mapping)
@@ -1573,6 +1581,50 @@ static void mlx4_close_fw(struct mlx4_dev *dev)
        }
 }
 
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+       u32 comm_flags;
+       u32 offline_bit;
+       unsigned long end;
+       struct mlx4_priv *priv = mlx4_priv(dev);
+
+       end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+       while (time_before(jiffies, end)) {
+               comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+                                         MLX4_COMM_CHAN_FLAGS));
+               offline_bit = (comm_flags &
+                              (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+               if (!offline_bit)
+                       return 0;
+               /* There are cases as part of AER/Reset flow that PF needs
+                * around 100 msec to load. We therefore sleep for 100 msec
+                * to allow other tasks to make use of that CPU during this
+                * time interval.
+                */
+               msleep(100);
+       }
+       mlx4_err(dev, "Communication channel is offline.\n");
+       return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       u32 comm_rst;
+       u32 comm_caps;
+
+       comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+                                MLX4_COMM_CHAN_CAPS));
+       comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+       if (comm_rst)
+               dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
 static int mlx4_init_slave(struct mlx4_dev *dev)
 {
        struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1588,6 +1640,12 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
        mutex_lock(&priv->cmd.slave_cmd_mutex);
        priv->cmd.max_cmds = 1;
+       if (mlx4_comm_check_offline(dev)) {
+               mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+               goto err_offline;
+       }
+
+       mlx4_reset_vf_support(dev);
        mlx4_warn(dev, "Sending reset\n");
        ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
                                       MLX4_COMM_TIME);
@@ -1631,6 +1689,7 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 err:
        mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+err_offline:
        mutex_unlock(&priv->cmd.slave_cmd_mutex);
        return -EIO;
 }
@@ -1705,7 +1764,8 @@ static void choose_steering_mode(struct mlx4_dev *dev,
        if (mlx4_log_num_mgm_entry_size <= 0 &&
            dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
            (!mlx4_is_mfunc(dev) ||
-            (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
+            (dev_cap->fs_max_num_qp_per_entry >=
+            (dev->persist->num_vfs + 1))) &&
            choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
                MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
                dev->oper_log_mgm_entry_size =
@@ -1829,7 +1889,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
                err = mlx4_dev_cap(dev, &dev_cap);
                if (err) {
                        mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
-                       goto err_stop_fw;
+                       return err;
                }
 
                choose_steering_mode(dev, &dev_cap);
@@ -1860,7 +1920,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
                                             &init_hca);
                if ((long long) icm_size < 0) {
                        err = icm_size;
-                       goto err_stop_fw;
+                       return err;
                }
 
                dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
@@ -1874,7 +1934,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
                err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
                if (err)
-                       goto err_stop_fw;
+                       return err;
 
                err = mlx4_INIT_HCA(dev, &init_hca);
                if (err) {
@@ -1886,7 +1946,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
                        err = mlx4_query_func(dev, &dev_cap);
                        if (err < 0) {
                                mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
-                               goto err_stop_fw;
+                               goto err_close;
                        } else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
                                dev->caps.num_eqs = dev_cap.max_eqs;
                                dev->caps.reserved_eqs = dev_cap.reserved_eqs;
@@ -2006,11 +2066,6 @@ err_free_icm:
        if (!mlx4_is_slave(dev))
                mlx4_free_icms(dev);
 
-err_stop_fw:
-       if (!mlx4_is_slave(dev)) {
-               mlx4_UNMAP_FA(dev);
-               mlx4_free_icm(dev, priv->fw.fw_icm, 0);
-       }
        return err;
 }
 
@@ -2293,7 +2348,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
                for (i = 0; i < nreq; ++i)
                        entries[i].entry = i;
 
-               nreq = pci_enable_msix_range(dev->pdev, entries, 2, nreq);
+               nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+                                            nreq);
 
                if (nreq < 0) {
                        kfree(entries);
@@ -2321,7 +2377,7 @@ no_msi:
        dev->caps.comp_pool        = 0;
 
        for (i = 0; i < 2; ++i)
-               priv->eq_table.eq[i].irq = dev->pdev->irq;
+               priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
 }
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
@@ -2349,7 +2405,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
        info->port_attr.show      = show_port_type;
        sysfs_attr_init(&info->port_attr.attr);
 
-       err = device_create_file(&dev->pdev->dev, &info->port_attr);
+       err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
        if (err) {
                mlx4_err(dev, "Failed to create file for port %d\n", port);
                info->port = -1;
@@ -2366,10 +2422,12 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
        info->port_mtu_attr.show      = show_port_ib_mtu;
        sysfs_attr_init(&info->port_mtu_attr.attr);
 
-       err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+       err = device_create_file(&dev->persist->pdev->dev,
+                                &info->port_mtu_attr);
        if (err) {
                mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
-               device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+               device_remove_file(&info->dev->persist->pdev->dev,
+                                  &info->port_attr);
                info->port = -1;
        }
 
@@ -2381,8 +2439,9 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
        if (info->port < 0)
                return;
 
-       device_remove_file(&info->dev->pdev->dev, &info->port_attr);
-       device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
+       device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+       device_remove_file(&info->dev->persist->pdev->dev,
+                          &info->port_mtu_attr);
 }
 
 static int mlx4_init_steering(struct mlx4_dev *dev)
@@ -2449,10 +2508,11 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
        void __iomem *owner;
        u32 ret;
 
-       if (pci_channel_offline(dev->pdev))
+       if (pci_channel_offline(dev->persist->pdev))
                return -EIO;
 
-       owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+       owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+                       MLX4_OWNER_BASE,
                        MLX4_OWNER_SIZE);
        if (!owner) {
                mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2468,10 +2528,11 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 {
        void __iomem *owner;
 
-       if (pci_channel_offline(dev->pdev))
+       if (pci_channel_offline(dev->persist->pdev))
                return;
 
-       owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+       owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+                       MLX4_OWNER_BASE,
                        MLX4_OWNER_SIZE);
        if (!owner) {
                mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2486,11 +2547,19 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
                                  !!((flags) & MLX4_FLAG_MASTER))
 
 static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
-                            u8 total_vfs, int existing_vfs)
+                            u8 total_vfs, int existing_vfs, int reset_flow)
 {
        u64 dev_flags = dev->flags;
        int err = 0;
 
+       if (reset_flow) {
+               dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+                                      GFP_KERNEL);
+               if (!dev->dev_vfs)
+                       goto free_mem;
+               return dev_flags;
+       }
+
        atomic_inc(&pf_loading);
        if (dev->flags &  MLX4_FLAG_SRIOV) {
                if (existing_vfs != total_vfs) {
@@ -2519,13 +2588,14 @@ static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
                dev_flags |= MLX4_FLAG_SRIOV |
                        MLX4_FLAG_MASTER;
                dev_flags &= ~MLX4_FLAG_SLAVE;
-               dev->num_vfs = total_vfs;
+               dev->persist->num_vfs = total_vfs;
        }
        return dev_flags;
 
 disable_sriov:
        atomic_dec(&pf_loading);
-       dev->num_vfs = 0;
+free_mem:
+       dev->persist->num_vfs = 0;
        kfree(dev->dev_vfs);
        return dev_flags & ~MLX4_FLAG_MASTER;
 }
@@ -2549,7 +2619,8 @@ static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap
 }
 
 static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
-                        int total_vfs, int *nvfs, struct mlx4_priv *priv)
+                        int total_vfs, int *nvfs, struct mlx4_priv *priv,
+                        int reset_flow)
 {
        struct mlx4_dev *dev;
        unsigned sum = 0;
@@ -2612,10 +2683,15 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
                        existing_vfs = pci_num_vf(pdev);
                        if (existing_vfs)
                                dev->flags |= MLX4_FLAG_SRIOV;
-                       dev->num_vfs = total_vfs;
+                       dev->persist->num_vfs = total_vfs;
                }
        }
 
+       /* on load remove any previous indication of internal error,
+        * device is up.
+        */
+       dev->persist->state = MLX4_DEVICE_STATE_UP;
+
 slave_start:
        err = mlx4_cmd_init(dev);
        if (err) {
@@ -2666,8 +2742,10 @@ slave_start:
                                goto err_fw;
 
                        if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-                               u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
-                                                                 existing_vfs);
+                               u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+                                                                 total_vfs,
+                                                                 existing_vfs,
+                                                                 reset_flow);
 
                                mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
                                dev->flags = dev_flags;
@@ -2709,7 +2787,7 @@ slave_start:
                        if (dev->flags & MLX4_FLAG_SRIOV) {
                                if (!existing_vfs)
                                        pci_disable_sriov(pdev);
-                               if (mlx4_is_master(dev))
+                               if (mlx4_is_master(dev) && !reset_flow)
                                        atomic_dec(&pf_loading);
                                dev->flags &= ~MLX4_FLAG_SRIOV;
                        }
@@ -2723,7 +2801,8 @@ slave_start:
        }
 
        if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-               u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs, existing_vfs);
+               u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+                                                 existing_vfs, reset_flow);
 
                if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
                        mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
@@ -2776,12 +2855,14 @@ slave_start:
                                 dev->caps.num_ports);
                        goto err_close;
                }
-               memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs));
+               memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
 
-               for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) {
+               for (i = 0;
+                    i < sizeof(dev->persist->nvfs)/
+                    sizeof(dev->persist->nvfs[0]); i++) {
                        unsigned j;
 
-                       for (j = 0; j < dev->nvfs[i]; ++sum, ++j) {
+                       for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
                                dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
                                dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
                                        dev->caps.num_ports;
@@ -2833,6 +2914,17 @@ slave_start:
                goto err_steer;
 
        mlx4_init_quotas(dev);
+       /* When PF resources are ready arm its comm channel to enable
+        * getting commands
+        */
+       if (mlx4_is_master(dev)) {
+               err = mlx4_ARM_COMM_CHANNEL(dev);
+               if (err) {
+                       mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+                                err);
+                       goto err_steer;
+               }
+       }
 
        for (port = 1; port <= dev->caps.num_ports; port++) {
                err = mlx4_init_port_info(dev, port);
@@ -2851,7 +2943,7 @@ slave_start:
 
        priv->removed = 0;
 
-       if (mlx4_is_master(dev) && dev->num_vfs)
+       if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
                atomic_dec(&pf_loading);
 
        kfree(dev_cap);
@@ -2910,10 +3002,12 @@ err_cmd:
        mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 
 err_sriov:
-       if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
+       if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
                pci_disable_sriov(pdev);
+               dev->flags &= ~MLX4_FLAG_SRIOV;
+       }
 
-       if (mlx4_is_master(dev) && dev->num_vfs)
+       if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
                atomic_dec(&pf_loading);
 
        kfree(priv->dev.dev_vfs);
@@ -3054,11 +3148,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
                }
        }
 
-       err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+       err = mlx4_catas_init(&priv->dev);
        if (err)
                goto err_release_regions;
+
+       err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
+       if (err)
+               goto err_catas;
+
        return 0;
 
+err_catas:
+       mlx4_catas_end(&priv->dev);
+
 err_release_regions:
        pci_release_regions(pdev);
 
@@ -3081,38 +3183,60 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
                return -ENOMEM;
 
        dev       = &priv->dev;
-       dev->pdev = pdev;
-       pci_set_drvdata(pdev, dev);
+       dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+       if (!dev->persist) {
+               kfree(priv);
+               return -ENOMEM;
+       }
+       dev->persist->pdev = pdev;
+       dev->persist->dev = dev;
+       pci_set_drvdata(pdev, dev->persist);
        priv->pci_dev_data = id->driver_data;
+       mutex_init(&dev->persist->device_state_mutex);
+       mutex_init(&dev->persist->interface_state_mutex);
 
        ret =  __mlx4_init_one(pdev, id->driver_data, priv);
-       if (ret)
+       if (ret) {
+               kfree(dev->persist);
                kfree(priv);
+       } else {
+               pci_save_state(pdev);
+       }
 
        return ret;
 }
 
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+       struct mlx4_dev_persistent *persist = dev->persist;
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       unsigned long   flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
+
+       memset(priv, 0, sizeof(*priv));
+       priv->dev.persist = persist;
+       priv->dev.flags = flags;
+}
+
 static void mlx4_unload_one(struct pci_dev *pdev)
 {
-       struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+       struct mlx4_dev  *dev  = persist->dev;
        struct mlx4_priv *priv = mlx4_priv(dev);
        int               pci_dev_data;
-       int p;
-       int active_vfs = 0;
+       int p, i;
 
        if (priv->removed)
                return;
 
+       /* saving current ports type for further use */
+       for (i = 0; i < dev->caps.num_ports; i++) {
+               dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+               dev->persist->curr_port_poss_type[i] = dev->caps.
+                                                      possible_type[i + 1];
+       }
+
        pci_dev_data = priv->pci_dev_data;
 
-       /* Disabling SR-IOV is not allowed while there are active vf's */
-       if (mlx4_is_master(dev)) {
-               active_vfs = mlx4_how_many_lives_vf(dev);
-               if (active_vfs) {
-                       pr_warn("Removing PF when there are active VF's !!\n");
-                       pr_warn("Will not disable SR-IOV.\n");
-               }
-       }
        mlx4_stop_sense(dev);
        mlx4_unregister_device(dev);
 
@@ -3156,12 +3280,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
        if (dev->flags & MLX4_FLAG_MSI_X)
                pci_disable_msix(pdev);
-       if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
-               mlx4_warn(dev, "Disabling SR-IOV\n");
-               pci_disable_sriov(pdev);
-               dev->flags &= ~MLX4_FLAG_SRIOV;
-               dev->num_vfs = 0;
-       }
 
        if (!mlx4_is_slave(dev))
                mlx4_free_ownership(dev);
@@ -3173,42 +3291,96 @@ static void mlx4_unload_one(struct pci_dev *pdev)
        kfree(dev->caps.qp1_proxy);
        kfree(dev->dev_vfs);
 
-       memset(priv, 0, sizeof(*priv));
+       mlx4_clean_dev(dev);
        priv->pci_dev_data = pci_dev_data;
        priv->removed = 1;
 }
 
 static void mlx4_remove_one(struct pci_dev *pdev)
 {
-       struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+       struct mlx4_dev  *dev  = persist->dev;
        struct mlx4_priv *priv = mlx4_priv(dev);
+       int active_vfs = 0;
+
+       mutex_lock(&persist->interface_state_mutex);
+       persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+       mutex_unlock(&persist->interface_state_mutex);
+
+       /* Disabling SR-IOV is not allowed while there are active vf's */
+       if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+               active_vfs = mlx4_how_many_lives_vf(dev);
+               if (active_vfs) {
+                       pr_warn("Removing PF when there are active VF's !!\n");
+                       pr_warn("Will not disable SR-IOV.\n");
+               }
+       }
+
+       /* device marked to be under deletion running now without the lock
+        * letting other tasks to be terminated
+        */
+       if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+               mlx4_unload_one(pdev);
+       else
+               mlx4_info(dev, "%s: interface is down\n", __func__);
+       mlx4_catas_end(dev);
+       if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+               mlx4_warn(dev, "Disabling SR-IOV\n");
+               pci_disable_sriov(pdev);
+       }
 
-       mlx4_unload_one(pdev);
        pci_release_regions(pdev);
        pci_disable_device(pdev);
+       kfree(dev->persist);
        kfree(priv);
        pci_set_drvdata(pdev, NULL);
 }
 
+static int restore_current_port_types(struct mlx4_dev *dev,
+                                     enum mlx4_port_type *types,
+                                     enum mlx4_port_type *poss_types)
+{
+       struct mlx4_priv *priv = mlx4_priv(dev);
+       int err, i;
+
+       mlx4_stop_sense(dev);
+
+       mutex_lock(&priv->port_mutex);
+       for (i = 0; i < dev->caps.num_ports; i++)
+               dev->caps.possible_type[i + 1] = poss_types[i];
+       err = mlx4_change_port_types(dev, types);
+       mlx4_start_sense(dev);
+       mutex_unlock(&priv->port_mutex);
+
+       return err;
+}
+
 int mlx4_restart_one(struct pci_dev *pdev)
 {
-       struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+       struct mlx4_dev  *dev  = persist->dev;
        struct mlx4_priv *priv = mlx4_priv(dev);
        int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
        int pci_dev_data, err, total_vfs;
 
        pci_dev_data = priv->pci_dev_data;
-       total_vfs = dev->num_vfs;
-       memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs));
+       total_vfs = dev->persist->num_vfs;
+       memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
        mlx4_unload_one(pdev);
-       err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+       err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
        if (err) {
                mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
                         __func__, pci_name(pdev), err);
                return err;
        }
 
+       err = restore_current_port_types(dev, dev->persist->curr_port_type,
+                                        dev->persist->curr_port_poss_type);
+       if (err)
+               mlx4_err(dev, "could not restore original port types (%d)\n",
+                        err);
+
        return err;
 }
 
@@ -3263,23 +3435,79 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
 static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
                                              pci_channel_state_t state)
 {
-       mlx4_unload_one(pdev);
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+       mlx4_err(persist->dev, "mlx4_pci_err_detected was called\n");
+       mlx4_enter_error_state(persist);
 
-       return state == pci_channel_io_perm_failure ?
-               PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+       mutex_lock(&persist->interface_state_mutex);
+       if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+               mlx4_unload_one(pdev);
+
+       mutex_unlock(&persist->interface_state_mutex);
+       if (state == pci_channel_io_perm_failure)
+               return PCI_ERS_RESULT_DISCONNECT;
+
+       pci_disable_device(pdev);
+       return PCI_ERS_RESULT_NEED_RESET;
 }
 
 static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
 {
-       struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+       struct mlx4_dev  *dev  = persist->dev;
        struct mlx4_priv *priv = mlx4_priv(dev);
        int               ret;
+       int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+       int total_vfs;
+
+       mlx4_err(dev, "mlx4_pci_slot_reset was called\n");
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               mlx4_err(dev, "Can not re-enable device, ret=%d\n", ret);
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+
+       pci_set_master(pdev);
+       pci_restore_state(pdev);
+       pci_save_state(pdev);
+
+       total_vfs = dev->persist->num_vfs;
+       memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+       mutex_lock(&persist->interface_state_mutex);
+       if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+               ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
+                                   priv, 1);
+               if (ret) {
+                       mlx4_err(dev, "%s: mlx4_load_one failed, ret=%d\n",
+                                __func__,  ret);
+                       goto end;
+               }
 
-       ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv);
+               ret = restore_current_port_types(dev, dev->persist->
+                                                curr_port_type, dev->persist->
+                                                curr_port_poss_type);
+               if (ret)
+                       mlx4_err(dev, "could not restore original port types (%d)\n", ret);
+       }
+end:
+       mutex_unlock(&persist->interface_state_mutex);
 
        return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
 
+static void mlx4_shutdown(struct pci_dev *pdev)
+{
+       struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+       mlx4_info(persist->dev, "mlx4_shutdown was called\n");
+       mutex_lock(&persist->interface_state_mutex);
+       if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+               mlx4_unload_one(pdev);
+       mutex_unlock(&persist->interface_state_mutex);
+}
+
 static const struct pci_error_handlers mlx4_err_handler = {
        .error_detected = mlx4_pci_err_detected,
        .slot_reset     = mlx4_pci_slot_reset,
@@ -3289,7 +3517,7 @@ static struct pci_driver mlx4_driver = {
        .name           = DRV_NAME,
        .id_table       = mlx4_pci_table,
        .probe          = mlx4_init_one,
-       .shutdown       = mlx4_unload_one,
+       .shutdown       = mlx4_shutdown,
        .remove         = mlx4_remove_one,
        .err_handler    = &mlx4_err_handler,
 };
@@ -3341,7 +3569,6 @@ static int __init mlx4_init(void)
        if (mlx4_verify_params())
                return -EINVAL;
 
-       mlx4_catas_init();
 
        mlx4_wq = create_singlethread_workqueue("mlx4");
        if (!mlx4_wq)