net/mlx5_core: Add pci error handlers to mlx5_core driver
[cascardo/linux.git] / drivers / net / ethernet / mellanox / mlx5 / core / main.c
index b6edc58..a103a54 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/mlx5/srq.h>
 #include <linux/debugfs.h>
 #include <linux/kmod.h>
+#include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include "mlx5_core.h"
 
@@ -181,6 +182,34 @@ static int set_dma_caps(struct pci_dev *pdev)
        return err;
 }
 
+static int mlx5_pci_enable_device(struct mlx5_core_dev *dev)
+{
+       struct pci_dev *pdev = dev->pdev;
+       int err = 0;
+
+       mutex_lock(&dev->pci_status_mutex);
+       if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) {
+               err = pci_enable_device(pdev);
+               if (!err)
+                       dev->pci_status = MLX5_PCI_STATUS_ENABLED;
+       }
+       mutex_unlock(&dev->pci_status_mutex);
+
+       return err;
+}
+
+static void mlx5_pci_disable_device(struct mlx5_core_dev *dev)
+{
+       struct pci_dev *pdev = dev->pdev;
+
+       mutex_lock(&dev->pci_status_mutex);
+       if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) {
+               pci_disable_device(pdev);
+               dev->pci_status = MLX5_PCI_STATUS_DISABLED;
+       }
+       mutex_unlock(&dev->pci_status_mutex);
+}
+
 static int request_bar(struct pci_dev *pdev)
 {
        int err = 0;
@@ -807,7 +836,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        if (!priv->dbg_root)
                return -ENOMEM;
 
-       err = pci_enable_device(pdev);
+       err = mlx5_pci_enable_device(dev);
        if (err) {
                dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
                goto err_dbg;
@@ -841,7 +870,7 @@ err_clr_master:
        pci_clear_master(dev->pdev);
        release_bar(dev->pdev);
 err_disable:
-       pci_disable_device(dev->pdev);
+       mlx5_pci_disable_device(dev);
 
 err_dbg:
        debugfs_remove(priv->dbg_root);
@@ -853,7 +882,7 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        iounmap(dev->iseg);
        pci_clear_master(dev->pdev);
        release_bar(dev->pdev);
-       pci_disable_device(dev->pdev);
+       mlx5_pci_disable_device(dev);
        debugfs_remove(priv->dbg_root);
 }
 
@@ -863,13 +892,25 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        struct pci_dev *pdev = dev->pdev;
        int err;
 
+       mutex_lock(&dev->intf_state_mutex);
+       if (dev->interface_state == MLX5_INTERFACE_STATE_UP) {
+               dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
+                        __func__);
+               goto out;
+       }
+
        dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
                 fw_rev_min(dev), fw_rev_sub(dev));
 
+       /* on load removing any previous indication of internal error, device is
+        * up
+        */
+       dev->state = MLX5_DEVICE_STATE_UP;
+
        err = mlx5_cmd_init(dev);
        if (err) {
                dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
-               return err;
+               goto out_err;
        }
 
        mlx5_pagealloc_init(dev);
@@ -994,6 +1035,10 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        if (err)
                pr_info("failed request module on %s\n", MLX5_IB_MOD);
 
+       dev->interface_state = MLX5_INTERFACE_STATE_UP;
+out:
+       mutex_unlock(&dev->intf_state_mutex);
+
        return 0;
 
 err_reg_dev:
@@ -1024,7 +1069,7 @@ err_stop_poll:
        mlx5_stop_health_poll(dev);
        if (mlx5_cmd_teardown_hca(dev)) {
                dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
-               return err;
+               goto out_err;
        }
 
 err_pagealloc_stop:
@@ -1040,13 +1085,23 @@ err_pagealloc_cleanup:
        mlx5_pagealloc_cleanup(dev);
        mlx5_cmd_cleanup(dev);
 
+out_err:
+       dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+       mutex_unlock(&dev->intf_state_mutex);
+
        return err;
 }
 
 static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 {
-       int err;
+       int err = 0;
 
+       mutex_lock(&dev->intf_state_mutex);
+       if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) {
+               dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
+                        __func__);
+               goto out;
+       }
        mlx5_unregister_device(dev);
        mlx5_cleanup_mr_table(dev);
        mlx5_cleanup_srq_table(dev);
@@ -1072,10 +1127,12 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        mlx5_cmd_cleanup(dev);
 
 out:
+       dev->interface_state = MLX5_INTERFACE_STATE_DOWN;
+       mutex_unlock(&dev->intf_state_mutex);
        return err;
 }
 
-static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
                     unsigned long param)
 {
        struct mlx5_priv *priv = &dev->priv;
@@ -1125,6 +1182,8 @@ static int init_one(struct pci_dev *pdev,
 
        INIT_LIST_HEAD(&priv->ctx_list);
        spin_lock_init(&priv->ctx_lock);
+       mutex_init(&dev->pci_status_mutex);
+       mutex_init(&dev->intf_state_mutex);
        err = mlx5_pci_init(dev, priv);
        if (err) {
                dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
@@ -1172,6 +1231,112 @@ static void remove_one(struct pci_dev *pdev)
        kfree(dev);
 }
 
+static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
+                                             pci_channel_state_t state)
+{
+       struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+       struct mlx5_priv *priv = &dev->priv;
+
+       dev_info(&pdev->dev, "%s was called\n", __func__);
+       mlx5_enter_error_state(dev);
+       mlx5_unload_one(dev, priv);
+       mlx5_pci_disable_device(dev);
+       return state == pci_channel_io_perm_failure ?
+               PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
+{
+       struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+       int err = 0;
+
+       dev_info(&pdev->dev, "%s was called\n", __func__);
+
+       err = mlx5_pci_enable_device(dev);
+       if (err) {
+               dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
+                       , __func__, err);
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+       pci_set_master(pdev);
+       pci_set_power_state(pdev, PCI_D0);
+       pci_restore_state(pdev);
+
+       return err ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+void mlx5_disable_device(struct mlx5_core_dev *dev)
+{
+       mlx5_pci_err_detected(dev->pdev, 0);
+}
+
+/* wait for the device to show vital signs. For now we check
+ * that we can read the device ID and that the health buffer
+ * shows a non zero value which is different than 0xffffffff
+ */
+static void wait_vital(struct pci_dev *pdev)
+{
+       struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+       struct mlx5_core_health *health = &dev->priv.health;
+       const int niter = 100;
+       u32 count;
+       u16 did;
+       int i;
+
+       /* Wait for firmware to be ready after reset */
+       msleep(1000);
+       for (i = 0; i < niter; i++) {
+               if (pci_read_config_word(pdev, 2, &did)) {
+                       dev_warn(&pdev->dev, "failed reading config word\n");
+                       break;
+               }
+               if (did == pdev->device) {
+                       dev_info(&pdev->dev, "device ID correctly read after %d iterations\n", i);
+                       break;
+               }
+               msleep(50);
+       }
+       if (i == niter)
+               dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
+
+       for (i = 0; i < niter; i++) {
+               count = ioread32be(health->health_counter);
+               if (count && count != 0xffffffff) {
+                       dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
+                       break;
+               }
+               msleep(50);
+       }
+
+       if (i == niter)
+               dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
+}
+
+static void mlx5_pci_resume(struct pci_dev *pdev)
+{
+       struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+       struct mlx5_priv *priv = &dev->priv;
+       int err;
+
+       dev_info(&pdev->dev, "%s was called\n", __func__);
+
+       pci_save_state(pdev);
+       wait_vital(pdev);
+
+       err = mlx5_load_one(dev, priv);
+       if (err)
+               dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
+                       , __func__, err);
+       else
+               dev_info(&pdev->dev, "%s: device recovered\n", __func__);
+}
+
+static const struct pci_error_handlers mlx5_err_handler = {
+       .error_detected = mlx5_pci_err_detected,
+       .slot_reset     = mlx5_pci_slot_reset,
+       .resume         = mlx5_pci_resume
+};
+
 static const struct pci_device_id mlx5_core_pci_table[] = {
        { PCI_VDEVICE(MELLANOX, 0x1011) }, /* Connect-IB */
        { PCI_VDEVICE(MELLANOX, 0x1012) }, /* Connect-IB VF */
@@ -1188,7 +1353,8 @@ static struct pci_driver mlx5_core_driver = {
        .name           = DRIVER_NAME,
        .id_table       = mlx5_core_pci_table,
        .probe          = init_one,
-       .remove         = remove_one
+       .remove         = remove_one,
+       .err_handler    = &mlx5_err_handler
 };
 
 static int __init init(void)