net/mlx5_core: Add pci error handlers to mlx5_core driver
[cascardo/linux.git] / drivers / net / ethernet / mellanox / mlx5 / core / health.c
index f1eb686..f5deb64 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/vmalloc.h>
+#include <linux/hardirq.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -68,6 +69,29 @@ static u8 get_nic_interface(struct mlx5_core_dev *dev)
        return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
 }
 
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+       unsigned long flags;
+       u64 vector;
+
+       /* wait for pending handlers to complete */
+       synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+       spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+       vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+       if (!vector)
+               goto no_trig;
+
+       vector |= MLX5_TRIGGERED_CMD_COMP;
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+       mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+       mlx5_cmd_comp_handler(dev, vector);
+       return;
+
+no_trig:
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
 static int in_fatal(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
@@ -82,6 +106,43 @@ static int in_fatal(struct mlx5_core_dev *dev)
        return 0;
 }
 
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               return;
+
+       mlx5_core_err(dev, "start\n");
+       if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+               dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+       mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+       mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+       u8 nic_interface = get_nic_interface(dev);
+
+       switch (nic_interface) {
+       case MLX5_NIC_IFC_FULL:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+               break;
+
+       case MLX5_NIC_IFC_DISABLED:
+               mlx5_core_warn(dev, "starting teardown\n");
+               break;
+
+       case MLX5_NIC_IFC_NO_DRAM_NIC:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+               break;
+       default:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+                              nic_interface);
+       }
+
+       mlx5_disable_device(dev);
+}
+
 static void health_care(struct work_struct *work)
 {
        struct mlx5_core_health *health;
@@ -92,6 +153,7 @@ static void health_care(struct work_struct *work)
        priv = container_of(health, struct mlx5_priv, health);
        dev = container_of(priv, struct mlx5_core_dev, priv);
        mlx5_core_warn(dev, "handling bad device here\n");
+       mlx5_handle_bad_state(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -147,6 +209,10 @@ static void print_health_info(struct mlx5_core_dev *dev)
        u32 fw;
        int i;
 
+       /* If the syndrom is 0, the device is OK and no need to print buffer */
+       if (!ioread8(&h->synd))
+               return;
+
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
                dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
 
@@ -178,6 +244,12 @@ static void poll_health(unsigned long data)
        struct mlx5_core_health *health = &dev->priv.health;
        u32 count;
 
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               trigger_cmd_completions(dev);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+               return;
+       }
+
        count = ioread32be(health->health_counter);
        if (count == health->prev)
                ++health->miss_counter;