net/mlx5_core: Fix internal error detection conditions
[cascardo/linux.git] / drivers / net / ethernet / mellanox / mlx5 / core / health.c
index 292d76f..f1eb686 100644 (file)
@@ -46,39 +46,52 @@ enum {
 enum {
        MLX5_HEALTH_SYNDR_FW_ERR                = 0x1,
        MLX5_HEALTH_SYNDR_IRISC_ERR             = 0x7,
+       MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR  = 0x8,
        MLX5_HEALTH_SYNDR_CRC_ERR               = 0x9,
        MLX5_HEALTH_SYNDR_FETCH_PCI_ERR         = 0xa,
        MLX5_HEALTH_SYNDR_HW_FTL_ERR            = 0xb,
        MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR  = 0xc,
        MLX5_HEALTH_SYNDR_EQ_ERR                = 0xd,
+       MLX5_HEALTH_SYNDR_EQ_INV                = 0xe,
        MLX5_HEALTH_SYNDR_FFSER_ERR             = 0xf,
+       MLX5_HEALTH_SYNDR_HIGH_TEMP             = 0x10
 };
 
-static DEFINE_SPINLOCK(health_lock);
-static LIST_HEAD(health_list);
-static struct work_struct health_work;
+enum {
+       MLX5_NIC_IFC_FULL               = 0,
+       MLX5_NIC_IFC_DISABLED           = 1,
+       MLX5_NIC_IFC_NO_DRAM_NIC        = 2
+};
+
+static u8 get_nic_interface(struct mlx5_core_dev *dev)
+{
+       return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+       struct health_buffer __iomem *h = health->health;
+
+       if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+               return 1;
+
+       if (ioread32be(&h->fw_ver) == 0xffffffff)
+               return 1;
+
+       return 0;
+}
 
 static void health_care(struct work_struct *work)
 {
-       struct mlx5_core_health *health, *n;
+       struct mlx5_core_health *health;
        struct mlx5_core_dev *dev;
        struct mlx5_priv *priv;
-       LIST_HEAD(tlist);
-
-       spin_lock_irq(&health_lock);
-       list_splice_init(&health_list, &tlist);
-
-       spin_unlock_irq(&health_lock);
 
-       list_for_each_entry_safe(health, n, &tlist, list) {
-               priv = container_of(health, struct mlx5_priv, health);
-               dev = container_of(priv, struct mlx5_core_dev, priv);
-               mlx5_core_warn(dev, "handling bad device here\n");
-               /* nothing yet */
-               spin_lock_irq(&health_lock);
-               list_del_init(&health->list);
-               spin_unlock_irq(&health_lock);
-       }
+       health = container_of(work, struct mlx5_core_health, work);
+       priv = container_of(health, struct mlx5_priv, health);
+       dev = container_of(priv, struct mlx5_core_dev, priv);
+       mlx5_core_warn(dev, "handling bad device here\n");
 }
 
 static const char *hsynd_str(u8 synd)
@@ -88,6 +101,8 @@ static const char *hsynd_str(u8 synd)
                return "firmware internal error";
        case MLX5_HEALTH_SYNDR_IRISC_ERR:
                return "irisc not responding";
+       case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+               return "unrecoverable hardware error";
        case MLX5_HEALTH_SYNDR_CRC_ERR:
                return "firmware CRC error";
        case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
@@ -98,46 +113,69 @@ static const char *hsynd_str(u8 synd)
                return "async EQ buffer overrun";
        case MLX5_HEALTH_SYNDR_EQ_ERR:
                return "EQ error";
+       case MLX5_HEALTH_SYNDR_EQ_INV:
+               return "Invalid EQ refrenced";
        case MLX5_HEALTH_SYNDR_FFSER_ERR:
                return "FFSER error";
+       case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+               return "High temprature";
        default:
                return "unrecognized error";
        }
 }
 
-static u16 read_be16(__be16 __iomem *p)
+static u16 get_maj(u32 fw)
+{
+       return fw >> 28;
+}
+
+static u16 get_min(u32 fw)
 {
-       return swab16(readl((__force u16 __iomem *) p));
+       return fw >> 16 & 0xfff;
 }
 
-static u32 read_be32(__be32 __iomem *p)
+static u16 get_sub(u32 fw)
 {
-       return swab32(readl((__force u32 __iomem *) p));
+       return fw & 0xffff;
 }
 
 static void print_health_info(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
        struct health_buffer __iomem *h = health->health;
+       char fw_str[18];
+       u32 fw;
        int i;
 
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
-               pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i));
-
-       pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr));
-       pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra));
-       pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver));
-       pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id));
-       pr_info("irisc_index %d\n", readb(&h->irisc_index));
-       pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd)));
-       pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync));
+               dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+
+       dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+       dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+       fw = ioread32be(&h->fw_ver);
+       sprintf(fw_str, "%d.%d.%d", get_maj(fw), get_min(fw), get_sub(fw));
+       dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
+       dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+       dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+       dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
+       dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+}
+
+static unsigned long get_next_poll_jiffies(void)
+{
+       unsigned long next;
+
+       get_random_bytes(&next, sizeof(next));
+       next %= HZ;
+       next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+       return next;
 }
 
 static void poll_health(unsigned long data)
 {
        struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
        struct mlx5_core_health *health = &dev->priv.health;
-       unsigned long next;
        u32 count;
 
        count = ioread32be(health->health_counter);
@@ -148,18 +186,16 @@ static void poll_health(unsigned long data)
 
        health->prev = count;
        if (health->miss_counter == MAX_MISSES) {
-               mlx5_core_err(dev, "device's health compromised\n");
+               dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
-               spin_lock_irq(&health_lock);
-               list_add_tail(&health->list, &health_list);
-               spin_unlock_irq(&health_lock);
-
-               queue_work(mlx5_core_wq, &health_work);
        } else {
-               get_random_bytes(&next, sizeof(next));
-               next %= HZ;
-               next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
-               mod_timer(&health->timer, next);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+       }
+
+       if (in_fatal(dev) && !health->sick) {
+               health->sick = true;
+               print_health_info(dev);
+               queue_work(health->wq, &health->work);
        }
 }
 
@@ -167,7 +203,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
 
-       INIT_LIST_HEAD(&health->list);
        init_timer(&health->timer);
        health->health = &dev->iseg->health;
        health->health_counter = &dev->iseg->health_counter;
@@ -183,18 +218,33 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
        struct mlx5_core_health *health = &dev->priv.health;
 
        del_timer_sync(&health->timer);
-
-       spin_lock_irq(&health_lock);
-       if (!list_empty(&health->list))
-               list_del_init(&health->list);
-       spin_unlock_irq(&health_lock);
 }
 
-void mlx5_health_cleanup(void)
+void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 {
+       struct mlx5_core_health *health = &dev->priv.health;
+
+       destroy_workqueue(health->wq);
 }
 
-void  __init mlx5_health_init(void)
+int mlx5_health_init(struct mlx5_core_dev *dev)
 {
-       INIT_WORK(&health_work, health_care);
+       struct mlx5_core_health *health;
+       char *name;
+
+       health = &dev->priv.health;
+       name = kmalloc(64, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+
+       strcpy(name, "mlx5_health");
+       strcat(name, dev_name(&dev->pdev->dev));
+       health->wq = create_singlethread_workqueue(name);
+       kfree(name);
+       if (!health->wq)
+               return -ENOMEM;
+
+       INIT_WORK(&health->work, health_care);
+
+       return 0;
 }