net/mlx5: Report devlink health on FW fatal issues

author Moshe Shemesh <moshe@mellanox.com>

Sun, 27 Jan 2019 16:38:39 +0000 (18:38 +0200)

committer Saeed Mahameed <saeedm@mellanox.com>

Thu, 13 Jun 2019 20:23:19 +0000 (13:23 -0700)
author Moshe Shemesh <moshe@mellanox.com>
Sun, 27 Jan 2019 16:38:39 +0000 (18:38 +0200)
committer Saeed Mahameed <saeedm@mellanox.com>
Thu, 13 Jun 2019 20:23:19 +0000 (13:23 -0700)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c

index 4ef62c6c6424ddc506e70b78c3e5da332ce2a069..2fe6923f7ce086c3f6d357c00ce5bf41aca8a002 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
         return 0;
  }
  
-static void health_recover_work(struct work_struct *work)
-{
-       struct mlx5_core_health *health;
-       struct mlx5_core_dev *dev;
-       struct mlx5_priv *priv;
-
-       health = container_of(work, struct mlx5_core_health, work);
-       priv = container_of(health, struct mlx5_priv, health);
-       dev = container_of(priv, struct mlx5_core_dev, priv);
-
-       mlx5_health_try_recover(dev);
-}
-
  static const char *hsynd_str(u8 synd)
  {
         switch (synd) {
@@ -614,6 +601,29 @@ free_data:
         return err;
  }
  
+static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
+{
+       struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+       struct mlx5_core_health *health;
+       struct mlx5_core_dev *dev;
+       struct mlx5_priv *priv;
+
+       health = container_of(work, struct mlx5_core_health, fatal_report_work);
+       priv = container_of(health, struct mlx5_priv, health);
+       dev = container_of(priv, struct mlx5_core_dev, priv);
+
+       mlx5_enter_error_state(dev, false);
+       if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
+               if (mlx5_health_try_recover(dev))
+                       mlx5_core_err(dev, "health recovery failed\n");
+               return;
+       }
+       fw_reporter_ctx.err_synd = health->synd;
+       fw_reporter_ctx.miss_counter = health->miss_counter;
+       devlink_health_report(health->fw_fatal_reporter,
+                             "FW fatal error reported", &fw_reporter_ctx);
+}
+
  static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
                 .name = "fw_fatal",
                 .recover = mlx5_fw_fatal_reporter_recover,
@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
  
         spin_lock_irqsave(&health->wq_lock, flags);
         if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
-               queue_work(health->wq, &health->work);
+               queue_work(health->wq, &health->fatal_report_work);
         else
                 mlx5_core_err(dev, "new health works are not permitted at this stage\n");
         spin_unlock_irqrestore(&health->wq_lock, flags);
@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
         set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
         spin_unlock_irqrestore(&health->wq_lock, flags);
         cancel_work_sync(&health->report_work);
-       cancel_work_sync(&health->work);
+       cancel_work_sync(&health->fatal_report_work);
  }
  
  void mlx5_health_flush(struct mlx5_core_dev *dev)
@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
         if (!health->wq)
                 goto out_err;
         spin_lock_init(&health->wq_lock);
-       INIT_WORK(&health->work, health_recover_work);
+       INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
         INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
  
         return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c

index ec5287c51825affa91fd27f69ec91e39d0d8a170..998eec938d3c9eb37862ad03b533c7943f6a14e0 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
         mlx5_enter_error_state(dev, false);
         mlx5_error_sw_reset(dev);
         mlx5_unload_one(dev, false);
-       /* In case of kernel call drain the health wq */
-       if (state) {
-               mlx5_drain_health_wq(dev);
-               mlx5_pci_disable_device(dev);
-       }
+       mlx5_drain_health_wq(dev);
+       mlx5_pci_disable_device(dev);
  
         return state == pci_channel_io_perm_failure ?
                 PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
  
  void mlx5_disable_device(struct mlx5_core_dev *dev)
  {
-       mlx5_pci_err_detected(dev->pdev, 0);
+       mlx5_error_sw_reset(dev);
+       mlx5_unload_one(dev, false);
  }
  
  void mlx5_recover_device(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h

index caac96bf9c0df655ac497c2d2fd480d8cc4d0d37..25847beabd3f53719eae8f59426eae21a7195e70 100644 (file)
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -442,7 +442,7 @@ struct mlx5_core_health {
         spinlock_t                      wq_lock;
         struct workqueue_struct        *wq;
         unsigned long                   flags;
-       struct work_struct              work;
+       struct work_struct              fatal_report_work;
         struct work_struct              report_work;
         struct delayed_work             recover_work;
         struct devlink_health_reporter *fw_reporter;
author	Moshe Shemesh <moshe@mellanox.com>
	Sun, 27 Jan 2019 16:38:39 +0000 (18:38 +0200)
committer	Saeed Mahameed <saeedm@mellanox.com>
	Thu, 13 Jun 2019 20:23:19 +0000 (13:23 -0700)
drivers/net/ethernet/mellanox/mlx5/core/health.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx5/core/main.c		patch \| blob \| history
include/linux/mlx5/driver.h		patch \| blob \| history