net/mlx5: Add Crdump support
authorAlex Vesker <valex@mellanox.com>
Tue, 17 Jul 2018 08:18:26 +0000 (11:18 +0300)
committerSaeed Mahameed <saeedm@mellanox.com>
Thu, 13 Jun 2019 20:23:17 +0000 (13:23 -0700)
Crdump allows the driver to retrieve a dump of the FW PCI crspace.
This is useful in case of catastrophic issues which may require FW
reset. The crspace dump can be used for later debug.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
drivers/net/ethernet/mellanox/mlx5/core/Makefile
drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
drivers/net/ethernet/mellanox/mlx5/core/main.c
include/linux/mlx5/driver.h

index 8e07354faea1ade16dc6ee2df49153ecaed2a3ad..5fe2bf916c06dd397873081be559efe855a970e8 100644 (file)
@@ -16,7 +16,7 @@ mlx5_core-y :=        main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
                transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
                fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
                lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
-               diag/fw_tracer.o devlink.o
+               diag/fw_tracer.o diag/crdump.o devlink.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
new file mode 100644 (file)
index 0000000..dfb3417
--- /dev/null
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "lib/pci_vsc.h"
+#include "lib/mlx5.h"
+
+#define BAD_ACCESS                     0xBADACCE5
+#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7
+
+static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev)
+{
+       return !!dev->priv.health.crdump_size;
+}
+
+static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+       u32 crdump_size = dev->priv.health.crdump_size;
+       int i, ret;
+
+       for (i = 0; i < (crdump_size / 4); i++)
+               cr_data[i] = BAD_ACCESS;
+
+       ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size);
+       if (ret <= 0) {
+               if (ret == 0)
+                       return -EIO;
+               return ret;
+       }
+
+       if (crdump_size != ret) {
+               mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
+                              ret, crdump_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
+{
+       int ret;
+
+       if (!mlx5_crdump_enabled(dev))
+               return -ENODEV;
+
+       ret = mlx5_vsc_gw_lock(dev);
+       if (ret) {
+               mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
+                              ret);
+               return ret;
+       }
+
+       ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
+       if (ret)
+               goto unlock;
+
+       ret = mlx5_crdump_fill(dev, cr_data);
+
+unlock:
+       mlx5_vsc_gw_unlock(dev);
+       return ret;
+}
+
+int mlx5_crdump_enable(struct mlx5_core_dev *dev)
+{
+       struct mlx5_priv *priv = &dev->priv;
+       u32 space_size;
+       int ret;
+
+       if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
+           mlx5_crdump_enabled(dev))
+               return 0;
+
+       ret = mlx5_vsc_gw_lock(dev);
+       if (ret)
+               return ret;
+
+       /* Check if space is supported and get space size */
+       ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
+                                   &space_size);
+       if (ret) {
+               /* Unlock and mask error since space is not supported */
+               mlx5_vsc_gw_unlock(dev);
+               return 0;
+       }
+
+       if (!space_size) {
+               mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
+               mlx5_vsc_gw_unlock(dev);
+               return -EINVAL;
+       }
+
+       ret = mlx5_vsc_gw_unlock(dev);
+       if (ret)
+               return ret;
+
+       priv->health.crdump_size = space_size;
+       return 0;
+}
+
+void mlx5_crdump_disable(struct mlx5_core_dev *dev)
+{
+       dev->priv.health.crdump_size = 0;
+}
index 397a2847867a8cc42e719ce4b90f9b1ef74dd3a1..d918e44491f468a903186f77676695c9a89b2e6a 100644 (file)
@@ -41,6 +41,9 @@ int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
 int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
 void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
+int mlx5_crdump_enable(struct mlx5_core_dev *dev);
+void mlx5_crdump_disable(struct mlx5_core_dev *dev);
+int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
 
 /* TODO move to lib/events.h */
 
index 3adc09a1a3125eb1364a409fdfb07f44416411ca..c70e97071b872e4afdbe9f8b004c3947d2f470dc 100644 (file)
@@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
        if (err)
                goto clean_load;
 
+       err = mlx5_crdump_enable(dev);
+       if (err)
+               dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
+
        pci_save_state(pdev);
        return 0;
 
@@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev)
        struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
        struct devlink *devlink = priv_to_devlink(dev);
 
+       mlx5_crdump_disable(dev);
        mlx5_devlink_unregister(devlink);
        mlx5_unregister_device(dev);
 
index f732445bcbdb750a76f8388ff6ad7cc593cac4a4..4ae533b3da0748452a513feb65734f5aa6d86244 100644 (file)
@@ -435,6 +435,7 @@ struct mlx5_core_health {
        u32                             prev;
        int                             miss_counter;
        bool                            sick;
+       u32                             crdump_size;
        /* wq spinlock to synchronize draining */
        spinlock_t                      wq_lock;
        struct workqueue_struct        *wq;