diff mbox series

[for-next] RDMA/bnxt_re: Move device to error state upon device crash

Message ID 1615957789-30077-1-git-send-email-selvin.xavier@broadcom.com (mailing list archive)
State Superseded
Headers show
Series [for-next] RDMA/bnxt_re: Move device to error state upon device crash | expand

Commit Message

Selvin Xavier March 17, 2021, 5:09 a.m. UTC
When L2 driver detects a device crash or device undergone
reset, it invokes a stop callback to recover from error.
Current RoCE driver doesn't recover the device. So move
the device to error state and dispatch fatal events to all qps.
Release the MSIx vectors to avoid a crash when  L2 driver
disables the MSIx.
Also, check for the device state to avoid posting further
commands to the HW.

Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h    |  1 +
 drivers/infiniband/hw/bnxt_re/main.c       | 42 ++++++++++++++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c |  4 +++
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h |  2 ++
 4 files changed, 49 insertions(+)

Comments

kernel test robot March 17, 2021, 7:53 a.m. UTC | #1
Hi Selvin,

I love your patch! Perhaps something to improve:

[auto build test WARNING on rdma/for-next]
[also build test WARNING on v5.12-rc3 next-20210316]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Selvin-Xavier/RDMA-bnxt_re-Move-device-to-error-state-upon-device-crash/20210317-131222
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/3e89320b3476467ccb141c6b5e46c1615f010b66
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Selvin-Xavier/RDMA-bnxt_re-Move-device-to-error-state-upon-device-crash/20210317-131222
        git checkout 3e89320b3476467ccb141c6b5e46c1615f010b66
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/infiniband/hw/bnxt_re/main.c: In function 'bnxt_re_stop':
>> drivers/infiniband/hw/bnxt_re/main.c:226:26: warning: variable 'rcfw' set but not used [-Wunused-but-set-variable]
     226 |  struct bnxt_qplib_rcfw *rcfw;
         |                          ^~~~


vim +/rcfw +226 drivers/infiniband/hw/bnxt_re/main.c

   221	
   222	/* for handling bnxt_en callbacks later */
   223	static void bnxt_re_stop(void *p)
   224	{
   225		struct bnxt_re_dev *rdev = p;
 > 226		struct bnxt_qplib_rcfw *rcfw;
   227		struct bnxt *bp;
   228	
   229		if (!rdev)
   230			return;
   231		ASSERT_RTNL();
   232	
   233		/* L2 driver invokes this callback during device error/crash or device
   234		 * reset. Current RoCE driver doesn't recover the device in case of
   235		 * error. Handle the error by dispatching fatal events to all qps
   236		 * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as
   237		 * L2 driver want to modify the MSIx table.
   238		 */
   239		bp = netdev_priv(rdev->netdev);
   240		rcfw = &rdev->rcfw;
   241	
   242		ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver");
   243		/* Check the current device state from L2 structure and move the
   244		 * device to detached state if FW_FATAL_COND is set.
   245		 * This prevents more commands to HW during clean-up,
   246		 * in case the device is already in error.
   247		 */
   248		if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))
   249			set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
   250	
   251		bnxt_re_dev_stop(rdev);
   252		bnxt_re_stop_irq(rdev);
   253		/* Move the device states to detached and  avoid sending any more
   254		 * commands to HW
   255		 */
   256		set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
   257		set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
   258	}
   259	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index b930ea3..ba26d8e 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -138,6 +138,7 @@  struct bnxt_re_dev {
 #define BNXT_RE_FLAG_QOS_WORK_REG		5
 #define BNXT_RE_FLAG_RESOURCES_ALLOCATED	7
 #define BNXT_RE_FLAG_RESOURCES_INITIALIZED	8
+#define BNXT_RE_FLAG_ERR_DEVICE_DETACHED       17
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index fdb8c24..63e7433 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -81,6 +81,7 @@  static struct workqueue_struct *bnxt_re_wq;
 static void bnxt_re_remove_device(struct bnxt_re_dev *rdev);
 static void bnxt_re_dealloc_driver(struct ib_device *ib_dev);
 static void bnxt_re_stop_irq(void *handle);
+static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev);
 
 static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode)
 {
@@ -221,6 +222,39 @@  static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
 /* for handling bnxt_en callbacks later */
 static void bnxt_re_stop(void *p)
 {
+	struct bnxt_re_dev *rdev = p;
+	struct bnxt_qplib_rcfw *rcfw;
+	struct bnxt *bp;
+
+	if (!rdev)
+		return;
+	ASSERT_RTNL();
+
+	/* L2 driver invokes this callback during device error/crash or device
+	 * reset. Current RoCE driver doesn't recover the device in case of
+	 * error. Handle the error by dispatching fatal events to all qps
+	 * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as
+	 * L2 driver want to modify the MSIx table.
+	 */
+	bp = netdev_priv(rdev->netdev);
+	rcfw = &rdev->rcfw;
+
+	ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver");
+	/* Check the current device state from L2 structure and move the
+	 * device to detached state if FW_FATAL_COND is set.
+	 * This prevents more commands to HW during clean-up,
+	 * in case the device is already in error.
+	 */
+	if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))
+		set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
+
+	bnxt_re_dev_stop(rdev);
+	bnxt_re_stop_irq(rdev);
+	/* Move the device states to detached and  avoid sending any more
+	 * commands to HW
+	 */
+	set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
+	set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
 }
 
 static void bnxt_re_start(void *p)
@@ -234,6 +268,8 @@  static void bnxt_re_sriov_config(void *p, int num_vfs)
 	if (!rdev)
 		return;
 
+	if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+		return;
 	rdev->num_vfs = num_vfs;
 	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
 		bnxt_re_set_resource_limits(rdev);
@@ -427,6 +463,9 @@  static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
 	if (!en_dev)
 		return rc;
 
+	if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+		return 0;
+
 	memset(&fw_msg, 0, sizeof(fw_msg));
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
@@ -489,6 +528,9 @@  static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
 	if (!en_dev)
 		return rc;
 
+	if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+		return 0;
+
 	memset(&fw_msg, 0, sizeof(fw_msg));
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 441eb42..5d384de 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -212,6 +212,10 @@  int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 	u8 opcode, retry_cnt = 0xFF;
 	int rc = 0;
 
+	/* Prevent posting if f/w is not in a state to process */
+	if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags))
+		return 0;
+
 	do {
 		opcode = req->opcode;
 		rc = __send_message(rcfw, req, resp, sb, is_block);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index 5f2f0a5..9474c00 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -138,6 +138,8 @@  struct bnxt_qplib_qp_node {
 #define FIRMWARE_INITIALIZED_FLAG	(0)
 #define FIRMWARE_FIRST_FLAG		(31)
 #define FIRMWARE_TIMED_OUT		(3)
+#define ERR_DEVICE_DETACHED             (4)
+
 struct bnxt_qplib_cmdq_mbox {
 	struct bnxt_qplib_reg_desc	reg;
 	void __iomem			*prod;