Message ID | 1615957789-30077-1-git-send-email-selvin.xavier@broadcom.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [for-next] RDMA/bnxt_re: Move device to error state upon device crash | expand |
Hi Selvin, I love your patch! Perhaps something to improve: [auto build test WARNING on rdma/for-next] [also build test WARNING on v5.12-rc3 next-20210316] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Selvin-Xavier/RDMA-bnxt_re-Move-device-to-error-state-upon-device-crash/20210317-131222 base: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next config: powerpc-allyesconfig (attached as .config) compiler: powerpc64-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/3e89320b3476467ccb141c6b5e46c1615f010b66 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Selvin-Xavier/RDMA-bnxt_re-Move-device-to-error-state-upon-device-crash/20210317-131222 git checkout 3e89320b3476467ccb141c6b5e46c1615f010b66 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): drivers/infiniband/hw/bnxt_re/main.c: In function 'bnxt_re_stop': >> drivers/infiniband/hw/bnxt_re/main.c:226:26: warning: variable 'rcfw' set but not used [-Wunused-but-set-variable] 226 | struct bnxt_qplib_rcfw *rcfw; | ^~~~ vim +/rcfw +226 drivers/infiniband/hw/bnxt_re/main.c 221 222 /* for handling bnxt_en callbacks later */ 223 static void bnxt_re_stop(void *p) 224 { 225 struct bnxt_re_dev *rdev = p; > 226 struct bnxt_qplib_rcfw *rcfw; 227 struct bnxt *bp; 228 229 if (!rdev) 230 return; 231 ASSERT_RTNL(); 232 233 /* L2 driver invokes this callback during device error/crash or device 234 * reset. Current RoCE driver doesn't recover the device in case of 235 * error. Handle the error by dispatching fatal events to all qps 236 * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as 237 * L2 driver want to modify the MSIx table. 238 */ 239 bp = netdev_priv(rdev->netdev); 240 rcfw = &rdev->rcfw; 241 242 ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver"); 243 /* Check the current device state from L2 structure and move the 244 * device to detached state if FW_FATAL_COND is set. 245 * This prevents more commands to HW during clean-up, 246 * in case the device is already in error. 247 */ 248 if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) 249 set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); 250 251 bnxt_re_dev_stop(rdev); 252 bnxt_re_stop_irq(rdev); 253 /* Move the device states to detached and avoid sending any more 254 * commands to HW 255 */ 256 set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); 257 set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); 258 } 259 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b930ea3..ba26d8e 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -138,6 +138,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_QOS_WORK_REG 5 #define BNXT_RE_FLAG_RESOURCES_ALLOCATED 7 #define BNXT_RE_FLAG_RESOURCES_INITIALIZED 8 +#define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index fdb8c24..63e7433 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -81,6 +81,7 @@ static struct workqueue_struct *bnxt_re_wq; static void bnxt_re_remove_device(struct bnxt_re_dev *rdev); static void bnxt_re_dealloc_driver(struct ib_device *ib_dev); static void bnxt_re_stop_irq(void *handle); +static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) { @@ -221,6 +222,39 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) /* for handling bnxt_en callbacks later */ static void bnxt_re_stop(void *p) { + struct bnxt_re_dev *rdev = p; + struct bnxt_qplib_rcfw *rcfw; + struct bnxt *bp; + + if (!rdev) + return; + ASSERT_RTNL(); + + /* L2 driver invokes this callback during device error/crash or device + * reset. Current RoCE driver doesn't recover the device in case of + * error. Handle the error by dispatching fatal events to all qps + * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as + * L2 driver want to modify the MSIx table. + */ + bp = netdev_priv(rdev->netdev); + rcfw = &rdev->rcfw; + + ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver"); + /* Check the current device state from L2 structure and move the + * device to detached state if FW_FATAL_COND is set. + * This prevents more commands to HW during clean-up, + * in case the device is already in error. + */ + if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + + bnxt_re_dev_stop(rdev); + bnxt_re_stop_irq(rdev); + /* Move the device states to detached and avoid sending any more + * commands to HW + */ + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); } static void bnxt_re_start(void *p) @@ -234,6 +268,8 @@ static void bnxt_re_sriov_config(void *p, int num_vfs) if (!rdev) return; + if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) + return; rdev->num_vfs = num_vfs; if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { bnxt_re_set_resource_limits(rdev); @@ -427,6 +463,9 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, if (!en_dev) return rc; + if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) + return 0; + memset(&fw_msg, 0, sizeof(fw_msg)); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); @@ -489,6 +528,9 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, if (!en_dev) return rc; + if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) + return 0; + memset(&fw_msg, 0, sizeof(fw_msg)); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 441eb42..5d384de 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -212,6 +212,10 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, u8 opcode, retry_cnt = 0xFF; int rc = 0; + /* Prevent posting if f/w is not in a state to process */ + if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) + return 0; + do { opcode = req->opcode; rc = __send_message(rcfw, req, resp, sb, is_block); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 5f2f0a5..9474c00 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -138,6 +138,8 @@ struct bnxt_qplib_qp_node { #define FIRMWARE_INITIALIZED_FLAG (0) #define FIRMWARE_FIRST_FLAG (31) #define FIRMWARE_TIMED_OUT (3) +#define ERR_DEVICE_DETACHED (4) + struct bnxt_qplib_cmdq_mbox { struct bnxt_qplib_reg_desc reg; void __iomem *prod;