diff mbox series

[net] net/smc: Fix possible access to freed memory in link clear

Message ID 20220829145435.2756430-1-liuyacan@corp.netease.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net] net/smc: Fix possible access to freed memory in link clear | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 0 this patch: 7
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang fail Errors and warnings before: 0 this patch: 8
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 0 this patch: 7
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

liuyacan@corp.netease.com Aug. 29, 2022, 2:54 p.m. UTC
From: liuyacan <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be
completed with WC in IB_WC_WR_FLUSH_ERR status. Current
implementation does not wait for it is done, but free the link
directly. So there is a risk that accessing the freed link in
tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Signed-off-by: liuyacan <liuyacan@corp.netease.com>
---
 net/smc/smc_core.c |  2 ++
 net/smc/smc_core.h |  2 ++
 net/smc/smc_wr.c   | 12 ++++++++++++
 net/smc/smc_wr.h   |  3 +++
 4 files changed, 19 insertions(+)

Comments

Tony Lu Aug. 29, 2022, 4:38 p.m. UTC | #1
On Mon, Aug 29, 2022 at 10:54:35PM +0800, liuyacan@corp.netease.com wrote:
> From: liuyacan <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be
> completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> implementation does not wait for it is done, but free the link
> directly. So there is a risk that accessing the freed link in
> tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> ---
>  net/smc/smc_core.c |  2 ++
>  net/smc/smc_core.h |  2 ++
>  net/smc/smc_wr.c   | 12 ++++++++++++
>  net/smc/smc_wr.h   |  3 +++
>  4 files changed, 19 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..b632a33f1 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	atomic_inc(&lnk->smcibdev->lnk_cnt);
>  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
>  	lnk->clearing = 0;
> +	lnk->rx_drained = 0;
>  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
>  	lnk->link_id = smcr_next_link_id(lgr);
>  	lnk->lgr = lgr;
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>  	smcr_buf_unmap_lgr(lnk);
>  	smcr_rtoken_clear_link(lnk);
>  	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);
>  	smc_wr_free_link(lnk);
>  	smc_ib_destroy_queue_pair(lnk);
>  	smc_ib_dealloc_protection_domain(lnk);
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index fe8b524ad..0a469a3e7 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -117,6 +117,7 @@ struct smc_link {
>  	u64			wr_rx_id;	/* seq # of last recv WR */
>  	u32			wr_rx_cnt;	/* number of WR recv buffers */
>  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
>  
>  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> @@ -138,6 +139,7 @@ struct smc_link {
>  	u8			link_idx;	/* index in lgr link array */
>  	u8			link_is_asym;	/* is link asymmetric? */
>  	u8			clearing : 1;	/* link is being cleared */
> +	u8                      rx_drained : 1; /* link is drained */
>  	refcount_t		refcnt;		/* link reference count */
>  	struct smc_link_group	*lgr;		/* parent link group */
>  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 26f8f240d..f9992896a 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>  			case IB_WC_RNR_RETRY_EXC_ERR:
>  			case IB_WC_WR_FLUSH_ERR:
>  				smcr_link_down_cond_sched(link);
> +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> +					link->rx_drained = 1;
> +					wake_up(&link->wr_rx_drain_wait);
> +				}

I am wondering if we should wait for all the wc comes back?

>  				break;
>  			default:
>  				smc_wr_rx_post(link); /* refill WR RX */
> @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>  }
>  
> +void smc_wr_drain_cq(struct smc_link *lnk)
> +{
> +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> +					 (lnk->drained == 1),
> +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> +}

Should we wait for it with timeout? It should eventually be wake up
normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
may also have this issue, although the probability of occurrence is
greatly reduced.

Cheers,
Tony Lu

> +
>  void smc_wr_free_link(struct smc_link *lnk)
>  {
>  	struct ib_device *ibdev;
> @@ -889,6 +900,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>  	atomic_set(&lnk->wr_tx_refcnt, 0);
>  	init_waitqueue_head(&lnk->wr_reg_wait);
>  	atomic_set(&lnk->wr_reg_refcnt, 0);
> +	init_waitqueue_head(&lnk->wr_rx_drain_wait);
>  	return rc;
>  
>  dma_unmap:
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index a54e90a11..2a7ebdba3 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -27,6 +27,8 @@
>  
>  #define SMC_WR_TX_PEND_PRIV_SIZE 32
>  
> +#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
> +
>  struct smc_wr_tx_pend_priv {
>  	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
>  };
> @@ -101,6 +103,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>  int smc_wr_create_link(struct smc_link *lnk);
>  int smc_wr_alloc_link_mem(struct smc_link *lnk);
>  int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> +void smc_wr_drain_cq(struct smc_link *lnk);
>  void smc_wr_free_link(struct smc_link *lnk);
>  void smc_wr_free_link_mem(struct smc_link *lnk);
>  void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> -- 
> 2.20.1
kernel test robot Aug. 30, 2022, 2:31 p.m. UTC | #2
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/intel-lab-lkp/linux/commits/liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git cb10b0f91c5f76de981ef927e7dadec60c5a5d96
config: arm64-randconfig-r005-20220830 (https://download.01.org/0day-ci/archive/20220830/202208302233.4HlN35vT-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f8be00c954c559c7ae24f34abade4faebc350ec9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
        git checkout f8be00c954c559c7ae24f34abade4faebc350ec9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash net/smc/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/smc/smc_wr.c: In function 'smc_wr_rx_process_cqes':
>> net/smc/smc_wr.c:468:60: error: invalid type argument of '->' (have 'struct ib_wc')
     468 |                                 if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
         |                                                            ^~
   In file included from net/smc/smc_wr.c:27:
   net/smc/smc_wr.c: In function 'smc_wr_drain_cq':
>> net/smc/smc_wr.c:641:48: error: 'struct smc_link' has no member named 'drained'; did you mean 'rx_drained'?
     641 |                                          (lnk->drained == 1),
         |                                                ^~~~~~~
   include/linux/wait.h:276:24: note: in definition of macro '___wait_cond_timeout'
     276 |         bool __cond = (condition);                                              \
         |                        ^~~~~~~~~
   net/smc/smc_wr.c:640:9: note: in expansion of macro 'wait_event_interruptible_timeout'
     640 |         wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> net/smc/smc_wr.c:641:48: error: 'struct smc_link' has no member named 'drained'; did you mean 'rx_drained'?
     641 |                                          (lnk->drained == 1),
         |                                                ^~~~~~~
   include/linux/wait.h:310:21: note: in definition of macro '___wait_event'
     310 |                 if (condition)                                                  \
         |                     ^~~~~~~~~
   include/linux/wait.h:506:32: note: in expansion of macro '___wait_cond_timeout'
     506 |         ___wait_event(wq_head, ___wait_cond_timeout(condition),                 \
         |                                ^~~~~~~~~~~~~~~~~~~~
   include/linux/wait.h:535:25: note: in expansion of macro '__wait_event_interruptible_timeout'
     535 |                 __ret = __wait_event_interruptible_timeout(wq_head,             \
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/smc/smc_wr.c:640:9: note: in expansion of macro 'wait_event_interruptible_timeout'
     640 |         wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +468 net/smc/smc_wr.c

   449	
   450	static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
   451	{
   452		struct smc_link *link;
   453		int i;
   454	
   455		for (i = 0; i < num; i++) {
   456			link = wc[i].qp->qp_context;
   457			if (wc[i].status == IB_WC_SUCCESS) {
   458				link->wr_rx_tstamp = jiffies;
   459				smc_wr_rx_demultiplex(&wc[i]);
   460				smc_wr_rx_post(link); /* refill WR RX */
   461			} else {
   462				/* handle status errors */
   463				switch (wc[i].status) {
   464				case IB_WC_RETRY_EXC_ERR:
   465				case IB_WC_RNR_RETRY_EXC_ERR:
   466				case IB_WC_WR_FLUSH_ERR:
   467					smcr_link_down_cond_sched(link);
 > 468					if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
   469						link->rx_drained = 1;
   470						wake_up(&link->wr_rx_drain_wait);
   471					}
   472					break;
   473				default:
   474					smc_wr_rx_post(link); /* refill WR RX */
   475					break;
   476				}
   477			}
   478		}
   479	}
   480	
   481	static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
   482	{
   483		struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
   484		struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
   485		int polled = 0;
   486		int rc;
   487	
   488	again:
   489		polled++;
   490		do {
   491			memset(&wc, 0, sizeof(wc));
   492			rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
   493			if (polled == 1) {
   494				ib_req_notify_cq(dev->roce_cq_recv,
   495						 IB_CQ_SOLICITED_MASK
   496						 | IB_CQ_REPORT_MISSED_EVENTS);
   497			}
   498			if (!rc)
   499				break;
   500			smc_wr_rx_process_cqes(&wc[0], rc);
   501		} while (rc > 0);
   502		if (polled == 1)
   503			goto again;
   504	}
   505	
   506	void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
   507	{
   508		struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
   509	
   510		tasklet_schedule(&dev->recv_tasklet);
   511	}
   512	
   513	int smc_wr_rx_post_init(struct smc_link *link)
   514	{
   515		u32 i;
   516		int rc = 0;
   517	
   518		for (i = 0; i < link->wr_rx_cnt; i++)
   519			rc = smc_wr_rx_post(link);
   520		return rc;
   521	}
   522	
   523	/***************************** init, exit, misc ******************************/
   524	
   525	void smc_wr_remember_qp_attr(struct smc_link *lnk)
   526	{
   527		struct ib_qp_attr *attr = &lnk->qp_attr;
   528		struct ib_qp_init_attr init_attr;
   529	
   530		memset(attr, 0, sizeof(*attr));
   531		memset(&init_attr, 0, sizeof(init_attr));
   532		ib_query_qp(lnk->roce_qp, attr,
   533			    IB_QP_STATE |
   534			    IB_QP_CUR_STATE |
   535			    IB_QP_PKEY_INDEX |
   536			    IB_QP_PORT |
   537			    IB_QP_QKEY |
   538			    IB_QP_AV |
   539			    IB_QP_PATH_MTU |
   540			    IB_QP_TIMEOUT |
   541			    IB_QP_RETRY_CNT |
   542			    IB_QP_RNR_RETRY |
   543			    IB_QP_RQ_PSN |
   544			    IB_QP_ALT_PATH |
   545			    IB_QP_MIN_RNR_TIMER |
   546			    IB_QP_SQ_PSN |
   547			    IB_QP_PATH_MIG_STATE |
   548			    IB_QP_CAP |
   549			    IB_QP_DEST_QPN,
   550			    &init_attr);
   551	
   552		lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
   553				       lnk->qp_attr.cap.max_send_wr);
   554		lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
   555				       lnk->qp_attr.cap.max_recv_wr);
   556	}
   557	
   558	static void smc_wr_init_sge(struct smc_link *lnk)
   559	{
   560		int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
   561		bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
   562		u32 i;
   563	
   564		for (i = 0; i < lnk->wr_tx_cnt; i++) {
   565			lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
   566				lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
   567			lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
   568			lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
   569			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
   570				lnk->roce_pd->local_dma_lkey;
   571			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
   572				lnk->roce_pd->local_dma_lkey;
   573			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
   574				lnk->roce_pd->local_dma_lkey;
   575			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
   576				lnk->roce_pd->local_dma_lkey;
   577			lnk->wr_tx_ibs[i].next = NULL;
   578			lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
   579			lnk->wr_tx_ibs[i].num_sge = 1;
   580			lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
   581			lnk->wr_tx_ibs[i].send_flags =
   582				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   583			if (send_inline)
   584				lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
   585			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
   586			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
   587			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
   588				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
   589			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
   590				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
   591		}
   592	
   593		if (lnk->lgr->smc_version == SMC_V2) {
   594			lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
   595			lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
   596			lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
   597	
   598			lnk->wr_tx_v2_ib->next = NULL;
   599			lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
   600			lnk->wr_tx_v2_ib->num_sge = 1;
   601			lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
   602			lnk->wr_tx_v2_ib->send_flags =
   603				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   604		}
   605	
   606		/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
   607		 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
   608		 * and the same buffer for all sges. When a larger message arrived then
   609		 * the content of the first small sge is copied to the beginning of
   610		 * the larger spillover buffer, allowing easy data mapping.
   611		 */
   612		for (i = 0; i < lnk->wr_rx_cnt; i++) {
   613			int x = i * sges_per_buf;
   614	
   615			lnk->wr_rx_sges[x].addr =
   616				lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
   617			lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
   618			lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
   619			if (lnk->lgr->smc_version == SMC_V2) {
   620				lnk->wr_rx_sges[x + 1].addr =
   621						lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
   622				lnk->wr_rx_sges[x + 1].length =
   623						SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
   624				lnk->wr_rx_sges[x + 1].lkey =
   625						lnk->roce_pd->local_dma_lkey;
   626			}
   627			lnk->wr_rx_ibs[i].next = NULL;
   628			lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
   629			lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
   630		}
   631		lnk->wr_reg.wr.next = NULL;
   632		lnk->wr_reg.wr.num_sge = 0;
   633		lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
   634		lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
   635		lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
   636	}
   637	
   638	void smc_wr_drain_cq(struct smc_link *lnk)
   639	{
   640		wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
 > 641						 (lnk->drained == 1),
   642						 SMC_WR_RX_WAIT_DRAIN_TIME);
   643	}
   644
kernel test robot Aug. 30, 2022, 4:10 p.m. UTC | #3
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/intel-lab-lkp/linux/commits/liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git cb10b0f91c5f76de981ef927e7dadec60c5a5d96
config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20220831/202208310048.0GtZuOsl-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f8be00c954c559c7ae24f34abade4faebc350ec9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
        git checkout f8be00c954c559c7ae24f34abade4faebc350ec9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/smc/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/smc/smc_wr.c:468:32: error: member reference type 'struct ib_wc' is not a pointer; did you mean to use '.'?
                                   if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
                                                         ~~~~~^~
                                                              .
>> net/smc/smc_wr.c:641:13: error: no member named 'drained' in 'struct smc_link'
                                            (lnk->drained == 1),
                                             ~~~  ^
   include/linux/wait.h:534:28: note: expanded from macro 'wait_event_interruptible_timeout'
           if (!___wait_cond_timeout(condition))                                   \
                                     ^~~~~~~~~
   include/linux/wait.h:276:17: note: expanded from macro '___wait_cond_timeout'
           bool __cond = (condition);                                              \
                          ^~~~~~~~~
>> net/smc/smc_wr.c:641:13: error: no member named 'drained' in 'struct smc_link'
                                            (lnk->drained == 1),
                                             ~~~  ^
   include/linux/wait.h:536:7: note: expanded from macro 'wait_event_interruptible_timeout'
                                                   condition, timeout);            \
                                                   ^~~~~~~~~
   include/linux/wait.h:506:46: note: expanded from macro '__wait_event_interruptible_timeout'
           ___wait_event(wq_head, ___wait_cond_timeout(condition),                 \
                                                       ^~~~~~~~~
   include/linux/wait.h:276:17: note: expanded from macro '___wait_cond_timeout'
           bool __cond = (condition);                                              \
                          ^~~~~~~~~
   include/linux/wait.h:310:7: note: expanded from macro '___wait_event'
                   if (condition)                                                  \
                       ^~~~~~~~~
   3 errors generated.


vim +468 net/smc/smc_wr.c

   449	
   450	static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
   451	{
   452		struct smc_link *link;
   453		int i;
   454	
   455		for (i = 0; i < num; i++) {
   456			link = wc[i].qp->qp_context;
   457			if (wc[i].status == IB_WC_SUCCESS) {
   458				link->wr_rx_tstamp = jiffies;
   459				smc_wr_rx_demultiplex(&wc[i]);
   460				smc_wr_rx_post(link); /* refill WR RX */
   461			} else {
   462				/* handle status errors */
   463				switch (wc[i].status) {
   464				case IB_WC_RETRY_EXC_ERR:
   465				case IB_WC_RNR_RETRY_EXC_ERR:
   466				case IB_WC_WR_FLUSH_ERR:
   467					smcr_link_down_cond_sched(link);
 > 468					if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
   469						link->rx_drained = 1;
   470						wake_up(&link->wr_rx_drain_wait);
   471					}
   472					break;
   473				default:
   474					smc_wr_rx_post(link); /* refill WR RX */
   475					break;
   476				}
   477			}
   478		}
   479	}
   480	
   481	static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
   482	{
   483		struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
   484		struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
   485		int polled = 0;
   486		int rc;
   487	
   488	again:
   489		polled++;
   490		do {
   491			memset(&wc, 0, sizeof(wc));
   492			rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
   493			if (polled == 1) {
   494				ib_req_notify_cq(dev->roce_cq_recv,
   495						 IB_CQ_SOLICITED_MASK
   496						 | IB_CQ_REPORT_MISSED_EVENTS);
   497			}
   498			if (!rc)
   499				break;
   500			smc_wr_rx_process_cqes(&wc[0], rc);
   501		} while (rc > 0);
   502		if (polled == 1)
   503			goto again;
   504	}
   505	
   506	void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
   507	{
   508		struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
   509	
   510		tasklet_schedule(&dev->recv_tasklet);
   511	}
   512	
   513	int smc_wr_rx_post_init(struct smc_link *link)
   514	{
   515		u32 i;
   516		int rc = 0;
   517	
   518		for (i = 0; i < link->wr_rx_cnt; i++)
   519			rc = smc_wr_rx_post(link);
   520		return rc;
   521	}
   522	
   523	/***************************** init, exit, misc ******************************/
   524	
   525	void smc_wr_remember_qp_attr(struct smc_link *lnk)
   526	{
   527		struct ib_qp_attr *attr = &lnk->qp_attr;
   528		struct ib_qp_init_attr init_attr;
   529	
   530		memset(attr, 0, sizeof(*attr));
   531		memset(&init_attr, 0, sizeof(init_attr));
   532		ib_query_qp(lnk->roce_qp, attr,
   533			    IB_QP_STATE |
   534			    IB_QP_CUR_STATE |
   535			    IB_QP_PKEY_INDEX |
   536			    IB_QP_PORT |
   537			    IB_QP_QKEY |
   538			    IB_QP_AV |
   539			    IB_QP_PATH_MTU |
   540			    IB_QP_TIMEOUT |
   541			    IB_QP_RETRY_CNT |
   542			    IB_QP_RNR_RETRY |
   543			    IB_QP_RQ_PSN |
   544			    IB_QP_ALT_PATH |
   545			    IB_QP_MIN_RNR_TIMER |
   546			    IB_QP_SQ_PSN |
   547			    IB_QP_PATH_MIG_STATE |
   548			    IB_QP_CAP |
   549			    IB_QP_DEST_QPN,
   550			    &init_attr);
   551	
   552		lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
   553				       lnk->qp_attr.cap.max_send_wr);
   554		lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
   555				       lnk->qp_attr.cap.max_recv_wr);
   556	}
   557	
   558	static void smc_wr_init_sge(struct smc_link *lnk)
   559	{
   560		int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
   561		bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
   562		u32 i;
   563	
   564		for (i = 0; i < lnk->wr_tx_cnt; i++) {
   565			lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
   566				lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
   567			lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
   568			lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
   569			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
   570				lnk->roce_pd->local_dma_lkey;
   571			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
   572				lnk->roce_pd->local_dma_lkey;
   573			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
   574				lnk->roce_pd->local_dma_lkey;
   575			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
   576				lnk->roce_pd->local_dma_lkey;
   577			lnk->wr_tx_ibs[i].next = NULL;
   578			lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
   579			lnk->wr_tx_ibs[i].num_sge = 1;
   580			lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
   581			lnk->wr_tx_ibs[i].send_flags =
   582				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   583			if (send_inline)
   584				lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
   585			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
   586			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
   587			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
   588				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
   589			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
   590				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
   591		}
   592	
   593		if (lnk->lgr->smc_version == SMC_V2) {
   594			lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
   595			lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
   596			lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
   597	
   598			lnk->wr_tx_v2_ib->next = NULL;
   599			lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
   600			lnk->wr_tx_v2_ib->num_sge = 1;
   601			lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
   602			lnk->wr_tx_v2_ib->send_flags =
   603				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   604		}
   605	
   606		/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
   607		 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
   608		 * and the same buffer for all sges. When a larger message arrived then
   609		 * the content of the first small sge is copied to the beginning of
   610		 * the larger spillover buffer, allowing easy data mapping.
   611		 */
   612		for (i = 0; i < lnk->wr_rx_cnt; i++) {
   613			int x = i * sges_per_buf;
   614	
   615			lnk->wr_rx_sges[x].addr =
   616				lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
   617			lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
   618			lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
   619			if (lnk->lgr->smc_version == SMC_V2) {
   620				lnk->wr_rx_sges[x + 1].addr =
   621						lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
   622				lnk->wr_rx_sges[x + 1].length =
   623						SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
   624				lnk->wr_rx_sges[x + 1].lkey =
   625						lnk->roce_pd->local_dma_lkey;
   626			}
   627			lnk->wr_rx_ibs[i].next = NULL;
   628			lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
   629			lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
   630		}
   631		lnk->wr_reg.wr.next = NULL;
   632		lnk->wr_reg.wr.num_sge = 0;
   633		lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
   634		lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
   635		lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
   636	}
   637	
   638	void smc_wr_drain_cq(struct smc_link *lnk)
   639	{
   640		wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
 > 641						 (lnk->drained == 1),
   642						 SMC_WR_RX_WAIT_DRAIN_TIME);
   643	}
   644
diff mbox series

Patch

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..b632a33f1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -752,6 +752,7 @@  int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	atomic_inc(&lnk->smcibdev->lnk_cnt);
 	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
 	lnk->clearing = 0;
+	lnk->rx_drained = 0;
 	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
@@ -1269,6 +1270,7 @@  void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..0a469a3e7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@  struct smc_link {
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
@@ -138,6 +139,7 @@  struct smc_link {
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	u8			clearing : 1;	/* link is being cleared */
+	u8                      rx_drained : 1; /* link is drained */
 	refcount_t		refcnt;		/* link reference count */
 	struct smc_link_group	*lgr;		/* parent link group */
 	struct work_struct	link_down_wrk;	/* wrk to bring link down */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..f9992896a 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -465,6 +465,10 @@  static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
+					link->rx_drained = 1;
+					wake_up(&link->wr_rx_drain_wait);
+				}
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +635,13 @@  static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
+					 (lnk->drained == 1),
+					 SMC_WR_RX_WAIT_DRAIN_TIME);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +900,7 @@  int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_drain_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..2a7ebdba3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -27,6 +27,8 @@ 
 
 #define SMC_WR_TX_PEND_PRIV_SIZE 32
 
+#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
+
 struct smc_wr_tx_pend_priv {
 	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
 };
@@ -101,6 +103,7 @@  static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);