diff mbox series

[net,v4] net/smc: Fix possible access to freed memory in link clear

Message ID 20220831155303.1758868-1-liuyacan@corp.netease.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net,v4] net/smc: Fix possible access to freed memory in link clear | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 64 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

liuyacan@corp.netease.com Aug. 31, 2022, 3:53 p.m. UTC
From: Yacan Liu <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be completed
with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
wait for it is done, but destroy the QP and free the link group directly.
So there is a risk that accessing the freed memory in tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>

---
Chagen in v4:
  -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
  -- Remove timeout.
Change in v3:
  -- Tune commit message (Signed-Off tag, Fixes tag).
     Tune code to avoid column length exceeding.
Change in v2:
  -- Fix some compile warnings and errors.
---
 net/smc/smc_core.c | 2 ++
 net/smc/smc_core.h | 2 ++
 net/smc/smc_wr.c   | 9 +++++++++
 net/smc/smc_wr.h   | 1 +
 4 files changed, 14 insertions(+)

Comments

Tony Lu Sept. 1, 2022, 6:51 a.m. UTC | #1
On Wed, Aug 31, 2022 at 11:53:03PM +0800, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 

Thanks for this fixes. I will test it in our environment.

Cheers,
Tony Lu
Wenjia Zhang Sept. 1, 2022, 9:33 a.m. UTC | #2
On 31.08.22 17:53, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>   BUG: unable to handle page fault for address: ffffffff8f220860
>   #PF: supervisor write access in kernel mode
>   #PF: error_code(0x0002) - not-present page
>   PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>   Oops: 0002 [#1] SMP PTI
>   CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>   Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>   RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>   Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>   RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>   RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>   RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>   RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>   R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>   R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>   FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>   Call Trace:
>    <IRQ>
>    _raw_spin_lock_irqsave+0x30/0x40
>    mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>    smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>    tasklet_action_common.isra.21+0x66/0x100
>    __do_softirq+0xd5/0x29c
>    asm_call_irq_on_stack+0x12/0x20
>    </IRQ>
>    do_softirq_own_stack+0x37/0x40
>    irq_exit_rcu+0x9d/0xa0
>    sysvec_call_function_single+0x34/0x80
>    asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 
> ---
> Chagen in v4:
>    -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>    -- Remove timeout.
> Change in v3:
>    -- Tune commit message (Signed-Off tag, Fixes tag).
>       Tune code to avoid column length exceeding.
> Change in v2:
>    -- Fix some compile warnings and errors.
> ---
>   net/smc/smc_core.c | 2 ++
>   net/smc/smc_core.h | 2 ++
>   net/smc/smc_wr.c   | 9 +++++++++
>   net/smc/smc_wr.h   | 1 +
>   4 files changed, 14 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..f92a916e9 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>   	lnk->lgr = lgr;
>   	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>   	lnk->link_idx = link_idx;
> +	lnk->wr_rx_id_compl = 0;
>   	smc_ibdev_cnt_inc(lnk);
>   	smcr_copy_dev_info_to_link(lnk);
>   	atomic_set(&lnk->conn_cnt, 0);
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>   	smcr_buf_unmap_lgr(lnk);
>   	smcr_rtoken_clear_link(lnk);
>   	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);
>   	smc_wr_free_link(lnk);
>   	smc_ib_destroy_queue_pair(lnk);
>   	smc_ib_dealloc_protection_domain(lnk);
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index fe8b524ad..285f9bd8e 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -115,8 +115,10 @@ struct smc_link {
>   	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>   	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>   	u64			wr_rx_id;	/* seq # of last recv WR */
> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>   	u32			wr_rx_cnt;	/* number of WR recv buffers */
>   	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>   
>   	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>   	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 26f8f240d..bc8793803 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>   
>   	for (i = 0; i < num; i++) {
>   		link = wc[i].qp->qp_context;
> +		link->wr_rx_id_compl = wc[i].wr_id;
>   		if (wc[i].status == IB_WC_SUCCESS) {
>   			link->wr_rx_tstamp = jiffies;
>   			smc_wr_rx_demultiplex(&wc[i]);
> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>   			case IB_WC_RNR_RETRY_EXC_ERR:
>   			case IB_WC_WR_FLUSH_ERR:
>   				smcr_link_down_cond_sched(link);
> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> +					wake_up(&link->wr_rx_empty_wait);
>   				break;
>   			default:
>   				smc_wr_rx_post(link); /* refill WR RX */
> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>   	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>   }
>   
> +void smc_wr_drain_cq(struct smc_link *lnk)
> +{
> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> +}
> +
>   void smc_wr_free_link(struct smc_link *lnk)
>   {
>   	struct ib_device *ibdev;
> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>   	atomic_set(&lnk->wr_tx_refcnt, 0);
>   	init_waitqueue_head(&lnk->wr_reg_wait);
>   	atomic_set(&lnk->wr_reg_refcnt, 0);
> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>   	return rc;
>   
>   dma_unmap:
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index a54e90a11..5ca5086ae 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>   int smc_wr_create_link(struct smc_link *lnk);
>   int smc_wr_alloc_link_mem(struct smc_link *lnk);
>   int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> +void smc_wr_drain_cq(struct smc_link *lnk);
>   void smc_wr_free_link(struct smc_link *lnk);
>   void smc_wr_free_link_mem(struct smc_link *lnk);
>   void smc_wr_free_lgr_mem(struct smc_link_group *lgr);

Thank you @Yacan for the effort to improve our code! And Thank you @Tony 
for such valuable suggestions and testing!
I like the modification of this version. However, this is not a fix 
patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize 
the parallelism of SMC-R connections" are still not applied. My 
sugguestions:
- Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of 
those patches I mentioned above, and ask if he can take your patch as a 
part of the patch serie
- Fix patches should go to net-next
- Please send always send your new version separately, rather than as 
reply to your previous version. That makes people confused.
liuyacan@corp.netease.com Sept. 1, 2022, 12:26 p.m. UTC | #3
> > From: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > After modifying the QP to the Error state, all RX WR would be completed
> > with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> > wait for it is done, but destroy the QP and free the link group directly.
> > So there is a risk that accessing the freed memory in tasklet context.
> > 
> > Here is a crash example:
> > 
> >   BUG: unable to handle page fault for address: ffffffff8f220860
> >   #PF: supervisor write access in kernel mode
> >   #PF: error_code(0x0002) - not-present page
> >   PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >   Oops: 0002 [#1] SMP PTI
> >   CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >   Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >   RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >   Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >   RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >   RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >   RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >   RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >   R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >   R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >   FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >   CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >   Call Trace:
> >    <IRQ>
> >    _raw_spin_lock_irqsave+0x30/0x40
> >    mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >    smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >    tasklet_action_common.isra.21+0x66/0x100
> >    __do_softirq+0xd5/0x29c
> >    asm_call_irq_on_stack+0x12/0x20
> >    </IRQ>
> >    do_softirq_own_stack+0x37/0x40
> >    irq_exit_rcu+0x9d/0xa0
> >    sysvec_call_function_single+0x34/0x80
> >    asm_sysvec_call_function_single+0x12/0x20
> > 
> > Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> > Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > ---
> > Chagen in v4:
> >    -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >    -- Remove timeout.
> > Change in v3:
> >    -- Tune commit message (Signed-Off tag, Fixes tag).
> >       Tune code to avoid column length exceeding.
> > Change in v2:
> >    -- Fix some compile warnings and errors.
> > ---
> >   net/smc/smc_core.c | 2 ++
> >   net/smc/smc_core.h | 2 ++
> >   net/smc/smc_wr.c   | 9 +++++++++
> >   net/smc/smc_wr.h   | 1 +
> >   4 files changed, 14 insertions(+)
> > 
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index ff49a11f5..f92a916e9 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >   	lnk->lgr = lgr;
> >   	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >   	lnk->link_idx = link_idx;
> > +	lnk->wr_rx_id_compl = 0;
> >   	smc_ibdev_cnt_inc(lnk);
> >   	smcr_copy_dev_info_to_link(lnk);
> >   	atomic_set(&lnk->conn_cnt, 0);
> > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >   	smcr_buf_unmap_lgr(lnk);
> >   	smcr_rtoken_clear_link(lnk);
> >   	smc_ib_modify_qp_error(lnk);
> > +	smc_wr_drain_cq(lnk);
> >   	smc_wr_free_link(lnk);
> >   	smc_ib_destroy_queue_pair(lnk);
> >   	smc_ib_dealloc_protection_domain(lnk);
> > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > index fe8b524ad..285f9bd8e 100644
> > --- a/net/smc/smc_core.h
> > +++ b/net/smc/smc_core.h
> > @@ -115,8 +115,10 @@ struct smc_link {
> >   	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >   	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >   	u64			wr_rx_id;	/* seq # of last recv WR */
> > +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >   	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >   	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> > +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >   
> >   	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >   	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > index 26f8f240d..bc8793803 100644
> > --- a/net/smc/smc_wr.c
> > +++ b/net/smc/smc_wr.c
> > @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >   
> >   	for (i = 0; i < num; i++) {
> >   		link = wc[i].qp->qp_context;
> > +		link->wr_rx_id_compl = wc[i].wr_id;
> >   		if (wc[i].status == IB_WC_SUCCESS) {
> >   			link->wr_rx_tstamp = jiffies;
> >   			smc_wr_rx_demultiplex(&wc[i]);
> > @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >   			case IB_WC_RNR_RETRY_EXC_ERR:
> >   			case IB_WC_WR_FLUSH_ERR:
> >   				smcr_link_down_cond_sched(link);
> > +				if (link->wr_rx_id_compl == link->wr_rx_id)
> > +					wake_up(&link->wr_rx_empty_wait);
> >   				break;
> >   			default:
> >   				smc_wr_rx_post(link); /* refill WR RX */
> > @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >   	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >   }
> >   
> > +void smc_wr_drain_cq(struct smc_link *lnk)
> > +{
> > +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> > +}
> > +
> >   void smc_wr_free_link(struct smc_link *lnk)
> >   {
> >   	struct ib_device *ibdev;
> > @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >   	atomic_set(&lnk->wr_tx_refcnt, 0);
> >   	init_waitqueue_head(&lnk->wr_reg_wait);
> >   	atomic_set(&lnk->wr_reg_refcnt, 0);
> > +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >   	return rc;
> >   
> >   dma_unmap:
> > diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> > index a54e90a11..5ca5086ae 100644
> > --- a/net/smc/smc_wr.h
> > +++ b/net/smc/smc_wr.h
> > @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >   int smc_wr_create_link(struct smc_link *lnk);
> >   int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >   int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> > +void smc_wr_drain_cq(struct smc_link *lnk);
> >   void smc_wr_free_link(struct smc_link *lnk);
> >   void smc_wr_free_link_mem(struct smc_link *lnk);
> >   void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> 
> Thank you @Yacan for the effort to improve our code! And Thank you @Tony 
> for such valuable suggestions and testing!
> I like the modification of this version. However, this is not a fix 
> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize 
> the parallelism of SMC-R connections" are still not applied. My 
> sugguestions:
> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of 
> those patches I mentioned above, and ask if he can take your patch as a 
> part of the patch serie
> - Fix patches should go to net-next
> - Please send always send your new version separately, rather than as 
> reply to your previous version. That makes people confused.

@Wenjia, Thanks a lot for your suggestions and guidance ! 

@D. Wythe, Can you include this patch in your series of patches if it is 
convenient?

Regards,
Yacan
Wenjia Zhang Sept. 1, 2022, 12:45 p.m. UTC | #4
On 01.09.22 14:26, liuyacan@corp.netease.com wrote:
>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>
>>> After modifying the QP to the Error state, all RX WR would be completed
>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>> wait for it is done, but destroy the QP and free the link group directly.
>>> So there is a risk that accessing the freed memory in tasklet context.
>>>
>>> Here is a crash example:
>>>
>>>    BUG: unable to handle page fault for address: ffffffff8f220860
>>>    #PF: supervisor write access in kernel mode
>>>    #PF: error_code(0x0002) - not-present page
>>>    PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>    Oops: 0002 [#1] SMP PTI
>>>    CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>    Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>    RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>    Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>    RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>    RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>    RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>    RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>    R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>    R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>    FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>    CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>    Call Trace:
>>>     <IRQ>
>>>     _raw_spin_lock_irqsave+0x30/0x40
>>>     mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>     smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>     tasklet_action_common.isra.21+0x66/0x100
>>>     __do_softirq+0xd5/0x29c
>>>     asm_call_irq_on_stack+0x12/0x20
>>>     </IRQ>
>>>     do_softirq_own_stack+0x37/0x40
>>>     irq_exit_rcu+0x9d/0xa0
>>>     sysvec_call_function_single+0x34/0x80
>>>     asm_sysvec_call_function_single+0x12/0x20
>>>
>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>
>>> ---
>>> Chagen in v4:
>>>     -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>     -- Remove timeout.
>>> Change in v3:
>>>     -- Tune commit message (Signed-Off tag, Fixes tag).
>>>        Tune code to avoid column length exceeding.
>>> Change in v2:
>>>     -- Fix some compile warnings and errors.
>>> ---
>>>    net/smc/smc_core.c | 2 ++
>>>    net/smc/smc_core.h | 2 ++
>>>    net/smc/smc_wr.c   | 9 +++++++++
>>>    net/smc/smc_wr.h   | 1 +
>>>    4 files changed, 14 insertions(+)
>>>
>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>> index ff49a11f5..f92a916e9 100644
>>> --- a/net/smc/smc_core.c
>>> +++ b/net/smc/smc_core.c
>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>    	lnk->lgr = lgr;
>>>    	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>    	lnk->link_idx = link_idx;
>>> +	lnk->wr_rx_id_compl = 0;
>>>    	smc_ibdev_cnt_inc(lnk);
>>>    	smcr_copy_dev_info_to_link(lnk);
>>>    	atomic_set(&lnk->conn_cnt, 0);
>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>    	smcr_buf_unmap_lgr(lnk);
>>>    	smcr_rtoken_clear_link(lnk);
>>>    	smc_ib_modify_qp_error(lnk);
>>> +	smc_wr_drain_cq(lnk);
>>>    	smc_wr_free_link(lnk);
>>>    	smc_ib_destroy_queue_pair(lnk);
>>>    	smc_ib_dealloc_protection_domain(lnk);
>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>> index fe8b524ad..285f9bd8e 100644
>>> --- a/net/smc/smc_core.h
>>> +++ b/net/smc/smc_core.h
>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>    	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>    	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>    	u64			wr_rx_id;	/* seq # of last recv WR */
>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>    	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>    	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>    
>>>    	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>    	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>> index 26f8f240d..bc8793803 100644
>>> --- a/net/smc/smc_wr.c
>>> +++ b/net/smc/smc_wr.c
>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>    
>>>    	for (i = 0; i < num; i++) {
>>>    		link = wc[i].qp->qp_context;
>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>    		if (wc[i].status == IB_WC_SUCCESS) {
>>>    			link->wr_rx_tstamp = jiffies;
>>>    			smc_wr_rx_demultiplex(&wc[i]);
>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>    			case IB_WC_RNR_RETRY_EXC_ERR:
>>>    			case IB_WC_WR_FLUSH_ERR:
>>>    				smcr_link_down_cond_sched(link);
>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>> +					wake_up(&link->wr_rx_empty_wait);
>>>    				break;
>>>    			default:
>>>    				smc_wr_rx_post(link); /* refill WR RX */
>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>    	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>    }
>>>    
>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>> +{
>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>> +}
>>> +
>>>    void smc_wr_free_link(struct smc_link *lnk)
>>>    {
>>>    	struct ib_device *ibdev;
>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>    	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>    	init_waitqueue_head(&lnk->wr_reg_wait);
>>>    	atomic_set(&lnk->wr_reg_refcnt, 0);
>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>    	return rc;
>>>    
>>>    dma_unmap:
>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>> index a54e90a11..5ca5086ae 100644
>>> --- a/net/smc/smc_wr.h
>>> +++ b/net/smc/smc_wr.h
>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>    int smc_wr_create_link(struct smc_link *lnk);
>>>    int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>    int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>    void smc_wr_free_link(struct smc_link *lnk);
>>>    void smc_wr_free_link_mem(struct smc_link *lnk);
>>>    void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>
>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>> for such valuable suggestions and testing!
>> I like the modification of this version. However, this is not a fix
>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>> the parallelism of SMC-R connections" are still not applied. My
>> sugguestions:
>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>> those patches I mentioned above, and ask if he can take your patch as a
>> part of the patch serie
>> - Fix patches should go to net-next
>> - Please send always send your new version separately, rather than as
>> reply to your previous version. That makes people confused.
> 
> @Wenjia, Thanks a lot for your suggestions and guidance !
> 
> @D. Wythe, Can you include this patch in your series of patches if it is
> convenient?
> 
> Regards,
> Yacan
> 
One point I was confused, fixes should goto net, sorry!
liuyacan@corp.netease.com Sept. 1, 2022, 12:54 p.m. UTC | #5
> >>> From: Yacan Liu <liuyacan@corp.netease.com>
> >>>
> >>> After modifying the QP to the Error state, all RX WR would be completed
> >>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> >>> wait for it is done, but destroy the QP and free the link group directly.
> >>> So there is a risk that accessing the freed memory in tasklet context.
> >>>
> >>> Here is a crash example:
> >>>
> >>>    BUG: unable to handle page fault for address: ffffffff8f220860
> >>>    #PF: supervisor write access in kernel mode
> >>>    #PF: error_code(0x0002) - not-present page
> >>>    PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >>>    Oops: 0002 [#1] SMP PTI
> >>>    CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >>>    Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >>>    RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >>>    Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >>>    RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >>>    RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >>>    RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >>>    RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >>>    R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >>>    R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >>>    FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >>>    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>    CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >>>    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >>>    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >>>    Call Trace:
> >>>     <IRQ>
> >>>     _raw_spin_lock_irqsave+0x30/0x40
> >>>     mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >>>     smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >>>     tasklet_action_common.isra.21+0x66/0x100
> >>>     __do_softirq+0xd5/0x29c
> >>>     asm_call_irq_on_stack+0x12/0x20
> >>>     </IRQ>
> >>>     do_softirq_own_stack+0x37/0x40
> >>>     irq_exit_rcu+0x9d/0xa0
> >>>     sysvec_call_function_single+0x34/0x80
> >>>     asm_sysvec_call_function_single+0x12/0x20
> >>>
> >>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> >>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> >>>
> >>> ---
> >>> Chagen in v4:
> >>>     -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >>>     -- Remove timeout.
> >>> Change in v3:
> >>>     -- Tune commit message (Signed-Off tag, Fixes tag).
> >>>        Tune code to avoid column length exceeding.
> >>> Change in v2:
> >>>     -- Fix some compile warnings and errors.
> >>> ---
> >>>    net/smc/smc_core.c | 2 ++
> >>>    net/smc/smc_core.h | 2 ++
> >>>    net/smc/smc_wr.c   | 9 +++++++++
> >>>    net/smc/smc_wr.h   | 1 +
> >>>    4 files changed, 14 insertions(+)
> >>>
> >>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>> index ff49a11f5..f92a916e9 100644
> >>> --- a/net/smc/smc_core.c
> >>> +++ b/net/smc/smc_core.c
> >>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >>>    	lnk->lgr = lgr;
> >>>    	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >>>    	lnk->link_idx = link_idx;
> >>> +	lnk->wr_rx_id_compl = 0;
> >>>    	smc_ibdev_cnt_inc(lnk);
> >>>    	smcr_copy_dev_info_to_link(lnk);
> >>>    	atomic_set(&lnk->conn_cnt, 0);
> >>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >>>    	smcr_buf_unmap_lgr(lnk);
> >>>    	smcr_rtoken_clear_link(lnk);
> >>>    	smc_ib_modify_qp_error(lnk);
> >>> +	smc_wr_drain_cq(lnk);
> >>>    	smc_wr_free_link(lnk);
> >>>    	smc_ib_destroy_queue_pair(lnk);
> >>>    	smc_ib_dealloc_protection_domain(lnk);
> >>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> >>> index fe8b524ad..285f9bd8e 100644
> >>> --- a/net/smc/smc_core.h
> >>> +++ b/net/smc/smc_core.h
> >>> @@ -115,8 +115,10 @@ struct smc_link {
> >>>    	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >>>    	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >>>    	u64			wr_rx_id;	/* seq # of last recv WR */
> >>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >>>    	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >>>    	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> >>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >>>    
> >>>    	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >>>    	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> >>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> >>> index 26f8f240d..bc8793803 100644
> >>> --- a/net/smc/smc_wr.c
> >>> +++ b/net/smc/smc_wr.c
> >>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>    
> >>>    	for (i = 0; i < num; i++) {
> >>>    		link = wc[i].qp->qp_context;
> >>> +		link->wr_rx_id_compl = wc[i].wr_id;
> >>>    		if (wc[i].status == IB_WC_SUCCESS) {
> >>>    			link->wr_rx_tstamp = jiffies;
> >>>    			smc_wr_rx_demultiplex(&wc[i]);
> >>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>    			case IB_WC_RNR_RETRY_EXC_ERR:
> >>>    			case IB_WC_WR_FLUSH_ERR:
> >>>    				smcr_link_down_cond_sched(link);
> >>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> >>> +					wake_up(&link->wr_rx_empty_wait);
> >>>    				break;
> >>>    			default:
> >>>    				smc_wr_rx_post(link); /* refill WR RX */
> >>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >>>    	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >>>    }
> >>>    
> >>> +void smc_wr_drain_cq(struct smc_link *lnk)
> >>> +{
> >>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> >>> +}
> >>> +
> >>>    void smc_wr_free_link(struct smc_link *lnk)
> >>>    {
> >>>    	struct ib_device *ibdev;
> >>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >>>    	atomic_set(&lnk->wr_tx_refcnt, 0);
> >>>    	init_waitqueue_head(&lnk->wr_reg_wait);
> >>>    	atomic_set(&lnk->wr_reg_refcnt, 0);
> >>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >>>    	return rc;
> >>>    
> >>>    dma_unmap:
> >>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> >>> index a54e90a11..5ca5086ae 100644
> >>> --- a/net/smc/smc_wr.h
> >>> +++ b/net/smc/smc_wr.h
> >>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >>>    int smc_wr_create_link(struct smc_link *lnk);
> >>>    int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >>>    int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> >>> +void smc_wr_drain_cq(struct smc_link *lnk);
> >>>    void smc_wr_free_link(struct smc_link *lnk);
> >>>    void smc_wr_free_link_mem(struct smc_link *lnk);
> >>>    void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> >>
> >> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
> >> for such valuable suggestions and testing!
> >> I like the modification of this version. However, this is not a fix
> >> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
> >> the parallelism of SMC-R connections" are still not applied. My
> >> sugguestions:
> >> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
> >> those patches I mentioned above, and ask if he can take your patch as a
> >> part of the patch serie
> >> - Fix patches should go to net-next
> >> - Please send always send your new version separately, rather than as
> >> reply to your previous version. That makes people confused.
> > 
> > @Wenjia, Thanks a lot for your suggestions and guidance !
> > 
> > @D. Wythe, Can you include this patch in your series of patches if it is
> > convenient?
> > 
> > Regards,
> > Yacan
> > 
> One point I was confused, fixes should goto net, sorry!

Well, @D. Wythe, please ignore the above emails, sorry!

Regards,
Yacan
Wenjia Zhang Sept. 1, 2022, 7:16 p.m. UTC | #6
On 01.09.22 14:54, liuyacan@corp.netease.com wrote:
>>>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>>>
>>>>> After modifying the QP to the Error state, all RX WR would be completed
>>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>>>> wait for it is done, but destroy the QP and free the link group directly.
>>>>> So there is a risk that accessing the freed memory in tasklet context.
>>>>>
>>>>> Here is a crash example:
>>>>>
>>>>>     BUG: unable to handle page fault for address: ffffffff8f220860
>>>>>     #PF: supervisor write access in kernel mode
>>>>>     #PF: error_code(0x0002) - not-present page
>>>>>     PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>>>     Oops: 0002 [#1] SMP PTI
>>>>>     CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>>>     Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>>>     RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>>>     Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>>>     RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>>>     RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>>>     RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>>>     RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>>>     R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>>>     R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>>>     FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>>>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>>>     CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>>>     DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>>>     DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>>>     Call Trace:
>>>>>      <IRQ>
>>>>>      _raw_spin_lock_irqsave+0x30/0x40
>>>>>      mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>>>      smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>>>      tasklet_action_common.isra.21+0x66/0x100
>>>>>      __do_softirq+0xd5/0x29c
>>>>>      asm_call_irq_on_stack+0x12/0x20
>>>>>      </IRQ>
>>>>>      do_softirq_own_stack+0x37/0x40
>>>>>      irq_exit_rcu+0x9d/0xa0
>>>>>      sysvec_call_function_single+0x34/0x80
>>>>>      asm_sysvec_call_function_single+0x12/0x20
>>>>>
>>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>>>
>>>>> ---
>>>>> Chagen in v4:
>>>>>      -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>>>      -- Remove timeout.
>>>>> Change in v3:
>>>>>      -- Tune commit message (Signed-Off tag, Fixes tag).
>>>>>         Tune code to avoid column length exceeding.
>>>>> Change in v2:
>>>>>      -- Fix some compile warnings and errors.
>>>>> ---
>>>>>     net/smc/smc_core.c | 2 ++
>>>>>     net/smc/smc_core.h | 2 ++
>>>>>     net/smc/smc_wr.c   | 9 +++++++++
>>>>>     net/smc/smc_wr.h   | 1 +
>>>>>     4 files changed, 14 insertions(+)
>>>>>
>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>> index ff49a11f5..f92a916e9 100644
>>>>> --- a/net/smc/smc_core.c
>>>>> +++ b/net/smc/smc_core.c
>>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>>>     	lnk->lgr = lgr;
>>>>>     	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>>>     	lnk->link_idx = link_idx;
>>>>> +	lnk->wr_rx_id_compl = 0;
>>>>>     	smc_ibdev_cnt_inc(lnk);
>>>>>     	smcr_copy_dev_info_to_link(lnk);
>>>>>     	atomic_set(&lnk->conn_cnt, 0);
>>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>>>     	smcr_buf_unmap_lgr(lnk);
>>>>>     	smcr_rtoken_clear_link(lnk);
>>>>>     	smc_ib_modify_qp_error(lnk);
>>>>> +	smc_wr_drain_cq(lnk);
>>>>>     	smc_wr_free_link(lnk);
>>>>>     	smc_ib_destroy_queue_pair(lnk);
>>>>>     	smc_ib_dealloc_protection_domain(lnk);
>>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>>>> index fe8b524ad..285f9bd8e 100644
>>>>> --- a/net/smc/smc_core.h
>>>>> +++ b/net/smc/smc_core.h
>>>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>>>     	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>>>     	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>>>     	u64			wr_rx_id;	/* seq # of last recv WR */
>>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>>>     	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>>>     	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>>>     
>>>>>     	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>>>     	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>>>> index 26f8f240d..bc8793803 100644
>>>>> --- a/net/smc/smc_wr.c
>>>>> +++ b/net/smc/smc_wr.c
>>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>     
>>>>>     	for (i = 0; i < num; i++) {
>>>>>     		link = wc[i].qp->qp_context;
>>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>>>     		if (wc[i].status == IB_WC_SUCCESS) {
>>>>>     			link->wr_rx_tstamp = jiffies;
>>>>>     			smc_wr_rx_demultiplex(&wc[i]);
>>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>     			case IB_WC_RNR_RETRY_EXC_ERR:
>>>>>     			case IB_WC_WR_FLUSH_ERR:
>>>>>     				smcr_link_down_cond_sched(link);
>>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>>>> +					wake_up(&link->wr_rx_empty_wait);
>>>>>     				break;
>>>>>     			default:
>>>>>     				smc_wr_rx_post(link); /* refill WR RX */
>>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>>>     	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>>>     }
>>>>>     
>>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>>>> +{
>>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>>>> +}
>>>>> +
>>>>>     void smc_wr_free_link(struct smc_link *lnk)
>>>>>     {
>>>>>     	struct ib_device *ibdev;
>>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>>>     	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>>>     	init_waitqueue_head(&lnk->wr_reg_wait);
>>>>>     	atomic_set(&lnk->wr_reg_refcnt, 0);
>>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>>>     	return rc;
>>>>>     
>>>>>     dma_unmap:
>>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>>>> index a54e90a11..5ca5086ae 100644
>>>>> --- a/net/smc/smc_wr.h
>>>>> +++ b/net/smc/smc_wr.h
>>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>>>     int smc_wr_create_link(struct smc_link *lnk);
>>>>>     int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>>>     int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>>>     void smc_wr_free_link(struct smc_link *lnk);
>>>>>     void smc_wr_free_link_mem(struct smc_link *lnk);
>>>>>     void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>>>
>>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>>>> for such valuable suggestions and testing!
>>>> I like the modification of this version. However, this is not a fix
>>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>>>> the parallelism of SMC-R connections" are still not applied. My
>>>> sugguestions:
>>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>>>> those patches I mentioned above, and ask if he can take your patch as a
>>>> part of the patch serie
>>>> - Fix patches should go to net-next
>>>> - Please send always send your new version separately, rather than as
>>>> reply to your previous version. That makes people confused.
>>>
>>> @Wenjia, Thanks a lot for your suggestions and guidance !
>>>
>>> @D. Wythe, Can you include this patch in your series of patches if it is
>>> convenient?
>>>
>>> Regards,
>>> Yacan
>>>
>> One point I was confused, fixes should goto net, sorry!
> 
> Well, @D. Wythe, please ignore the above emails, sorry!
> 
> Regards,
> Yacan
> 
oh no, I didn't mean that. I think I didn't say clearly. What I mean is 
that the patch should go to net as a seperate patch if the patch serie 
from D. Wythe is already applied. But now the patch serie is still not 
applied, so you can still ask D. Wythe to take your patch as a part of 
this serie. (Just a suggestion)
liuyacan@corp.netease.com Sept. 2, 2022, 2:16 a.m. UTC | #7
> >>>>> From: Yacan Liu <liuyacan@corp.netease.com>
> >>>>>
> >>>>> After modifying the QP to the Error state, all RX WR would be completed
> >>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> >>>>> wait for it is done, but destroy the QP and free the link group directly.
> >>>>> So there is a risk that accessing the freed memory in tasklet context.
> >>>>>
> >>>>> Here is a crash example:
> >>>>>
> >>>>>     BUG: unable to handle page fault for address: ffffffff8f220860
> >>>>>     #PF: supervisor write access in kernel mode
> >>>>>     #PF: error_code(0x0002) - not-present page
> >>>>>     PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >>>>>     Oops: 0002 [#1] SMP PTI
> >>>>>     CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >>>>>     Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >>>>>     RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >>>>>     Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >>>>>     RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >>>>>     RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >>>>>     RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >>>>>     RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >>>>>     R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >>>>>     R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >>>>>     FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >>>>>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>>>     CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >>>>>     DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >>>>>     DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >>>>>     Call Trace:
> >>>>>      <IRQ>
> >>>>>      _raw_spin_lock_irqsave+0x30/0x40
> >>>>>      mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >>>>>      smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >>>>>      tasklet_action_common.isra.21+0x66/0x100
> >>>>>      __do_softirq+0xd5/0x29c
> >>>>>      asm_call_irq_on_stack+0x12/0x20
> >>>>>      </IRQ>
> >>>>>      do_softirq_own_stack+0x37/0x40
> >>>>>      irq_exit_rcu+0x9d/0xa0
> >>>>>      sysvec_call_function_single+0x34/0x80
> >>>>>      asm_sysvec_call_function_single+0x12/0x20
> >>>>>
> >>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> >>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> >>>>>
> >>>>> ---
> >>>>> Chagen in v4:
> >>>>>      -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >>>>>      -- Remove timeout.
> >>>>> Change in v3:
> >>>>>      -- Tune commit message (Signed-Off tag, Fixes tag).
> >>>>>         Tune code to avoid column length exceeding.
> >>>>> Change in v2:
> >>>>>      -- Fix some compile warnings and errors.
> >>>>> ---
> >>>>>     net/smc/smc_core.c | 2 ++
> >>>>>     net/smc/smc_core.h | 2 ++
> >>>>>     net/smc/smc_wr.c   | 9 +++++++++
> >>>>>     net/smc/smc_wr.h   | 1 +
> >>>>>     4 files changed, 14 insertions(+)
> >>>>>
> >>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>>>> index ff49a11f5..f92a916e9 100644
> >>>>> --- a/net/smc/smc_core.c
> >>>>> +++ b/net/smc/smc_core.c
> >>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >>>>>     	lnk->lgr = lgr;
> >>>>>     	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >>>>>     	lnk->link_idx = link_idx;
> >>>>> +	lnk->wr_rx_id_compl = 0;
> >>>>>     	smc_ibdev_cnt_inc(lnk);
> >>>>>     	smcr_copy_dev_info_to_link(lnk);
> >>>>>     	atomic_set(&lnk->conn_cnt, 0);
> >>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >>>>>     	smcr_buf_unmap_lgr(lnk);
> >>>>>     	smcr_rtoken_clear_link(lnk);
> >>>>>     	smc_ib_modify_qp_error(lnk);
> >>>>> +	smc_wr_drain_cq(lnk);
> >>>>>     	smc_wr_free_link(lnk);
> >>>>>     	smc_ib_destroy_queue_pair(lnk);
> >>>>>     	smc_ib_dealloc_protection_domain(lnk);
> >>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> >>>>> index fe8b524ad..285f9bd8e 100644
> >>>>> --- a/net/smc/smc_core.h
> >>>>> +++ b/net/smc/smc_core.h
> >>>>> @@ -115,8 +115,10 @@ struct smc_link {
> >>>>>     	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >>>>>     	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >>>>>     	u64			wr_rx_id;	/* seq # of last recv WR */
> >>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >>>>>     	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >>>>>     	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> >>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >>>>>     
> >>>>>     	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >>>>>     	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> >>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> >>>>> index 26f8f240d..bc8793803 100644
> >>>>> --- a/net/smc/smc_wr.c
> >>>>> +++ b/net/smc/smc_wr.c
> >>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>>>     
> >>>>>     	for (i = 0; i < num; i++) {
> >>>>>     		link = wc[i].qp->qp_context;
> >>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
> >>>>>     		if (wc[i].status == IB_WC_SUCCESS) {
> >>>>>     			link->wr_rx_tstamp = jiffies;
> >>>>>     			smc_wr_rx_demultiplex(&wc[i]);
> >>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>>>     			case IB_WC_RNR_RETRY_EXC_ERR:
> >>>>>     			case IB_WC_WR_FLUSH_ERR:
> >>>>>     				smcr_link_down_cond_sched(link);
> >>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> >>>>> +					wake_up(&link->wr_rx_empty_wait);
> >>>>>     				break;
> >>>>>     			default:
> >>>>>     				smc_wr_rx_post(link); /* refill WR RX */
> >>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >>>>>     	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >>>>>     }
> >>>>>     
> >>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
> >>>>> +{
> >>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> >>>>> +}
> >>>>> +
> >>>>>     void smc_wr_free_link(struct smc_link *lnk)
> >>>>>     {
> >>>>>     	struct ib_device *ibdev;
> >>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >>>>>     	atomic_set(&lnk->wr_tx_refcnt, 0);
> >>>>>     	init_waitqueue_head(&lnk->wr_reg_wait);
> >>>>>     	atomic_set(&lnk->wr_reg_refcnt, 0);
> >>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >>>>>     	return rc;
> >>>>>     
> >>>>>     dma_unmap:
> >>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> >>>>> index a54e90a11..5ca5086ae 100644
> >>>>> --- a/net/smc/smc_wr.h
> >>>>> +++ b/net/smc/smc_wr.h
> >>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >>>>>     int smc_wr_create_link(struct smc_link *lnk);
> >>>>>     int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >>>>>     int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> >>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
> >>>>>     void smc_wr_free_link(struct smc_link *lnk);
> >>>>>     void smc_wr_free_link_mem(struct smc_link *lnk);
> >>>>>     void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> >>>>
> >>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
> >>>> for such valuable suggestions and testing!
> >>>> I like the modification of this version. However, this is not a fix
> >>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
> >>>> the parallelism of SMC-R connections" are still not applied. My
> >>>> sugguestions:
> >>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
> >>>> those patches I mentioned above, and ask if he can take your patch as a
> >>>> part of the patch serie
> >>>> - Fix patches should go to net-next
> >>>> - Please send always send your new version separately, rather than as
> >>>> reply to your previous version. That makes people confused.
> >>>
> >>> @Wenjia, Thanks a lot for your suggestions and guidance !
> >>>
> >>> @D. Wythe, Can you include this patch in your series of patches if it is
> >>> convenient?
> >>>
> >>> Regards,
> >>> Yacan
> >>>
> >> One point I was confused, fixes should goto net, sorry!
> > 
> > Well, @D. Wythe, please ignore the above emails, sorry!
> > 
> > Regards,
> > Yacan
> > 
> oh no, I didn't mean that. I think I didn't say clearly. What I mean is 
> that the patch should go to net as a seperate patch if the patch serie 
> from D. Wythe is already applied. But now the patch serie is still not 
> applied, so you can still ask D. Wythe to take your patch as a part of 
> this serie. (Just a suggestion)

Well, I misunderstood. What I'm not sure about is that the patch serie 
from D. Wythe is going to the net-next tree, but mine is going to the net. 
Will this be a problem ?

Regards,
Yacan
Wenjia Zhang Sept. 2, 2022, 5:55 a.m. UTC | #8
On 02.09.22 04:16, liuyacan@corp.netease.com wrote:
>>>>>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>>>>>
>>>>>>> After modifying the QP to the Error state, all RX WR would be completed
>>>>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>>>>>> wait for it is done, but destroy the QP and free the link group directly.
>>>>>>> So there is a risk that accessing the freed memory in tasklet context.
>>>>>>>
>>>>>>> Here is a crash example:
>>>>>>>
>>>>>>>      BUG: unable to handle page fault for address: ffffffff8f220860
>>>>>>>      #PF: supervisor write access in kernel mode
>>>>>>>      #PF: error_code(0x0002) - not-present page
>>>>>>>      PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>>>>>      Oops: 0002 [#1] SMP PTI
>>>>>>>      CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>>>>>      Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>>>>>      RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>>>>>      Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>>>>>      RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>>>>>      RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>>>>>      RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>>>>>      RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>>>>>      R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>>>>>      R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>>>>>      FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>>>>>      CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>>>>>      CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>>>>>      DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>>>>>      DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>>>>>      Call Trace:
>>>>>>>       <IRQ>
>>>>>>>       _raw_spin_lock_irqsave+0x30/0x40
>>>>>>>       mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>>>>>       smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>>>>>       tasklet_action_common.isra.21+0x66/0x100
>>>>>>>       __do_softirq+0xd5/0x29c
>>>>>>>       asm_call_irq_on_stack+0x12/0x20
>>>>>>>       </IRQ>
>>>>>>>       do_softirq_own_stack+0x37/0x40
>>>>>>>       irq_exit_rcu+0x9d/0xa0
>>>>>>>       sysvec_call_function_single+0x34/0x80
>>>>>>>       asm_sysvec_call_function_single+0x12/0x20
>>>>>>>
>>>>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>>>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>>>>>
>>>>>>> ---
>>>>>>> Chagen in v4:
>>>>>>>       -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>>>>>       -- Remove timeout.
>>>>>>> Change in v3:
>>>>>>>       -- Tune commit message (Signed-Off tag, Fixes tag).
>>>>>>>          Tune code to avoid column length exceeding.
>>>>>>> Change in v2:
>>>>>>>       -- Fix some compile warnings and errors.
>>>>>>> ---
>>>>>>>      net/smc/smc_core.c | 2 ++
>>>>>>>      net/smc/smc_core.h | 2 ++
>>>>>>>      net/smc/smc_wr.c   | 9 +++++++++
>>>>>>>      net/smc/smc_wr.h   | 1 +
>>>>>>>      4 files changed, 14 insertions(+)
>>>>>>>
>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>>> index ff49a11f5..f92a916e9 100644
>>>>>>> --- a/net/smc/smc_core.c
>>>>>>> +++ b/net/smc/smc_core.c
>>>>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>>>>>      	lnk->lgr = lgr;
>>>>>>>      	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>>>>>      	lnk->link_idx = link_idx;
>>>>>>> +	lnk->wr_rx_id_compl = 0;
>>>>>>>      	smc_ibdev_cnt_inc(lnk);
>>>>>>>      	smcr_copy_dev_info_to_link(lnk);
>>>>>>>      	atomic_set(&lnk->conn_cnt, 0);
>>>>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>>>>>      	smcr_buf_unmap_lgr(lnk);
>>>>>>>      	smcr_rtoken_clear_link(lnk);
>>>>>>>      	smc_ib_modify_qp_error(lnk);
>>>>>>> +	smc_wr_drain_cq(lnk);
>>>>>>>      	smc_wr_free_link(lnk);
>>>>>>>      	smc_ib_destroy_queue_pair(lnk);
>>>>>>>      	smc_ib_dealloc_protection_domain(lnk);
>>>>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>>>>>> index fe8b524ad..285f9bd8e 100644
>>>>>>> --- a/net/smc/smc_core.h
>>>>>>> +++ b/net/smc/smc_core.h
>>>>>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>>>>>      	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>>>>>      	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>>>>>      	u64			wr_rx_id;	/* seq # of last recv WR */
>>>>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>>>>>      	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>>>>>      	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>>>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>>>>>      
>>>>>>>      	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>>>>>      	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>>>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>>>>>> index 26f8f240d..bc8793803 100644
>>>>>>> --- a/net/smc/smc_wr.c
>>>>>>> +++ b/net/smc/smc_wr.c
>>>>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>>>      
>>>>>>>      	for (i = 0; i < num; i++) {
>>>>>>>      		link = wc[i].qp->qp_context;
>>>>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>>>>>      		if (wc[i].status == IB_WC_SUCCESS) {
>>>>>>>      			link->wr_rx_tstamp = jiffies;
>>>>>>>      			smc_wr_rx_demultiplex(&wc[i]);
>>>>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>>>      			case IB_WC_RNR_RETRY_EXC_ERR:
>>>>>>>      			case IB_WC_WR_FLUSH_ERR:
>>>>>>>      				smcr_link_down_cond_sched(link);
>>>>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>>>>>> +					wake_up(&link->wr_rx_empty_wait);
>>>>>>>      				break;
>>>>>>>      			default:
>>>>>>>      				smc_wr_rx_post(link); /* refill WR RX */
>>>>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>>>>>      	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>>>>>      }
>>>>>>>      
>>>>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>>>>>> +{
>>>>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>>>>>> +}
>>>>>>> +
>>>>>>>      void smc_wr_free_link(struct smc_link *lnk)
>>>>>>>      {
>>>>>>>      	struct ib_device *ibdev;
>>>>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>>>>>      	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>>>>>      	init_waitqueue_head(&lnk->wr_reg_wait);
>>>>>>>      	atomic_set(&lnk->wr_reg_refcnt, 0);
>>>>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>>>>>      	return rc;
>>>>>>>      
>>>>>>>      dma_unmap:
>>>>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>>>>>> index a54e90a11..5ca5086ae 100644
>>>>>>> --- a/net/smc/smc_wr.h
>>>>>>> +++ b/net/smc/smc_wr.h
>>>>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>>>>>      int smc_wr_create_link(struct smc_link *lnk);
>>>>>>>      int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>>>>>      int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>>>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_link(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_link_mem(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>>>>>
>>>>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>>>>>> for such valuable suggestions and testing!
>>>>>> I like the modification of this version. However, this is not a fix
>>>>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>>>>>> the parallelism of SMC-R connections" are still not applied. My
>>>>>> sugguestions:
>>>>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>>>>>> those patches I mentioned above, and ask if he can take your patch as a
>>>>>> part of the patch serie
>>>>>> - Fix patches should go to net-next
>>>>>> - Please send always send your new version separately, rather than as
>>>>>> reply to your previous version. That makes people confused.
>>>>>
>>>>> @Wenjia, Thanks a lot for your suggestions and guidance !
>>>>>
>>>>> @D. Wythe, Can you include this patch in your series of patches if it is
>>>>> convenient?
>>>>>
>>>>> Regards,
>>>>> Yacan
>>>>>
>>>> One point I was confused, fixes should goto net, sorry!
>>>
>>> Well, @D. Wythe, please ignore the above emails, sorry!
>>>
>>> Regards,
>>> Yacan
>>>
>> oh no, I didn't mean that. I think I didn't say clearly. What I mean is
>> that the patch should go to net as a seperate patch if the patch serie
>> from D. Wythe is already applied. But now the patch serie is still not
>> applied, so you can still ask D. Wythe to take your patch as a part of
>> this serie. (Just a suggestion)
> 
> Well, I misunderstood. What I'm not sure about is that the patch serie
> from D. Wythe is going to the net-next tree, but mine is going to the net.
> Will this be a problem ?
> 
> Regards,
> Yacan
> I don't think that would be a problem in this situation.
Tony Lu Sept. 4, 2022, 3:58 p.m. UTC | #9
On Wed, Aug 31, 2022 at 11:53:03PM +0800, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 
> ---
> Chagen in v4:
>   -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>   -- Remove timeout.
> Change in v3:
>   -- Tune commit message (Signed-Off tag, Fixes tag).
>      Tune code to avoid column length exceeding.
> Change in v2:
>   -- Fix some compile warnings and errors.
> ---
>  net/smc/smc_core.c | 2 ++
>  net/smc/smc_core.h | 2 ++
>  net/smc/smc_wr.c   | 9 +++++++++
>  net/smc/smc_wr.h   | 1 +
>  4 files changed, 14 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..f92a916e9 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	lnk->lgr = lgr;
>  	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>  	lnk->link_idx = link_idx;
> +	lnk->wr_rx_id_compl = 0;
>  	smc_ibdev_cnt_inc(lnk);
>  	smcr_copy_dev_info_to_link(lnk);
>  	atomic_set(&lnk->conn_cnt, 0);
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>  	smcr_buf_unmap_lgr(lnk);
>  	smcr_rtoken_clear_link(lnk);
>  	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);

smc_wr_drain_cq() can put into smc_wr_free_link(). Since
smc_wr_free_link() do the same things together, such as waiting for
resources cleaning up about wr.

>  	smc_wr_free_link(lnk);
>  	smc_ib_destroy_queue_pair(lnk);
>  	smc_ib_dealloc_protection_domain(lnk);

<snip>

The patch tested good in our environment. If you are going to send the
v5 patch, you can go with my review.

Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>

Cheers,
Tony Lu
liuyacan@corp.netease.com Sept. 5, 2022, 7:18 a.m. UTC | #10
> > From: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > After modifying the QP to the Error state, all RX WR would be completed
> > with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> > wait for it is done, but destroy the QP and free the link group directly.
> > So there is a risk that accessing the freed memory in tasklet context.
> > 
> > Here is a crash example:
> > 
> >  BUG: unable to handle page fault for address: ffffffff8f220860
> >  #PF: supervisor write access in kernel mode
> >  #PF: error_code(0x0002) - not-present page
> >  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >  Oops: 0002 [#1] SMP PTI
> >  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >  Call Trace:
> >   <IRQ>
> >   _raw_spin_lock_irqsave+0x30/0x40
> >   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >   tasklet_action_common.isra.21+0x66/0x100
> >   __do_softirq+0xd5/0x29c
> >   asm_call_irq_on_stack+0x12/0x20
> >   </IRQ>
> >   do_softirq_own_stack+0x37/0x40
> >   irq_exit_rcu+0x9d/0xa0
> >   sysvec_call_function_single+0x34/0x80
> >   asm_sysvec_call_function_single+0x12/0x20
> > 
> > Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> > Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > ---
> > Chagen in v4:
> >   -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >   -- Remove timeout.
> > Change in v3:
> >   -- Tune commit message (Signed-Off tag, Fixes tag).
> >      Tune code to avoid column length exceeding.
> > Change in v2:
> >   -- Fix some compile warnings and errors.
> > ---
> >  net/smc/smc_core.c | 2 ++
> >  net/smc/smc_core.h | 2 ++
> >  net/smc/smc_wr.c   | 9 +++++++++
> >  net/smc/smc_wr.h   | 1 +
> >  4 files changed, 14 insertions(+)
> > 
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index ff49a11f5..f92a916e9 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >  	lnk->lgr = lgr;
> >  	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >  	lnk->link_idx = link_idx;
> > +	lnk->wr_rx_id_compl = 0;
> >  	smc_ibdev_cnt_inc(lnk);
> >  	smcr_copy_dev_info_to_link(lnk);
> >  	atomic_set(&lnk->conn_cnt, 0);
> > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >  	smcr_buf_unmap_lgr(lnk);
> >  	smcr_rtoken_clear_link(lnk);
> >  	smc_ib_modify_qp_error(lnk);
> > +	smc_wr_drain_cq(lnk);
> 
> smc_wr_drain_cq() can put into smc_wr_free_link(). Since
> smc_wr_free_link() do the same things together, such as waiting for
> resources cleaning up about wr.
> 
> >  	smc_wr_free_link(lnk);
> >  	smc_ib_destroy_queue_pair(lnk);
> >  	smc_ib_dealloc_protection_domain(lnk);
> 
> <snip>
> 
> The patch tested good in our environment. If you are going to send the
> v5 patch, you can go with my review.
> 
> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
> 
> Cheers,
> Tony Lu

Thank you very much for your verification and suggestion!

Regards,
Yacan
diff mbox series

Patch

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..f92a916e9 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -757,6 +757,7 @@  int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->lgr = lgr;
 	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
 	lnk->link_idx = link_idx;
+	lnk->wr_rx_id_compl = 0;
 	smc_ibdev_cnt_inc(lnk);
 	smcr_copy_dev_info_to_link(lnk);
 	atomic_set(&lnk->conn_cnt, 0);
@@ -1269,6 +1270,7 @@  void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..285f9bd8e 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -115,8 +115,10 @@  struct smc_link {
 	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
 	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
 	u64			wr_rx_id;	/* seq # of last recv WR */
+	u64			wr_rx_id_compl; /* seq # of last completed WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..bc8793803 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -454,6 +454,7 @@  static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 
 	for (i = 0; i < num; i++) {
 		link = wc[i].qp->qp_context;
+		link->wr_rx_id_compl = wc[i].wr_id;
 		if (wc[i].status == IB_WC_SUCCESS) {
 			link->wr_rx_tstamp = jiffies;
 			smc_wr_rx_demultiplex(&wc[i]);
@@ -465,6 +466,8 @@  static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->wr_rx_id_compl == link->wr_rx_id)
+					wake_up(&link->wr_rx_empty_wait);
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +634,11 @@  static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +897,7 @@  int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_empty_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..5ca5086ae 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -101,6 +101,7 @@  static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);