diff mbox series

rdma/i40iw: Add a reference when accepting a connection to avoid panic

Message ID 20190327134254.1740-1-andrew.boyer@dell.com (mailing list archive)
State Changes Requested
Headers show
Series rdma/i40iw: Add a reference when accepting a connection to avoid panic | expand

Commit Message

Andrew Boyer March 27, 2019, 1:42 p.m. UTC
When a CONNECT_REQUEST is received on the listening side, a
new cm_node is created. A pointer to the cm_node is put into an iw_cm
event message, which is put on a workqueue and then sent to i40iw_accept().

The driver needs to add a reference to go with the iw_cm event so that the
cm_node cannot be destroyed before the workqueue item is processed.

Note that i40iw_accept() already releases a reference in two error paths;
these appear to be incorrect since there was no associated reference taken.

Backtrace:
[436732.936866] general protection fault: 0000 [#1] SMP NOPTI
[436732.937891] Modules linked in: ...
[436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-coreos-r9999.1533000047-442 #1
[436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
[436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw]
[436732.972065] RSP: 0018:ffff99c047713b28 EFLAGS: 00010046
[436732.973022] RAX: 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0
[436732.974314] RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
[436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09: ffff8bd53818dc40
[436732.976902] R10: ffff99c047713a08 R11: 0000000000000004 R12: ffff8bd538188018
[436732.978192] R13: ffff8bd53818b220 R14: ffff8bd648826800 R15: ffff8bcf356a3400
[436732.979480] FS:  00007fc6ceba2700(0000) GS:ffff8bd674400000(0000) knlGS:0000000000000000
[436732.980937] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[436732.981983] CR2: 00007faa0ea26270 CR3: 00000016fa6ce003 CR4: 00000000003606f0
[436732.983312] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[436732.984602] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[436732.985893] Call Trace:
[436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm]
[436732.987159]  rdma_accept+0x1e8/0x260 [rdma_cm]
[436732.987982]  0xffffffffc0ad1141
[436732.988574]  0xffffffffc0ad14cd
[436732.989168]  __vfs_write+0x33/0x150
[436732.989824]  ? __inode_security_revalidate+0x4a/0x70
[436732.990734]  ? selinux_file_permission+0xdd/0x130
[436732.991600]  ? security_file_permission+0x36/0xb0
[436732.992466]  vfs_write+0xb3/0x1a0
[436732.993088]  SyS_write+0x52/0xc0
[436732.993698]  do_syscall_64+0x66/0x1d0
[436732.994384]  entry_SYSCALL_64_after_hwframe+0x21/0x86
[436732.995311] RIP: 0033:0x7fc79f7676ad
[436732.995981] RSP: 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX: 00007fc79f7676ad
[436732.998646] RDX: 0000000000000128 RSI: 00007fc76d371050 RDI: 000000000000005c
[436732.999934] RBP: 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
[436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12: 00007fc76d3711d0
[436733.002508] R13: 0000000028c80950 R14: 0000000028cc0950 R15: 000000002796b010
[436733.003798] Code: ...
[436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP: ffff99c047713b28

Fixes: f27b4746f378e ("i40iw: add connection management code"
Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
---
 drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Comments

Shiraz Saleem March 29, 2019, 11:29 p.m. UTC | #1
>Subject: [PATCH] rdma/i40iw: Add a reference when accepting a connection to
>avoid panic
>
>When a CONNECT_REQUEST is received on the listening side, a new cm_node is
>created. A pointer to the cm_node is put into an iw_cm event message, which is put
>on a workqueue and then sent to i40iw_accept().
>
>The driver needs to add a reference to go with the iw_cm event so that the cm_node
>cannot be destroyed before the workqueue item is processed.
>
>Note that i40iw_accept() already releases a reference in two error paths; these
>appear to be incorrect since there was no associated reference taken.
>
>Backtrace:
>[436732.936866] general protection fault: 0000 [#1] SMP NOPTI [436732.937891]
>Modules linked in: ...
>[436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-
>coreos-r9999.1533000047-442 #1
>[436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
>[436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw] [436732.972065] RSP:
>0018:ffff99c047713b28 EFLAGS: 00010046 [436732.973022] RAX:
>0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0 [436732.974314]
>RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
>[436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09:
>ffff8bd53818dc40 [436732.976902] R10: ffff99c047713a08 R11: 0000000000000004
>R12: ffff8bd538188018 [436732.978192] R13: ffff8bd53818b220 R14:
>ffff8bd648826800 R15: ffff8bcf356a3400 [436732.979480] FS:
>00007fc6ceba2700(0000) GS:ffff8bd674400000(0000) knlGS:0000000000000000
>[436732.980937] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[436732.981983] CR2: 00007faa0ea26270 CR3: 00000016fa6ce003 CR4:
>00000000003606f0 [436732.983312] DR0: 0000000000000000 DR1:
>0000000000000000 DR2: 0000000000000000 [436732.984602] DR3:
>0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>[436732.985893] Call Trace:
>[436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm] [436732.987159]
>rdma_accept+0x1e8/0x260 [rdma_cm] [436732.987982]  0xffffffffc0ad1141
>[436732.988574]  0xffffffffc0ad14cd [436732.989168]  __vfs_write+0x33/0x150
>[436732.989824]  ? __inode_security_revalidate+0x4a/0x70
>[436732.990734]  ? selinux_file_permission+0xdd/0x130
>[436732.991600]  ? security_file_permission+0x36/0xb0
>[436732.992466]  vfs_write+0xb3/0x1a0
>[436732.993088]  SyS_write+0x52/0xc0
>[436732.993698]  do_syscall_64+0x66/0x1d0 [436732.994384]
>entry_SYSCALL_64_after_hwframe+0x21/0x86
>[436732.995311] RIP: 0033:0x7fc79f7676ad [436732.995981] RSP:
>002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>[436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX:
>00007fc79f7676ad [436732.998646] RDX: 0000000000000128 RSI:
>00007fc76d371050 RDI: 000000000000005c [436732.999934] RBP:
>00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
>[436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12:
>00007fc76d3711d0 [436733.002508] R13: 0000000028c80950 R14:
>0000000028cc0950 R15: 000000002796b010 [436733.003798] Code: ...
>[436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP: ffff99c047713b28
>
>Fixes: f27b4746f378e ("i40iw: add connection management code"
>Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
>---
> drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>index 206cfb0016f8..28e92a68c178 100644
>--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>@@ -272,6 +272,9 @@ static int i40iw_send_cm_event(struct i40iw_cm_node
>*cm_node,
> 		event.private_data = (void *)cm_node->pdata_buf;
> 		event.private_data_len = (u8)cm_node->pdata.size;
> 		event.ird = cm_node->ird_size;
>+
>+		/* Take a reference to go with the iw_cm event */
>+		atomic_inc(&cm_node->ref_count);
> 		break;

Maybe I am missing something here, but i40iw_cm_post_event() should have bumped
the cm_node ref count so it is not deleted till event worker completes. So, I am not
entirely convinced this is the right root cause and fix.

It would be useful if we could get the call trace on i40iw_rem_ref_cm_node() when cm_node
goes away. I can assist providing a debug patch.

Shiraz
Andrew Boyer April 5, 2019, 2:50 p.m. UTC | #2
> On Mar 29, 2019, at 7:29 PM, Saleem, Shiraz <shiraz.saleem@intel.com> wrote:
> 
>> Subject: [PATCH] rdma/i40iw: Add a reference when accepting a connection to
>> avoid panic
>> 
>> When a CONNECT_REQUEST is received on the listening side, a new cm_node is
>> created. A pointer to the cm_node is put into an iw_cm event message, which is put
>> on a workqueue and then sent to i40iw_accept().
>> 
>> The driver needs to add a reference to go with the iw_cm event so that the cm_node
>> cannot be destroyed before the workqueue item is processed.
>> 
>> Note that i40iw_accept() already releases a reference in two error paths; these
>> appear to be incorrect since there was no associated reference taken.
>> 
>> Backtrace:
>> [436732.936866] general protection fault: 0000 [#1] SMP NOPTI [436732.937891]
>> Modules linked in: ...
>> [436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-
>> coreos-r9999.1533000047-442 #1
>> [436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
>> [436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw] [436732.972065] RSP:
>> 0018:ffff99c047713b28 EFLAGS: 00010046 [436732.973022] RAX:
>> 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0 [436732.974314]
>> RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
>> [436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09:
>> ffff8bd53818dc40 [436732.976902] R10: ffff99c047713a08 R11: 0000000000000004
>> R12: ffff8bd538188018 [436732.978192] R13: ffff8bd53818b220 R14:
>> ffff8bd648826800 R15: ffff8bcf356a3400 [436732.979480] FS:
>> 00007fc6ceba2700(0000) GS:ffff8bd674400000(0000) knlGS:0000000000000000
>> [436732.980937] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [436732.981983] CR2: 00007faa0ea26270 CR3: 00000016fa6ce003 CR4:
>> 00000000003606f0 [436732.983312] DR0: 0000000000000000 DR1:
>> 0000000000000000 DR2: 0000000000000000 [436732.984602] DR3:
>> 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> [436732.985893] Call Trace:
>> [436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm] [436732.987159]
>> rdma_accept+0x1e8/0x260 [rdma_cm] [436732.987982]  0xffffffffc0ad1141
>> [436732.988574]  0xffffffffc0ad14cd [436732.989168]  __vfs_write+0x33/0x150
>> [436732.989824]  ? __inode_security_revalidate+0x4a/0x70
>> [436732.990734]  ? selinux_file_permission+0xdd/0x130
>> [436732.991600]  ? security_file_permission+0x36/0xb0
>> [436732.992466]  vfs_write+0xb3/0x1a0
>> [436732.993088]  SyS_write+0x52/0xc0
>> [436732.993698]  do_syscall_64+0x66/0x1d0 [436732.994384]
>> entry_SYSCALL_64_after_hwframe+0x21/0x86
>> [436732.995311] RIP: 0033:0x7fc79f7676ad [436732.995981] RSP:
>> 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>> [436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX:
>> 00007fc79f7676ad [436732.998646] RDX: 0000000000000128 RSI:
>> 00007fc76d371050 RDI: 000000000000005c [436732.999934] RBP:
>> 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
>> [436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12:
>> 00007fc76d3711d0 [436733.002508] R13: 0000000028c80950 R14:
>> 0000000028cc0950 R15: 000000002796b010 [436733.003798] Code: ...
>> [436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP: ffff99c047713b28
>> 
>> Fixes: f27b4746f378e ("i40iw: add connection management code"
>> Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
>> ---
>> drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
>> 1 file changed, 15 insertions(+), 4 deletions(-)
>> 
>> diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> index 206cfb0016f8..28e92a68c178 100644
>> --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> @@ -272,6 +272,9 @@ static int i40iw_send_cm_event(struct i40iw_cm_node
>> *cm_node,
>> 		event.private_data = (void *)cm_node->pdata_buf;
>> 		event.private_data_len = (u8)cm_node->pdata.size;
>> 		event.ird = cm_node->ird_size;
>> +
>> +		/* Take a reference to go with the iw_cm event */
>> +		atomic_inc(&cm_node->ref_count);
>> 		break;
> 
> Maybe I am missing something here, but i40iw_cm_post_event() should have bumped
> the cm_node ref count so it is not deleted till event worker completes. So, I am not
> entirely convinced this is the right root cause and fix.
> 
> It would be useful if we could get the call trace on i40iw_rem_ref_cm_node() when cm_node
> goes away. I can assist providing a debug patch.
> 
> Shiraz

There are two distinct events put onto two different workqueues. Event A is of type struct i40iw_cm_event. Event B is of type struct iw_cm_event.

i40iw_cm_post_event() bumps the refcount and then posts event A.
Event A gets sent to i40iw_cm_event_handler().
i40iw_cm_event_handler() calls i40iw_send_cm_event().
i40iw_send_cm_event() posts event B.
Then i40iw_cm_event_handler() drops the refcount associated with event A.

Event B gets sent to cm_id->event_handler() which I believe is cm_event_handler() in iwcm.c.

There is nothing to prevent the refcount associated with event A from getting dropped before cm_event_handler() is able to process event B.

-Andrew
Shiraz Saleem April 8, 2019, 4:16 p.m. UTC | #3
>Subject: Re: [PATCH] rdma/i40iw: Add a reference when accepting a connection to
>avoid panic
>
>
>> On Mar 29, 2019, at 7:29 PM, Saleem, Shiraz <shiraz.saleem@intel.com> wrote:
>>
>>> Subject: [PATCH] rdma/i40iw: Add a reference when accepting a
>>> connection to avoid panic
>>>
>>> When a CONNECT_REQUEST is received on the listening side, a new
>>> cm_node is created. A pointer to the cm_node is put into an iw_cm
>>> event message, which is put on a workqueue and then sent to i40iw_accept().
>>>
>>> The driver needs to add a reference to go with the iw_cm event so
>>> that the cm_node cannot be destroyed before the workqueue item is processed.
>>>
>>> Note that i40iw_accept() already releases a reference in two error
>>> paths; these appear to be incorrect since there was no associated reference
>taken.
>>>
>>> Backtrace:
>>> [436732.936866] general protection fault: 0000 [#1] SMP NOPTI
>>> [436732.937891] Modules linked in: ...
>>> [436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-
>>> coreos-r9999.1533000047-442 #1
>>> [436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
>>> [436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw] [436732.972065]
>RSP:
>>> 0018:ffff99c047713b28 EFLAGS: 00010046 [436732.973022] RAX:
>>> 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0
>>> [436732.974314]
>>> RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
>>> [436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09:
>>> ffff8bd53818dc40 [436732.976902] R10: ffff99c047713a08 R11:
>>> 0000000000000004
>>> R12: ffff8bd538188018 [436732.978192] R13: ffff8bd53818b220 R14:
>>> ffff8bd648826800 R15: ffff8bcf356a3400 [436732.979480] FS:
>>> 00007fc6ceba2700(0000) GS:ffff8bd674400000(0000)
>>> knlGS:0000000000000000 [436732.980937] CS:  0010 DS: 0000 ES: 0000
>>> CR0: 0000000080050033 [436732.981983] CR2: 00007faa0ea26270 CR3:
>00000016fa6ce003 CR4:
>>> 00000000003606f0 [436732.983312] DR0: 0000000000000000 DR1:
>>> 0000000000000000 DR2: 0000000000000000 [436732.984602] DR3:
>>> 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>> [436732.985893] Call Trace:
>>> [436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm] [436732.987159]
>>> rdma_accept+0x1e8/0x260 [rdma_cm] [436732.987982]  0xffffffffc0ad1141
>>> [436732.988574]  0xffffffffc0ad14cd [436732.989168]
>>> __vfs_write+0x33/0x150 [436732.989824]  ?
>>> __inode_security_revalidate+0x4a/0x70
>>> [436732.990734]  ? selinux_file_permission+0xdd/0x130
>>> [436732.991600]  ? security_file_permission+0x36/0xb0
>>> [436732.992466]  vfs_write+0xb3/0x1a0 [436732.993088]
>>> SyS_write+0x52/0xc0 [436732.993698]  do_syscall_64+0x66/0x1d0
>>> [436732.994384]
>>> entry_SYSCALL_64_after_hwframe+0x21/0x86
>>> [436732.995311] RIP: 0033:0x7fc79f7676ad [436732.995981] RSP:
>>> 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>>> [436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX:
>>> 00007fc79f7676ad [436732.998646] RDX: 0000000000000128 RSI:
>>> 00007fc76d371050 RDI: 000000000000005c [436732.999934] RBP:
>>> 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
>>> [436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12:
>>> 00007fc76d3711d0 [436733.002508] R13: 0000000028c80950 R14:
>>> 0000000028cc0950 R15: 000000002796b010 [436733.003798] Code: ...
>>> [436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP:
>>> ffff99c047713b28
>>>
>>> Fixes: f27b4746f378e ("i40iw: add connection management code"
>>> Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
>>> ---
>>> drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
>>> 1 file changed, 15 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>>> b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>>> index 206cfb0016f8..28e92a68c178 100644
>>> --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>>> +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>>> @@ -272,6 +272,9 @@ static int i40iw_send_cm_event(struct
>>> i40iw_cm_node *cm_node,
>>> 		event.private_data = (void *)cm_node->pdata_buf;
>>> 		event.private_data_len = (u8)cm_node->pdata.size;
>>> 		event.ird = cm_node->ird_size;
>>> +
>>> +		/* Take a reference to go with the iw_cm event */
>>> +		atomic_inc(&cm_node->ref_count);
>>> 		break;
>>
>> Maybe I am missing something here, but i40iw_cm_post_event() should
>> have bumped the cm_node ref count so it is not deleted till event
>> worker completes. So, I am not entirely convinced this is the right root cause and
>fix.
>>
>> It would be useful if we could get the call trace on
>> i40iw_rem_ref_cm_node() when cm_node goes away. I can assist providing a
>debug patch.
>>
>> Shiraz
>
>There are two distinct events put onto two different workqueues. Event A is of type
>struct i40iw_cm_event. Event B is of type struct iw_cm_event.
>
>i40iw_cm_post_event() bumps the refcount and then posts event A.
>Event A gets sent to i40iw_cm_event_handler().
>i40iw_cm_event_handler() calls i40iw_send_cm_event().
>i40iw_send_cm_event() posts event B.
>Then i40iw_cm_event_handler() drops the refcount associated with event A.
>
>Event B gets sent to cm_id->event_handler() which I believe is cm_event_handler()
>in iwcm.c.
>
>There is nothing to prevent the refcount associated with event A from getting
>dropped before cm_event_handler() is able to process event B.
>
Sure. But, there is a refcnt for the cm_node when its created that I would have expected to exist in i40iw_accept().
Where did that get dropped? Something like this that shows the sequence of callers dropping the refcnt on the cm_node
leading up to the problem would be good.

diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 8233f5a..9d01d9d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -2288,6 +2288,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
        struct i40iw_cm_info nfo;
        unsigned long flags;
 
+       pr_info("%s: cm_node %px ref_cnt %d caller: %pS \n", __func__, cm_node, atomic_read(&cm_node->ref_count),__builtin_return_address(0));
        spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
        if (atomic_dec_return(&cm_node->ref_count)) {
                spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
@@ -3664,6 +3665,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        dev = &iwdev->sc_dev;
        cm_core = &iwdev->cm_core;
        cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
+       pr_info("%s: cm_node %px\n", __func__, cm_node);
 
        if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
                cm_node->ipv4 = true;
Doug Ledford May 1, 2019, 2:50 p.m. UTC | #4
On Mon, 2019-04-08 at 16:16 +0000, Saleem, Shiraz wrote:
> > Subject: Re: [PATCH] rdma/i40iw: Add a reference when accepting a connection to
> > avoid panic
> > 
> > 
> > > On Mar 29, 2019, at 7:29 PM, Saleem, Shiraz <shiraz.saleem@intel.com> wrote:
> > > 
> > > > Subject: [PATCH] rdma/i40iw: Add a reference when accepting a
> > > > connection to avoid panic
> > > > 
> > > > When a CONNECT_REQUEST is received on the listening side, a new
> > > > cm_node is created. A pointer to the cm_node is put into an iw_cm
> > > > event message, which is put on a workqueue and then sent to i40iw_accept().
> > > > 
> > > > The driver needs to add a reference to go with the iw_cm event so
> > > > that the cm_node cannot be destroyed before the workqueue item is processed.
> > > > 
> > > > Note that i40iw_accept() already releases a reference in two error
> > > > paths; these appear to be incorrect since there was no associated reference
> > taken.
> > > > Backtrace:
> > > > [436732.936866] general protection fault: 0000 [#1] SMP NOPTI
> > > > [436732.937891] Modules linked in: ...
> > > > [436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-
> > > > coreos-r9999.1533000047-442 #1
> > > > [436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
> > > > [436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw] [436732.972065]
> > RSP:
> > > > 0018:ffff99c047713b28 EFLAGS: 00010046 [436732.973022] RAX:
> > > > 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0
> > > > [436732.974314]
> > > > RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
> > > > [436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09:
> > > > ffff8bd53818dc40 [436732.976902] R10: ffff99c047713a08 R11:
> > > > 0000000000000004
> > > > R12: ffff8bd538188018 [436732.978192] R13: ffff8bd53818b220 R14:
> > > > ffff8bd648826800 R15: ffff8bcf356a3400 [436732.979480] FS:
> > > > 00007fc6ceba2700(0000) GS:ffff8bd674400000(0000)
> > > > knlGS:0000000000000000 [436732.980937] CS:  0010 DS: 0000 ES: 0000
> > > > CR0: 0000000080050033 [436732.981983] CR2: 00007faa0ea26270 CR3:
> > 00000016fa6ce003 CR4:
> > > > 00000000003606f0 [436732.983312] DR0: 0000000000000000 DR1:
> > > > 0000000000000000 DR2: 0000000000000000 [436732.984602] DR3:
> > > > 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > > [436732.985893] Call Trace:
> > > > [436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm] [436732.987159]
> > > > rdma_accept+0x1e8/0x260 [rdma_cm] [436732.987982]  0xffffffffc0ad1141
> > > > [436732.988574]  0xffffffffc0ad14cd [436732.989168]
> > > > __vfs_write+0x33/0x150 [436732.989824]  ?
> > > > __inode_security_revalidate+0x4a/0x70
> > > > [436732.990734]  ? selinux_file_permission+0xdd/0x130
> > > > [436732.991600]  ? security_file_permission+0x36/0xb0
> > > > [436732.992466]  vfs_write+0xb3/0x1a0 [436732.993088]
> > > > SyS_write+0x52/0xc0 [436732.993698]  do_syscall_64+0x66/0x1d0
> > > > [436732.994384]
> > > > entry_SYSCALL_64_after_hwframe+0x21/0x86
> > > > [436732.995311] RIP: 0033:0x7fc79f7676ad [436732.995981] RSP:
> > > > 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> > > > [436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX:
> > > > 00007fc79f7676ad [436732.998646] RDX: 0000000000000128 RSI:
> > > > 00007fc76d371050 RDI: 000000000000005c [436732.999934] RBP:
> > > > 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
> > > > [436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12:
> > > > 00007fc76d3711d0 [436733.002508] R13: 0000000028c80950 R14:
> > > > 0000000028cc0950 R15: 000000002796b010 [436733.003798] Code: ...
> > > > [436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP:
> > > > ffff99c047713b28
> > > > 
> > > > Fixes: f27b4746f378e ("i40iw: add connection management code"
> > > > Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
> > > > ---
> > > > drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
> > > > 1 file changed, 15 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
> > > > b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> > > > index 206cfb0016f8..28e92a68c178 100644
> > > > --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
> > > > +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> > > > @@ -272,6 +272,9 @@ static int i40iw_send_cm_event(struct
> > > > i40iw_cm_node *cm_node,
> > > > 		event.private_data = (void *)cm_node->pdata_buf;
> > > > 		event.private_data_len = (u8)cm_node->pdata.size;
> > > > 		event.ird = cm_node->ird_size;
> > > > +
> > > > +		/* Take a reference to go with the iw_cm event */
> > > > +		atomic_inc(&cm_node->ref_count);
> > > > 		break;
> > > 
> > > Maybe I am missing something here, but i40iw_cm_post_event() should
> > > have bumped the cm_node ref count so it is not deleted till event
> > > worker completes. So, I am not entirely convinced this is the right root cause and
> > fix.
> > > It would be useful if we could get the call trace on
> > > i40iw_rem_ref_cm_node() when cm_node goes away. I can assist providing a
> > debug patch.
> > > Shiraz
> > 
> > There are two distinct events put onto two different workqueues. Event A is of type
> > struct i40iw_cm_event. Event B is of type struct iw_cm_event.
> > 
> > i40iw_cm_post_event() bumps the refcount and then posts event A.
> > Event A gets sent to i40iw_cm_event_handler().
> > i40iw_cm_event_handler() calls i40iw_send_cm_event().
> > i40iw_send_cm_event() posts event B.
> > Then i40iw_cm_event_handler() drops the refcount associated with event A.
> > 
> > Event B gets sent to cm_id->event_handler() which I believe is cm_event_handler()
> > in iwcm.c.
> > 
> > There is nothing to prevent the refcount associated with event A from getting
> > dropped before cm_event_handler() is able to process event B.
> > 
> Sure. But, there is a refcnt for the cm_node when its created that I would have expected to exist in i40iw_accept().
> Where did that get dropped? Something like this that shows the sequence of callers dropping the refcnt on the cm_node
> leading up to the problem would be good.
> 
> diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> index 8233f5a..9d01d9d 100644
> --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
> +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
> @@ -2288,6 +2288,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
>         struct i40iw_cm_info nfo;
>         unsigned long flags;
>  
> +       pr_info("%s: cm_node %px ref_cnt %d caller: %pS \n", __func__, cm_node, atomic_read(&cm_node->ref_count),__builtin_return_address(0));
>         spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
>         if (atomic_dec_return(&cm_node->ref_count)) {
>                 spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
> @@ -3664,6 +3665,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
>         dev = &iwdev->sc_dev;
>         cm_core = &iwdev->cm_core;
>         cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
> +       pr_info("%s: cm_node %px\n", __func__, cm_node);
>  
>         if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
>                 cm_node->ipv4 = true;
> 

This patch is now over a month old with no resolution.  Can we get a
final answer on this please?
Shiraz Saleem May 2, 2019, 1:28 p.m. UTC | #5
>Subject: Re: [PATCH] rdma/i40iw: Add a reference when accepting a connection
>to avoid panic
>
>On Mon, 2019-04-08 at 16:16 +0000, Saleem, Shiraz wrote:
>> > Subject: Re: [PATCH] rdma/i40iw: Add a reference when accepting a
>> > connection to avoid panic
>> >
>> >
>> > > On Mar 29, 2019, at 7:29 PM, Saleem, Shiraz <shiraz.saleem@intel.com>
>wrote:
>> > >
>> > > > Subject: [PATCH] rdma/i40iw: Add a reference when accepting a
>> > > > connection to avoid panic
>> > > >
>> > > > When a CONNECT_REQUEST is received on the listening side, a new
>> > > > cm_node is created. A pointer to the cm_node is put into an
>> > > > iw_cm event message, which is put on a workqueue and then sent to
>i40iw_accept().
>> > > >
>> > > > The driver needs to add a reference to go with the iw_cm event
>> > > > so that the cm_node cannot be destroyed before the workqueue item is
>processed.
>> > > >
>> > > > Note that i40iw_accept() already releases a reference in two
>> > > > error paths; these appear to be incorrect since there was no
>> > > > associated reference
>> > taken.
>> > > > Backtrace:
>> > > > [436732.936866] general protection fault: 0000 [#1] SMP NOPTI
>> > > > [436732.937891] Modules linked in: ...
>> > > > [436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE
>4.14.19-
>> > > > coreos-r9999.1533000047-442 #1
>> > > > [436732.970042] task: ffff8bd589113c80 task.stack:
>> > > > ffff99c047710000 [436732.971123] RIP:
>> > > > 0010:i40iw_accept+0x2d0/0x4c0 [i40iw] [436732.972065]
>> > RSP:
>> > > > 0018:ffff99c047713b28 EFLAGS: 00010046 [436732.973022] RAX:
>> > > > 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0
>> > > > [436732.974314]
>> > > > RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI:
>> > > > dead000000000100 [436732.975607] RBP: ffff99c047713c68 R08:
>0000000000000000 R09:
>> > > > ffff8bd53818dc40 [436732.976902] R10: ffff99c047713a08 R11:
>> > > > 0000000000000004
>> > > > R12: ffff8bd538188018 [436732.978192] R13: ffff8bd53818b220 R14:
>> > > > ffff8bd648826800 R15: ffff8bcf356a3400 [436732.979480] FS:
>> > > > 00007fc6ceba2700(0000) GS:ffff8bd674400000(0000)
>> > > > knlGS:0000000000000000 [436732.980937] CS:  0010 DS: 0000 ES:
>> > > > 0000
>> > > > CR0: 0000000080050033 [436732.981983] CR2: 00007faa0ea26270
>CR3:
>> > 00000016fa6ce003 CR4:
>> > > > 00000000003606f0 [436732.983312] DR0: 0000000000000000 DR1:
>> > > > 0000000000000000 DR2: 0000000000000000 [436732.984602] DR3:
>> > > > 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> > > > [436732.985893] Call Trace:
>> > > > [436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm] [436732.987159]
>> > > > rdma_accept+0x1e8/0x260 [rdma_cm] [436732.987982]
>> > > > 0xffffffffc0ad1141 [436732.988574]  0xffffffffc0ad14cd
>> > > > [436732.989168]
>> > > > __vfs_write+0x33/0x150 [436732.989824]  ?
>> > > > __inode_security_revalidate+0x4a/0x70
>> > > > [436732.990734]  ? selinux_file_permission+0xdd/0x130
>> > > > [436732.991600]  ? security_file_permission+0x36/0xb0
>> > > > [436732.992466]  vfs_write+0xb3/0x1a0 [436732.993088]
>> > > > SyS_write+0x52/0xc0 [436732.993698]  do_syscall_64+0x66/0x1d0
>> > > > [436732.994384]
>> > > > entry_SYSCALL_64_after_hwframe+0x21/0x86
>> > > > [436732.995311] RIP: 0033:0x7fc79f7676ad [436732.995981] RSP:
>> > > > 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX:
>> > > > 0000000000000001 [436732.997355] RAX: ffffffffffffffda RBX:
>0000000028c80950 RCX:
>> > > > 00007fc79f7676ad [436732.998646] RDX: 0000000000000128 RSI:
>> > > > 00007fc76d371050 RDI: 000000000000005c [436732.999934] RBP:
>> > > > 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
>> > > > [436733.001221] R10: 0000000000000009 R11: 0000000000000293
>R12:
>> > > > 00007fc76d3711d0 [436733.002508] R13: 0000000028c80950 R14:
>> > > > 0000000028cc0950 R15: 000000002796b010 [436733.003798] Code: ...
>> > > > [436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP:
>> > > > ffff99c047713b28
>> > > >
>> > > > Fixes: f27b4746f378e ("i40iw: add connection management code"
>> > > > Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
>> > > > ---
>> > > > drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
>> > > > 1 file changed, 15 insertions(+), 4 deletions(-)
>> > > >
>> > > > diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> > > > b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> > > > index 206cfb0016f8..28e92a68c178 100644
>> > > > --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> > > > +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> > > > @@ -272,6 +272,9 @@ static int i40iw_send_cm_event(struct
>> > > > i40iw_cm_node *cm_node,
>> > > > 		event.private_data = (void *)cm_node->pdata_buf;
>> > > > 		event.private_data_len = (u8)cm_node->pdata.size;
>> > > > 		event.ird = cm_node->ird_size;
>> > > > +
>> > > > +		/* Take a reference to go with the iw_cm event */
>> > > > +		atomic_inc(&cm_node->ref_count);
>> > > > 		break;
>> > >
>> > > Maybe I am missing something here, but i40iw_cm_post_event()
>> > > should have bumped the cm_node ref count so it is not deleted till
>> > > event worker completes. So, I am not entirely convinced this is
>> > > the right root cause and
>> > fix.
>> > > It would be useful if we could get the call trace on
>> > > i40iw_rem_ref_cm_node() when cm_node goes away. I can assist
>> > > providing a
>> > debug patch.
>> > > Shiraz
>> >
>> > There are two distinct events put onto two different workqueues.
>> > Event A is of type struct i40iw_cm_event. Event B is of type struct
>iw_cm_event.
>> >
>> > i40iw_cm_post_event() bumps the refcount and then posts event A.
>> > Event A gets sent to i40iw_cm_event_handler().
>> > i40iw_cm_event_handler() calls i40iw_send_cm_event().
>> > i40iw_send_cm_event() posts event B.
>> > Then i40iw_cm_event_handler() drops the refcount associated with event A.
>> >
>> > Event B gets sent to cm_id->event_handler() which I believe is
>> > cm_event_handler() in iwcm.c.
>> >
>> > There is nothing to prevent the refcount associated with event A
>> > from getting dropped before cm_event_handler() is able to process event B.
>> >
>> Sure. But, there is a refcnt for the cm_node when its created that I would have
>expected to exist in i40iw_accept().
>> Where did that get dropped? Something like this that shows the
>> sequence of callers dropping the refcnt on the cm_node leading up to the
>problem would be good.
>>
>> diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> index 8233f5a..9d01d9d 100644
>> --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
>> @@ -2288,6 +2288,7 @@ static void i40iw_rem_ref_cm_node(struct
>i40iw_cm_node *cm_node)
>>         struct i40iw_cm_info nfo;
>>         unsigned long flags;
>>
>> +       pr_info("%s: cm_node %px ref_cnt %d caller: %pS \n", __func__,
>> + cm_node,
>> + atomic_read(&cm_node->ref_count),__builtin_return_address(0));
>>         spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
>>         if (atomic_dec_return(&cm_node->ref_count)) {
>>                 spin_unlock_irqrestore(&cm_node->cm_core->ht_lock,
>> flags); @@ -3664,6 +3665,7 @@ int i40iw_accept(struct iw_cm_id *cm_id,
>struct iw_cm_conn_param *conn_param)
>>         dev = &iwdev->sc_dev;
>>         cm_core = &iwdev->cm_core;
>>         cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
>> +       pr_info("%s: cm_node %px\n", __func__, cm_node);
>>
>>         if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
>>                 cm_node->ipv4 = true;
>>
>
>This patch is now over a month old with no resolution.  Can we get a final answer
>on this please?
>

Need some further debug data on the problem before a fix can be proposed.
Parvathi - Is this something you can help with or at least provide instructions to repro?

Shiraz
Jason Gunthorpe June 7, 2019, 6:53 p.m. UTC | #6
On Wed, Mar 27, 2019 at 09:42:54AM -0400, Andrew Boyer wrote:
> When a CONNECT_REQUEST is received on the listening side, a
> new cm_node is created. A pointer to the cm_node is put into an iw_cm
> event message, which is put on a workqueue and then sent to i40iw_accept().
> 
> The driver needs to add a reference to go with the iw_cm event so that the
> cm_node cannot be destroyed before the workqueue item is processed.
> 
> Note that i40iw_accept() already releases a reference in two error paths;
> these appear to be incorrect since there was no associated reference taken.
> 
> Backtrace:
> [436732.936866] general protection fault: 0000 [#1] SMP NOPTI
> [436732.937891] Modules linked in: ...
> [436732.966395] CPU: 0 PID: 14062 Comm: CMIB Tainted: P           OE   4.14.19-coreos-r9999.1533000047-442 #1
> [436732.970042] task: ffff8bd589113c80 task.stack: ffff99c047710000
> [436732.971123] RIP: 0010:i40iw_accept+0x2d0/0x4c0 [i40iw]
> [436732.972065] RSP: 0018:ffff99c047713b28 EFLAGS: 00010046
> [436732.973022] RAX: 0000000000000296 RBX: ffff8bcf356a1800 RCX: ffff8bcf356a34c0
> [436732.974314] RDX: dead000000000200 RSI: ffff8bd53818b1c0 RDI: dead000000000100
> [436732.975607] RBP: ffff99c047713c68 R08: 0000000000000000 R09: ffff8bd53818dc40
> [436732.976902] R10: ffff99c047713a08 R11: 0000000000000004 R12: ffff8bd538188018
> [436732.978192] R13: ffff8bd53818b220 R14: ffff8bd648826800 R15: ffff8bcf356a3400
> [436732.979480] FS:  00007fc6ceba2700(0000) GS:ffff8bd674400000(0000) knlGS:0000000000000000
> [436732.980937] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [436732.981983] CR2: 00007faa0ea26270 CR3: 00000016fa6ce003 CR4: 00000000003606f0
> [436732.983312] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [436732.984602] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [436732.985893] Call Trace:
> [436732.986368]  iw_cm_accept+0x8d/0x550 [iw_cm]
> [436732.987159]  rdma_accept+0x1e8/0x260 [rdma_cm]
> [436732.987982]  0xffffffffc0ad1141
> [436732.988574]  0xffffffffc0ad14cd
> [436732.989168]  __vfs_write+0x33/0x150
> [436732.989824]  ? __inode_security_revalidate+0x4a/0x70
> [436732.990734]  ? selinux_file_permission+0xdd/0x130
> [436732.991600]  ? security_file_permission+0x36/0xb0
> [436732.992466]  vfs_write+0xb3/0x1a0
> [436732.993088]  SyS_write+0x52/0xc0
> [436732.993698]  do_syscall_64+0x66/0x1d0
> [436732.994384]  entry_SYSCALL_64_after_hwframe+0x21/0x86
> [436732.995311] RIP: 0033:0x7fc79f7676ad
> [436732.995981] RSP: 002b:00007fc76d371040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> [436732.997355] RAX: ffffffffffffffda RBX: 0000000028c80950 RCX: 00007fc79f7676ad
> [436732.998646] RDX: 0000000000000128 RSI: 00007fc76d371050 RDI: 000000000000005c
> [436732.999934] RBP: 00007fc76d371050 R08: 0000000000000000 R09: 0000000028cc2400
> [436733.001221] R10: 0000000000000009 R11: 0000000000000293 R12: 00007fc76d3711d0
> [436733.002508] R13: 0000000028c80950 R14: 0000000028cc0950 R15: 000000002796b010
> [436733.003798] Code: ...
> [436733.007166] RIP: i40iw_accept+0x2d0/0x4c0 [i40iw] RSP: ffff99c047713b28
> 
> Fixes: f27b4746f378e ("i40iw: add connection management code"
> Signed-off-by: Andrew Boyer <andrew.boyer@dell.com>
> ---
>  drivers/infiniband/hw/i40iw/i40iw_cm.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)

This has been quite for a while, it needs Shiraz's approval to move
ahead, so I'm dropping it off patchworks, please work with Shriaz.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 206cfb0016f8..28e92a68c178 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -272,6 +272,9 @@  static int i40iw_send_cm_event(struct i40iw_cm_node *cm_node,
 		event.private_data = (void *)cm_node->pdata_buf;
 		event.private_data_len = (u8)cm_node->pdata.size;
 		event.ird = cm_node->ird_size;
+
+		/* Take a reference to go with the iw_cm event */
+		atomic_inc(&cm_node->ref_count);
 		break;
 	case IW_CM_EVENT_CONNECT_REPLY:
 		i40iw_get_cmevent_info(cm_node, cm_id, &event);
@@ -3642,15 +3645,19 @@  int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	unsigned long flags;
 
 	memset(&attr, 0, sizeof(attr));
+
+	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
+
 	ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn);
-	if (!ibqp)
+	if (!ibqp) {
+		i40iw_rem_ref_cm_node(cm_node);
 		return -EINVAL;
+	}
 
 	iwqp = to_iwqp(ibqp);
 	iwdev = iwqp->iwdev;
 	dev = &iwdev->sc_dev;
 	cm_core = &iwdev->cm_core;
-	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
 
 	if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
 		cm_node->ipv4 = true;
@@ -3683,9 +3690,11 @@  int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	buf_len = conn_param->private_data_len + I40IW_MAX_IETF_SIZE;
 
 	status = i40iw_allocate_dma_mem(dev->hw, &iwqp->ietf_mem, buf_len, 1);
-
-	if (status)
+	if (status) {
+		i40iw_rem_ref_cm_node(cm_node);
 		return -ENOMEM;
+	}
+
 	cm_node->pdata.size = conn_param->private_data_len;
 	accept.addr = iwqp->ietf_mem.va;
 	accept.size = i40iw_cm_build_mpa_frame(cm_node, &accept, MPA_KEY_REPLY);
@@ -3706,6 +3715,7 @@  int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 					 &tagged_offset);
 		if (IS_ERR(ibmr)) {
 			i40iw_free_dma_mem(dev->hw, &iwqp->ietf_mem);
+			i40iw_rem_ref_cm_node(cm_node);
 			return -ENOMEM;
 		}
 
@@ -3767,6 +3777,7 @@  int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		atomic_dec(&cm_node->listener->pend_accepts_cnt);
 		cm_node->accept_pend = 0;
 	}
+	i40iw_rem_ref_cm_node(cm_node);
 	return 0;
 }