Message ID | 1350683896-3928-1-git-send-email-klebers@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Rejected |
Headers | show |
Hi, Any comments about this one? Thank you, Kleber On 10/19/2012 06:58 PM, Kleber Sacilotto de Souza wrote: > During PCI error recovery, the calls to wait_for_completion() in the > infiniband core path can hang waiting for some tasks that will never > complete, since the hardware is nonfunctional. > > INFO: task eehd:16029 blocked for more than 120 seconds. > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this > message. > eehd D 0000000000000000 9664 16029 3093 0x00000080 > Call Trace: > [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) > [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 > [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 > [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 > [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 > [c0000000af8437c0] [d000000004113538] .ib_unregister_mad_agent+0x498/0x690 [ib_mad] > [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 [ib_sa] > [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 [ib_core] > [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 [mlx4_ib] > [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 [mlx4_core] > [c0000000af843b40] [d000000003f6abfc] .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] > [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 [mlx4_core] > [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 [mlx4_core] > [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 > [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 > [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 > [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 > [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 > > This patch fixes the issue by replacing the calls to > wait_for_completion() by wait_for_completion_timeout(), providing a > timeout of 5 seconds to wait for the normal completion of the task. > > Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> > --- > drivers/infiniband/core/addr.c | 2 +- > drivers/infiniband/core/cm.c | 2 +- > drivers/infiniband/core/cma.c | 4 ++-- > drivers/infiniband/core/iwcm.c | 2 +- > drivers/infiniband/core/mad.c | 4 ++-- > drivers/infiniband/core/mad_rmpp.c | 2 +- > drivers/infiniband/core/multicast.c | 4 ++-- > drivers/infiniband/core/sa_query.c | 2 +- > drivers/infiniband/core/ucm.c | 2 +- > drivers/infiniband/core/ucma.c | 2 +- > drivers/infiniband/core/uverbs_main.c | 4 ++-- > 11 files changed, 15 insertions(+), 15 deletions(-) > > diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c > index eaec8d7..be42113 100644 > --- a/drivers/infiniband/core/addr.c > +++ b/drivers/infiniband/core/addr.c > @@ -86,7 +86,7 @@ static inline void put_client(struct rdma_addr_client *client) > void rdma_addr_unregister_client(struct rdma_addr_client *client) > { > put_client(client); > - wait_for_completion(&client->comp); > + wait_for_completion_timeout(&client->comp, 5*HZ); > } > EXPORT_SYMBOL(rdma_addr_unregister_client); > > diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c > index 394fea2..062a90b 100644 > --- a/drivers/infiniband/core/cm.c > +++ b/drivers/infiniband/core/cm.c > @@ -910,7 +910,7 @@ retest: > > cm_free_id(cm_id->local_id); > cm_deref_id(cm_id_priv); > - wait_for_completion(&cm_id_priv->comp); > + wait_for_completion_timeout(&cm_id_priv->comp, 5*HZ); > while ((work = cm_dequeue_work(cm_id_priv)) != NULL) > cm_free_work(work); > kfree(cm_id_priv->compare_data); > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > index a7568c3..c85d078 100644 > --- a/drivers/infiniband/core/cma.c > +++ b/drivers/infiniband/core/cma.c > @@ -944,7 +944,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) > > cma_release_port(id_priv); > cma_deref_id(id_priv); > - wait_for_completion(&id_priv->comp); > + wait_for_completion_timeout(&id_priv->comp, 5*HZ); > > if (id_priv->internal_id) > cma_deref_id(id_priv->id.context); > @@ -3388,7 +3388,7 @@ static void cma_process_remove(struct cma_device *cma_dev) > mutex_unlock(&lock); > > cma_deref_dev(cma_dev); > - wait_for_completion(&cma_dev->comp); > + wait_for_completion_timeout(&cma_dev->comp, 5*HZ); > } > > static void cma_remove_one(struct ib_device *device) > diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c > index 0bb99bb..7457195 100644 > --- a/drivers/infiniband/core/iwcm.c > +++ b/drivers/infiniband/core/iwcm.c > @@ -399,7 +399,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id) > > destroy_cm_id(cm_id); > > - wait_for_completion(&cm_id_priv->destroy_comp); > + wait_for_completion_timeout(&cm_id_priv->destroy_comp, 5*HZ); > > free_cm_id(cm_id_priv); > } > diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c > index dc3fd1e..bf94a67 100644 > --- a/drivers/infiniband/core/mad.c > +++ b/drivers/infiniband/core/mad.c > @@ -539,7 +539,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) > ib_cancel_rmpp_recvs(mad_agent_priv); > > deref_mad_agent(mad_agent_priv); > - wait_for_completion(&mad_agent_priv->comp); > + wait_for_completion_timeout(&mad_agent_priv->comp, 5*HZ); > > kfree(mad_agent_priv->reg_req); > ib_dereg_mr(mad_agent_priv->agent.mr); > @@ -558,7 +558,7 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) > spin_unlock_irqrestore(&qp_info->snoop_lock, flags); > > deref_snoop_agent(mad_snoop_priv); > - wait_for_completion(&mad_snoop_priv->comp); > + wait_for_completion_timeout(&mad_snoop_priv->comp, 5*HZ); > > kfree(mad_snoop_priv); > } > diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c > index f37878c..562bf84 100644 > --- a/drivers/infiniband/core/mad_rmpp.c > +++ b/drivers/infiniband/core/mad_rmpp.c > @@ -78,7 +78,7 @@ static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) > static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) > { > deref_rmpp_recv(rmpp_recv); > - wait_for_completion(&rmpp_recv->comp); > + wait_for_completion_timeout(&rmpp_recv->comp, 5*HZ); > ib_destroy_ah(rmpp_recv->ah); > kfree(rmpp_recv); > } > diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c > index d2360a8..0cba0c6 100644 > --- a/drivers/infiniband/core/multicast.c > +++ b/drivers/infiniband/core/multicast.c > @@ -684,7 +684,7 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast) > } > > deref_member(member); > - wait_for_completion(&member->comp); > + wait_for_completion_timeout(&member->comp, 5*HZ); > ib_sa_client_put(member->client); > kfree(member); > } > @@ -862,7 +862,7 @@ static void mcast_remove_one(struct ib_device *device) > IB_LINK_LAYER_INFINIBAND) { > port = &dev->port[i]; > deref_port(port); > - wait_for_completion(&port->comp); > + wait_for_completion_timeout(&port->comp, 5*HZ); > } > } > > diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c > index a8905ab..25547fe 100644 > --- a/drivers/infiniband/core/sa_query.c > +++ b/drivers/infiniband/core/sa_query.c > @@ -474,7 +474,7 @@ EXPORT_SYMBOL(ib_sa_register_client); > void ib_sa_unregister_client(struct ib_sa_client *client) > { > ib_sa_client_put(client); > - wait_for_completion(&client->comp); > + wait_for_completion_timeout(&client->comp, 5*HZ); > } > EXPORT_SYMBOL(ib_sa_unregister_client); > > diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c > index 49b15ac..6358941 100644 > --- a/drivers/infiniband/core/ucm.c > +++ b/drivers/infiniband/core/ucm.c > @@ -550,7 +550,7 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, > return PTR_ERR(ctx); > > ib_ucm_ctx_put(ctx); > - wait_for_completion(&ctx->comp); > + wait_for_completion_timeout(&ctx->comp, 5*HZ); > > /* No new events will be generated after destroying the cm_id. */ > ib_destroy_cm_id(ctx->cm_id); > diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c > index 2709ff5..4b4eb29 100644 > --- a/drivers/infiniband/core/ucma.c > +++ b/drivers/infiniband/core/ucma.c > @@ -516,7 +516,7 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, > return PTR_ERR(ctx); > > ucma_put_ctx(ctx); > - wait_for_completion(&ctx->comp); > + wait_for_completion_timeout(&ctx->comp, 5*HZ); > resp.events_reported = ucma_free_ctx(ctx); > > if (copy_to_user((void __user *)(unsigned long)cmd.response, > diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c > index 6f2ce6f..0824b92 100644 > --- a/drivers/infiniband/core/uverbs_main.c > +++ b/drivers/infiniband/core/uverbs_main.c > @@ -819,7 +819,7 @@ err_cdev: > > err: > kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); > - wait_for_completion(&uverbs_dev->comp); > + wait_for_completion_timeout(&uverbs_dev->comp, 5*HZ); > kfree(uverbs_dev); > return; > } > @@ -841,7 +841,7 @@ static void ib_uverbs_remove_one(struct ib_device *device) > clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); > > kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); > - wait_for_completion(&uverbs_dev->comp); > + wait_for_completion_timeout(&uverbs_dev->comp, 5*HZ); > kfree(uverbs_dev); > } >
On 19/10/2012 23:58, Kleber Sacilotto de Souza wrote: > During PCI error recovery, the calls to wait_for_completion() in the > infiniband core path can hang waiting for some tasks that will never > complete, since the hardware is nonfunctional. > > INFO: task eehd:16029 blocked for more than 120 seconds. > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this > message. > eehd D 0000000000000000 9664 16029 3093 0x00000080 > Call Trace: > [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) > [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 > [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 > [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 > [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 > [c0000000af8437c0] [d000000004113538] .ib_unregister_mad_agent+0x498/0x690 [ib_mad] > [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 [ib_sa] > [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 [ib_core] > [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 [mlx4_ib] > [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 [mlx4_core] > [c0000000af843b40] [d000000003f6abfc] .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] > [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 [mlx4_core] > [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 [mlx4_core] > [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 > [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 > [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 > [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 > [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 > > This patch fixes the issue by replacing the calls to > wait_for_completion() by wait_for_completion_timeout(), providing a > timeout of 5 seconds to wait for the normal completion of the task. > > Signed-off-by: Kleber Sacilotto de Souza<klebers@linux.vnet.ibm.com> > --- > drivers/infiniband/core/addr.c | 2 +- > drivers/infiniband/core/cm.c | 2 +- > drivers/infiniband/core/cma.c | 4 ++-- > drivers/infiniband/core/iwcm.c | 2 +- > drivers/infiniband/core/mad.c | 4 ++-- > drivers/infiniband/core/mad_rmpp.c | 2 +- > drivers/infiniband/core/multicast.c | 4 ++-- > drivers/infiniband/core/sa_query.c | 2 +- for all the above files, your patch makes sense, since the completion/refcount logic relate to waiting for event that needs to come from the device. > drivers/infiniband/core/ucm.c | 2 +- > drivers/infiniband/core/ucma.c | 2 +- On these files, as far as I understand this code from quick looking, I'm not sure on what exactly the completion objects protects, Sean? Or. > drivers/infiniband/core/uverbs_main.c | 4 ++-- -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/1/2012 11:12 AM, Or Gerlitz wrote: > On 19/10/2012 23:58, Kleber Sacilotto de Souza wrote: >> During PCI error recovery, the calls to wait_for_completion() in the >> infiniband core path can hang waiting for some tasks that will never >> complete, since the hardware is nonfunctional. >> >> INFO: task eehd:16029 blocked for more than 120 seconds. >> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >> message. >> eehd D 0000000000000000 9664 16029 3093 0x00000080 >> Call Trace: >> [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) >> [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 >> [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 >> [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 >> [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 >> [c0000000af8437c0] [d000000004113538] >> .ib_unregister_mad_agent+0x498/0x690 [ib_mad] >> [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 >> [ib_sa] >> [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 >> [ib_core] >> [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 >> [mlx4_ib] >> [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 >> [mlx4_core] >> [c0000000af843b40] [d000000003f6abfc] >> .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] >> [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 >> [mlx4_core] >> [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 >> [mlx4_core] >> [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 >> [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 >> [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 >> [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 >> [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 >> >> This patch fixes the issue by replacing the calls to >> wait_for_completion() by wait_for_completion_timeout(), providing a >> timeout of 5 seconds to wait for the normal completion of the task. >> >> Signed-off-by: Kleber Sacilotto de Souza<klebers@linux.vnet.ibm.com> >> --- >> drivers/infiniband/core/addr.c | 2 +- >> drivers/infiniband/core/cm.c | 2 +- >> drivers/infiniband/core/cma.c | 4 ++-- >> drivers/infiniband/core/iwcm.c | 2 +- >> drivers/infiniband/core/mad.c | 4 ++-- >> drivers/infiniband/core/mad_rmpp.c | 2 +- >> drivers/infiniband/core/multicast.c | 4 ++-- >> drivers/infiniband/core/sa_query.c | 2 +- > > for all the above files, your patch makes sense, since the > completion/refcount logic > relate to waiting for event that needs to come from the device. Is this a fix or a workaround for some other underlying issue which needs fixing ? What happens when modules are reloaded after this ? Do things still work ? Are resources lost ? -- Hal > >> drivers/infiniband/core/ucm.c | 2 +- >> drivers/infiniband/core/ucma.c | 2 +- > > On these files, as far as I understand this code from quick looking, I'm > not sure on what > exactly the completion objects protects, Sean? > > Or. > >> drivers/infiniband/core/uverbs_main.c | 4 ++-- > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/01/2012 01:30 PM, Hal Rosenstock wrote: > On 11/1/2012 11:12 AM, Or Gerlitz wrote: >> On 19/10/2012 23:58, Kleber Sacilotto de Souza wrote: >>> During PCI error recovery, the calls to wait_for_completion() in the >>> infiniband core path can hang waiting for some tasks that will never >>> complete, since the hardware is nonfunctional. >>> >>> INFO: task eehd:16029 blocked for more than 120 seconds. >>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>> message. >>> eehd D 0000000000000000 9664 16029 3093 0x00000080 >>> Call Trace: >>> [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) >>> [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 >>> [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 >>> [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 >>> [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 >>> [c0000000af8437c0] [d000000004113538] >>> .ib_unregister_mad_agent+0x498/0x690 [ib_mad] >>> [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 >>> [ib_sa] >>> [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 >>> [ib_core] >>> [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 >>> [mlx4_ib] >>> [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 >>> [mlx4_core] >>> [c0000000af843b40] [d000000003f6abfc] >>> .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] >>> [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 >>> [mlx4_core] >>> [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 >>> [mlx4_core] >>> [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 >>> [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 >>> [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 >>> [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 >>> [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 >>> >>> This patch fixes the issue by replacing the calls to >>> wait_for_completion() by wait_for_completion_timeout(), providing a >>> timeout of 5 seconds to wait for the normal completion of the task. >>> >>> Signed-off-by: Kleber Sacilotto de Souza<klebers@linux.vnet.ibm.com> >>> --- >>> drivers/infiniband/core/addr.c | 2 +- >>> drivers/infiniband/core/cm.c | 2 +- >>> drivers/infiniband/core/cma.c | 4 ++-- >>> drivers/infiniband/core/iwcm.c | 2 +- >>> drivers/infiniband/core/mad.c | 4 ++-- >>> drivers/infiniband/core/mad_rmpp.c | 2 +- >>> drivers/infiniband/core/multicast.c | 4 ++-- >>> drivers/infiniband/core/sa_query.c | 2 +- >> for all the above files, your patch makes sense, since the >> completion/refcount logic >> relate to waiting for event that needs to come from the device. > Is this a fix or a workaround for some other underlying issue which > needs fixing ? > > What happens when modules are reloaded after this ? Do things still work > ? Are resources lost ? After the error is recovered, the IB devices resume their normal operations, and the modules can be reloaded successfully. > > -- Hal > >>> drivers/infiniband/core/ucm.c | 2 +- >>> drivers/infiniband/core/ucma.c | 2 +- >> On these files, as far as I understand this code from quick looking, I'm >> not sure on what >> exactly the completion objects protects, Sean? >> >> Or. >> >>> drivers/infiniband/core/uverbs_main.c | 4 ++-- >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >>
On 11/1/2012 1:53 PM, Kleber Sacilotto de Souza wrote: > On 11/01/2012 01:30 PM, Hal Rosenstock wrote: >> On 11/1/2012 11:12 AM, Or Gerlitz wrote: >>> On 19/10/2012 23:58, Kleber Sacilotto de Souza wrote: >>>> During PCI error recovery, the calls to wait_for_completion() in the >>>> infiniband core path can hang waiting for some tasks that will never >>>> complete, since the hardware is nonfunctional. >>>> >>>> INFO: task eehd:16029 blocked for more than 120 seconds. >>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>>> message. >>>> eehd D 0000000000000000 9664 16029 3093 0x00000080 >>>> Call Trace: >>>> [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) >>>> [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 >>>> [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 >>>> [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 >>>> [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 >>>> [c0000000af8437c0] [d000000004113538] >>>> .ib_unregister_mad_agent+0x498/0x690 [ib_mad] >>>> [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 >>>> [ib_sa] >>>> [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 >>>> [ib_core] >>>> [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 >>>> [mlx4_ib] >>>> [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 >>>> [mlx4_core] >>>> [c0000000af843b40] [d000000003f6abfc] >>>> .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] >>>> [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 >>>> [mlx4_core] >>>> [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 >>>> [mlx4_core] >>>> [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 >>>> [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 >>>> [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 >>>> [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 >>>> [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 >>>> >>>> This patch fixes the issue by replacing the calls to >>>> wait_for_completion() by wait_for_completion_timeout(), providing a >>>> timeout of 5 seconds to wait for the normal completion of the task. >>>> >>>> Signed-off-by: Kleber Sacilotto de Souza<klebers@linux.vnet.ibm.com> >>>> --- >>>> drivers/infiniband/core/addr.c | 2 +- >>>> drivers/infiniband/core/cm.c | 2 +- >>>> drivers/infiniband/core/cma.c | 4 ++-- >>>> drivers/infiniband/core/iwcm.c | 2 +- >>>> drivers/infiniband/core/mad.c | 4 ++-- >>>> drivers/infiniband/core/mad_rmpp.c | 2 +- >>>> drivers/infiniband/core/multicast.c | 4 ++-- >>>> drivers/infiniband/core/sa_query.c | 2 +- >>> for all the above files, your patch makes sense, since the >>> completion/refcount logic >>> relate to waiting for event that needs to come from the device. >> Is this a fix or a workaround for some other underlying issue which >> needs fixing ? >> >> What happens when modules are reloaded after this ? Do things still work >> ? Are resources lost ? > > After the error is recovered, the IB devices resume their normal operations, > and the modules can be reloaded successfully. Isn't this paving over some lower layer issue where the required completions aren't issued ? Shouldn't that be found and fixed ? -- Hal > >> >> -- Hal >> >>>> drivers/infiniband/core/ucm.c | 2 +- >>>> drivers/infiniband/core/ucma.c | 2 +- >>> On these files, as far as I understand this code from quick looking, I'm >>> not sure on what >>> exactly the completion objects protects, Sean? >>> >>> Or. >>> >>>> drivers/infiniband/core/uverbs_main.c | 4 ++-- >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Oct 19, 2012 at 2:58 PM, Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote: > void rdma_addr_unregister_client(struct rdma_addr_client *client) > { > put_client(client); > - wait_for_completion(&client->comp); > + wait_for_completion_timeout(&client->comp, 5*HZ); > } > EXPORT_SYMBOL(rdma_addr_unregister_client); This type of change looks pretty risky to me. Aren't we just letting things continue with a potential use-after-free hanging around? Where does 5 seconds as a timeout come from? Can we be confident that nothing will complete client->comp after 5 seconds? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > drivers/infiniband/core/ucm.c | 2 +- > > drivers/infiniband/core/ucma.c | 2 +- > > On these files, as far as I understand this code from quick looking, I'm > not sure on what > exactly the completion objects protects, Sean? These protect against the caller trying to destroy an object that is in use. For example, it handles the case where the user calls 'connect' on a given context, then calls 'destroy' on that same context from another thread. We shouldn't need to deal with any timeouts at this level. - Sean -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Nov 5, 2012 at 7:21 PM, Hefty, Sean <sean.hefty@intel.com> wrote: > > > > drivers/infiniband/core/ucm.c | 2 +- > > > drivers/infiniband/core/ucma.c | 2 +- > > > > On these files, as far as I understand this code from quick looking, I'm > > not sure on what > > exactly the completion objects protects, Sean? > > These protect against the caller trying to destroy an object that is in > use. For example, it handles the case where the user calls 'connect' on a > given context, then calls 'destroy' on that same context from another > thread. > > We shouldn't need to deal with any timeouts at this level. > OK, thanks, this was my thought as well -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/5/2012 1:21 PM, Hefty, Sean wrote: >>> drivers/infiniband/core/ucm.c | 2 +- >>> drivers/infiniband/core/ucma.c | 2 +- >> >> On these files, as far as I understand this code from quick looking, I'm >> not sure on what >> exactly the completion objects protects, Sean? > > These protect against the caller trying to destroy an object that is in use. For example, it handles the case where the user calls 'connect' on a given context, then calls 'destroy' on that same context from another thread. > > We shouldn't need to deal with any timeouts at this level. I believe there's some lower level issue which needs to be found and fixed where some driver is not returning the completions for some reason. -- Hal > > - Sean > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/05/2012 05:15 PM, Hal Rosenstock wrote: > On 11/5/2012 1:21 PM, Hefty, Sean wrote: >>>> drivers/infiniband/core/ucm.c | 2 +- >>>> drivers/infiniband/core/ucma.c | 2 +- >>> On these files, as far as I understand this code from quick looking, I'm >>> not sure on what >>> exactly the completion objects protects, Sean? >> These protect against the caller trying to destroy an object that is in use. For example, it handles the case where the user calls 'connect' on a given context, then calls 'destroy' on that same context from another thread. >> >> We shouldn't need to deal with any timeouts at this level. > I believe there's some lower level issue which needs to be found and > fixed where some driver is not returning the completions for some reason. The driver is not returning the completions because during EEH (Extended Error Handling) recovery on powerpc systems the PCI slot is frozen, and we are not going to receive any interrupt from the adapter until we remove it completely and reset it. Kleber > > -- Hal > >> - Sean >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >>
> On Mon, Nov 5, 2012 at 9:54 PM, Kleber Sacilotto de Souza > <klebers@linux.vnet.ibm.com> wrote: >> >> >> The driver is not returning the completions because during EEH (Extended >> Error Handling) recovery on powerpc systems the PCI slot is frozen, and >> we are not going to receive any interrupt from the adapter until we >> remove it completely and reset it. >> maybe, but per Sean's comment this possible problem is irrelevant for ucm/ucma -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/06/2012 03:30 AM, Or Gerlitz wrote: >> On Mon, Nov 5, 2012 at 9:54 PM, Kleber Sacilotto de Souza >> <klebers@linux.vnet.ibm.com> wrote: >>> >>> The driver is not returning the completions because during EEH (Extended >>> Error Handling) recovery on powerpc systems the PCI slot is frozen, and >>> we are not going to receive any interrupt from the adapter until we >>> remove it completely and reset it. >>> > maybe, but per Sean's comment this possible problem is irrelevant for ucm/ucma > During my tests I've seen the wait_for_completion() call hanging on different parts of the code, but not on ucm/ucma. So would it be OK to change the other calls and leave the ucm/ucma as it is? Thank you.
On Tue, Nov 6, 2012 at 11:58 AM, Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote: > During my tests I've seen the wait_for_completion() call hanging on > different parts of the code, but not on ucm/ucma. So would it be OK to > change the other calls and leave the ucm/ucma as it is? ucm/ucma you should remove in V1, I'm not fully sure on the uverbs part, it needs more review, the other parts are OK to me. Or. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Nov 6, 2012 at 6:44 PM, Or Gerlitz <or.gerlitz@gmail.com> wrote:
> the other parts are OK to me.
I wanted to say that you stepped on real problem and provided a
solution, Roland's wondering is ofcourse
correct, how do we avoid use after free in very slow non pci hotunplug cases?!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index eaec8d7..be42113 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -86,7 +86,7 @@ static inline void put_client(struct rdma_addr_client *client) void rdma_addr_unregister_client(struct rdma_addr_client *client) { put_client(client); - wait_for_completion(&client->comp); + wait_for_completion_timeout(&client->comp, 5*HZ); } EXPORT_SYMBOL(rdma_addr_unregister_client); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 394fea2..062a90b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -910,7 +910,7 @@ retest: cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); - wait_for_completion(&cm_id_priv->comp); + wait_for_completion_timeout(&cm_id_priv->comp, 5*HZ); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); kfree(cm_id_priv->compare_data); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a7568c3..c85d078 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -944,7 +944,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) cma_release_port(id_priv); cma_deref_id(id_priv); - wait_for_completion(&id_priv->comp); + wait_for_completion_timeout(&id_priv->comp, 5*HZ); if (id_priv->internal_id) cma_deref_id(id_priv->id.context); @@ -3388,7 +3388,7 @@ static void cma_process_remove(struct cma_device *cma_dev) mutex_unlock(&lock); cma_deref_dev(cma_dev); - wait_for_completion(&cma_dev->comp); + wait_for_completion_timeout(&cma_dev->comp, 5*HZ); } static void cma_remove_one(struct ib_device *device) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 0bb99bb..7457195 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -399,7 +399,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id) destroy_cm_id(cm_id); - wait_for_completion(&cm_id_priv->destroy_comp); + wait_for_completion_timeout(&cm_id_priv->destroy_comp, 5*HZ); free_cm_id(cm_id_priv); } diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index dc3fd1e..bf94a67 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -539,7 +539,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); - wait_for_completion(&mad_agent_priv->comp); + wait_for_completion_timeout(&mad_agent_priv->comp, 5*HZ); kfree(mad_agent_priv->reg_req); ib_dereg_mr(mad_agent_priv->agent.mr); @@ -558,7 +558,7 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) spin_unlock_irqrestore(&qp_info->snoop_lock, flags); deref_snoop_agent(mad_snoop_priv); - wait_for_completion(&mad_snoop_priv->comp); + wait_for_completion_timeout(&mad_snoop_priv->comp, 5*HZ); kfree(mad_snoop_priv); } diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index f37878c..562bf84 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -78,7 +78,7 @@ static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { deref_rmpp_recv(rmpp_recv); - wait_for_completion(&rmpp_recv->comp); + wait_for_completion_timeout(&rmpp_recv->comp, 5*HZ); ib_destroy_ah(rmpp_recv->ah); kfree(rmpp_recv); } diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d2360a8..0cba0c6 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -684,7 +684,7 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast) } deref_member(member); - wait_for_completion(&member->comp); + wait_for_completion_timeout(&member->comp, 5*HZ); ib_sa_client_put(member->client); kfree(member); } @@ -862,7 +862,7 @@ static void mcast_remove_one(struct ib_device *device) IB_LINK_LAYER_INFINIBAND) { port = &dev->port[i]; deref_port(port); - wait_for_completion(&port->comp); + wait_for_completion_timeout(&port->comp, 5*HZ); } } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a8905ab..25547fe 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -474,7 +474,7 @@ EXPORT_SYMBOL(ib_sa_register_client); void ib_sa_unregister_client(struct ib_sa_client *client) { ib_sa_client_put(client); - wait_for_completion(&client->comp); + wait_for_completion_timeout(&client->comp, 5*HZ); } EXPORT_SYMBOL(ib_sa_unregister_client); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 49b15ac..6358941 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -550,7 +550,7 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, return PTR_ERR(ctx); ib_ucm_ctx_put(ctx); - wait_for_completion(&ctx->comp); + wait_for_completion_timeout(&ctx->comp, 5*HZ); /* No new events will be generated after destroying the cm_id. */ ib_destroy_cm_id(ctx->cm_id); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2709ff5..4b4eb29 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -516,7 +516,7 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); + wait_for_completion_timeout(&ctx->comp, 5*HZ); resp.events_reported = ucma_free_ctx(ctx); if (copy_to_user((void __user *)(unsigned long)cmd.response, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6f2ce6f..0824b92 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -819,7 +819,7 @@ err_cdev: err: kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); - wait_for_completion(&uverbs_dev->comp); + wait_for_completion_timeout(&uverbs_dev->comp, 5*HZ); kfree(uverbs_dev); return; } @@ -841,7 +841,7 @@ static void ib_uverbs_remove_one(struct ib_device *device) clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); - wait_for_completion(&uverbs_dev->comp); + wait_for_completion_timeout(&uverbs_dev->comp, 5*HZ); kfree(uverbs_dev); }
During PCI error recovery, the calls to wait_for_completion() in the infiniband core path can hang waiting for some tasks that will never complete, since the hardware is nonfunctional. INFO: task eehd:16029 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. eehd D 0000000000000000 9664 16029 3093 0x00000080 Call Trace: [c0000000af8430e0] [c0000000be05e200] 0xc0000000be05e200 (unreliable) [c0000000af8432b0] [c000000000014070] .__switch_to+0x130/0x250 [c0000000af843360] [c000000000777c5c] .__schedule+0x40c/0x920 [c0000000af843600] [c000000000775a10] .schedule_timeout+0x210/0x260 [c0000000af8436e0] [c000000000777658] .wait_for_common+0xf8/0x210 [c0000000af8437c0] [d000000004113538] .ib_unregister_mad_agent+0x498/0x690 [ib_mad] [c0000000af8438c0] [d0000000042c01f4] .ib_sa_remove_one+0xe4/0x180 [ib_sa] [c0000000af843970] [d0000000040952e8] .ib_unregister_device+0x78/0x170 [ib_core] [c0000000af843a10] [d000000004183290] .mlx4_ib_remove+0x40/0x1f0 [mlx4_ib] [c0000000af843ab0] [d000000003f6ab54] .mlx4_remove_device+0xd4/0x110 [mlx4_core] [c0000000af843b40] [d000000003f6abfc] .mlx4_unregister_device+0x6c/0xf0 [mlx4_core] [c0000000af843be0] [d000000003f6f19c] .mlx4_remove_one+0x10c/0x3a0 [mlx4_core] [c0000000af843c80] [d000000003f6f448] .mlx4_pci_err_detected+0x18/0x40 [mlx4_core] [c0000000af843d00] [c000000000058a50] .eeh_report_error+0x70/0xe0 [c0000000af843d90] [c0000000003e4d94] .pci_walk_bus+0xa4/0x140 [c0000000af843e50] [c000000000058628] .handle_eeh_events+0x1f8/0x480 [c0000000af843f00] [c000000000058dfc] .eeh_event_handler+0x13c/0x1e0 [c0000000af843f90] [c00000000002031c] .kernel_thread+0x54/0x70 This patch fixes the issue by replacing the calls to wait_for_completion() by wait_for_completion_timeout(), providing a timeout of 5 seconds to wait for the normal completion of the task. Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/cma.c | 4 ++-- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/core/mad.c | 4 ++-- drivers/infiniband/core/mad_rmpp.c | 2 +- drivers/infiniband/core/multicast.c | 4 ++-- drivers/infiniband/core/sa_query.c | 2 +- drivers/infiniband/core/ucm.c | 2 +- drivers/infiniband/core/ucma.c | 2 +- drivers/infiniband/core/uverbs_main.c | 4 ++-- 11 files changed, 15 insertions(+), 15 deletions(-)