diff mbox series

[for-next,3/9] RDMA/hns: Completely release qp resources when hw err

Message ID 1565343666-73193-4-git-send-email-oulijun@huawei.com (mailing list archive)
State Rejected
Headers show
Series Bugfixes for 5.3-rc2 | expand

Commit Message

Lijun Ou Aug. 9, 2019, 9:41 a.m. UTC
From: Yangyang Li <liyangyang20@huawei.com>

Even if no response from hardware, make sure that qp related
resources are completely released.

Signed-off-by: Yangyang Li <liyangyang20@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

Comments

Doug Ledford Aug. 12, 2019, 3:29 p.m. UTC | #1
On Fri, 2019-08-09 at 17:41 +0800, Lijun Ou wrote:
> From: Yangyang Li <liyangyang20@huawei.com>
> 
> Even if no response from hardware, make sure that qp related
> resources are completely released.
> 
> Signed-off-by: Yangyang Li <liyangyang20@huawei.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ++++--------
>  1 file changed, 4 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index 7a14f0b..0409851 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -4562,16 +4562,14 @@ static int
> hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
>  {
>  	struct hns_roce_cq *send_cq, *recv_cq;
>  	struct ib_device *ibdev = &hr_dev->ib_dev;
> -	int ret;
> +	int ret = 0;
>  
>  	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state !=
> IB_QPS_RESET) {
>  		/* Modify qp to reset before destroying qp */
>  		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
>  					    hr_qp->state, IB_QPS_RESET);
> -		if (ret) {
> +		if (ret)
>  			ibdev_err(ibdev, "modify QP to Reset
> failed.\n");
> -			return ret;
> -		}
>  	}
>  
>  	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
> @@ -4627,7 +4625,7 @@ static int hns_roce_v2_destroy_qp_common(struct
> hns_roce_dev *hr_dev,
>  		kfree(hr_qp->rq_inl_buf.wqe_list);
>  	}
>  
> -	return 0;
> +	return ret;
>  }
>  
>  static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata
> *udata)
> @@ -4637,11 +4635,9 @@ static int hns_roce_v2_destroy_qp(struct ib_qp
> *ibqp, struct ib_udata *udata)
>  	int ret;
>  
>  	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
> -	if (ret) {
> +	if (ret)
>  		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx
> failed(%d)\n",
>  			  hr_qp->qpn, ret);
> -		return ret;
> -	}
>  
>  	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
>  		kfree(hr_to_hr_sqp(hr_qp));

I don't know your hardware, but this patch sounds wrong/dangerous to me.
As long as the resources this card might access are allocated by the
kernel, you can't get random data corruption by the card writing to
memory used elsewhere in the kernel.  So if your card is not responding
to your requests to free the resources, it would seem safer to leak
those resources permanently than to free them and risk the card coming
back to life long enough to corrupt memory reallocated to some other
task.

Only if you can guarantee me that there is no way your commands to the
card will fail and then the card start working again later would I
consider this patch safe.  And if it's possible for the card to hang
like this, should that be triggering a reset of the device?
Yangyang Li Aug. 14, 2019, 6:02 a.m. UTC | #2
Hi, Doug
Thanks a lot for your reply.

在 2019/8/12 23:29, Doug Ledford 写道:
> On Fri, 2019-08-09 at 17:41 +0800, Lijun Ou wrote:
>> From: Yangyang Li <liyangyang20@huawei.com>
>>
>> Even if no response from hardware, make sure that qp related
>> resources are completely released.
>>
>> Signed-off-by: Yangyang Li <liyangyang20@huawei.com>
>> ---
>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ++++--------
>>  1 file changed, 4 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> index 7a14f0b..0409851 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> @@ -4562,16 +4562,14 @@ static int
>> hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
>>  {
>>  	struct hns_roce_cq *send_cq, *recv_cq;
>>  	struct ib_device *ibdev = &hr_dev->ib_dev;
>> -	int ret;
>> +	int ret = 0;
>>  
>>  	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state !=
>> IB_QPS_RESET) {
>>  		/* Modify qp to reset before destroying qp */
>>  		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
>>  					    hr_qp->state, IB_QPS_RESET);
>> -		if (ret) {
>> +		if (ret)
>>  			ibdev_err(ibdev, "modify QP to Reset
>> failed.\n");
>> -			return ret;
>> -		}
>>  	}
>>  
>>  	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
>> @@ -4627,7 +4625,7 @@ static int hns_roce_v2_destroy_qp_common(struct
>> hns_roce_dev *hr_dev,
>>  		kfree(hr_qp->rq_inl_buf.wqe_list);
>>  	}
>>  
>> -	return 0;
>> +	return ret;
>>  }
>>  
>>  static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata
>> *udata)
>> @@ -4637,11 +4635,9 @@ static int hns_roce_v2_destroy_qp(struct ib_qp
>> *ibqp, struct ib_udata *udata)
>>  	int ret;
>>  
>>  	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
>> -	if (ret) {
>> +	if (ret)
>>  		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx
>> failed(%d)\n",
>>  			  hr_qp->qpn, ret);
>> -		return ret;
>> -	}
>>  
>>  	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
>>  		kfree(hr_to_hr_sqp(hr_qp));
> 
> I don't know your hardware, but this patch sounds wrong/dangerous to me.
> As long as the resources this card might access are allocated by the
> kernel, you can't get random data corruption by the card writing to
> memory used elsewhere in the kernel.  So if your card is not responding
> to your requests to free the resources, it would seem safer to leak
> those resources permanently than to free them and risk the card coming
> back to life long enough to corrupt memory reallocated to some other
> task.
> 
> Only if you can guarantee me that there is no way your commands to the
> card will fail and then the card start working again later would I
> consider this patch safe.  And if it's possible for the card to hang
> like this, should that be triggering a reset of the device?
> 

Thanks for your suggestion, I agree with you, it would seem safer to leak
those resources permanently than to free them. I will abandon this change
and consider cleaning up these leaked resources during uninstallation or reset.

Thanks
Doug Ledford Aug. 14, 2019, 3:05 p.m. UTC | #3
On Wed, 2019-08-14 at 14:02 +0800, Yangyang Li wrote:
> > I don't know your hardware, but this patch sounds wrong/dangerous to
> > me.
> > As long as the resources this card might access are allocated by the
> > kernel, you can't get random data corruption by the card writing to
> > memory used elsewhere in the kernel.  So if your card is not
> > responding
> > to your requests to free the resources, it would seem safer to leak
> > those resources permanently than to free them and risk the card
> > coming
> > back to life long enough to corrupt memory reallocated to some other
> > task.
> > 
> > Only if you can guarantee me that there is no way your commands to
> > the
> > card will fail and then the card start working again later would I
> > consider this patch safe.  And if it's possible for the card to hang
> > like this, should that be triggering a reset of the device?
> > 
> 
> Thanks for your suggestion, I agree with you, it would seem safer to
> leak
> those resources permanently than to free them. I will abandon this
> change
> and consider cleaning up these leaked resources during uninstallation
> or reset.

Ok, patch dropped from patchworks, thanks.
Leon Romanovsky Aug. 14, 2019, 6:47 p.m. UTC | #4
On Wed, Aug 14, 2019 at 11:05:04AM -0400, Doug Ledford wrote:
> On Wed, 2019-08-14 at 14:02 +0800, Yangyang Li wrote:
> > > I don't know your hardware, but this patch sounds wrong/dangerous to
> > > me.
> > > As long as the resources this card might access are allocated by the
> > > kernel, you can't get random data corruption by the card writing to
> > > memory used elsewhere in the kernel.  So if your card is not
> > > responding
> > > to your requests to free the resources, it would seem safer to leak
> > > those resources permanently than to free them and risk the card
> > > coming
> > > back to life long enough to corrupt memory reallocated to some other
> > > task.
> > >
> > > Only if you can guarantee me that there is no way your commands to
> > > the
> > > card will fail and then the card start working again later would I
> > > consider this patch safe.  And if it's possible for the card to hang
> > > like this, should that be triggering a reset of the device?
> > >
> >
> > Thanks for your suggestion, I agree with you, it would seem safer to
> > leak
> > those resources permanently than to free them. I will abandon this
> > change
> > and consider cleaning up these leaked resources during uninstallation
> > or reset.
>
> Ok, patch dropped from patchworks, thanks.

Sorry for being late, but I don't like the idea of having leaked memory.

All my allocation patches are actually trying to avoid such situation
by ensuring that no driver does crazy stuff like this. It means that
once I'll have time to work on QP allocations, I'll ensure that memory
is freed, so it is better to free such memory now.

Thanks

>
> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>     Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD
Doug Ledford Aug. 19, 2019, 5:39 p.m. UTC | #5
On Wed, 2019-08-14 at 21:47 +0300, Leon Romanovsky wrote:
> On Wed, Aug 14, 2019 at 11:05:04AM -0400, Doug Ledford wrote:
> > On Wed, 2019-08-14 at 14:02 +0800, Yangyang Li wrote:
> > > > I don't know your hardware, but this patch sounds
> > > > wrong/dangerous to
> > > > me.
> > > > As long as the resources this card might access are allocated by
> > > > the
> > > > kernel, you can't get random data corruption by the card writing
> > > > to
> > > > memory used elsewhere in the kernel.  So if your card is not
> > > > responding
> > > > to your requests to free the resources, it would seem safer to
> > > > leak
> > > > those resources permanently than to free them and risk the card
> > > > coming
> > > > back to life long enough to corrupt memory reallocated to some
> > > > other
> > > > task.
> > > > 
> > > > Only if you can guarantee me that there is no way your commands
> > > > to
> > > > the
> > > > card will fail and then the card start working again later would
> > > > I
> > > > consider this patch safe.  And if it's possible for the card to
> > > > hang
> > > > like this, should that be triggering a reset of the device?
> > > > 
> > > 
> > > Thanks for your suggestion, I agree with you, it would seem safer
> > > to
> > > leak
> > > those resources permanently than to free them. I will abandon this
> > > change
> > > and consider cleaning up these leaked resources during
> > > uninstallation
> > > or reset.
> > 
> > Ok, patch dropped from patchworks, thanks.
> 
> Sorry for being late, but I don't like the idea of having leaked
> memory.
> 
> All my allocation patches are actually trying to avoid such situation
> by ensuring that no driver does crazy stuff like this. It means that
> once I'll have time to work on QP allocations, I'll ensure that memory
> is freed, so it is better to free such memory now.

You can't free something if the card might still access it.  A leak is
always preferable to data corruption.
Weihang Li Oct. 8, 2019, 8:43 a.m. UTC | #6
> -----Original Message-----
> From: Linuxarm [mailto:linuxarm-bounces@huawei.com] On Behalf Of Doug
> Ledford
> Sent: Tuesday, August 20, 2019 1:40 AM
> To: Leon Romanovsky <leon@kernel.org>
> Cc: linux-rdma@vger.kernel.org; Linuxarm <linuxarm@huawei.com>;
> jgg@ziepe.ca
> Subject: Re: [PATCH for-next 3/9] RDMA/hns: Completely release qp
> resources when hw err
> 
> On Wed, 2019-08-14 at 21:47 +0300, Leon Romanovsky wrote:
> > On Wed, Aug 14, 2019 at 11:05:04AM -0400, Doug Ledford wrote:
> > > On Wed, 2019-08-14 at 14:02 +0800, Yangyang Li wrote:
> > > > > I don't know your hardware, but this patch sounds
> > > > > wrong/dangerous to me.
> > > > > As long as the resources this card might access are allocated by
> > > > > the kernel, you can't get random data corruption by the card
> > > > > writing to memory used elsewhere in the kernel.  So if your card
> > > > > is not responding to your requests to free the resources, it
> > > > > would seem safer to leak those resources permanently than to
> > > > > free them and risk the card coming back to life long enough to
> > > > > corrupt memory reallocated to some other task.
> > > > >
> > > > > Only if you can guarantee me that there is no way your commands
> > > > > to the card will fail and then the card start working again
> > > > > later would I consider this patch safe.  And if it's possible
> > > > > for the card to hang like this, should that be triggering a
> > > > > reset of the device?
> > > > >
> > > >
> > > > Thanks for your suggestion, I agree with you, it would seem safer
> > > > to leak those resources permanently than to free them. I will
> > > > abandon this change and consider cleaning up these leaked
> > > > resources during uninstallation or reset.
> > >
> > > Ok, patch dropped from patchworks, thanks.
> >
> > Sorry for being late, but I don't like the idea of having leaked
> > memory.
> >
> > All my allocation patches are actually trying to avoid such situation
> > by ensuring that no driver does crazy stuff like this. It means that
> > once I'll have time to work on QP allocations, I'll ensure that memory
> > is freed, so it is better to free such memory now.
> 
> You can't free something if the card might still access it.  A leak is always
> preferable to data corruption.
> 

Hi Doug,

We can confirm that hip08 hardware won't start working again when failed
to destroy after considering all possible situations. So it is safe to free qp
resources  even if the destroy command failed executed. I will resend this
patch to avoid memory leaks as Jason asked.

Thanks,
Weihang

> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>     Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 7a14f0b..0409851 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -4562,16 +4562,14 @@  static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 {
 	struct hns_roce_cq *send_cq, *recv_cq;
 	struct ib_device *ibdev = &hr_dev->ib_dev;
-	int ret;
+	int ret = 0;
 
 	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state != IB_QPS_RESET) {
 		/* Modify qp to reset before destroying qp */
 		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
 					    hr_qp->state, IB_QPS_RESET);
-		if (ret) {
+		if (ret)
 			ibdev_err(ibdev, "modify QP to Reset failed.\n");
-			return ret;
-		}
 	}
 
 	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
@@ -4627,7 +4625,7 @@  static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 		kfree(hr_qp->rq_inl_buf.wqe_list);
 	}
 
-	return 0;
+	return ret;
 }
 
 static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
@@ -4637,11 +4635,9 @@  static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	int ret;
 
 	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
-	if (ret) {
+	if (ret)
 		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx failed(%d)\n",
 			  hr_qp->qpn, ret);
-		return ret;
-	}
 
 	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
 		kfree(hr_to_hr_sqp(hr_qp));