diff mbox series

[net-next,v2,10/10] net/smc: fix application data exception

Message ID e590ca91e24d002608df29d100d4139977d0bcb6.1661407821.git.alibuda@linux.alibaba.com (mailing list archive)
State Superseded
Headers show
Series optimize the parallelism of SMC-R connections | expand

Commit Message

D. Wythe Aug. 26, 2022, 9:51 a.m. UTC
From: "D. Wythe" <alibuda@linux.alibaba.com>

After we optimize the parallel capability of SMC-R connection
establishment, There is a certain probability that following
exceptions will occur in the wrk benchmark test:

Running 10s test @ http://11.213.45.6:80
  8 threads and 64 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     3.72ms   13.94ms 245.33ms   94.17%
    Req/Sec     1.96k   713.67     5.41k    75.16%
  155262 requests in 10.10s, 23.10MB read
Non-2xx or 3xx responses: 3

We will find that the error is HTTP 400 error, which is a serious
exception in our test, which means the application data was
corrupted.

Consider the following scenarios:

CPU0                            CPU1

buf_desc->used = 0;
                                cmpxchg(buf_desc->used, 0, 1)
                                deal_with(buf_desc)

memset(buf_desc->cpu_addr,0);

This will cause the data received by a victim connection to be cleared,
thus triggering an HTTP 400 error in the server.

This patch exchange the order between clear used and memset, add
barrier to ensure memory consistency.

Fixes: 1c5526968e27 ("net/smc: Clear memory when release and reuse buffer")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 net/smc/smc_core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Comments

Wen Gu Sept. 8, 2022, 9:37 a.m. UTC | #1
On 2022/8/26 17:51, D. Wythe wrote:

> From: "D. Wythe" <alibuda@linux.alibaba.com>
> 
> After we optimize the parallel capability of SMC-R connection
> establishment, There is a certain probability that following
> exceptions will occur in the wrk benchmark test:
> 
> Running 10s test @ http://11.213.45.6:80
>    8 threads and 64 connections
>    Thread Stats   Avg      Stdev     Max   +/- Stdev
>      Latency     3.72ms   13.94ms 245.33ms   94.17%
>      Req/Sec     1.96k   713.67     5.41k    75.16%
>    155262 requests in 10.10s, 23.10MB read
> Non-2xx or 3xx responses: 3
> 
> We will find that the error is HTTP 400 error, which is a serious
> exception in our test, which means the application data was
> corrupted.
> 
> Consider the following scenarios:
> 
> CPU0                            CPU1
> 
> buf_desc->used = 0;
>                                  cmpxchg(buf_desc->used, 0, 1)
>                                  deal_with(buf_desc)
> 
> memset(buf_desc->cpu_addr,0);
> 
> This will cause the data received by a victim connection to be cleared,
> thus triggering an HTTP 400 error in the server.
> 
> This patch exchange the order between clear used and memset, add
> barrier to ensure memory consistency.
> 
> Fixes: 1c5526968e27 ("net/smc: Clear memory when release and reuse buffer")
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> ---
>   net/smc/smc_core.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index 84bf84c..fdad953 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -1380,8 +1380,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
>   
>   		smc_buf_free(lgr, is_rmb, buf_desc);
>   	} else {
> -		buf_desc->used = 0;
> -		memset(buf_desc->cpu_addr, 0, buf_desc->len);
> +		/* memzero_explicit provides potential memory barrier semantics */
> +		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
> +		WRITE_ONCE(buf_desc->used, 0);
>   	}
>   }
>   

It seems that the same issue exists in smc_buf_unuse(), Maybe it also needs to be fixed?


static void smc_buf_unuse(struct smc_connection *conn,
			  struct smc_link_group *lgr)
{
	if (conn->sndbuf_desc) {
		if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) {
			smcr_buf_unuse(conn->sndbuf_desc, false, lgr);
		} else {
			conn->sndbuf_desc->used = 0;
			memset(conn->sndbuf_desc->cpu_addr, 0,
			       conn->sndbuf_desc->len);
                         ^...................
		}
	}
	if (conn->rmb_desc) {
		if (!lgr->is_smcd) {
			smcr_buf_unuse(conn->rmb_desc, true, lgr);
		} else {
			conn->rmb_desc->used = 0;
			memset(conn->rmb_desc->cpu_addr, 0,
			       conn->rmb_desc->len +
			       sizeof(struct smcd_cdc_msg));
                         ^...................
		}
	}
}

Thanks,
Wen Gu
D. Wythe Sept. 16, 2022, 5:24 a.m. UTC | #2
Hi, Wen Gu

This is indeed same issues, I will fix it in the next version.

Thanks
D. Wythe


On 9/8/22 5:37 PM, Wen Gu wrote:
> 
> 
> On 2022/8/26 17:51, D. Wythe wrote:
> 
>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>
>> After we optimize the parallel capability of SMC-R connection
>> establishment, There is a certain probability that following
>> exceptions will occur in the wrk benchmark test:
>>
>> Running 10s test @ http://11.213.45.6:80
>>    8 threads and 64 connections
>>    Thread Stats   Avg      Stdev     Max   +/- Stdev
>>      Latency     3.72ms   13.94ms 245.33ms   94.17%
>>      Req/Sec     1.96k   713.67     5.41k    75.16%
>>    155262 requests in 10.10s, 23.10MB read
>> Non-2xx or 3xx responses: 3
>>
>> We will find that the error is HTTP 400 error, which is a serious
>> exception in our test, which means the application data was
>> corrupted.
>>
>> Consider the following scenarios:
>>
>> CPU0                            CPU1
>>
>> buf_desc->used = 0;
>>                                  cmpxchg(buf_desc->used, 0, 1)
>>                                  deal_with(buf_desc)
>>
>> memset(buf_desc->cpu_addr,0);
>>
>> This will cause the data received by a victim connection to be cleared,
>> thus triggering an HTTP 400 error in the server.
>>
>> This patch exchange the order between clear used and memset, add
>> barrier to ensure memory consistency.
>>
>> Fixes: 1c5526968e27 ("net/smc: Clear memory when release and reuse buffer")
>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>> ---
>>   net/smc/smc_core.c | 5 +++--
>>   1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>> index 84bf84c..fdad953 100644
>> --- a/net/smc/smc_core.c
>> +++ b/net/smc/smc_core.c
>> @@ -1380,8 +1380,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
>>           smc_buf_free(lgr, is_rmb, buf_desc);
>>       } else {
>> -        buf_desc->used = 0;
>> -        memset(buf_desc->cpu_addr, 0, buf_desc->len);
>> +        /* memzero_explicit provides potential memory barrier semantics */
>> +        memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
>> +        WRITE_ONCE(buf_desc->used, 0);
>>       }
>>   }
> 
> It seems that the same issue exists in smc_buf_unuse(), Maybe it also needs to be fixed?
> 
> 
> static void smc_buf_unuse(struct smc_connection *conn,
>                struct smc_link_group *lgr)
> {
>      if (conn->sndbuf_desc) {
>          if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) {
>              smcr_buf_unuse(conn->sndbuf_desc, false, lgr);
>          } else {
>              conn->sndbuf_desc->used = 0;
>              memset(conn->sndbuf_desc->cpu_addr, 0,
>                     conn->sndbuf_desc->len);
>                          ^...................
>          }
>      }
>      if (conn->rmb_desc) {
>          if (!lgr->is_smcd) {
>              smcr_buf_unuse(conn->rmb_desc, true, lgr);
>          } else {
>              conn->rmb_desc->used = 0;
>              memset(conn->rmb_desc->cpu_addr, 0,
>                     conn->rmb_desc->len +
>                     sizeof(struct smcd_cdc_msg));
>                          ^...................
>          }
>      }
> }
> 
> Thanks,
> Wen Gu
diff mbox series

Patch

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 84bf84c..fdad953 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1380,8 +1380,9 @@  static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
 
 		smc_buf_free(lgr, is_rmb, buf_desc);
 	} else {
-		buf_desc->used = 0;
-		memset(buf_desc->cpu_addr, 0, buf_desc->len);
+		/* memzero_explicit provides potential memory barrier semantics */
+		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
+		WRITE_ONCE(buf_desc->used, 0);
 	}
 }