diff mbox series

[RFC,1/1] SUNRPC: increase max timeout for rebind to handle NFS server restart

Message ID 1676016656-26195-1-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/1] SUNRPC: increase max timeout for rebind to handle NFS server restart | expand

Commit Message

Dai Ngo Feb. 10, 2023, 8:10 a.m. UTC
Occasionally NLM lock and unlock request fail with EIO and ENOLCK
respectively. This usually happens when the NFS server is restarted
while NLM lock test is running.

Currently there is a 9 seconds limit for retrying the bind operation.
If the server is under load the port mapper might take more than 9
seconds to become ready after the NFS server restarted.

This patch increases the timeout for rebind from 9 to 30 seconds
allowing a bit more time for the port mapper to become ready.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 include/linux/sunrpc/clnt.h  | 3 +++
 include/linux/sunrpc/sched.h | 4 ++--
 net/sunrpc/clnt.c            | 2 +-
 net/sunrpc/sched.c           | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

Comments

Dai Ngo Feb. 17, 2023, 6:22 p.m. UTC | #1
Hi Trond,

Could you please let me know your opinion on this patch?

Thanks,
-Dai

On 2/10/23 12:10 AM, Dai Ngo wrote:
> Occasionally NLM lock and unlock request fail with EIO and ENOLCK
> respectively. This usually happens when the NFS server is restarted
> while NLM lock test is running.
>
> Currently there is a 9 seconds limit for retrying the bind operation.
> If the server is under load the port mapper might take more than 9
> seconds to become ready after the NFS server restarted.
>
> This patch increases the timeout for rebind from 9 to 30 seconds
> allowing a bit more time for the port mapper to become ready.
>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>   include/linux/sunrpc/clnt.h  | 3 +++
>   include/linux/sunrpc/sched.h | 4 ++--
>   net/sunrpc/clnt.c            | 2 +-
>   net/sunrpc/sched.c           | 3 ++-
>   4 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
> index 770ef2cb5775..7f2dee56c121 100644
> --- a/include/linux/sunrpc/clnt.h
> +++ b/include/linux/sunrpc/clnt.h
> @@ -162,6 +162,9 @@ struct rpc_add_xprt_test {
>   #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
>   #define RPC_CLNT_CREATE_CONNECTED	(1UL << 12)
>   
> +#define	RPC_CLNT_REBIND_DELAY		3
> +#define	RPC_CLNT_REBIND_MAX_TIMEOUT	30
> +
>   struct rpc_clnt *rpc_create(struct rpc_create_args *args);
>   struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
>   				const struct rpc_program *, u32);
> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
> index b8ca3ecaf8d7..e9dc142f10bb 100644
> --- a/include/linux/sunrpc/sched.h
> +++ b/include/linux/sunrpc/sched.h
> @@ -90,8 +90,8 @@ struct rpc_task {
>   #endif
>   	unsigned char		tk_priority : 2,/* Task priority */
>   				tk_garb_retry : 2,
> -				tk_cred_retry : 2,
> -				tk_rebind_retry : 2;
> +				tk_cred_retry : 2;
> +	unsigned char		tk_rebind_retry;
>   };
>   
>   typedef void			(*rpc_action)(struct rpc_task *);
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 0b0b9f1eed46..6c89a1fa40bf 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task)
>   		if (task->tk_rebind_retry == 0)
>   			break;
>   		task->tk_rebind_retry--;
> -		rpc_delay(task, 3*HZ);
> +		rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
>   		goto retry_timeout;
>   	case -ENOBUFS:
>   		rpc_delay(task, HZ >> 2);
> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
> index be587a308e05..5c18a35752aa 100644
> --- a/net/sunrpc/sched.c
> +++ b/net/sunrpc/sched.c
> @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task)
>   	/* Initialize retry counters */
>   	task->tk_garb_retry = 2;
>   	task->tk_cred_retry = 2;
> -	task->tk_rebind_retry = 2;
> +	task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
> +					RPC_CLNT_REBIND_DELAY;
>   
>   	/* starting timestamp */
>   	task->tk_start = ktime_get();
Dai Ngo Feb. 23, 2023, 5:40 a.m. UTC | #2
Hi Anna,

Just a reminder that this patch is still waiting for a review.

Thanks,
-Dai

On 2/17/23 10:22 AM, dai.ngo@oracle.com wrote:
> Hi Trond,
>
> Could you please let me know your opinion on this patch?
>
> Thanks,
> -Dai
>
> On 2/10/23 12:10 AM, Dai Ngo wrote:
>> Occasionally NLM lock and unlock request fail with EIO and ENOLCK
>> respectively. This usually happens when the NFS server is restarted
>> while NLM lock test is running.
>>
>> Currently there is a 9 seconds limit for retrying the bind operation.
>> If the server is under load the port mapper might take more than 9
>> seconds to become ready after the NFS server restarted.
>>
>> This patch increases the timeout for rebind from 9 to 30 seconds
>> allowing a bit more time for the port mapper to become ready.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   include/linux/sunrpc/clnt.h  | 3 +++
>>   include/linux/sunrpc/sched.h | 4 ++--
>>   net/sunrpc/clnt.c            | 2 +-
>>   net/sunrpc/sched.c           | 3 ++-
>>   4 files changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
>> index 770ef2cb5775..7f2dee56c121 100644
>> --- a/include/linux/sunrpc/clnt.h
>> +++ b/include/linux/sunrpc/clnt.h
>> @@ -162,6 +162,9 @@ struct rpc_add_xprt_test {
>>   #define RPC_CLNT_CREATE_REUSEPORT    (1UL << 11)
>>   #define RPC_CLNT_CREATE_CONNECTED    (1UL << 12)
>>   +#define    RPC_CLNT_REBIND_DELAY        3
>> +#define    RPC_CLNT_REBIND_MAX_TIMEOUT    30
>> +
>>   struct rpc_clnt *rpc_create(struct rpc_create_args *args);
>>   struct rpc_clnt    *rpc_bind_new_program(struct rpc_clnt *,
>>                   const struct rpc_program *, u32);
>> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
>> index b8ca3ecaf8d7..e9dc142f10bb 100644
>> --- a/include/linux/sunrpc/sched.h
>> +++ b/include/linux/sunrpc/sched.h
>> @@ -90,8 +90,8 @@ struct rpc_task {
>>   #endif
>>       unsigned char        tk_priority : 2,/* Task priority */
>>                   tk_garb_retry : 2,
>> -                tk_cred_retry : 2,
>> -                tk_rebind_retry : 2;
>> +                tk_cred_retry : 2;
>> +    unsigned char        tk_rebind_retry;
>>   };
>>     typedef void            (*rpc_action)(struct rpc_task *);
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index 0b0b9f1eed46..6c89a1fa40bf 100644
>> --- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task)
>>           if (task->tk_rebind_retry == 0)
>>               break;
>>           task->tk_rebind_retry--;
>> -        rpc_delay(task, 3*HZ);
>> +        rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
>>           goto retry_timeout;
>>       case -ENOBUFS:
>>           rpc_delay(task, HZ >> 2);
>> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
>> index be587a308e05..5c18a35752aa 100644
>> --- a/net/sunrpc/sched.c
>> +++ b/net/sunrpc/sched.c
>> @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task)
>>       /* Initialize retry counters */
>>       task->tk_garb_retry = 2;
>>       task->tk_cred_retry = 2;
>> -    task->tk_rebind_retry = 2;
>> +    task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
>> +                    RPC_CLNT_REBIND_DELAY;
>>         /* starting timestamp */
>>       task->tk_start = ktime_get();
diff mbox series

Patch

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 770ef2cb5775..7f2dee56c121 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -162,6 +162,9 @@  struct rpc_add_xprt_test {
 #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
 #define RPC_CLNT_CREATE_CONNECTED	(1UL << 12)
 
+#define	RPC_CLNT_REBIND_DELAY		3
+#define	RPC_CLNT_REBIND_MAX_TIMEOUT	30
+
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				const struct rpc_program *, u32);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index b8ca3ecaf8d7..e9dc142f10bb 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -90,8 +90,8 @@  struct rpc_task {
 #endif
 	unsigned char		tk_priority : 2,/* Task priority */
 				tk_garb_retry : 2,
-				tk_cred_retry : 2,
-				tk_rebind_retry : 2;
+				tk_cred_retry : 2;
+	unsigned char		tk_rebind_retry;
 };
 
 typedef void			(*rpc_action)(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b0b9f1eed46..6c89a1fa40bf 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2053,7 +2053,7 @@  call_bind_status(struct rpc_task *task)
 		if (task->tk_rebind_retry == 0)
 			break;
 		task->tk_rebind_retry--;
-		rpc_delay(task, 3*HZ);
+		rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ);
 		goto retry_timeout;
 	case -ENOBUFS:
 		rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index be587a308e05..5c18a35752aa 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -817,7 +817,8 @@  rpc_init_task_statistics(struct rpc_task *task)
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
 	task->tk_cred_retry = 2;
-	task->tk_rebind_retry = 2;
+	task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT /
+					RPC_CLNT_REBIND_DELAY;
 
 	/* starting timestamp */
 	task->tk_start = ktime_get();