diff mbox series

[net-next] net: ipvs: random start for RR scheduler

Message ID 20220509122213.19508-1-imagedong@tencent.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: ipvs: random start for RR scheduler | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 3 this patch: 3
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 52 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Menglong Dong May 9, 2022, 12:22 p.m. UTC
From: Menglong Dong <imagedong@tencent.com>

For now, the start of the RR scheduler is in the order of dest
service added, it will result in imbalance if the load balance
is done in client side and long connect is used.

For example, we have client1, client2, ..., client5 and real service
service1, service2, service3. All clients have the same ipvs config,
and each of them will create 2 long TCP connect to the virtual
service. Therefore, all the clients will connect to service1 and
service2, leaving service3 free.

Fix this by randomize the start of dest service to RR scheduler when
IP_VS_SVC_F_SCHED_RR_RANDOM is set.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
---
 include/uapi/linux/ip_vs.h    |  2 ++
 net/netfilter/ipvs/ip_vs_rr.c | 25 ++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

Comments

Julian Anastasov May 9, 2022, 6:17 p.m. UTC | #1
Hello,

On Mon, 9 May 2022, menglong8.dong@gmail.com wrote:

> From: Menglong Dong <imagedong@tencent.com>
> 
> For now, the start of the RR scheduler is in the order of dest
> service added, it will result in imbalance if the load balance

	...order of added destinations,...

> is done in client side and long connect is used.

	..."long connections are used". Is this a case where
small number of connections are used? And the two connections
relatively overload the real servers?

> For example, we have client1, client2, ..., client5 and real service
> service1, service2, service3. All clients have the same ipvs config,
> and each of them will create 2 long TCP connect to the virtual
> service. Therefore, all the clients will connect to service1 and
> service2, leaving service3 free.

	You mean, there are many IPVS directors with same
config and each director gets 2 connections? Third connection
will get real server #3, right ? Also, are the clients local
to the director?

> Fix this by randomize the start of dest service to RR scheduler when

	..."randomizing the starting destination when"

> IP_VS_SVC_F_SCHED_RR_RANDOM is set.
> 
> Signed-off-by: Menglong Dong <imagedong@tencent.com>
> ---
>  include/uapi/linux/ip_vs.h    |  2 ++
>  net/netfilter/ipvs/ip_vs_rr.c | 25 ++++++++++++++++++++++++-
>  2 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 4102ddcb4e14..7f74bafd3211 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -28,6 +28,8 @@
>  #define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
>  #define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
>  
> +#define IP_VS_SVC_F_SCHED_RR_RANDOM	IP_VS_SVC_F_SCHED1 /* random start */
> +
>  /*
>   *      Destination Server Flags
>   */
> diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
> index 38495c6f6c7c..e309d97bdd08 100644
> --- a/net/netfilter/ipvs/ip_vs_rr.c
> +++ b/net/netfilter/ipvs/ip_vs_rr.c
> @@ -22,13 +22,36 @@
>  
>  #include <net/ip_vs.h>
>  
> +static void ip_vs_rr_random_start(struct ip_vs_service *svc)
> +{
> +	struct list_head *cur;
> +	u32 start;
> +
> +	if (!(svc->flags | IP_VS_SVC_F_SCHED_RR_RANDOM) ||

	| -> &

> +	    svc->num_dests <= 1)
> +		return;
> +
> +	spin_lock_bh(&svc->sched_lock);
> +	start = get_random_u32() % svc->num_dests;

	May be prandom is more appropriate for non-crypto purposes. 
Also, not sure if it is a good idea to limit the number of steps,
eg. to 128...

	start = prandom_u32_max(min(svc->num_dests, 128U));

	or just use

	start = prandom_u32_max(svc->num_dests);

	Also, this line can be before the spin_lock_bh.

> +	cur = &svc->destinations;

	cur = svc->sched_data;

	... and to start from current svc->sched_data because
we are called for every added dest. Better to jump 0..127 steps
ahead, to avoid delay with long lists?

> +	while (start--)
> +		cur = cur->next;
> +	svc->sched_data = cur;
> +	spin_unlock_bh(&svc->sched_lock);
> +}
>  
>  static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
>  {
>  	svc->sched_data = &svc->destinations;
> +	ip_vs_rr_random_start(svc);
>  	return 0;
>  }
>  
> +static int ip_vs_rr_add_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
> +{
> +	ip_vs_rr_random_start(svc);
> +	return 0;
> +}
>  
>  static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
>  {
> @@ -104,7 +127,7 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
>  	.module =		THIS_MODULE,
>  	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
>  	.init_service =		ip_vs_rr_init_svc,
> -	.add_dest =		NULL,
> +	.add_dest =		ip_vs_rr_add_dest,
>  	.del_dest =		ip_vs_rr_del_dest,
>  	.schedule =		ip_vs_rr_schedule,
>  };
> -- 
> 2.36.0

Regards

--
Julian Anastasov <ja@ssi.bg>
Menglong Dong May 10, 2022, 2:55 a.m. UTC | #2
On Tue, May 10, 2022 at 2:17 AM Julian Anastasov <ja@ssi.bg> wrote:
>
>
>         Hello,
>
> On Mon, 9 May 2022, menglong8.dong@gmail.com wrote:
>
> > From: Menglong Dong <imagedong@tencent.com>
> >
> > For now, the start of the RR scheduler is in the order of dest
> > service added, it will result in imbalance if the load balance
>
>         ...order of added destinations,...
>
> > is done in client side and long connect is used.
>
>         ..."long connections are used". Is this a case where
> small number of connections are used? And the two connections
> relatively overload the real servers?
>
> > For example, we have client1, client2, ..., client5 and real service
> > service1, service2, service3. All clients have the same ipvs config,
> > and each of them will create 2 long TCP connect to the virtual
> > service. Therefore, all the clients will connect to service1 and
> > service2, leaving service3 free.
>
>         You mean, there are many IPVS directors with same
> config and each director gets 2 connections? Third connection
> will get real server #3, right ? Also, are the clients local
> to the director?
>
> > Fix this by randomize the start of dest service to RR scheduler when
>
>         ..."randomizing the starting destination when"
>

Nice description :/

> > IP_VS_SVC_F_SCHED_RR_RANDOM is set.
> >
> > Signed-off-by: Menglong Dong <imagedong@tencent.com>
> > ---
> >  include/uapi/linux/ip_vs.h    |  2 ++
> >  net/netfilter/ipvs/ip_vs_rr.c | 25 ++++++++++++++++++++++++-
> >  2 files changed, 26 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> > index 4102ddcb4e14..7f74bafd3211 100644
> > --- a/include/uapi/linux/ip_vs.h
> > +++ b/include/uapi/linux/ip_vs.h
> > @@ -28,6 +28,8 @@
> >  #define IP_VS_SVC_F_SCHED_SH_FALLBACK        IP_VS_SVC_F_SCHED1 /* SH fallback */
> >  #define IP_VS_SVC_F_SCHED_SH_PORT    IP_VS_SVC_F_SCHED2 /* SH use port */
> >
> > +#define IP_VS_SVC_F_SCHED_RR_RANDOM  IP_VS_SVC_F_SCHED1 /* random start */
> > +
> >  /*
> >   *      Destination Server Flags
> >   */
> > diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
> > index 38495c6f6c7c..e309d97bdd08 100644
> > --- a/net/netfilter/ipvs/ip_vs_rr.c
> > +++ b/net/netfilter/ipvs/ip_vs_rr.c
> > @@ -22,13 +22,36 @@
> >
> >  #include <net/ip_vs.h>
> >
> > +static void ip_vs_rr_random_start(struct ip_vs_service *svc)
> > +{
> > +     struct list_head *cur;
> > +     u32 start;
> > +
> > +     if (!(svc->flags | IP_VS_SVC_F_SCHED_RR_RANDOM) ||
>
>         | -> &
>
> > +         svc->num_dests <= 1)
> > +             return;
> > +
> > +     spin_lock_bh(&svc->sched_lock);
> > +     start = get_random_u32() % svc->num_dests;
>
>         May be prandom is more appropriate for non-crypto purposes.
> Also, not sure if it is a good idea to limit the number of steps,
> eg. to 128...
>
>         start = prandom_u32_max(min(svc->num_dests, 128U));
>

Yeah, prandom_u32_max is a good choice, I'll use it instead.

>         or just use
>
>         start = prandom_u32_max(svc->num_dests);
>
>         Also, this line can be before the spin_lock_bh.
>
> > +     cur = &svc->destinations;
>
>         cur = svc->sched_data;
>
>         ... and to start from current svc->sched_data because
> we are called for every added dest. Better to jump 0..127 steps
> ahead, to avoid delay with long lists?
>

I'm a little afraid that the 'steps' may make the starting dest not
absolutely random, in terms of probability. For example, we have
256 services, and will the services in the middle have more chances
to be chosen as the start? It's just a feeling, I'm not good at
Probability :/

The delay that ip_vs_wrr_gcd_weight() caused is much more than
this case, so maybe the delay here can be ignored?

Thanks!
Menglong Dong
Menglong Dong May 10, 2022, 3:21 a.m. UTC | #3
On Tue, May 10, 2022 at 2:17 AM Julian Anastasov <ja@ssi.bg> wrote:
>
>
>         Hello,
>
> On Mon, 9 May 2022, menglong8.dong@gmail.com wrote:
>
> > From: Menglong Dong <imagedong@tencent.com>
> >
> > For now, the start of the RR scheduler is in the order of dest
> > service added, it will result in imbalance if the load balance
>
>         ...order of added destinations,...
>
> > is done in client side and long connect is used.
>
>         ..."long connections are used". Is this a case where
> small number of connections are used? And the two connections
> relatively overload the real servers?
>
> > For example, we have client1, client2, ..., client5 and real service
> > service1, service2, service3. All clients have the same ipvs config,
> > and each of them will create 2 long TCP connect to the virtual
> > service. Therefore, all the clients will connect to service1 and
> > service2, leaving service3 free.
>
>         You mean, there are many IPVS directors with same
> config and each director gets 2 connections? Third connection
> will get real server #3, right ? Also, are the clients local
> to the director?
>

Sorry to have missed your message here. Yeah, this is what I mean.
And in my case, the directors are local the to client, and each client
only have 2 connections. If the 3th connection happens, it will get #3
real server. And all directors connected to #1 and #2 real servers,
resulting in overload.

> > Fix this by randomize the start of dest service to RR scheduler when
>
>         ..."randomizing the starting destination when"
>
> > IP_VS_SVC_F_SCHED_RR_RANDOM is set.
> >
> > Signed-off-by: Menglong Dong <imagedong@tencent.com>
> > ---
> >  include/uapi/linux/ip_vs.h    |  2 ++
> >  net/netfilter/ipvs/ip_vs_rr.c | 25 ++++++++++++++++++++++++-
> >  2 files changed, 26 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> > index 4102ddcb4e14..7f74bafd3211 100644
> > --- a/include/uapi/linux/ip_vs.h
> > +++ b/include/uapi/linux/ip_vs.h
> > @@ -28,6 +28,8 @@
> >  #define IP_VS_SVC_F_SCHED_SH_FALLBACK        IP_VS_SVC_F_SCHED1 /* SH fallback */
> >  #define IP_VS_SVC_F_SCHED_SH_PORT    IP_VS_SVC_F_SCHED2 /* SH use port */
> >
> > +#define IP_VS_SVC_F_SCHED_RR_RANDOM  IP_VS_SVC_F_SCHED1 /* random start */
> > +
> >  /*
> >   *      Destination Server Flags
> >   */
> > diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
> > index 38495c6f6c7c..e309d97bdd08 100644
> > --- a/net/netfilter/ipvs/ip_vs_rr.c
> > +++ b/net/netfilter/ipvs/ip_vs_rr.c
> > @@ -22,13 +22,36 @@
> >
> >  #include <net/ip_vs.h>
> >
> > +static void ip_vs_rr_random_start(struct ip_vs_service *svc)
> > +{
> > +     struct list_head *cur;
> > +     u32 start;
> > +
> > +     if (!(svc->flags | IP_VS_SVC_F_SCHED_RR_RANDOM) ||
>
>         | -> &
>
> > +         svc->num_dests <= 1)
> > +             return;
> > +
> > +     spin_lock_bh(&svc->sched_lock);
> > +     start = get_random_u32() % svc->num_dests;
>
>         May be prandom is more appropriate for non-crypto purposes.
> Also, not sure if it is a good idea to limit the number of steps,
> eg. to 128...
>
>         start = prandom_u32_max(min(svc->num_dests, 128U));
>
>         or just use
>
>         start = prandom_u32_max(svc->num_dests);
>
>         Also, this line can be before the spin_lock_bh.
>
> > +     cur = &svc->destinations;
>
>         cur = svc->sched_data;
>
>         ... and to start from current svc->sched_data because
> we are called for every added dest. Better to jump 0..127 steps
> ahead, to avoid delay with long lists?
>
> > +     while (start--)
> > +             cur = cur->next;
> > +     svc->sched_data = cur;
> > +     spin_unlock_bh(&svc->sched_lock);
> > +}
> >
> >  static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
> >  {
> >       svc->sched_data = &svc->destinations;
> > +     ip_vs_rr_random_start(svc);
> >       return 0;
> >  }
> >
> > +static int ip_vs_rr_add_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
> > +{
> > +     ip_vs_rr_random_start(svc);
> > +     return 0;
> > +}
> >
> >  static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
> >  {
> > @@ -104,7 +127,7 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
> >       .module =               THIS_MODULE,
> >       .n_list =               LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
> >       .init_service =         ip_vs_rr_init_svc,
> > -     .add_dest =             NULL,
> > +     .add_dest =             ip_vs_rr_add_dest,
> >       .del_dest =             ip_vs_rr_del_dest,
> >       .schedule =             ip_vs_rr_schedule,
> >  };
> > --
> > 2.36.0
>
> Regards
>
> --
> Julian Anastasov <ja@ssi.bg>
>
diff mbox series

Patch

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..7f74bafd3211 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -28,6 +28,8 @@ 
 #define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
 #define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
 
+#define IP_VS_SVC_F_SCHED_RR_RANDOM	IP_VS_SVC_F_SCHED1 /* random start */
+
 /*
  *      Destination Server Flags
  */
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index 38495c6f6c7c..e309d97bdd08 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -22,13 +22,36 @@ 
 
 #include <net/ip_vs.h>
 
+static void ip_vs_rr_random_start(struct ip_vs_service *svc)
+{
+	struct list_head *cur;
+	u32 start;
+
+	if (!(svc->flags | IP_VS_SVC_F_SCHED_RR_RANDOM) ||
+	    svc->num_dests <= 1)
+		return;
+
+	spin_lock_bh(&svc->sched_lock);
+	start = get_random_u32() % svc->num_dests;
+	cur = &svc->destinations;
+	while (start--)
+		cur = cur->next;
+	svc->sched_data = cur;
+	spin_unlock_bh(&svc->sched_lock);
+}
 
 static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
 {
 	svc->sched_data = &svc->destinations;
+	ip_vs_rr_random_start(svc);
 	return 0;
 }
 
+static int ip_vs_rr_add_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+	ip_vs_rr_random_start(svc);
+	return 0;
+}
 
 static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
 {
@@ -104,7 +127,7 @@  static struct ip_vs_scheduler ip_vs_rr_scheduler = {
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
 	.init_service =		ip_vs_rr_init_svc,
-	.add_dest =		NULL,
+	.add_dest =		ip_vs_rr_add_dest,
 	.del_dest =		ip_vs_rr_del_dest,
 	.schedule =		ip_vs_rr_schedule,
 };