diff mbox series

[1/1] net/ipv4/inet_fragment: Batch fqdir destroy works

Message ID 20201208094529.23266-2-sjpark@amazon.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net: Reduce rcu_barrier() contentions from 'unshare(CLONE_NEWNET)' | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 8012 this patch: 8012
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 90 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 8126 this patch: 8126
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

SeongJae Park Dec. 8, 2020, 9:45 a.m. UTC
From: SeongJae Park <sjpark@amazon.de>

In 'fqdir_exit()', a work for destruction of the 'fqdir' is enqueued.
The work function, 'fqdir_work_fn()', calls 'rcu_barrier()'.  In case of
intensive 'fqdir_exit()' (e.g., frequent 'unshare(CLONE_NEWNET)'
systemcalls), this increased contention could result in unacceptably
high latency of 'rcu_barrier()'.  This commit avoids such contention by
doing the destruction in batched manner, as similar to that of
'cleanup_net()'.

Signed-off-by: SeongJae Park <sjpark@amazon.de>
---
 include/net/inet_frag.h  |  2 +-
 net/ipv4/inet_fragment.c | 28 ++++++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

Comments

Jakub Kicinski Dec. 9, 2020, 11:16 p.m. UTC | #1
On Tue, 8 Dec 2020 10:45:29 +0100 SeongJae Park wrote:
> From: SeongJae Park <sjpark@amazon.de>
> 
> In 'fqdir_exit()', a work for destruction of the 'fqdir' is enqueued.
> The work function, 'fqdir_work_fn()', calls 'rcu_barrier()'.  In case of
> intensive 'fqdir_exit()' (e.g., frequent 'unshare(CLONE_NEWNET)'
> systemcalls), this increased contention could result in unacceptably
> high latency of 'rcu_barrier()'.  This commit avoids such contention by
> doing the destruction in batched manner, as similar to that of
> 'cleanup_net()'.
> 
> Signed-off-by: SeongJae Park <sjpark@amazon.de>

Looks fine to me, but you haven't CCed Florian or Eric who where the
last two people to touch this function. Please repost CCing them and
fixing the nit below, thanks!

>  static void fqdir_work_fn(struct work_struct *work)
>  {
> -	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
> -	struct inet_frags *f = fqdir->f;
> +	struct llist_node *kill_list;
> +	struct fqdir *fqdir;
> +	struct inet_frags *f;

nit: reorder fqdir and f to keep reverse xmas tree variable ordering.
Eric Dumazet Dec. 10, 2020, 12:17 a.m. UTC | #2
On 12/8/20 10:45 AM, SeongJae Park wrote:
> From: SeongJae Park <sjpark@amazon.de>
> 
> In 'fqdir_exit()', a work for destruction of the 'fqdir' is enqueued.
> The work function, 'fqdir_work_fn()', calls 'rcu_barrier()'.  In case of
> intensive 'fqdir_exit()' (e.g., frequent 'unshare(CLONE_NEWNET)'
> systemcalls), this increased contention could result in unacceptably
> high latency of 'rcu_barrier()'.  This commit avoids such contention by
> doing the destruction in batched manner, as similar to that of
> 'cleanup_net()'.

Any numbers to share ? I have never seen an issue.

> 
> Signed-off-by: SeongJae Park <sjpark@amazon.de>
> ---
>  include/net/inet_frag.h  |  2 +-
>  net/ipv4/inet_fragment.c | 28 ++++++++++++++++++++--------
>  2 files changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index bac79e817776..558893d8810c 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -20,7 +20,7 @@ struct fqdir {
>  
>  	/* Keep atomic mem on separate cachelines in structs that include it */
>  	atomic_long_t		mem ____cacheline_aligned_in_smp;
> -	struct work_struct	destroy_work;
> +	struct llist_node	destroy_list;
>  };
>  
>  /**
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 10d31733297d..796b559137c5 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -145,12 +145,19 @@ static void inet_frags_free_cb(void *ptr, void *arg)
>  		inet_frag_destroy(fq);
>  }
>  
> +static LLIST_HEAD(destroy_list);
> +
>  static void fqdir_work_fn(struct work_struct *work)
>  {
> -	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
> -	struct inet_frags *f = fqdir->f;
> +	struct llist_node *kill_list;
> +	struct fqdir *fqdir;
> +	struct inet_frags *f;
> +
> +	/* Atomically snapshot the list of fqdirs to destroy */
> +	kill_list = llist_del_all(&destroy_list);
>  
> -	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
> +	llist_for_each_entry(fqdir, kill_list, destroy_list)
> +		rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
> 


OK, it seems rhashtable_free_and_destroy() has cond_resched() so we are not going
to hold this cpu for long periods.
 
>  	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
>  	 * have completed, since they need to dereference fqdir.
> @@ -158,10 +165,13 @@ static void fqdir_work_fn(struct work_struct *work)
>  	 */
>  	rcu_barrier();
>  
> -	if (refcount_dec_and_test(&f->refcnt))
> -		complete(&f->completion);
> +	llist_for_each_entry(fqdir, kill_list, destroy_list) {

Don't we need the llist_for_each_entry_safe() variant here ???

> +		f = fqdir->f;
> +		if (refcount_dec_and_test(&f->refcnt))
> +			complete(&f->completion);
>  
> -	kfree(fqdir);
> +		kfree(fqdir);
> +	}
>  }
>  
>  int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
> @@ -184,10 +194,12 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
>  }
>  EXPORT_SYMBOL(fqdir_init);
>  
> +static DECLARE_WORK(fqdir_destroy_work, fqdir_work_fn);
> +
>  void fqdir_exit(struct fqdir *fqdir)
>  {
> -	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
> -	queue_work(system_wq, &fqdir->destroy_work);
> +	if (llist_add(&fqdir->destroy_list, &destroy_list))
> +		queue_work(system_wq, &fqdir_destroy_work);
>  }
>  EXPORT_SYMBOL(fqdir_exit);
>  
>
SeongJae Park Dec. 10, 2020, 6:43 a.m. UTC | #3
On Wed, 9 Dec 2020 15:16:59 -0800 Jakub Kicinski <kuba@kernel.org> wrote:

> On Tue, 8 Dec 2020 10:45:29 +0100 SeongJae Park wrote:
> > From: SeongJae Park <sjpark@amazon.de>
> > 
> > In 'fqdir_exit()', a work for destruction of the 'fqdir' is enqueued.
> > The work function, 'fqdir_work_fn()', calls 'rcu_barrier()'.  In case of
> > intensive 'fqdir_exit()' (e.g., frequent 'unshare(CLONE_NEWNET)'
> > systemcalls), this increased contention could result in unacceptably
> > high latency of 'rcu_barrier()'.  This commit avoids such contention by
> > doing the destruction in batched manner, as similar to that of
> > 'cleanup_net()'.
> > 
> > Signed-off-by: SeongJae Park <sjpark@amazon.de>
> 
> Looks fine to me, but you haven't CCed Florian or Eric who where the
> last two people to touch this function. Please repost CCing them and
> fixing the nit below, thanks!

Thank you for let me know that.  I will send the next version so.

> 
> >  static void fqdir_work_fn(struct work_struct *work)
> >  {
> > -	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
> > -	struct inet_frags *f = fqdir->f;
> > +	struct llist_node *kill_list;
> > +	struct fqdir *fqdir;
> > +	struct inet_frags *f;
> 
> nit: reorder fqdir and f to keep reverse xmas tree variable ordering.

Hehe, ok, I will. :)


Thanks,
SeongJae Park
SeongJae Park Dec. 10, 2020, 7:27 a.m. UTC | #4
On Thu, 10 Dec 2020 01:17:58 +0100 Eric Dumazet <eric.dumazet@gmail.com> wrote:

> 
> 
> On 12/8/20 10:45 AM, SeongJae Park wrote:
> > From: SeongJae Park <sjpark@amazon.de>
> > 
> > In 'fqdir_exit()', a work for destruction of the 'fqdir' is enqueued.
> > The work function, 'fqdir_work_fn()', calls 'rcu_barrier()'.  In case of
> > intensive 'fqdir_exit()' (e.g., frequent 'unshare(CLONE_NEWNET)'
> > systemcalls), this increased contention could result in unacceptably
> > high latency of 'rcu_barrier()'.  This commit avoids such contention by
> > doing the destruction in batched manner, as similar to that of
> > 'cleanup_net()'.
> 
> Any numbers to share ? I have never seen an issue.

On our 40 CPU cores / 70GB DRAM machine, 15GB of available memory was reduced
within 2 minutes while my artificial reproducer runs.  The reproducer merely
repeats 'unshare(CLONE_NEWNET)' in a loop for 50,000 times.  The reproducer is
not only artificial but resembles the behavior of our real workloads.
While the reproducer runs, 'cleanup_net()' was called only 4 times.  First two
calls quickly finished, but third call took about 30 seconds, and the fourth
call didn't finished until the reproducer finishes.  We also confirmed the
third and fourth calls just waiting for 'rcu_barrier()'.

I think you've not seen this issue before because we are doing very intensive
'unshare()' calls.  Also, this is not reproducible on every hardware.  On my 6
CPU machine, the problem didn't reproduce.

> 
> > 
> > Signed-off-by: SeongJae Park <sjpark@amazon.de>
> > ---
> >  include/net/inet_frag.h  |  2 +-
> >  net/ipv4/inet_fragment.c | 28 ++++++++++++++++++++--------
> >  2 files changed, 21 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> > index bac79e817776..558893d8810c 100644
> > --- a/include/net/inet_frag.h
> > +++ b/include/net/inet_frag.h
> > @@ -20,7 +20,7 @@ struct fqdir {
> >  
> >  	/* Keep atomic mem on separate cachelines in structs that include it */
> >  	atomic_long_t		mem ____cacheline_aligned_in_smp;
> > -	struct work_struct	destroy_work;
> > +	struct llist_node	destroy_list;
> >  };
> >  
> >  /**
> > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> > index 10d31733297d..796b559137c5 100644
> > --- a/net/ipv4/inet_fragment.c
> > +++ b/net/ipv4/inet_fragment.c
> > @@ -145,12 +145,19 @@ static void inet_frags_free_cb(void *ptr, void *arg)
> >  		inet_frag_destroy(fq);
> >  }
> >  
> > +static LLIST_HEAD(destroy_list);
> > +
> >  static void fqdir_work_fn(struct work_struct *work)
> >  {
> > -	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
> > -	struct inet_frags *f = fqdir->f;
> > +	struct llist_node *kill_list;
> > +	struct fqdir *fqdir;
> > +	struct inet_frags *f;
> > +
> > +	/* Atomically snapshot the list of fqdirs to destroy */
> > +	kill_list = llist_del_all(&destroy_list);
> >  
> > -	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
> > +	llist_for_each_entry(fqdir, kill_list, destroy_list)
> > +		rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
> > 
> 
> 
> OK, it seems rhashtable_free_and_destroy() has cond_resched() so we are not going
> to hold this cpu for long periods.
>  
> >  	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
> >  	 * have completed, since they need to dereference fqdir.
> > @@ -158,10 +165,13 @@ static void fqdir_work_fn(struct work_struct *work)
> >  	 */
> >  	rcu_barrier();
> >  
> > -	if (refcount_dec_and_test(&f->refcnt))
> > -		complete(&f->completion);
> > +	llist_for_each_entry(fqdir, kill_list, destroy_list) {
> 
> Don't we need the llist_for_each_entry_safe() variant here ???

Oh, indeed.  I will do so in the next version.

> 
> > +		f = fqdir->f;
> > +		if (refcount_dec_and_test(&f->refcnt))
> > +			complete(&f->completion);
> >  
> > -	kfree(fqdir);
> > +		kfree(fqdir);
> > +	}


Thanks,
SeongJae Park
diff mbox series

Patch

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index bac79e817776..558893d8810c 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -20,7 +20,7 @@  struct fqdir {
 
 	/* Keep atomic mem on separate cachelines in structs that include it */
 	atomic_long_t		mem ____cacheline_aligned_in_smp;
-	struct work_struct	destroy_work;
+	struct llist_node	destroy_list;
 };
 
 /**
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 10d31733297d..796b559137c5 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,12 +145,19 @@  static void inet_frags_free_cb(void *ptr, void *arg)
 		inet_frag_destroy(fq);
 }
 
+static LLIST_HEAD(destroy_list);
+
 static void fqdir_work_fn(struct work_struct *work)
 {
-	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
-	struct inet_frags *f = fqdir->f;
+	struct llist_node *kill_list;
+	struct fqdir *fqdir;
+	struct inet_frags *f;
+
+	/* Atomically snapshot the list of fqdirs to destroy */
+	kill_list = llist_del_all(&destroy_list);
 
-	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+	llist_for_each_entry(fqdir, kill_list, destroy_list)
+		rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 
 	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
 	 * have completed, since they need to dereference fqdir.
@@ -158,10 +165,13 @@  static void fqdir_work_fn(struct work_struct *work)
 	 */
 	rcu_barrier();
 
-	if (refcount_dec_and_test(&f->refcnt))
-		complete(&f->completion);
+	llist_for_each_entry(fqdir, kill_list, destroy_list) {
+		f = fqdir->f;
+		if (refcount_dec_and_test(&f->refcnt))
+			complete(&f->completion);
 
-	kfree(fqdir);
+		kfree(fqdir);
+	}
 }
 
 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
@@ -184,10 +194,12 @@  int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
 }
 EXPORT_SYMBOL(fqdir_init);
 
+static DECLARE_WORK(fqdir_destroy_work, fqdir_work_fn);
+
 void fqdir_exit(struct fqdir *fqdir)
 {
-	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
-	queue_work(system_wq, &fqdir->destroy_work);
+	if (llist_add(&fqdir->destroy_list, &destroy_list))
+		queue_work(system_wq, &fqdir_destroy_work);
 }
 EXPORT_SYMBOL(fqdir_exit);