diff mbox

Batch unmount cleanup

Message ID 1508179936-15591-1-git-send-email-leon.gh.yang@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Leon Yang Oct. 16, 2017, 6:52 p.m. UTC
From: Leon Yang <leon.gh.yang@gmail.com>

Each time the unmounted list is cleanup, synchronize_rcu() is
called, which is relatively costly. Scheduling the cleanup in a
workqueue, similar to what is being done in
net/core/net_namespace.c:cleanup_net, makes unmounting faster
without adding too much overhead. This is useful especially for
servers with many containers where mounting/unmounting happens a
lot.

Signed-off-by: Leon Yang <leon.gh.yang@gmail.com>
---
 fs/namespace.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

Comments

Al Viro Oct. 16, 2017, 10:19 p.m. UTC | #1
On Mon, Oct 16, 2017 at 01:52:16PM -0500, Leon Yang wrote:
> From: Leon Yang <leon.gh.yang@gmail.com>
> 
> Each time the unmounted list is cleanup, synchronize_rcu() is
> called, which is relatively costly. Scheduling the cleanup in a
> workqueue, similar to what is being done in
> net/core/net_namespace.c:cleanup_net, makes unmounting faster
> without adding too much overhead. This is useful especially for
> servers with many containers where mounting/unmounting happens a
> lot.

NAK.  You really do _not_ want to return from umount(2) too early.
Eric W. Biederman Oct. 18, 2017, 1:38 a.m. UTC | #2
Leon Yang <leon.gh.yang@gmail.com> writes:

> From: Leon Yang <leon.gh.yang@gmail.com>
>
> Each time the unmounted list is cleanup, synchronize_rcu() is
> called, which is relatively costly. Scheduling the cleanup in a
> workqueue, similar to what is being done in
> net/core/net_namespace.c:cleanup_net, makes unmounting faster
> without adding too much overhead. This is useful especially for
> servers with many containers where mounting/unmounting happens a
> lot.

So this only matters if the umount calls are serialized in
userspace.  Otherwise multiple parallel copies of synchronize_rcu can
run in different threads.

What are you doing that is leading you to think it is time spent in
namespace_unlock that is your problem.   And what kernel are you doing
it on?

There is the very unfortunate fact that mount propagation requires
global locks of all of the mount namespaces.  So there are limits to
what you can do in paralle in multiple threads.  The syncrhonize_rcu
very much happens without a lock.

Further in the case of tearing down a whole mount namespace or
in the case of using lazy unmounts there is natural patching that takes
place and all of the umounts happen with a single synchronize_rcu
inside a single namespace_unlock.

The synchronize_rcu_expedited that happens in the network stack might
be interesting to play with.   But overall the situation is completely
different than what exists in the network stack.  Batching already
naturally exists.  The function synchronize_rcu is not called under a
lock.

The only thing I can imagine you might profitably queue is the entire
put_mnt_ns operation.  Especially the drop_collected_mounts part where
you could call umount_tree for several mount namespaces simultaneously
while holding the lock.

Without some numbers and a real use case I wouldn't suggest that though
because too much batching has bad side effects, such as extended lock
hold times.

In short:  What are you doing that the kernel is running slow?

Eric



> Signed-off-by: Leon Yang <leon.gh.yang@gmail.com>
> ---
>  fs/namespace.c | 27 +++++++++++++++++++++++----
>  1 file changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 3b601f1..864ce7e 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -68,6 +68,7 @@ static int mnt_group_start = 1;
>  static struct hlist_head *mount_hashtable __read_mostly;
>  static struct hlist_head *mountpoint_hashtable __read_mostly;
>  static struct kmem_cache *mnt_cache __read_mostly;
> +static struct workqueue_struct *unmounted_wq;
>  static DECLARE_RWSEM(namespace_sem);
>  
>  /* /sys/fs */
> @@ -1409,22 +1410,29 @@ EXPORT_SYMBOL(may_umount);
>  
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  
> -static void namespace_unlock(void)
> +static void cleanup_unmounted(struct work_struct *work)
>  {
>  	struct hlist_head head;
> +	down_write(&namespace_sem);
>  
>  	hlist_move_list(&unmounted, &head);
>  
>  	up_write(&namespace_sem);
>  
> -	if (likely(hlist_empty(&head)))
> -		return;
> -
>  	synchronize_rcu();
>  
>  	group_pin_kill(&head);
>  }
>  
> +static DECLARE_WORK(unmounted_cleanup_work, cleanup_unmounted);
> +
> +static void namespace_unlock(void)
> +{
> +	if (!likely(hlist_empty(&unmounted)))
> +		queue_work(unmounted_wq, &unmounted_cleanup_work);
> +	up_write(&namespace_sem);
> +}
> +
>  static inline void namespace_lock(void)
>  {
>  	down_write(&namespace_sem);
> @@ -3276,6 +3284,17 @@ void __init mnt_init(void)
>  	init_mount_tree();
>  }
>  
> +static int __init unmounted_wq_init(void)
> +{
> +	/* Create workqueue for cleanup */
> +	unmounted_wq = create_singlethread_workqueue("unmounted");
> +	if (!unmounted_wq)
> +		panic("Could not create unmounted workq");
> +	return 0;
> +}
> +
> +pure_initcall(unmounted_wq_init);
> +
>  void put_mnt_ns(struct mnt_namespace *ns)
>  {
>  	if (!atomic_dec_and_test(&ns->count))
diff mbox

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index 3b601f1..864ce7e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,6 +68,7 @@  static int mnt_group_start = 1;
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
+static struct workqueue_struct *unmounted_wq;
 static DECLARE_RWSEM(namespace_sem);
 
 /* /sys/fs */
@@ -1409,22 +1410,29 @@  EXPORT_SYMBOL(may_umount);
 
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 
-static void namespace_unlock(void)
+static void cleanup_unmounted(struct work_struct *work)
 {
 	struct hlist_head head;
+	down_write(&namespace_sem);
 
 	hlist_move_list(&unmounted, &head);
 
 	up_write(&namespace_sem);
 
-	if (likely(hlist_empty(&head)))
-		return;
-
 	synchronize_rcu();
 
 	group_pin_kill(&head);
 }
 
+static DECLARE_WORK(unmounted_cleanup_work, cleanup_unmounted);
+
+static void namespace_unlock(void)
+{
+	if (!likely(hlist_empty(&unmounted)))
+		queue_work(unmounted_wq, &unmounted_cleanup_work);
+	up_write(&namespace_sem);
+}
+
 static inline void namespace_lock(void)
 {
 	down_write(&namespace_sem);
@@ -3276,6 +3284,17 @@  void __init mnt_init(void)
 	init_mount_tree();
 }
 
+static int __init unmounted_wq_init(void)
+{
+	/* Create workqueue for cleanup */
+	unmounted_wq = create_singlethread_workqueue("unmounted");
+	if (!unmounted_wq)
+		panic("Could not create unmounted workq");
+	return 0;
+}
+
+pure_initcall(unmounted_wq_init);
+
 void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!atomic_dec_and_test(&ns->count))