diff mbox

fs: only sync() superblocks reachable from the current namespace

Message ID 05434cda5cc3b461b5d70467b094904ad23fdc11.1517007510.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Omar Sandoval Jan. 26, 2018, 10:58 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Currently, the sync() syscall is system-wide, so any process in a
container can cause significant I/O stalls across the system by calling
sync(). This is even true for filesystems which are not accessible in
the process' mount namespace. This patch scopes sync() to only write out
filesystems reachable in the current mount namespace, except for the
initial mount namespace, which still syncs everything to avoid
surprises. This fixes the broken isolation we were seeing here.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/sync.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 14 deletions(-)

Comments

Omar Sandoval Jan. 26, 2018, 11 p.m. UTC | #1
On Fri, Jan 26, 2018 at 02:58:39PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Currently, the sync() syscall is system-wide, so any process in a
> container can cause significant I/O stalls across the system by calling
> sync(). This is even true for filesystems which are not accessible in
> the process' mount namespace. This patch scopes sync() to only write out
> filesystems reachable in the current mount namespace, except for the
> initial mount namespace, which still syncs everything to avoid
> surprises. This fixes the broken isolation we were seeing here.
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/sync.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 51 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/sync.c b/fs/sync.c
> index 6e0a2cbaf6de..bde1e3196298 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -17,6 +17,7 @@
>  #include <linux/quotaops.h>
>  #include <linux/backing-dev.h>
>  #include "internal.h"
> +#include "mount.h"
>  
>  #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
>  			SYNC_FILE_RANGE_WAIT_AFTER)
> @@ -68,16 +69,46 @@ int sync_filesystem(struct super_block *sb)
>  }
>  EXPORT_SYMBOL(sync_filesystem);
>  
> -static void sync_inodes_one_sb(struct super_block *sb, void *arg)
> +struct sb_sync {
> +	/*
> +	 * Only sync superblocks reachable from this namespace. If NULL, sync
> +	 * everything.
> +	 */
> +	struct mnt_namespace *mnt_ns;
> +
> +	/* ->sync_fs() wait argument. */
> +	int wait;
> +};
> +
> +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
> +{
> +	struct mount *mnt;
> +
> +	if (!mnt_ns)
> +		return 1;
> +
> +	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
> +		if (mnt->mnt_ns == mnt_ns)
> +			return 1;
> +	}

Sigh, of course, I forgot to grab the proper locks here. Will send a v2.

> +	return 0;
> +}
> +
> +static void sync_inodes_one_sb(struct super_block *sb, void *p)
>  {
> -	if (!sb_rdonly(sb))
> +	struct sb_sync *arg = p;
> +
> +	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
>  		sync_inodes_sb(sb);
>  }
>  
> -static void sync_fs_one_sb(struct super_block *sb, void *arg)
> +static void sync_fs_one_sb(struct super_block *sb, void *p)
>  {
> -	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
> -		sb->s_op->sync_fs(sb, *(int *)arg);
> +	struct sb_sync *arg = p;
> +
> +	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
> +	    sb->s_op->sync_fs)
> +		sb->s_op->sync_fs(sb, arg->wait);
>  }
>  
>  static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> @@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
>   */
>  SYSCALL_DEFINE0(sync)
>  {
> -	int nowait = 0, wait = 1;
> +	struct sb_sync arg = {
> +		.mnt_ns = current->nsproxy->mnt_ns,
> +	};
> +
> +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> +		arg.mnt_ns = NULL;
>  
>  	wakeup_flusher_threads(WB_REASON_SYNC);
> -	iterate_supers(sync_inodes_one_sb, NULL);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &wait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
> +	arg.wait = 1;
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
>  	iterate_bdevs(fdatawait_one_bdev, NULL);
>  	if (unlikely(laptop_mode))
> @@ -122,17 +159,17 @@ SYSCALL_DEFINE0(sync)
>  
>  static void do_sync_work(struct work_struct *work)
>  {
> -	int nowait = 0;
> +	struct sb_sync arg = {};
>  
>  	/*
>  	 * Sync twice to reduce the possibility we skipped some inodes / pages
>  	 * because they were temporarily locked
>  	 */
> -	iterate_supers(sync_inodes_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
> -	iterate_supers(sync_inodes_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
>  	printk("Emergency Sync complete\n");
>  	kfree(work);
> -- 
> 2.16.1
>
Al Viro Jan. 26, 2018, 11:13 p.m. UTC | #2
On Fri, Jan 26, 2018 at 02:58:39PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Currently, the sync() syscall is system-wide, so any process in a
> container can cause significant I/O stalls across the system by calling
> sync(). This is even true for filesystems which are not accessible in
> the process' mount namespace. This patch scopes sync() to only write out
> filesystems reachable in the current mount namespace, except for the
> initial mount namespace, which still syncs everything to avoid
> surprises. This fixes the broken isolation we were seeing here.

> +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
> +{
> +	struct mount *mnt;
> +
> +	if (!mnt_ns)
> +		return 1;
> +
> +	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
> +		if (mnt->mnt_ns == mnt_ns)
> +			return 1;
> +	}
> +	return 0;
> +}

Erm...  And just what is protecting the list here?

>  static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> @@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
>   */
>  SYSCALL_DEFINE0(sync)
>  {
> -	int nowait = 0, wait = 1;
> +	struct sb_sync arg = {
> +		.mnt_ns = current->nsproxy->mnt_ns,
> +	};
> +
> +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> +		arg.mnt_ns = NULL;
>  
>  	wakeup_flusher_threads(WB_REASON_SYNC);
> -	iterate_supers(sync_inodes_one_sb, NULL);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &wait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
> +	arg.wait = 1;
> +	iterate_supers(sync_fs_one_sb, &arg);

So now sync() includes O(total vfsmounts on the system) walking the lists, no
matter what *and* in a situation when a lazy-unmounted filesystem is held active
by an opened file sync(2) won't touch that filesystem.  Unless done in the
magical namespace init(8) happens to run in.
Al Viro Jan. 26, 2018, 11:17 p.m. UTC | #3
On Fri, Jan 26, 2018 at 11:13:51PM +0000, Al Viro wrote:

> Erm...  And just what is protecting the list here?
> 
> >  static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> > @@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> >   */
> >  SYSCALL_DEFINE0(sync)
> >  {
> > -	int nowait = 0, wait = 1;
> > +	struct sb_sync arg = {
> > +		.mnt_ns = current->nsproxy->mnt_ns,
> > +	};
> > +
> > +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> > +		arg.mnt_ns = NULL;
> >  
> >  	wakeup_flusher_threads(WB_REASON_SYNC);
> > -	iterate_supers(sync_inodes_one_sb, NULL);
> > -	iterate_supers(sync_fs_one_sb, &nowait);
> > -	iterate_supers(sync_fs_one_sb, &wait);
> > +	iterate_supers(sync_inodes_one_sb, &arg);
> > +	iterate_supers(sync_fs_one_sb, &arg);
> > +	arg.wait = 1;
> > +	iterate_supers(sync_fs_one_sb, &arg);
> 
> So now sync() includes O(total vfsmounts on the system) walking the lists, no
> matter what *and* in a situation when a lazy-unmounted filesystem is held active
> by an opened file sync(2) won't touch that filesystem.  Unless done in the
> magical namespace init(8) happens to run in.

BTW, if your process happens to have inherited an opened file from parent,
then unshares the namespace and unmounts the filesystem that file came
from, sync(2) won't affect the writes on that descriptor.
Omar Sandoval Jan. 26, 2018, 11:46 p.m. UTC | #4
On Fri, Jan 26, 2018 at 11:17:29PM +0000, Al Viro wrote:
> On Fri, Jan 26, 2018 at 11:13:51PM +0000, Al Viro wrote:
> 
> > Erm...  And just what is protecting the list here?

Addressed in v2/v3, sorry about that.

> > >  static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> > > @@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> > >   */
> > >  SYSCALL_DEFINE0(sync)
> > >  {
> > > -	int nowait = 0, wait = 1;
> > > +	struct sb_sync arg = {
> > > +		.mnt_ns = current->nsproxy->mnt_ns,
> > > +	};
> > > +
> > > +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> > > +		arg.mnt_ns = NULL;
> > >  
> > >  	wakeup_flusher_threads(WB_REASON_SYNC);
> > > -	iterate_supers(sync_inodes_one_sb, NULL);
> > > -	iterate_supers(sync_fs_one_sb, &nowait);
> > > -	iterate_supers(sync_fs_one_sb, &wait);
> > > +	iterate_supers(sync_inodes_one_sb, &arg);
> > > +	iterate_supers(sync_fs_one_sb, &arg);
> > > +	arg.wait = 1;
> > > +	iterate_supers(sync_fs_one_sb, &arg);
> > 
> > So now sync() includes O(total vfsmounts on the system) walking the lists, no
> > matter what

This bit I thought about, and the only alternative I could come up with
was walking mnt_ns->list which makes it O(vfsmounts in the namespace)
which is at least capped at sysctl_mount_max. The downside of that is
that we have to track duplicates to handle super blocks mounted in more
than one place. And, of course, it doesn't address your other points.

> > *and* in a situation when a lazy-unmounted filesystem is held active
> > by an opened file sync(2) won't touch that filesystem.  Unless done in the
> > magical namespace init(8) happens to run in.

Hm, so we could always sync the filesystem if s_mounts is empty to cover
this.

> BTW, if your process happens to have inherited an opened file from parent,
> then unshares the namespace and unmounts the filesystem that file came
> from, sync(2) won't affect the writes on that descriptor.

But for this, I don't have any good ideas. Either we track these things
in the hot path or do something O(number of open fds). Do you have any
suggestions?
diff mbox

Patch

diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..bde1e3196298 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -17,6 +17,7 @@ 
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -68,16 +69,46 @@  int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
-static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+struct sb_sync {
+	/*
+	 * Only sync superblocks reachable from this namespace. If NULL, sync
+	 * everything.
+	 */
+	struct mnt_namespace *mnt_ns;
+
+	/* ->sync_fs() wait argument. */
+	int wait;
+};
+
+static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
+{
+	struct mount *mnt;
+
+	if (!mnt_ns)
+		return 1;
+
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_ns == mnt_ns)
+			return 1;
+	}
+	return 0;
+}
+
+static void sync_inodes_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb))
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
 		sync_inodes_sb(sb);
 }
 
-static void sync_fs_one_sb(struct super_block *sb, void *arg)
+static void sync_fs_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
+	    sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, arg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -107,12 +138,18 @@  static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
-	int nowait = 0, wait = 1;
+	struct sb_sync arg = {
+		.mnt_ns = current->nsproxy->mnt_ns,
+	};
+
+	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
+		arg.mnt_ns = NULL;
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
+	arg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
@@ -122,17 +159,17 @@  SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sb_sync arg = {};
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);