diff mbox series

[3/3] xfs: Fix stale data exposure when readahead races with hole punch

Message ID 20190711140012.1671-4-jack@suse.cz (mailing list archive)
State New, archived
Headers show
Series xfs: Fix races between readahead and hole punching | expand

Commit Message

Jan Kara July 11, 2019, 2 p.m. UTC
Hole puching currently evicts pages from page cache and then goes on to
remove blocks from the inode. This happens under both XFS_IOLOCK_EXCL
and XFS_MMAPLOCK_EXCL which provides appropriate serialization with
racing reads or page faults. However there is currently nothing that
prevents readahead triggered by fadvise() or madvise() from racing with
the hole punch and instantiating page cache page after hole punching has
evicted page cache in xfs_flush_unmap_range() but before it has removed
blocks from the inode. This page cache page will be mapping soon to be
freed block and that can lead to returning stale data to userspace or
even filesystem corruption.

Fix the problem by protecting handling of readahead requests by
XFS_IOLOCK_SHARED similarly as we protect reads.

CC: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxjQNmxqmtA_VbYW0Su9rKRk2zobJmahcyeaEVOFKVQ5dw@mail.gmail.com/
Reported-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/xfs/xfs_file.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

Comments

Amir Goldstein July 11, 2019, 3:28 p.m. UTC | #1
On Thu, Jul 11, 2019 at 5:00 PM Jan Kara <jack@suse.cz> wrote:
>
> Hole puching currently evicts pages from page cache and then goes on to
> remove blocks from the inode. This happens under both XFS_IOLOCK_EXCL
> and XFS_MMAPLOCK_EXCL which provides appropriate serialization with
> racing reads or page faults. However there is currently nothing that
> prevents readahead triggered by fadvise() or madvise() from racing with
> the hole punch and instantiating page cache page after hole punching has
> evicted page cache in xfs_flush_unmap_range() but before it has removed
> blocks from the inode. This page cache page will be mapping soon to be
> freed block and that can lead to returning stale data to userspace or
> even filesystem corruption.
>
> Fix the problem by protecting handling of readahead requests by
> XFS_IOLOCK_SHARED similarly as we protect reads.
>
> CC: stable@vger.kernel.org
> Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxjQNmxqmtA_VbYW0Su9rKRk2zobJmahcyeaEVOFKVQ5dw@mail.gmail.com/
> Reported-by: Amir Goldstein <amir73il@gmail.com>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---

Looks sane. (I'll let xfs developers offer reviewed-by tags)

>  fs/xfs/xfs_file.c | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 76748255f843..88fe3dbb3ba2 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -33,6 +33,7 @@
>  #include <linux/pagevec.h>
>  #include <linux/backing-dev.h>
>  #include <linux/mman.h>
> +#include <linux/fadvise.h>
>
>  static const struct vm_operations_struct xfs_file_vm_ops;
>
> @@ -939,6 +940,24 @@ xfs_file_fallocate(
>         return error;
>  }
>
> +STATIC int
> +xfs_file_fadvise(
> +       struct file *file,
> +       loff_t start,
> +       loff_t end,
> +       int advice)
> +{
> +       struct xfs_inode *ip = XFS_I(file_inode(file));
> +       int ret;
> +
> +       /* Readahead needs protection from hole punching and similar ops */
> +       if (advice == POSIX_FADV_WILLNEED)
> +               xfs_ilock(ip, XFS_IOLOCK_SHARED);
> +       ret = generic_fadvise(file, start, end, advice);
> +       if (advice == POSIX_FADV_WILLNEED)
> +               xfs_iunlock(ip, XFS_IOLOCK_SHARED);
> +       return ret;
> +}
>
>  STATIC loff_t
>  xfs_file_remap_range(
> @@ -1235,6 +1254,7 @@ const struct file_operations xfs_file_operations = {
>         .fsync          = xfs_file_fsync,
>         .get_unmapped_area = thp_get_unmapped_area,
>         .fallocate      = xfs_file_fallocate,
> +       .fadvise        = xfs_file_fadvise,
>         .remap_file_range = xfs_file_remap_range,
>  };
>
> --
> 2.16.4
>
Darrick J. Wong July 11, 2019, 3:49 p.m. UTC | #2
On Thu, Jul 11, 2019 at 06:28:54PM +0300, Amir Goldstein wrote:
> On Thu, Jul 11, 2019 at 5:00 PM Jan Kara <jack@suse.cz> wrote:
> >
> > Hole puching currently evicts pages from page cache and then goes on to
> > remove blocks from the inode. This happens under both XFS_IOLOCK_EXCL
> > and XFS_MMAPLOCK_EXCL which provides appropriate serialization with
> > racing reads or page faults. However there is currently nothing that
> > prevents readahead triggered by fadvise() or madvise() from racing with
> > the hole punch and instantiating page cache page after hole punching has
> > evicted page cache in xfs_flush_unmap_range() but before it has removed
> > blocks from the inode. This page cache page will be mapping soon to be
> > freed block and that can lead to returning stale data to userspace or
> > even filesystem corruption.
> >
> > Fix the problem by protecting handling of readahead requests by
> > XFS_IOLOCK_SHARED similarly as we protect reads.
> >
> > CC: stable@vger.kernel.org
> > Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxjQNmxqmtA_VbYW0Su9rKRk2zobJmahcyeaEVOFKVQ5dw@mail.gmail.com/
> > Reported-by: Amir Goldstein <amir73il@gmail.com>
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> 
> Looks sane. (I'll let xfs developers offer reviewed-by tags)
> 
> >  fs/xfs/xfs_file.c | 20 ++++++++++++++++++++
> >  1 file changed, 20 insertions(+)
> >
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 76748255f843..88fe3dbb3ba2 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -33,6 +33,7 @@
> >  #include <linux/pagevec.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/mman.h>
> > +#include <linux/fadvise.h>
> >
> >  static const struct vm_operations_struct xfs_file_vm_ops;
> >
> > @@ -939,6 +940,24 @@ xfs_file_fallocate(
> >         return error;
> >  }
> >
> > +STATIC int
> > +xfs_file_fadvise(
> > +       struct file *file,
> > +       loff_t start,
> > +       loff_t end,
> > +       int advice)

Indentation needs fixing here.

> > +{
> > +       struct xfs_inode *ip = XFS_I(file_inode(file));
> > +       int ret;
> > +
> > +       /* Readahead needs protection from hole punching and similar ops */
> > +       if (advice == POSIX_FADV_WILLNEED)
> > +               xfs_ilock(ip, XFS_IOLOCK_SHARED);

It's good to fix this race, but at the same time I wonder what's the
impact to processes writing to one part of a file waiting on IOLOCK_EXCL
while readahead holds IOLOCK_SHARED?

(bluh bluh range locks ftw bluh bluh)

Do we need a lock for DONTNEED?  I think the answer is that you have to
lock the page to drop it and that will protect us from <myriad punch and
truncate spaghetti> ... ?

> > +       ret = generic_fadvise(file, start, end, advice);
> > +       if (advice == POSIX_FADV_WILLNEED)
> > +               xfs_iunlock(ip, XFS_IOLOCK_SHARED);

Maybe it'd be better to do:

	int	lockflags = 0;

	if (advice == POSIX_FADV_WILLNEED) {
		lockflags = XFS_IOLOCK_SHARED;
		xfs_ilock(ip, lockflags);
	}

	ret = generic_fadvise(file, start, end, advice);

	if (lockflags)
		xfs_iunlock(ip, lockflags);

Just in case we some day want more or different types of inode locks?

--D

> > +       return ret;
> > +}
> >
> >  STATIC loff_t
> >  xfs_file_remap_range(
> > @@ -1235,6 +1254,7 @@ const struct file_operations xfs_file_operations = {
> >         .fsync          = xfs_file_fsync,
> >         .get_unmapped_area = thp_get_unmapped_area,
> >         .fallocate      = xfs_file_fallocate,
> > +       .fadvise        = xfs_file_fadvise,
> >         .remap_file_range = xfs_file_remap_range,
> >  };
> >
> > --
> > 2.16.4
> >
Jan Kara July 12, 2019, noon UTC | #3
On Thu 11-07-19 08:49:17, Darrick J. Wong wrote:
> On Thu, Jul 11, 2019 at 06:28:54PM +0300, Amir Goldstein wrote:
> > > +{
> > > +       struct xfs_inode *ip = XFS_I(file_inode(file));
> > > +       int ret;
> > > +
> > > +       /* Readahead needs protection from hole punching and similar ops */
> > > +       if (advice == POSIX_FADV_WILLNEED)
> > > +               xfs_ilock(ip, XFS_IOLOCK_SHARED);
> 
> It's good to fix this race, but at the same time I wonder what's the
> impact to processes writing to one part of a file waiting on IOLOCK_EXCL
> while readahead holds IOLOCK_SHARED?
> 
> (bluh bluh range locks ftw bluh bluh)

Yeah, with range locks this would have less impact. Also note that we hold
the lock only during page setup and IO submission. IO itself will already
happen without IOLOCK, only under page lock. But that's enough to stop the
race.

> Do we need a lock for DONTNEED?  I think the answer is that you have to
> lock the page to drop it and that will protect us from <myriad punch and
> truncate spaghetti> ... ?

Yeah, DONTNEED is just page writeback + invalidate. So page lock is enough
to protect from anything bad. Essentially we need IOLOCK only to protect
the places that creates new pages in page cache.

> > > +       ret = generic_fadvise(file, start, end, advice);
> > > +       if (advice == POSIX_FADV_WILLNEED)
> > > +               xfs_iunlock(ip, XFS_IOLOCK_SHARED);
> 
> Maybe it'd be better to do:
> 
> 	int	lockflags = 0;
> 
> 	if (advice == POSIX_FADV_WILLNEED) {
> 		lockflags = XFS_IOLOCK_SHARED;
> 		xfs_ilock(ip, lockflags);
> 	}
> 
> 	ret = generic_fadvise(file, start, end, advice);
> 
> 	if (lockflags)
> 		xfs_iunlock(ip, lockflags);
> 
> Just in case we some day want more or different types of inode locks?

OK, will do. Just I'll get to testing this only after I return from
vacation.

								Honza
Darrick J. Wong July 12, 2019, 5:56 p.m. UTC | #4
On Fri, Jul 12, 2019 at 02:00:04PM +0200, Jan Kara wrote:
> On Thu 11-07-19 08:49:17, Darrick J. Wong wrote:
> > On Thu, Jul 11, 2019 at 06:28:54PM +0300, Amir Goldstein wrote:
> > > > +{
> > > > +       struct xfs_inode *ip = XFS_I(file_inode(file));
> > > > +       int ret;
> > > > +
> > > > +       /* Readahead needs protection from hole punching and similar ops */
> > > > +       if (advice == POSIX_FADV_WILLNEED)
> > > > +               xfs_ilock(ip, XFS_IOLOCK_SHARED);
> > 
> > It's good to fix this race, but at the same time I wonder what's the
> > impact to processes writing to one part of a file waiting on IOLOCK_EXCL
> > while readahead holds IOLOCK_SHARED?
> > 
> > (bluh bluh range locks ftw bluh bluh)
> 
> Yeah, with range locks this would have less impact. Also note that we hold
> the lock only during page setup and IO submission. IO itself will already
> happen without IOLOCK, only under page lock. But that's enough to stop the
> race.

> > Do we need a lock for DONTNEED?  I think the answer is that you have to
> > lock the page to drop it and that will protect us from <myriad punch and
> > truncate spaghetti> ... ?
> 
> Yeah, DONTNEED is just page writeback + invalidate. So page lock is enough
> to protect from anything bad. Essentially we need IOLOCK only to protect
> the places that creates new pages in page cache.
> 
> > > > +       ret = generic_fadvise(file, start, end, advice);
> > > > +       if (advice == POSIX_FADV_WILLNEED)
> > > > +               xfs_iunlock(ip, XFS_IOLOCK_SHARED);
> > 
> > Maybe it'd be better to do:
> > 
> > 	int	lockflags = 0;
> > 
> > 	if (advice == POSIX_FADV_WILLNEED) {
> > 		lockflags = XFS_IOLOCK_SHARED;
> > 		xfs_ilock(ip, lockflags);
> > 	}
> > 
> > 	ret = generic_fadvise(file, start, end, advice);
> > 
> > 	if (lockflags)
> > 		xfs_iunlock(ip, lockflags);
> > 
> > Just in case we some day want more or different types of inode locks?
> 
> OK, will do. Just I'll get to testing this only after I return from
> vacation.

<nod>

--D
> 
> 								Honza
> 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 76748255f843..88fe3dbb3ba2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -33,6 +33,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/mman.h>
+#include <linux/fadvise.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -939,6 +940,24 @@  xfs_file_fallocate(
 	return error;
 }
 
+STATIC int
+xfs_file_fadvise(
+	struct file *file,
+	loff_t start,
+	loff_t end,
+	int advice)
+{
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	int ret;
+
+	/* Readahead needs protection from hole punching and similar ops */
+	if (advice == POSIX_FADV_WILLNEED)
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = generic_fadvise(file, start, end, advice);
+	if (advice == POSIX_FADV_WILLNEED)
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
+}
 
 STATIC loff_t
 xfs_file_remap_range(
@@ -1235,6 +1254,7 @@  const struct file_operations xfs_file_operations = {
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
+	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
 };