diff mbox series

[2/2] xfs: use iomap_valid method to detect stale cached iomaps

Message ID 20220921082959.1411675-3-david@fromorbit.com (mailing list archive)
State New, archived
Headers show
Series iomap/xfs: fix data corruption due to stale cached iomaps | expand

Commit Message

Dave Chinner Sept. 21, 2022, 8:29 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that iomap supports a mechanism to validate cached iomaps for
buffered write operations, hook it up to the XFS buffered write ops
so that we can avoid data corruptions that result from stale cached
iomaps. See:

https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/

or the ->iomap_valid() introduction commit for exact details of the
corruption vector.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_iomap.c | 53 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 4 deletions(-)

Comments

Darrick J. Wong Sept. 22, 2022, 3:44 a.m. UTC | #1
On Wed, Sep 21, 2022 at 06:29:59PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that iomap supports a mechanism to validate cached iomaps for
> buffered write operations, hook it up to the XFS buffered write ops
> so that we can avoid data corruptions that result from stale cached
> iomaps. See:
> 
> https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
> 
> or the ->iomap_valid() introduction commit for exact details of the
> corruption vector.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_iomap.c | 53 ++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 07da03976ec1..2e77ae817e6b 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -91,6 +91,12 @@ xfs_bmbt_to_iomap(
>  	if (xfs_ipincount(ip) &&
>  	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
>  		iomap->flags |= IOMAP_F_DIRTY;
> +
> +	/*
> +	 * Sample the extent tree sequence so that we can detect if the tree
> +	 * changes while the iomap is still being used.
> +	 */
> +	*((int *)&iomap->private) = READ_ONCE(ip->i_df.if_seq);
>  	return 0;
>  }
>  
> @@ -915,6 +921,7 @@ xfs_buffered_write_iomap_begin(
>  	int			allocfork = XFS_DATA_FORK;
>  	int			error = 0;
>  	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	u16			remap_flags = 0;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -926,6 +933,20 @@ xfs_buffered_write_iomap_begin(
>  
>  	ASSERT(!XFS_IS_REALTIME_INODE(ip));
>  
> +	/*
> +	 * If we are remapping a stale iomap, preserve the IOMAP_F_NEW flag
> +	 * if it is passed to us. This will only be set if we are remapping a
> +	 * range that we just allocated and hence had set IOMAP_F_NEW on. We
> +	 * need to set it again here so any further writes over this newly
> +	 * allocated region we are remapping are preserved.
> +	 *
> +	 * This pairs with the code in xfs_buffered_write_iomap_end() that skips
> +	 * punching newly allocated delalloc regions that have iomaps marked as
> +	 * stale.
> +	 */
> +	if (iomap->flags & IOMAP_F_STALE)
> +		remap_flags = iomap->flags & IOMAP_F_NEW;
> +
>  	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
>  	if (error)
>  		return error;
> @@ -1100,7 +1121,7 @@ xfs_buffered_write_iomap_begin(
>  
>  found_imap:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, remap_flags);

Ah, ok, so the ->iomap_begin function /is/ required to detect
IOMAP_F_STALE, carryover any IOMAP_F_NEW, and drop the IOMAP_F_STALE.

>  found_cow:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> @@ -1160,13 +1181,20 @@ xfs_buffered_write_iomap_end(
>  
>  	/*
>  	 * Trim delalloc blocks if they were allocated by this write and we
> -	 * didn't manage to write the whole range.
> +	 * didn't manage to write the whole range. If the iomap was marked stale
> +	 * because it is no longer valid, we are going to remap this range
> +	 * immediately, so don't punch it out.
>  	 *
> -	 * We don't need to care about racing delalloc as we hold i_mutex
> +	 * XXX (dgc): This next comment and assumption is totally bogus because
> +	 * iomap_page_mkwrite() runs through here and it doesn't hold the
> +	 * i_rwsem. Hence this whole error handling path may be badly broken.

That probably needs fixing, though I'll break that out as a separate
reply to the cover letter.

> +	 *
> +	 * We don't need to care about racing delalloc as we hold i_rwsem
>  	 * across the reserve/allocate/unreserve calls. If there are delalloc
>  	 * blocks in the range, they are ours.
>  	 */
> -	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
> +	if (((iomap->flags & (IOMAP_F_NEW | IOMAP_F_STALE)) == IOMAP_F_NEW) &&
> +	    start_fsb < end_fsb) {
>  		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
>  					 XFS_FSB_TO_B(mp, end_fsb) - 1);
>  
> @@ -1182,9 +1210,26 @@ xfs_buffered_write_iomap_end(
>  	return 0;
>  }
>  
> +/*
> + * Check that the iomap passed to us is still valid for the given offset and
> + * length.
> + */
> +static bool
> +xfs_buffered_write_iomap_valid(
> +	struct inode		*inode,
> +	const struct iomap	*iomap)
> +{
> +	int			seq = *((int *)&iomap->private);
> +
> +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> +		return false;
> +	return true;
> +}

Wheee, thanks for tackling this one. :)

--D

> +
>  const struct iomap_ops xfs_buffered_write_iomap_ops = {
>  	.iomap_begin		= xfs_buffered_write_iomap_begin,
>  	.iomap_end		= xfs_buffered_write_iomap_end,
> +	.iomap_valid		= xfs_buffered_write_iomap_valid,
>  };
>  
>  static int
> -- 
> 2.37.2
>
Dave Chinner Sept. 23, 2022, 12:04 a.m. UTC | #2
On Wed, Sep 21, 2022 at 08:44:01PM -0700, Darrick J. Wong wrote:
> On Wed, Sep 21, 2022 at 06:29:59PM +1000, Dave Chinner wrote:
> >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > @@ -1160,13 +1181,20 @@ xfs_buffered_write_iomap_end(
> >  
> >  	/*
> >  	 * Trim delalloc blocks if they were allocated by this write and we
> > -	 * didn't manage to write the whole range.
> > +	 * didn't manage to write the whole range. If the iomap was marked stale
> > +	 * because it is no longer valid, we are going to remap this range
> > +	 * immediately, so don't punch it out.
> >  	 *
> > -	 * We don't need to care about racing delalloc as we hold i_mutex
> > +	 * XXX (dgc): This next comment and assumption is totally bogus because
> > +	 * iomap_page_mkwrite() runs through here and it doesn't hold the
> > +	 * i_rwsem. Hence this whole error handling path may be badly broken.
> 
> That probably needs fixing, though I'll break that out as a separate
> reply to the cover letter.

I'll drop it for the moment - I wrote that note when I first noticed
the problem as a "reminder to self" to mention it the problem in the
cover letter because....

> 
> > +	 *
> > +	 * We don't need to care about racing delalloc as we hold i_rwsem
> >  	 * across the reserve/allocate/unreserve calls. If there are delalloc
> >  	 * blocks in the range, they are ours.
> >  	 */
> > -	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
> > +	if (((iomap->flags & (IOMAP_F_NEW | IOMAP_F_STALE)) == IOMAP_F_NEW) &&
> > +	    start_fsb < end_fsb) {
> >  		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
> >  					 XFS_FSB_TO_B(mp, end_fsb) - 1);

.... I really don't like this "fix". If the next mapping (the
revalidated range) doesn't exactly fill the remainder of the
original delalloc mapping within EOF, we end up with delalloc blocks
within EOF that have no data in the page cache over them. i.e. this
relies on blind luck to avoid unflushable delalloc extents and is a
serious landmine to be leaving behind.

The fact we want buffered writes to move to shared i_rwsem operation
also means that we have no guarantee that nobody else has added data
into the page cache over this delalloc range. Hence punching out the
page cache and then the delalloc blocks is exactly the wrong thing
to be doing.

Further, racing mappings over this delalloc range mean that those
other contexts will also be trying to zero ranges of partial pages
because iomap_block_needs_zeroing() returns true for IOMAP_DELALLOC
mappings regardless of IOMAP_F_NEW.

Indeed, XFS is only using IOMAP_F_NEW on the initial delalloc
mapping to perform the above "do we need to punch out the unused
range" detection in xfs_buffered_write_iomap_end(). i.e. it's a flag
that says "we allocated this delalloc range", but it in no way
indicates "we are the only context that has written data into this
delalloc range".

Hence I suspect that the first thing we need to do here is get rid
of this use of IOMAP_F_NEW and the punching out of delalloc range
on write error. I think what we need to do here is walk the page
cache over the range of the remaining delalloc region and for every
hole that we find in the page cache, we punch only that range out.

We probably need to do this holding the mapping->invalidate_lock
exclusively to ensure the page cache contents do not change while
we are doing this walk - this will at least cause other contexts
that have the delalloc range mapped to block during page cache
insertion. This will then cause the the ->iomap_valid() check they
run once the folio is inserted and locked to detect that the iomap
they hold is now invalid an needs remapping...

This would avoid the need for IOMAP_F_STALE and IOMAP_F_NEW to be
propagated into the new contexts - only iomap_iter() would need to
handle advancing STALE maps with 0 bytes processed specially....

> > @@ -1182,9 +1210,26 @@ xfs_buffered_write_iomap_end(
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Check that the iomap passed to us is still valid for the given offset and
> > + * length.
> > + */
> > +static bool
> > +xfs_buffered_write_iomap_valid(
> > +	struct inode		*inode,
> > +	const struct iomap	*iomap)
> > +{
> > +	int			seq = *((int *)&iomap->private);
> > +
> > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> > +		return false;
> > +	return true;
> > +}
> 
> Wheee, thanks for tackling this one. :)

I think this one might have a long way to run yet.... :/

-Dave.
Darrick J. Wong Sept. 28, 2022, 4:54 a.m. UTC | #3
On Fri, Sep 23, 2022 at 10:04:03AM +1000, Dave Chinner wrote:
> On Wed, Sep 21, 2022 at 08:44:01PM -0700, Darrick J. Wong wrote:
> > On Wed, Sep 21, 2022 at 06:29:59PM +1000, Dave Chinner wrote:
> > >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > @@ -1160,13 +1181,20 @@ xfs_buffered_write_iomap_end(
> > >  
> > >  	/*
> > >  	 * Trim delalloc blocks if they were allocated by this write and we
> > > -	 * didn't manage to write the whole range.
> > > +	 * didn't manage to write the whole range. If the iomap was marked stale
> > > +	 * because it is no longer valid, we are going to remap this range
> > > +	 * immediately, so don't punch it out.
> > >  	 *
> > > -	 * We don't need to care about racing delalloc as we hold i_mutex
> > > +	 * XXX (dgc): This next comment and assumption is totally bogus because
> > > +	 * iomap_page_mkwrite() runs through here and it doesn't hold the
> > > +	 * i_rwsem. Hence this whole error handling path may be badly broken.
> > 
> > That probably needs fixing, though I'll break that out as a separate
> > reply to the cover letter.
> 
> I'll drop it for the moment - I wrote that note when I first noticed
> the problem as a "reminder to self" to mention it the problem in the
> cover letter because....
> 
> > 
> > > +	 *
> > > +	 * We don't need to care about racing delalloc as we hold i_rwsem
> > >  	 * across the reserve/allocate/unreserve calls. If there are delalloc
> > >  	 * blocks in the range, they are ours.
> > >  	 */
> > > -	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
> > > +	if (((iomap->flags & (IOMAP_F_NEW | IOMAP_F_STALE)) == IOMAP_F_NEW) &&
> > > +	    start_fsb < end_fsb) {
> > >  		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
> > >  					 XFS_FSB_TO_B(mp, end_fsb) - 1);
> 
> .... I really don't like this "fix". If the next mapping (the
> revalidated range) doesn't exactly fill the remainder of the
> original delalloc mapping within EOF, we end up with delalloc blocks
> within EOF that have no data in the page cache over them. i.e. this
> relies on blind luck to avoid unflushable delalloc extents and is a
> serious landmine to be leaving behind.

I'd kinda wondered over the years why not just leave pages in place and
in whatever state they were before, but never really wanted to dig too
deep into that.  I suppose I will when the v2 patchset arrives.

> The fact we want buffered writes to move to shared i_rwsem operation
> also means that we have no guarantee that nobody else has added data
> into the page cache over this delalloc range. Hence punching out the
> page cache and then the delalloc blocks is exactly the wrong thing
> to be doing.
> 
> Further, racing mappings over this delalloc range mean that those
> other contexts will also be trying to zero ranges of partial pages
> because iomap_block_needs_zeroing() returns true for IOMAP_DELALLOC
> mappings regardless of IOMAP_F_NEW.
> 
> Indeed, XFS is only using IOMAP_F_NEW on the initial delalloc
> mapping to perform the above "do we need to punch out the unused
> range" detection in xfs_buffered_write_iomap_end(). i.e. it's a flag
> that says "we allocated this delalloc range", but it in no way
> indicates "we are the only context that has written data into this
> delalloc range".
> 
> Hence I suspect that the first thing we need to do here is get rid
> of this use of IOMAP_F_NEW and the punching out of delalloc range
> on write error. I think what we need to do here is walk the page
> cache over the range of the remaining delalloc region and for every
> hole that we find in the page cache, we punch only that range out.

That would make more sense; I bet we'd have tripped over this as soon as
we shifted buffered writes to IOLOCK_SHARED and failed a write().

> We probably need to do this holding the mapping->invalidate_lock
> exclusively to ensure the page cache contents do not change while
> we are doing this walk - this will at least cause other contexts
> that have the delalloc range mapped to block during page cache
> insertion. This will then cause the the ->iomap_valid() check they
> run once the folio is inserted and locked to detect that the iomap
> they hold is now invalid an needs remapping...

<nod>

> This would avoid the need for IOMAP_F_STALE and IOMAP_F_NEW to be
> propagated into the new contexts - only iomap_iter() would need to
> handle advancing STALE maps with 0 bytes processed specially....

Ooh nice.

> > > @@ -1182,9 +1210,26 @@ xfs_buffered_write_iomap_end(
> > >  	return 0;
> > >  }
> > >  
> > > +/*
> > > + * Check that the iomap passed to us is still valid for the given offset and
> > > + * length.
> > > + */
> > > +static bool
> > > +xfs_buffered_write_iomap_valid(
> > > +	struct inode		*inode,
> > > +	const struct iomap	*iomap)
> > > +{
> > > +	int			seq = *((int *)&iomap->private);
> > > +
> > > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> > > +		return false;
> > > +	return true;
> > > +}
> > 
> > Wheee, thanks for tackling this one. :)
> 
> I think this one might have a long way to run yet.... :/

It's gonna be a fun time backporting this all to 4.14. ;)

Btw, can you share the reproducer?

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Sept. 29, 2022, 1:45 a.m. UTC | #4
On Tue, Sep 27, 2022 at 09:54:27PM -0700, Darrick J. Wong wrote:
> On Fri, Sep 23, 2022 at 10:04:03AM +1000, Dave Chinner wrote:
> > On Wed, Sep 21, 2022 at 08:44:01PM -0700, Darrick J. Wong wrote:
> > > On Wed, Sep 21, 2022 at 06:29:59PM +1000, Dave Chinner wrote:
> > > > @@ -1182,9 +1210,26 @@ xfs_buffered_write_iomap_end(
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +/*
> > > > + * Check that the iomap passed to us is still valid for the given offset and
> > > > + * length.
> > > > + */
> > > > +static bool
> > > > +xfs_buffered_write_iomap_valid(
> > > > +	struct inode		*inode,
> > > > +	const struct iomap	*iomap)
> > > > +{
> > > > +	int			seq = *((int *)&iomap->private);
> > > > +
> > > > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> > > > +		return false;
> > > > +	return true;
> > > > +}
> > > 
> > > Wheee, thanks for tackling this one. :)
> > 
> > I think this one might have a long way to run yet.... :/
> 
> It's gonna be a fun time backporting this all to 4.14. ;)

Hopefully it won't be a huge issue, the current code is more
contained to XFS and much less dependent on iomap iteration stuff...

> Btw, can you share the reproducer?

Not sure. The current reproducer I have is 2500 lines of complex C
code that was originally based on a reproducer the original reporter
provided. It does lots of stuff that isn't directly related to
reproducing the issue, and will be impossible to review and maintain
as it stands in fstests.

I will probably end up cutting it down to just a simple program that
reproduces the specific IO pattern that leads to the corruption
(reverse sequential non-block-aligned writes), then use the fstest
wrapper script to setup cgroup memory limits to cause writeback and
memory reclaim to race with the non-block-aligned writes. We only
need md5sums to detect corruption, so I think that the whole thing
can be done in a couple of hundred lines of shell and C code. If I
can reduce the write() IO pattern down to an xfs_io invocation,
everythign can be done directly in the fstest script...

Cheers,

Dave.
Frank Sorenson Oct. 4, 2022, 11:34 p.m. UTC | #5
On 9/28/22 20:45, Dave Chinner wrote:
> On Tue, Sep 27, 2022 at 09:54:27PM -0700, Darrick J. Wong wrote:

>> Btw, can you share the reproducer?

> Not sure. The current reproducer I have is 2500 lines of complex C
> code that was originally based on a reproducer the original reporter
> provided. It does lots of stuff that isn't directly related to
> reproducing the issue, and will be impossible to review and maintain
> as it stands in fstests.

Too true.  Fortunately, now that I understand the necessary conditions
and IO patterns, I managed to prune it all down to ~75 lines of bash
calling xfs_io.  See below.

Frank
--
Frank Sorenson
sorenson@redhat.com
Principal Software Maintenance Engineer
Global Support Services - filesystems
Red Hat

###########################################
#!/bin/bash
#	Frank Sorenson <sorenson@redhat.com>, 2022

num_files=8
num_writers=3

KiB=1024
MiB=$(( $KiB * $KiB ))
GiB=$(( $KiB * $KiB * $KiB ))

file_size=$(( 500 * $MiB ))
#file_size=$(( 1 * $GiB ))
write_size=$(( 1 * $MiB ))
start_offset=512

num_loops=$(( ($file_size - $start_offset + (($num_writers * $write_size) - 1)) / ($num_writers * $write_size) ))
total_size=$(( ($num_loops * $num_writers * $write_size) + $start_offset ))

cgroup_path=/sys/fs/cgroup/test_write_bug
mkdir -p $cgroup_path || { echo "unable to create cgroup" ; exit ; }

max_mem=$(( 40 * $MiB ))
high_mem=$(( ($max_mem * 9) / 10 ))
echo $high_mem >$cgroup_path/memory.high
echo $max_mem >$cgroup_path/memory.max

mkdir -p testfiles
rm -f testfiles/expected
xfs_io -f -c "pwrite -b $((1 * $MiB)) -S 0x40 0 $total_size" testfiles/expected >/dev/null 2>&1
expected_sum=$(md5sum testfiles/expected | awk '{print $1}')

echo $$ > $cgroup_path/cgroup.procs || exit # put ourselves in the cgroup

do_one_testfile() {
	filenum=$1
	cpids=""
	offset=$start_offset

	rm -f testfiles/test$filenum
	xfs_io -f -c "pwrite -b $start_offset -S 0x40 0 $start_offset" testfiles/test$filenum >/dev/null 2>&1

	while [[ $offset -lt $file_size ]] ; do
		cpids=""
		for i in $(seq 1 $num_writers) ; do
			xfs_io -f -c "pwrite -b $write_size -S 0x40 $(( ($offset + (($num_writers - $i) * $write_size)  ) )) $write_size" testfiles/test$filenum >/dev/null 2>&1 &
			cpids="$cpids $!"
		done
		wait $cpids
		offset=$(( $offset + ($num_writers * $write_size) ))
	done
}

round=1
while [[ 42 ]] ; do
	echo "test round: $round"
	cpids=""
	for i in $(seq 1 $num_files) ; do
		do_one_testfile $i &
		cpids="$cpids $!"
	done
	wait $cpids

	replicated="" # now check the files
	for i in $(seq 1 $num_files) ; do
		sum=$(md5sum testfiles/test$i | awk '{print $1}')
		[[ $sum == $expected_sum ]] || replicated="$replicated testfiles/test$i"
	done

	[[ -n $replicated ]] && break
	round=$(($round + 1))
done
echo "replicated bug with: $replicated"
echo $$ > /sys/fs/cgroup/cgroup.procs
rmdir $cgroup_path
Darrick J. Wong Oct. 5, 2022, 1:34 a.m. UTC | #6
On Tue, Oct 04, 2022 at 06:34:03PM -0500, Frank Sorenson wrote:
> 
> 
> On 9/28/22 20:45, Dave Chinner wrote:
> > On Tue, Sep 27, 2022 at 09:54:27PM -0700, Darrick J. Wong wrote:
> 
> > > Btw, can you share the reproducer?
> 
> > Not sure. The current reproducer I have is 2500 lines of complex C
> > code that was originally based on a reproducer the original reporter
> > provided. It does lots of stuff that isn't directly related to
> > reproducing the issue, and will be impossible to review and maintain
> > as it stands in fstests.
> 
> Too true.  Fortunately, now that I understand the necessary conditions
> and IO patterns, I managed to prune it all down to ~75 lines of bash
> calling xfs_io.  See below.
> 
> Frank
> --
> Frank Sorenson
> sorenson@redhat.com
> Principal Software Maintenance Engineer
> Global Support Services - filesystems
> Red Hat
> 
> ###########################################
> #!/bin/bash
> #	Frank Sorenson <sorenson@redhat.com>, 2022
> 
> num_files=8
> num_writers=3
> 
> KiB=1024
> MiB=$(( $KiB * $KiB ))
> GiB=$(( $KiB * $KiB * $KiB ))
> 
> file_size=$(( 500 * $MiB ))
> #file_size=$(( 1 * $GiB ))
> write_size=$(( 1 * $MiB ))
> start_offset=512
> 
> num_loops=$(( ($file_size - $start_offset + (($num_writers * $write_size) - 1)) / ($num_writers * $write_size) ))
> total_size=$(( ($num_loops * $num_writers * $write_size) + $start_offset ))
> 
> cgroup_path=/sys/fs/cgroup/test_write_bug
> mkdir -p $cgroup_path || { echo "unable to create cgroup" ; exit ; }
> 
> max_mem=$(( 40 * $MiB ))
> high_mem=$(( ($max_mem * 9) / 10 ))
> echo $high_mem >$cgroup_path/memory.high
> echo $max_mem >$cgroup_path/memory.max

Hmm, so we setup a cgroup a very low memory limit, and then kick off a
lot of threads doing IO... which I guess is how you ended up with a long
write to an unwritten extent that races with memory reclaim targetting a
dirty page at the end of that unwritten extent for writeback and
eviction.

I wonder, if we had a way to slow down iomap_write_iter, could we
simulate the writeback and eviction with sync_file_range and
madvise(MADV_FREE)?

(I've been playing with a debug knob to slow down writeback for a
different corruption problem I've been working on, and it's taken the
repro time down from days to a 5 second fstest.)

Anyhow, thanks for the simplified repo, I'll keep thinking about this. :)

--D

> mkdir -p testfiles
> rm -f testfiles/expected
> xfs_io -f -c "pwrite -b $((1 * $MiB)) -S 0x40 0 $total_size" testfiles/expected >/dev/null 2>&1
> expected_sum=$(md5sum testfiles/expected | awk '{print $1}')
> 
> echo $$ > $cgroup_path/cgroup.procs || exit # put ourselves in the cgroup
> 
> do_one_testfile() {
> 	filenum=$1
> 	cpids=""
> 	offset=$start_offset
> 
> 	rm -f testfiles/test$filenum
> 	xfs_io -f -c "pwrite -b $start_offset -S 0x40 0 $start_offset" testfiles/test$filenum >/dev/null 2>&1
> 
> 	while [[ $offset -lt $file_size ]] ; do
> 		cpids=""
> 		for i in $(seq 1 $num_writers) ; do
> 			xfs_io -f -c "pwrite -b $write_size -S 0x40 $(( ($offset + (($num_writers - $i) * $write_size)  ) )) $write_size" testfiles/test$filenum >/dev/null 2>&1 &
> 			cpids="$cpids $!"
> 		done
> 		wait $cpids
> 		offset=$(( $offset + ($num_writers * $write_size) ))
> 	done
> }
> 
> round=1
> while [[ 42 ]] ; do
> 	echo "test round: $round"
> 	cpids=""
> 	for i in $(seq 1 $num_files) ; do
> 		do_one_testfile $i &
> 		cpids="$cpids $!"
> 	done
> 	wait $cpids
> 
> 	replicated="" # now check the files
> 	for i in $(seq 1 $num_files) ; do
> 		sum=$(md5sum testfiles/test$i | awk '{print $1}')
> 		[[ $sum == $expected_sum ]] || replicated="$replicated testfiles/test$i"
> 	done
> 
> 	[[ -n $replicated ]] && break
> 	round=$(($round + 1))
> done
> echo "replicated bug with: $replicated"
> echo $$ > /sys/fs/cgroup/cgroup.procs
> rmdir $cgroup_path
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 07da03976ec1..2e77ae817e6b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -91,6 +91,12 @@  xfs_bmbt_to_iomap(
 	if (xfs_ipincount(ip) &&
 	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
+
+	/*
+	 * Sample the extent tree sequence so that we can detect if the tree
+	 * changes while the iomap is still being used.
+	 */
+	*((int *)&iomap->private) = READ_ONCE(ip->i_df.if_seq);
 	return 0;
 }
 
@@ -915,6 +921,7 @@  xfs_buffered_write_iomap_begin(
 	int			allocfork = XFS_DATA_FORK;
 	int			error = 0;
 	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	u16			remap_flags = 0;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -926,6 +933,20 @@  xfs_buffered_write_iomap_begin(
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 
+	/*
+	 * If we are remapping a stale iomap, preserve the IOMAP_F_NEW flag
+	 * if it is passed to us. This will only be set if we are remapping a
+	 * range that we just allocated and hence had set IOMAP_F_NEW on. We
+	 * need to set it again here so any further writes over this newly
+	 * allocated region we are remapping are preserved.
+	 *
+	 * This pairs with the code in xfs_buffered_write_iomap_end() that skips
+	 * punching newly allocated delalloc regions that have iomaps marked as
+	 * stale.
+	 */
+	if (iomap->flags & IOMAP_F_STALE)
+		remap_flags = iomap->flags & IOMAP_F_NEW;
+
 	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
 	if (error)
 		return error;
@@ -1100,7 +1121,7 @@  xfs_buffered_write_iomap_begin(
 
 found_imap:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, remap_flags);
 
 found_cow:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1160,13 +1181,20 @@  xfs_buffered_write_iomap_end(
 
 	/*
 	 * Trim delalloc blocks if they were allocated by this write and we
-	 * didn't manage to write the whole range.
+	 * didn't manage to write the whole range. If the iomap was marked stale
+	 * because it is no longer valid, we are going to remap this range
+	 * immediately, so don't punch it out.
 	 *
-	 * We don't need to care about racing delalloc as we hold i_mutex
+	 * XXX (dgc): This next comment and assumption is totally bogus because
+	 * iomap_page_mkwrite() runs through here and it doesn't hold the
+	 * i_rwsem. Hence this whole error handling path may be badly broken.
+	 *
+	 * We don't need to care about racing delalloc as we hold i_rwsem
 	 * across the reserve/allocate/unreserve calls. If there are delalloc
 	 * blocks in the range, they are ours.
 	 */
-	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+	if (((iomap->flags & (IOMAP_F_NEW | IOMAP_F_STALE)) == IOMAP_F_NEW) &&
+	    start_fsb < end_fsb) {
 		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
 					 XFS_FSB_TO_B(mp, end_fsb) - 1);
 
@@ -1182,9 +1210,26 @@  xfs_buffered_write_iomap_end(
 	return 0;
 }
 
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_buffered_write_iomap_valid(
+	struct inode		*inode,
+	const struct iomap	*iomap)
+{
+	int			seq = *((int *)&iomap->private);
+
+	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
+		return false;
+	return true;
+}
+
 const struct iomap_ops xfs_buffered_write_iomap_ops = {
 	.iomap_begin		= xfs_buffered_write_iomap_begin,
 	.iomap_end		= xfs_buffered_write_iomap_end,
+	.iomap_valid		= xfs_buffered_write_iomap_valid,
 };
 
 static int