diff mbox series

[6/7] xfs: use iomap_valid method to detect stale cached iomaps

Message ID 20221101003412.3842572-7-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series [1/7] xfs: write page faults in iomap are not buffered writes | expand

Commit Message

Dave Chinner Nov. 1, 2022, 12:34 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that iomap supports a mechanism to validate cached iomaps for
buffered write operations, hook it up to the XFS buffered write ops
so that we can avoid data corruptions that result from stale cached
iomaps. See:

https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/

or the ->iomap_valid() introduction commit for exact details of the
corruption vector.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c |  4 +--
 fs/xfs/xfs_aops.c        |  2 +-
 fs/xfs/xfs_iomap.c       | 69 ++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_iomap.h       |  4 +--
 fs/xfs/xfs_pnfs.c        |  5 +--
 5 files changed, 61 insertions(+), 23 deletions(-)

Comments

kernel test robot Nov. 1, 2022, 9:15 a.m. UTC | #1
Hi Dave,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on xfs-linux/for-next]
[also build test WARNING on linus/master v6.1-rc3 next-20221101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Dave-Chinner/xfs-write-page-faults-in-iomap-are-not-buffered-writes/20221101-104935
base:   https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
patch link:    https://lore.kernel.org/r/20221101003412.3842572-7-david%40fromorbit.com
patch subject: [PATCH 6/7] xfs: use iomap_valid method to detect stale cached iomaps
config: riscv-randconfig-r042-20221101
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 791a7ae1ba3efd6bca96338e10ffde557ba83920)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/4a183febef8e4f12f2a256fb67bc3021122b662f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Dave-Chinner/xfs-write-page-faults-in-iomap-are-not-buffered-writes/20221101-104935
        git checkout 4a183febef8e4f12f2a256fb67bc3021122b662f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash fs/xfs/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (!error && write &&
               ^~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here
           error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
                                                             ^~~
   fs/xfs/xfs_pnfs.c:183:2: note: remove the 'if' if its condition is always true
           if (!error && write &&
           ^~~~~~~~~~~~~~~~~~~~~~
>> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized]
           if (!error && write &&
               ^~~~~~~~~~~~~~~
   fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here
           error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
                                                             ^~~
   fs/xfs/xfs_pnfs.c:183:6: note: remove the '&&' if its condition is always true
           if (!error && write &&
               ^~~~~~~~~~~~~~~~~~
>> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized]
           if (!error && write &&
               ^~~~~~
   fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here
           error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
                                                             ^~~
   fs/xfs/xfs_pnfs.c:183:6: note: remove the '&&' if its condition is always true
           if (!error && write &&
               ^~~~~~~~~
   fs/xfs/xfs_pnfs.c:128:11: note: initialize the variable 'seq' to silence this warning
           int                     seq;
                                      ^
                                       = 0
   3 warnings generated.


vim +183 fs/xfs/xfs_pnfs.c

b39a04636fd7454 Dave Chinner      2022-01-31  106  
527851124d10f9c Christoph Hellwig 2015-02-16  107  /*
527851124d10f9c Christoph Hellwig 2015-02-16  108   * Get a layout for the pNFS client.
527851124d10f9c Christoph Hellwig 2015-02-16  109   */
527851124d10f9c Christoph Hellwig 2015-02-16  110  int
527851124d10f9c Christoph Hellwig 2015-02-16  111  xfs_fs_map_blocks(
527851124d10f9c Christoph Hellwig 2015-02-16  112  	struct inode		*inode,
527851124d10f9c Christoph Hellwig 2015-02-16  113  	loff_t			offset,
527851124d10f9c Christoph Hellwig 2015-02-16  114  	u64			length,
527851124d10f9c Christoph Hellwig 2015-02-16  115  	struct iomap		*iomap,
527851124d10f9c Christoph Hellwig 2015-02-16  116  	bool			write,
527851124d10f9c Christoph Hellwig 2015-02-16  117  	u32			*device_generation)
527851124d10f9c Christoph Hellwig 2015-02-16  118  {
527851124d10f9c Christoph Hellwig 2015-02-16  119  	struct xfs_inode	*ip = XFS_I(inode);
527851124d10f9c Christoph Hellwig 2015-02-16  120  	struct xfs_mount	*mp = ip->i_mount;
527851124d10f9c Christoph Hellwig 2015-02-16  121  	struct xfs_bmbt_irec	imap;
527851124d10f9c Christoph Hellwig 2015-02-16  122  	xfs_fileoff_t		offset_fsb, end_fsb;
527851124d10f9c Christoph Hellwig 2015-02-16  123  	loff_t			limit;
527851124d10f9c Christoph Hellwig 2015-02-16  124  	int			bmapi_flags = XFS_BMAPI_ENTIRE;
527851124d10f9c Christoph Hellwig 2015-02-16  125  	int			nimaps = 1;
527851124d10f9c Christoph Hellwig 2015-02-16  126  	uint			lock_flags;
527851124d10f9c Christoph Hellwig 2015-02-16  127  	int			error = 0;
4a183febef8e4f1 Dave Chinner      2022-11-01  128  	int			seq;
527851124d10f9c Christoph Hellwig 2015-02-16  129  
75c8c50fa16a23f Dave Chinner      2021-08-18  130  	if (xfs_is_shutdown(mp))
527851124d10f9c Christoph Hellwig 2015-02-16  131  		return -EIO;
527851124d10f9c Christoph Hellwig 2015-02-16  132  
527851124d10f9c Christoph Hellwig 2015-02-16  133  	/*
527851124d10f9c Christoph Hellwig 2015-02-16  134  	 * We can't export inodes residing on the realtime device.  The realtime
527851124d10f9c Christoph Hellwig 2015-02-16  135  	 * device doesn't have a UUID to identify it, so the client has no way
527851124d10f9c Christoph Hellwig 2015-02-16  136  	 * to find it.
527851124d10f9c Christoph Hellwig 2015-02-16  137  	 */
527851124d10f9c Christoph Hellwig 2015-02-16  138  	if (XFS_IS_REALTIME_INODE(ip))
527851124d10f9c Christoph Hellwig 2015-02-16  139  		return -ENXIO;
527851124d10f9c Christoph Hellwig 2015-02-16  140  
46eeb521b952471 Darrick J. Wong   2016-10-03  141  	/*
46eeb521b952471 Darrick J. Wong   2016-10-03  142  	 * The pNFS block layout spec actually supports reflink like
46eeb521b952471 Darrick J. Wong   2016-10-03  143  	 * functionality, but the Linux pNFS server doesn't implement it yet.
46eeb521b952471 Darrick J. Wong   2016-10-03  144  	 */
46eeb521b952471 Darrick J. Wong   2016-10-03  145  	if (xfs_is_reflink_inode(ip))
46eeb521b952471 Darrick J. Wong   2016-10-03  146  		return -ENXIO;
46eeb521b952471 Darrick J. Wong   2016-10-03  147  
527851124d10f9c Christoph Hellwig 2015-02-16  148  	/*
527851124d10f9c Christoph Hellwig 2015-02-16  149  	 * Lock out any other I/O before we flush and invalidate the pagecache,
527851124d10f9c Christoph Hellwig 2015-02-16  150  	 * and then hand out a layout to the remote system.  This is very
527851124d10f9c Christoph Hellwig 2015-02-16  151  	 * similar to direct I/O, except that the synchronization is much more
69eb5fa10eb283e Dan Williams      2018-03-20  152  	 * complicated.  See the comment near xfs_break_leased_layouts
69eb5fa10eb283e Dan Williams      2018-03-20  153  	 * for a detailed explanation.
527851124d10f9c Christoph Hellwig 2015-02-16  154  	 */
527851124d10f9c Christoph Hellwig 2015-02-16  155  	xfs_ilock(ip, XFS_IOLOCK_EXCL);
527851124d10f9c Christoph Hellwig 2015-02-16  156  
527851124d10f9c Christoph Hellwig 2015-02-16  157  	error = -EINVAL;
527851124d10f9c Christoph Hellwig 2015-02-16  158  	limit = mp->m_super->s_maxbytes;
527851124d10f9c Christoph Hellwig 2015-02-16  159  	if (!write)
527851124d10f9c Christoph Hellwig 2015-02-16  160  		limit = max(limit, round_up(i_size_read(inode),
527851124d10f9c Christoph Hellwig 2015-02-16  161  				     inode->i_sb->s_blocksize));
527851124d10f9c Christoph Hellwig 2015-02-16  162  	if (offset > limit)
527851124d10f9c Christoph Hellwig 2015-02-16  163  		goto out_unlock;
527851124d10f9c Christoph Hellwig 2015-02-16  164  	if (offset > limit - length)
527851124d10f9c Christoph Hellwig 2015-02-16  165  		length = limit - offset;
527851124d10f9c Christoph Hellwig 2015-02-16  166  
527851124d10f9c Christoph Hellwig 2015-02-16  167  	error = filemap_write_and_wait(inode->i_mapping);
527851124d10f9c Christoph Hellwig 2015-02-16  168  	if (error)
527851124d10f9c Christoph Hellwig 2015-02-16  169  		goto out_unlock;
527851124d10f9c Christoph Hellwig 2015-02-16  170  	error = invalidate_inode_pages2(inode->i_mapping);
527851124d10f9c Christoph Hellwig 2015-02-16  171  	if (WARN_ON_ONCE(error))
2bd3fa793aaa7e9 Christoph Hellwig 2020-11-11  172  		goto out_unlock;
527851124d10f9c Christoph Hellwig 2015-02-16  173  
527851124d10f9c Christoph Hellwig 2015-02-16  174  	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
527851124d10f9c Christoph Hellwig 2015-02-16  175  	offset_fsb = XFS_B_TO_FSBT(mp, offset);
527851124d10f9c Christoph Hellwig 2015-02-16  176  
527851124d10f9c Christoph Hellwig 2015-02-16  177  	lock_flags = xfs_ilock_data_map_shared(ip);
527851124d10f9c Christoph Hellwig 2015-02-16  178  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
527851124d10f9c Christoph Hellwig 2015-02-16  179  				&imap, &nimaps, bmapi_flags);
527851124d10f9c Christoph Hellwig 2015-02-16  180  
88cdb7147b21b2d Christoph Hellwig 2019-10-30  181  	ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
88cdb7147b21b2d Christoph Hellwig 2019-10-30  182  
e696663a97e89f8 Christoph Hellwig 2019-10-30 @183  	if (!error && write &&
e696663a97e89f8 Christoph Hellwig 2019-10-30  184  	    (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
e696663a97e89f8 Christoph Hellwig 2019-10-30  185  		if (offset + length > XFS_ISIZE(ip))
e696663a97e89f8 Christoph Hellwig 2019-10-30  186  			end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
e696663a97e89f8 Christoph Hellwig 2019-10-30  187  		else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
e696663a97e89f8 Christoph Hellwig 2019-10-30  188  			end_fsb = min(end_fsb, imap.br_startoff +
e696663a97e89f8 Christoph Hellwig 2019-10-30  189  					       imap.br_blockcount);
e696663a97e89f8 Christoph Hellwig 2019-10-30  190  		xfs_iunlock(ip, lock_flags);
e696663a97e89f8 Christoph Hellwig 2019-10-30  191  
e696663a97e89f8 Christoph Hellwig 2019-10-30  192  		error = xfs_iomap_write_direct(ip, offset_fsb,
4a183febef8e4f1 Dave Chinner      2022-11-01  193  				end_fsb - offset_fsb, 0, &imap, &seq);
527851124d10f9c Christoph Hellwig 2015-02-16  194  		if (error)
527851124d10f9c Christoph Hellwig 2015-02-16  195  			goto out_unlock;
527851124d10f9c Christoph Hellwig 2015-02-16  196  
527851124d10f9c Christoph Hellwig 2015-02-16  197  		/*
307cdb54b80e036 Christoph Hellwig 2019-10-30  198  		 * Ensure the next transaction is committed synchronously so
307cdb54b80e036 Christoph Hellwig 2019-10-30  199  		 * that the blocks allocated and handed out to the client are
307cdb54b80e036 Christoph Hellwig 2019-10-30  200  		 * guaranteed to be present even after a server crash.
527851124d10f9c Christoph Hellwig 2015-02-16  201  		 */
b39a04636fd7454 Dave Chinner      2022-01-31  202  		error = xfs_fs_map_update_inode(ip);
472c6e46f589c26 Dave Chinner      2022-01-31  203  		if (!error)
472c6e46f589c26 Dave Chinner      2022-01-31  204  			error = xfs_log_force_inode(ip);
527851124d10f9c Christoph Hellwig 2015-02-16  205  		if (error)
527851124d10f9c Christoph Hellwig 2015-02-16  206  			goto out_unlock;
472c6e46f589c26 Dave Chinner      2022-01-31  207  
e696663a97e89f8 Christoph Hellwig 2019-10-30  208  	} else {
e696663a97e89f8 Christoph Hellwig 2019-10-30  209  		xfs_iunlock(ip, lock_flags);
527851124d10f9c Christoph Hellwig 2015-02-16  210  	}
527851124d10f9c Christoph Hellwig 2015-02-16  211  	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
527851124d10f9c Christoph Hellwig 2015-02-16  212  
4a183febef8e4f1 Dave Chinner      2022-11-01  213  	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
527851124d10f9c Christoph Hellwig 2015-02-16  214  	*device_generation = mp->m_generation;
527851124d10f9c Christoph Hellwig 2015-02-16  215  	return error;
527851124d10f9c Christoph Hellwig 2015-02-16  216  out_unlock:
527851124d10f9c Christoph Hellwig 2015-02-16  217  	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
527851124d10f9c Christoph Hellwig 2015-02-16  218  	return error;
527851124d10f9c Christoph Hellwig 2015-02-16  219  }
527851124d10f9c Christoph Hellwig 2015-02-16  220
Christoph Hellwig Nov. 2, 2022, 8:41 a.m. UTC | #2
> +	*((int *)&iomap->private) = sequence;

> +static bool
> +xfs_buffered_write_iomap_valid(
> +	struct inode		*inode,
> +	const struct iomap	*iomap)
> +{
> +	int			seq = *((int *)&iomap->private);

I really hate this stuffing of the sequence into the private pointer.
The iomap structure isn't so size constrained that we have to do that,
so we can just add a sequence number field directly to it.  I don't
think that is a layering violation, as the concept of a sequence
numebr is pretty generic and we'll probably need it for all file systems
eventually.

> +
> +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> +		return false;

Which makes me wonder if we could do away with the callback entirely
by adding an option sequence number pointer to the iomap_iter.  If set
the core code compares it against iomap->seq and we get rid of the
per-folio indirect call, and boilerplate code that would need to be
implemented in every file system.
Darrick J. Wong Nov. 2, 2022, 5:19 p.m. UTC | #3
On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that iomap supports a mechanism to validate cached iomaps for
> buffered write operations, hook it up to the XFS buffered write ops
> so that we can avoid data corruptions that result from stale cached
> iomaps. See:
> 
> https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
> 
> or the ->iomap_valid() introduction commit for exact details of the
> corruption vector.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c |  4 +--
>  fs/xfs/xfs_aops.c        |  2 +-
>  fs/xfs/xfs_iomap.c       | 69 ++++++++++++++++++++++++++++++----------
>  fs/xfs/xfs_iomap.h       |  4 +--
>  fs/xfs/xfs_pnfs.c        |  5 +--
>  5 files changed, 61 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 49d0d4ea63fc..db225130618c 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc(
>  	 * the extent.  Just return the real extent at this offset.
>  	 */
>  	if (!isnullstartblock(bma.got.br_startblock)) {
> -		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
>  		*seq = READ_ONCE(ifp->if_seq);
> +		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
>  		goto out_trans_cancel;
>  	}
>  
> @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc(
>  	XFS_STATS_INC(mp, xs_xstrat_quick);
>  
>  	ASSERT(!isnullstartblock(bma.got.br_startblock));
> -	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
>  	*seq = READ_ONCE(ifp->if_seq);
> +	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
>  
>  	if (whichfork == XFS_COW_FORK)

Hmm.  @ifp here could be the cow fork, which means *seq will be from the
cow fork.  This I think is going to cause problems in
xfs_buffered_write_iomap_valid...

>  		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 5d1a995b15f8..ca5a9e45a48c 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -373,7 +373,7 @@ xfs_map_blocks(
>  	    isnullstartblock(imap.br_startblock))
>  		goto allocate_blocks;
>  
> -	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
> +	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
>  	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
>  	return 0;
>  allocate_blocks:
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 2d48fcc7bd6f..5053ffcf10fe 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap(
>  	struct iomap		*iomap,
>  	struct xfs_bmbt_irec	*imap,
>  	unsigned int		mapping_flags,
> -	u16			iomap_flags)
> +	u16			iomap_flags,
> +	int			sequence)

Nit: The sequence numbers are unsigned int, not signed int.

>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> @@ -91,6 +92,9 @@ xfs_bmbt_to_iomap(
>  	if (xfs_ipincount(ip) &&
>  	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
>  		iomap->flags |= IOMAP_F_DIRTY;
> +
> +	/* The extent tree sequence is needed for iomap validity checking. */
> +	*((int *)&iomap->private) = sequence;
>  	return 0;
>  }
>  
> @@ -195,7 +199,8 @@ xfs_iomap_write_direct(
>  	xfs_fileoff_t		offset_fsb,
>  	xfs_fileoff_t		count_fsb,
>  	unsigned int		flags,
> -	struct xfs_bmbt_irec	*imap)
> +	struct xfs_bmbt_irec	*imap,
> +	int			*seq)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_trans	*tp;
> @@ -285,6 +290,8 @@ xfs_iomap_write_direct(
>  		error = xfs_alert_fsblock_zero(ip, imap);
>  
>  out_unlock:
> +	if (seq)
> +		*seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
>  
> @@ -743,6 +750,7 @@ xfs_direct_write_iomap_begin(
>  	bool			shared = false;
>  	u16			iomap_flags = 0;
>  	unsigned int		lockmode = XFS_ILOCK_SHARED;
> +	int			seq;
>  
>  	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
>  
> @@ -811,9 +819,10 @@ xfs_direct_write_iomap_begin(
>  			goto out_unlock;
>  	}
>  
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
>  
>  allocate_blocks:
>  	error = -EAGAIN;
> @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin(
>  	xfs_iunlock(ip, lockmode);
>  
>  	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
> -			flags, &imap);
> +			flags, &imap, &seq);
>  	if (error)
>  		return error;
>  
>  	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
>  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> -				 iomap_flags | IOMAP_F_NEW);
> +				 iomap_flags | IOMAP_F_NEW, seq);
>  
>  out_found_cow:
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
>  	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
>  	if (imap.br_startblock != HOLESTARTBLOCK) {
> -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
>  		if (error)
>  			return error;
>  	}
> -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);

Here we've found a mapping from the COW fork and we're encoding it into
the struct iomap.  Why is the sequence number from the *data* fork and
not the COW fork?

>  
>  out_unlock:
>  	if (lockmode)
> @@ -915,6 +925,7 @@ xfs_buffered_write_iomap_begin(
>  	int			allocfork = XFS_DATA_FORK;
>  	int			error = 0;
>  	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -1094,26 +1105,29 @@ xfs_buffered_write_iomap_begin(
>  	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
>  	 * them out if the write happens to fail.
>  	 */
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
>  
>  found_imap:
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
>  
>  found_cow:
> +	seq = READ_ONCE(ip->i_df.if_seq);

Same question here.

>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	if (imap.br_startoff <= offset_fsb) {
> -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
>  		if (error)
>  			return error;
>  		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> -					 IOMAP_F_SHARED);
> +					 IOMAP_F_SHARED, seq);
>  	}
>  
>  	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
> -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
>  
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end(
>  	return 0;
>  }
>  
> +/*
> + * Check that the iomap passed to us is still valid for the given offset and
> + * length.
> + */
> +static bool
> +xfs_buffered_write_iomap_valid(
> +	struct inode		*inode,
> +	const struct iomap	*iomap)
> +{
> +	int			seq = *((int *)&iomap->private);
> +
> +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))

Why is it ok to sample the data fork's sequence number even if the
mapping came from the COW fork?  That doesn't make sense to me,
conceptually.  I definitely think it's buggy that the revalidation
might not sample from the same sequence counter as the original
measurement.

My curiosity is now increased about why shouldn't we return the u32
counter and u32 pointer in the struct iomap so that we can keep the
sample/revalidation straight in XFS, and iomap can avoid a second
indirect call?

> +		return false;
> +	return true;
> +}
> +
>  const struct iomap_ops xfs_buffered_write_iomap_ops = {
>  	.iomap_begin		= xfs_buffered_write_iomap_begin,
>  	.iomap_end		= xfs_buffered_write_iomap_end,
> +	.iomap_valid		= xfs_buffered_write_iomap_valid,
>  };
>  
>  /*
> @@ -1359,6 +1390,7 @@ xfs_read_iomap_begin(
>  	int			nimaps = 1, error = 0;
>  	bool			shared = false;
>  	unsigned int		lockmode = XFS_ILOCK_SHARED;
> +	int			seq;
>  
>  	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
>  
> @@ -1372,13 +1404,14 @@ xfs_read_iomap_begin(
>  			       &nimaps, 0);
>  	if (!error && (flags & IOMAP_REPORT))
>  		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +	seq = READ_ONCE(ip->i_df.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  
>  	if (error)
>  		return error;
>  	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
>  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> -				 shared ? IOMAP_F_SHARED : 0);
> +				 shared ? IOMAP_F_SHARED : 0, seq);
>  }
>  
>  const struct iomap_ops xfs_read_iomap_ops = {
> @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin(
>  			end_fsb = min(end_fsb, data_fsb);
>  		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
>  		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> -					  IOMAP_F_SHARED);
> +				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));

Here we explicitly sample from the cow fork but the comparison is done
against the data fork.  That /probably/ doesn't matter for reads since
you're not proposing that we revalidate on those.  Should we?  What
happens if userspace hands us a large read() from an unwritten extent
and the same dirty/writeback/reclaim sequence happens to the last folio
in that read() range before read_iter actually gets there?

>  		/*
>  		 * This is a COW extent, so we must probe the page cache
>  		 * because there could be dirty page cache being backed
> @@ -1460,7 +1493,8 @@ xfs_seek_iomap_begin(
>  	imap.br_state = XFS_EXT_NORM;
>  done:
>  	xfs_trim_extent(&imap, offset_fsb, end_fsb);
> -	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0,
> +			READ_ONCE(ip->i_df.if_seq));
>  out_unlock:
>  	xfs_iunlock(ip, lockmode);
>  	return error;
> @@ -1486,6 +1520,7 @@ xfs_xattr_iomap_begin(
>  	struct xfs_bmbt_irec	imap;
>  	int			nimaps = 1, error = 0;
>  	unsigned		lockmode;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -1502,12 +1537,14 @@ xfs_xattr_iomap_begin(
>  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>  			       &nimaps, XFS_BMAPI_ATTRFORK);
>  out_unlock:
> +
> +	seq = READ_ONCE(ip->i_af.if_seq);
>  	xfs_iunlock(ip, lockmode);
>  
>  	if (error)
>  		return error;
>  	ASSERT(nimaps);
> -	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> +	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
>  }
>  
>  const struct iomap_ops xfs_xattr_iomap_ops = {
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 0f62ab633040..792fed2a9072 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -13,14 +13,14 @@ struct xfs_bmbt_irec;
>  
>  int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
>  		xfs_fileoff_t count_fsb, unsigned int flags,
> -		struct xfs_bmbt_irec *imap);
> +		struct xfs_bmbt_irec *imap, int *sequence);
>  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
>  xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
>  		xfs_fileoff_t end_fsb);
>  
>  int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
>  		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
> -		u16 iomap_flags);
> +		u16 iomap_flags, int sequence);
>  
>  int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
>  		bool *did_zero);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 37a24f0f7cd4..eea507a80c5c 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -125,6 +125,7 @@ xfs_fs_map_blocks(
>  	int			nimaps = 1;
>  	uint			lock_flags;
>  	int			error = 0;
> +	int			seq;
>  
>  	if (xfs_is_shutdown(mp))
>  		return -EIO;
> @@ -189,7 +190,7 @@ xfs_fs_map_blocks(
>  		xfs_iunlock(ip, lock_flags);
>  
>  		error = xfs_iomap_write_direct(ip, offset_fsb,
> -				end_fsb - offset_fsb, 0, &imap);
> +				end_fsb - offset_fsb, 0, &imap, &seq);

I got a compiler warning about seq not being set in the case where we
don't call xfs_iomap_write_direct.

--D

>  		if (error)
>  			goto out_unlock;
>  
> @@ -209,7 +210,7 @@ xfs_fs_map_blocks(
>  	}
>  	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
>  
> -	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
>  	*device_generation = mp->m_generation;
>  	return error;
>  out_unlock:
> -- 
> 2.37.2
>
Dave Chinner Nov. 2, 2022, 9:39 p.m. UTC | #4
On Wed, Nov 02, 2022 at 01:41:25AM -0700, Christoph Hellwig wrote:
> > +	*((int *)&iomap->private) = sequence;
> 
> > +static bool
> > +xfs_buffered_write_iomap_valid(
> > +	struct inode		*inode,
> > +	const struct iomap	*iomap)
> > +{
> > +	int			seq = *((int *)&iomap->private);
> 
> I really hate this stuffing of the sequence into the private pointer.

Oh, I'm no fan of it either. It was this or work out how to support
sequence numbers/cookies directly in iomaps...

> The iomap structure isn't so size constrained that we have to do that,
> so we can just add a sequence number field directly to it.  I don't
> think that is a layering violation, as the concept of a sequence
> numebr is pretty generic and we'll probably need it for all file systems
> eventually.

*nod*

This was the least of my worries trying to get this code to work. I
didn't have to think about it this way, so it was one less thing to
worry about.

My concerns with putting it into the iomap is that different
filesystems will have different mechanisms for detecting stale
iomaps. THe way we do it with a generation counter is pretty coarse
as any change to the extent map will invalidate the iomap regardless
of whether they overlap or not.

If, in future, we want something more complex and finer grained
(e.g. an iext tree cursor) to allow us to determine if the
change to the extent tree actually modified the extent backing the
iomap, then we are going to need an opaque cookie of some kind, not
a u32 or a u32*.

> > +
> > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> > +		return false;
> 
> Which makes me wonder if we could do away with the callback entirely
> by adding an option sequence number pointer to the iomap_iter.  If set
> the core code compares it against iomap->seq and we get rid of the
> per-folio indirect call, and boilerplate code that would need to be
> implemented in every file system.

I'm not convinced that this is the right way to proceed.

The writeback code also has cached iomap validity checks via a
callback.  The checks in xfs_imap_valid() require more than just a
single sequence number check - there are two sequence numbers, one
of which is conditionally checked when the inode may have COW
operations occurring.

Hence I don't think that encoding a single u32 into the iomap is
generic enough even for the current "check the cached iomap is not
stale" use cases we have. I'd really like to move towards a common
mechanism for both the write and writeback paths.

I didn't have the brain capacity to think all this through at the
time, so I just stuffed the sequence number in the private field and
moved on to the next part of the problem.

Indeed, I haven't even checked if the read path might be susceptible
to stale cached iomaps yet....

Cheers,

Dave.
Dave Chinner Nov. 2, 2022, 10:36 p.m. UTC | #5
On Wed, Nov 02, 2022 at 10:19:33AM -0700, Darrick J. Wong wrote:
> On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that iomap supports a mechanism to validate cached iomaps for
> > buffered write operations, hook it up to the XFS buffered write ops
> > so that we can avoid data corruptions that result from stale cached
> > iomaps. See:
> > 
> > https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
> > 
> > or the ->iomap_valid() introduction commit for exact details of the
> > corruption vector.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c |  4 +--
> >  fs/xfs/xfs_aops.c        |  2 +-
> >  fs/xfs/xfs_iomap.c       | 69 ++++++++++++++++++++++++++++++----------
> >  fs/xfs/xfs_iomap.h       |  4 +--
> >  fs/xfs/xfs_pnfs.c        |  5 +--
> >  5 files changed, 61 insertions(+), 23 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > index 49d0d4ea63fc..db225130618c 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.c
> > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc(
> >  	 * the extent.  Just return the real extent at this offset.
> >  	 */
> >  	if (!isnullstartblock(bma.got.br_startblock)) {
> > -		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> >  		*seq = READ_ONCE(ifp->if_seq);
> > +		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
> >  		goto out_trans_cancel;
> >  	}
> >  
> > @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc(
> >  	XFS_STATS_INC(mp, xs_xstrat_quick);
> >  
> >  	ASSERT(!isnullstartblock(bma.got.br_startblock));
> > -	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> >  	*seq = READ_ONCE(ifp->if_seq);
> > +	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
> >  
> >  	if (whichfork == XFS_COW_FORK)
> 
> Hmm.  @ifp here could be the cow fork, which means *seq will be from the
> cow fork.  This I think is going to cause problems in
> xfs_buffered_write_iomap_valid...

We can't get here from the buffered write path. This can only be
reached by the writeback path via:

iomap_do_writepage
  iomap_writepage_map
    xfs_map_blocks
      xfs_convert_blocks
        xfs_bmapi_convert_delalloc

Hence the sequence numbers stored in iomap->private here are
completely ignored by the writeback code. They still get stored
in the struct xfs_writepage_ctx and checked by xfs_imap_valid(), so
the changes here were really just making sure all the
xfs_bmbt_to_iomap() callers stored the sequence number consistently.

> > @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin(
> >  	xfs_iunlock(ip, lockmode);
> >  
> >  	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
> > -			flags, &imap);
> > +			flags, &imap, &seq);
> >  	if (error)
> >  		return error;
> >  
> >  	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
> >  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> > -				 iomap_flags | IOMAP_F_NEW);
> > +				 iomap_flags | IOMAP_F_NEW, seq);
> >  
> >  out_found_cow:
> > +	seq = READ_ONCE(ip->i_df.if_seq);
> >  	xfs_iunlock(ip, lockmode);
> >  	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> >  	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> >  	if (imap.br_startblock != HOLESTARTBLOCK) {
> > -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> > +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
> >  		if (error)
> >  			return error;
> >  	}
> > -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
> > +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
> 
> Here we've found a mapping from the COW fork and we're encoding it into
> the struct iomap.  Why is the sequence number from the *data* fork and
> not the COW fork?

Because my brain isn't big enough to hold all this code straight in
my head.  I found it all too easy to get confused by what iomap is
passed where and how to tell them apart and I could not tell if
there was an easy way to tell if the iomap passed to
xfs_iomap_valid() needed to check against the data or cow fork.

Hence I set it up such that the xfs_iomap_valid() callback only
checks the data fork sequence so the only correct thing to set in
the iomap is the data fork sequence and moved on to the next part of
the problem...

But, again, this information probably should be encoded into the
sequence cookie we store. Which begs the question - if we are doing
a COW operation, we have to check that neither the data fork (the
srcmap) nor the COW fork (the iter->iomap) have changed, right? This
is what the writeback code does (i.e. checks both data and COW fork
sequence numbers), so perhaps that's also what we should be doing
here?

i.e. either the cookie for COW operations needs to contain both
data+cow sequence numbers, and both need to match to proceed, or we
have to validate both the srcmap and the iter->iomap with separate
callouts once the folio is locked.

Thoughts?

> > @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end(
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Check that the iomap passed to us is still valid for the given offset and
> > + * length.
> > + */
> > +static bool
> > +xfs_buffered_write_iomap_valid(
> > +	struct inode		*inode,
> > +	const struct iomap	*iomap)
> > +{
> > +	int			seq = *((int *)&iomap->private);
> > +
> > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> 
> Why is it ok to sample the data fork's sequence number even if the
> mapping came from the COW fork?  That doesn't make sense to me,
> conceptually.  I definitely think it's buggy that the revalidation
> might not sample from the same sequence counter as the original
> measurement.

Yup, see above.

> >  const struct iomap_ops xfs_read_iomap_ops = {
> > @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin(
> >  			end_fsb = min(end_fsb, data_fsb);
> >  		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
> >  		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> > -					  IOMAP_F_SHARED);
> > +				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));
> 
> Here we explicitly sample from the cow fork but the comparison is done
> against the data fork.  That /probably/ doesn't matter for reads since
> you're not proposing that we revalidate on those.  Should we?

As I said elsewhere, I haven't even looked at the read path. The
iomap code is now complex enough that I have trouble following it
and keeping all the corner cases in my head. This is the down side
of having people smarter than you write the code you need to debug
when shit inevitably goes wrong.

> What
> happens if userspace hands us a large read() from an unwritten extent
> and the same dirty/writeback/reclaim sequence happens to the last folio
> in that read() range before read_iter actually gets there?

No idea - shit may well go wrong there in the same way, but I've had
my hands full just fixing the write path. I'm *really* tired of
fighting with the iomap code right now...

> > @@ -189,7 +190,7 @@ xfs_fs_map_blocks(
> >  		xfs_iunlock(ip, lock_flags);
> >  
> >  		error = xfs_iomap_write_direct(ip, offset_fsb,
> > -				end_fsb - offset_fsb, 0, &imap);
> > +				end_fsb - offset_fsb, 0, &imap, &seq);
> 
> I got a compiler warning about seq not being set in the case where we
> don't call xfs_iomap_write_direct.

Yup, gcc 11.2 doesn't warn about it. Now to find out why my build
system is using gcc 11.2 when I upgraded it months ago to use
gcc-12 because of exactly these sorts of issues.... :(

-Dave.
Christoph Hellwig Nov. 4, 2022, 8:14 a.m. UTC | #6
On Thu, Nov 03, 2022 at 08:39:27AM +1100, Dave Chinner wrote:
> My concerns with putting it into the iomap is that different
> filesystems will have different mechanisms for detecting stale
> iomaps. THe way we do it with a generation counter is pretty coarse
> as any change to the extent map will invalidate the iomap regardless
> of whether they overlap or not.

OTOH it is a good way to nudge users to at least implement this simple
but working scheme, and it has no real overhead if people later want
to do something more fancy.

> If, in future, we want something more complex and finer grained
> (e.g. an iext tree cursor) to allow us to determine if the
> change to the extent tree actually modified the extent backing the
> iomap, then we are going to need an opaque cookie of some kind, not
> a u32 or a u32*.

Yes, but for that it actually has to be worth it and be implemented.
Darrick J. Wong Nov. 8, 2022, midnight UTC | #7
On Thu, Nov 03, 2022 at 09:36:05AM +1100, Dave Chinner wrote:
> On Wed, Nov 02, 2022 at 10:19:33AM -0700, Darrick J. Wong wrote:
> > On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Now that iomap supports a mechanism to validate cached iomaps for
> > > buffered write operations, hook it up to the XFS buffered write ops
> > > so that we can avoid data corruptions that result from stale cached
> > > iomaps. See:
> > > 
> > > https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
> > > 
> > > or the ->iomap_valid() introduction commit for exact details of the
> > > corruption vector.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/libxfs/xfs_bmap.c |  4 +--
> > >  fs/xfs/xfs_aops.c        |  2 +-
> > >  fs/xfs/xfs_iomap.c       | 69 ++++++++++++++++++++++++++++++----------
> > >  fs/xfs/xfs_iomap.h       |  4 +--
> > >  fs/xfs/xfs_pnfs.c        |  5 +--
> > >  5 files changed, 61 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > > index 49d0d4ea63fc..db225130618c 100644
> > > --- a/fs/xfs/libxfs/xfs_bmap.c
> > > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > > @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc(
> > >  	 * the extent.  Just return the real extent at this offset.
> > >  	 */
> > >  	if (!isnullstartblock(bma.got.br_startblock)) {
> > > -		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> > >  		*seq = READ_ONCE(ifp->if_seq);
> > > +		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
> > >  		goto out_trans_cancel;
> > >  	}
> > >  
> > > @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc(
> > >  	XFS_STATS_INC(mp, xs_xstrat_quick);
> > >  
> > >  	ASSERT(!isnullstartblock(bma.got.br_startblock));
> > > -	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> > >  	*seq = READ_ONCE(ifp->if_seq);
> > > +	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
> > >  
> > >  	if (whichfork == XFS_COW_FORK)
> > 
> > Hmm.  @ifp here could be the cow fork, which means *seq will be from the
> > cow fork.  This I think is going to cause problems in
> > xfs_buffered_write_iomap_valid...
> 
> We can't get here from the buffered write path. This can only be
> reached by the writeback path via:
> 
> iomap_do_writepage
>   iomap_writepage_map
>     xfs_map_blocks
>       xfs_convert_blocks
>         xfs_bmapi_convert_delalloc
> 
> Hence the sequence numbers stored in iomap->private here are
> completely ignored by the writeback code. They still get stored
> in the struct xfs_writepage_ctx and checked by xfs_imap_valid(), so
> the changes here were really just making sure all the
> xfs_bmbt_to_iomap() callers stored the sequence number consistently.
> 
> > > @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin(
> > >  	xfs_iunlock(ip, lockmode);
> > >  
> > >  	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
> > > -			flags, &imap);
> > > +			flags, &imap, &seq);
> > >  	if (error)
> > >  		return error;
> > >  
> > >  	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
> > >  	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> > > -				 iomap_flags | IOMAP_F_NEW);
> > > +				 iomap_flags | IOMAP_F_NEW, seq);
> > >  
> > >  out_found_cow:
> > > +	seq = READ_ONCE(ip->i_df.if_seq);
> > >  	xfs_iunlock(ip, lockmode);
> > >  	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> > >  	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> > >  	if (imap.br_startblock != HOLESTARTBLOCK) {
> > > -		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> > > +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
> > >  		if (error)
> > >  			return error;
> > >  	}
> > > -	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
> > > +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
> > 
> > Here we've found a mapping from the COW fork and we're encoding it into
> > the struct iomap.  Why is the sequence number from the *data* fork and
> > not the COW fork?
> 
> Because my brain isn't big enough to hold all this code straight in
> my head.  I found it all too easy to get confused by what iomap is
> passed where and how to tell them apart and I could not tell if
> there was an easy way to tell if the iomap passed to
> xfs_iomap_valid() needed to check against the data or cow fork.
> 
> Hence I set it up such that the xfs_iomap_valid() callback only
> checks the data fork sequence so the only correct thing to set in
> the iomap is the data fork sequence and moved on to the next part of
> the problem...
> 
> But, again, this information probably should be encoded into the
> sequence cookie we store. Which begs the question - if we are doing
> a COW operation, we have to check that neither the data fork (the
> srcmap) nor the COW fork (the iter->iomap) have changed, right? This
> is what the writeback code does (i.e. checks both data and COW fork
> sequence numbers), so perhaps that's also what we should be doing
> here?
> 
> i.e. either the cookie for COW operations needs to contain both
> data+cow sequence numbers, and both need to match to proceed, or we
> have to validate both the srcmap and the iter->iomap with separate
> callouts once the folio is locked.
> 
> Thoughts?

As we sorta discussed last week on IRC, I guess you could start by
encoding both values (as needed) in something fugly like this in struct
iomap:

union {
	void *private;
	u64 cookie;
};

But the longer term solution is (I suspect) to make a new struct that
XFS can envelop:

struct iomap_pagecache_ctx {
	struct iomap_iter	iter;	/* do not touch */
	struct iomap_page_ops	*ops;	/* callers can attach here */
};

struct xfs_pagecache_ctx {
	struct iomap_pagecache_ctx	xpctx;
	u32		data_seq;
	u32		cow_seq;
};

and pass that into the iomap functions:

	struct xfs_pagecache_ctx	ctx = { };

	ret = iomap_file_buffered_write(iocb, from, &ctx,
			&xfs_buffered_write_iomap_ops);

so that xfs_iomap_revalidate can do the usual struct unwrapping:

	struct xfs_pagecache_ctx	*xpctx;

	xpctx = container_of(ipctx, struct xfs_pagecache_ctx, ipctx);
	if (xpctx->data_seq != ip->i_df.df_seq)
		return NOPE;

But that's a pretty aggressive refactoring, not suitable for a bugfix,
do it afterwards while you're waiting for the rest of us to check over
part 1, etc.

--D

> > > @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end(
> > >  	return 0;
> > >  }
> > >  
> > > +/*
> > > + * Check that the iomap passed to us is still valid for the given offset and
> > > + * length.
> > > + */
> > > +static bool
> > > +xfs_buffered_write_iomap_valid(
> > > +	struct inode		*inode,
> > > +	const struct iomap	*iomap)
> > > +{
> > > +	int			seq = *((int *)&iomap->private);
> > > +
> > > +	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
> > 
> > Why is it ok to sample the data fork's sequence number even if the
> > mapping came from the COW fork?  That doesn't make sense to me,
> > conceptually.  I definitely think it's buggy that the revalidation
> > might not sample from the same sequence counter as the original
> > measurement.
> 
> Yup, see above.
> 
> > >  const struct iomap_ops xfs_read_iomap_ops = {
> > > @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin(
> > >  			end_fsb = min(end_fsb, data_fsb);
> > >  		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
> > >  		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> > > -					  IOMAP_F_SHARED);
> > > +				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));
> > 
> > Here we explicitly sample from the cow fork but the comparison is done
> > against the data fork.  That /probably/ doesn't matter for reads since
> > you're not proposing that we revalidate on those.  Should we?
> 
> As I said elsewhere, I haven't even looked at the read path. The
> iomap code is now complex enough that I have trouble following it
> and keeping all the corner cases in my head. This is the down side
> of having people smarter than you write the code you need to debug
> when shit inevitably goes wrong.
> 
> > What
> > happens if userspace hands us a large read() from an unwritten extent
> > and the same dirty/writeback/reclaim sequence happens to the last folio
> > in that read() range before read_iter actually gets there?
> 
> No idea - shit may well go wrong there in the same way, but I've had
> my hands full just fixing the write path. I'm *really* tired of
> fighting with the iomap code right now...
> 
> > > @@ -189,7 +190,7 @@ xfs_fs_map_blocks(
> > >  		xfs_iunlock(ip, lock_flags);
> > >  
> > >  		error = xfs_iomap_write_direct(ip, offset_fsb,
> > > -				end_fsb - offset_fsb, 0, &imap);
> > > +				end_fsb - offset_fsb, 0, &imap, &seq);
> > 
> > I got a compiler warning about seq not being set in the case where we
> > don't call xfs_iomap_write_direct.
> 
> Yup, gcc 11.2 doesn't warn about it. Now to find out why my build
> system is using gcc 11.2 when I upgraded it months ago to use
> gcc-12 because of exactly these sorts of issues.... :(

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 49d0d4ea63fc..db225130618c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4551,8 +4551,8 @@  xfs_bmapi_convert_delalloc(
 	 * the extent.  Just return the real extent at this offset.
 	 */
 	if (!isnullstartblock(bma.got.br_startblock)) {
-		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
 		*seq = READ_ONCE(ifp->if_seq);
+		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
 		goto out_trans_cancel;
 	}
 
@@ -4599,8 +4599,8 @@  xfs_bmapi_convert_delalloc(
 	XFS_STATS_INC(mp, xs_xstrat_quick);
 
 	ASSERT(!isnullstartblock(bma.got.br_startblock));
-	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
 	*seq = READ_ONCE(ifp->if_seq);
+	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
 
 	if (whichfork == XFS_COW_FORK)
 		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d1a995b15f8..ca5a9e45a48c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -373,7 +373,7 @@  xfs_map_blocks(
 	    isnullstartblock(imap.br_startblock))
 		goto allocate_blocks;
 
-	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
 	return 0;
 allocate_blocks:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2d48fcc7bd6f..5053ffcf10fe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,7 +54,8 @@  xfs_bmbt_to_iomap(
 	struct iomap		*iomap,
 	struct xfs_bmbt_irec	*imap,
 	unsigned int		mapping_flags,
-	u16			iomap_flags)
+	u16			iomap_flags,
+	int			sequence)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
@@ -91,6 +92,9 @@  xfs_bmbt_to_iomap(
 	if (xfs_ipincount(ip) &&
 	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
+
+	/* The extent tree sequence is needed for iomap validity checking. */
+	*((int *)&iomap->private) = sequence;
 	return 0;
 }
 
@@ -195,7 +199,8 @@  xfs_iomap_write_direct(
 	xfs_fileoff_t		offset_fsb,
 	xfs_fileoff_t		count_fsb,
 	unsigned int		flags,
-	struct xfs_bmbt_irec	*imap)
+	struct xfs_bmbt_irec	*imap,
+	int			*seq)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
@@ -285,6 +290,8 @@  xfs_iomap_write_direct(
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
+	if (seq)
+		*seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 
@@ -743,6 +750,7 @@  xfs_direct_write_iomap_begin(
 	bool			shared = false;
 	u16			iomap_flags = 0;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
+	int			seq;
 
 	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
 
@@ -811,9 +819,10 @@  xfs_direct_write_iomap_begin(
 			goto out_unlock;
 	}
 
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, lockmode);
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
 
 allocate_blocks:
 	error = -EAGAIN;
@@ -839,24 +848,25 @@  xfs_direct_write_iomap_begin(
 	xfs_iunlock(ip, lockmode);
 
 	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
-			flags, &imap);
+			flags, &imap, &seq);
 	if (error)
 		return error;
 
 	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 iomap_flags | IOMAP_F_NEW);
+				 iomap_flags | IOMAP_F_NEW, seq);
 
 out_found_cow:
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, lockmode);
 	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
 	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
 	if (imap.br_startblock != HOLESTARTBLOCK) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
 		if (error)
 			return error;
 	}
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
 
 out_unlock:
 	if (lockmode)
@@ -915,6 +925,7 @@  xfs_buffered_write_iomap_begin(
 	int			allocfork = XFS_DATA_FORK;
 	int			error = 0;
 	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1094,26 +1105,29 @@  xfs_buffered_write_iomap_begin(
 	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
 	 * them out if the write happens to fail.
 	 */
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
 
 found_imap:
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
 
 found_cow:
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (imap.br_startoff <= offset_fsb) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
 		if (error)
 			return error;
 		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-					 IOMAP_F_SHARED);
+					 IOMAP_F_SHARED, seq);
 	}
 
 	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
 
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1328,9 +1342,26 @@  xfs_buffered_write_iomap_end(
 	return 0;
 }
 
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_buffered_write_iomap_valid(
+	struct inode		*inode,
+	const struct iomap	*iomap)
+{
+	int			seq = *((int *)&iomap->private);
+
+	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
+		return false;
+	return true;
+}
+
 const struct iomap_ops xfs_buffered_write_iomap_ops = {
 	.iomap_begin		= xfs_buffered_write_iomap_begin,
 	.iomap_end		= xfs_buffered_write_iomap_end,
+	.iomap_valid		= xfs_buffered_write_iomap_valid,
 };
 
 /*
@@ -1359,6 +1390,7 @@  xfs_read_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
+	int			seq;
 
 	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
 
@@ -1372,13 +1404,14 @@  xfs_read_iomap_begin(
 			       &nimaps, 0);
 	if (!error && (flags & IOMAP_REPORT))
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+	seq = READ_ONCE(ip->i_df.if_seq);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 shared ? IOMAP_F_SHARED : 0);
+				 shared ? IOMAP_F_SHARED : 0, seq);
 }
 
 const struct iomap_ops xfs_read_iomap_ops = {
@@ -1438,7 +1471,7 @@  xfs_seek_iomap_begin(
 			end_fsb = min(end_fsb, data_fsb);
 		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
 		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-					  IOMAP_F_SHARED);
+				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));
 		/*
 		 * This is a COW extent, so we must probe the page cache
 		 * because there could be dirty page cache being backed
@@ -1460,7 +1493,8 @@  xfs_seek_iomap_begin(
 	imap.br_state = XFS_EXT_NORM;
 done:
 	xfs_trim_extent(&imap, offset_fsb, end_fsb);
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0,
+			READ_ONCE(ip->i_df.if_seq));
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 	return error;
@@ -1486,6 +1520,7 @@  xfs_xattr_iomap_begin(
 	struct xfs_bmbt_irec	imap;
 	int			nimaps = 1, error = 0;
 	unsigned		lockmode;
+	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1502,12 +1537,14 @@  xfs_xattr_iomap_begin(
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, XFS_BMAPI_ATTRFORK);
 out_unlock:
+
+	seq = READ_ONCE(ip->i_af.if_seq);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	ASSERT(nimaps);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 0f62ab633040..792fed2a9072 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,14 +13,14 @@  struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
 		xfs_fileoff_t count_fsb, unsigned int flags,
-		struct xfs_bmbt_irec *imap);
+		struct xfs_bmbt_irec *imap, int *sequence);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
 		xfs_fileoff_t end_fsb);
 
 int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
 		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
-		u16 iomap_flags);
+		u16 iomap_flags, int sequence);
 
 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
 		bool *did_zero);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 37a24f0f7cd4..eea507a80c5c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,6 +125,7 @@  xfs_fs_map_blocks(
 	int			nimaps = 1;
 	uint			lock_flags;
 	int			error = 0;
+	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -189,7 +190,7 @@  xfs_fs_map_blocks(
 		xfs_iunlock(ip, lock_flags);
 
 		error = xfs_iomap_write_direct(ip, offset_fsb,
-				end_fsb - offset_fsb, 0, &imap);
+				end_fsb - offset_fsb, 0, &imap, &seq);
 		if (error)
 			goto out_unlock;
 
@@ -209,7 +210,7 @@  xfs_fs_map_blocks(
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
 	*device_generation = mp->m_generation;
 	return error;
 out_unlock: