diff mbox series

[11/14] xfs: move the seq counters for buffered writes to a private struct

Message ID 166801780645.3992140.5437978046181951687.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series xfs, iomap: fix data corruption due to stale cached iomaps | expand

Commit Message

Darrick J. Wong Nov. 9, 2022, 6:16 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

To avoid racing with writeback, XFS needs to revalidate (io)mappings
after grabbing each folio lock.  Right now, XFS stashes the sequence
counter values in the (void *pointer) field of the iomap, which is not
the greatest solution.  First, we don't record which fork was sampled,
which means that the current code which samples either fork and compares
it to the data fork seqcount is wrong.  Second, if another thread
touches the cow fork, we (conservatively) want to revalidate because
*something* has changed.

The previous three patches reorganized the iomap callbacks to pass the
iomap_iter to the ->iomap_{begin,end} functions and provided a way to
set the iter->private field for a buffered write.  Now we can update the
buffered write paths in XFS to allocate the necessary memory to sample
both forks' sequence counters and revalidate the data fork during a
write operation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c   |    2 -
 fs/xfs/libxfs/xfs_bmap.c |    4 +-
 fs/xfs/xfs_aops.c        |    2 -
 fs/xfs/xfs_file.c        |    3 +
 fs/xfs/xfs_iomap.c       |  115 ++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_iomap.h       |   10 +++-
 fs/xfs/xfs_pnfs.c        |    5 +-
 fs/xfs/xfs_reflink.c     |    3 +
 include/linux/iomap.h    |    2 -
 9 files changed, 90 insertions(+), 56 deletions(-)

Comments

Christoph Hellwig Nov. 15, 2022, 8:38 a.m. UTC | #1
This looks reasonable, but needs to be folded into the patch
introducing the sequence validation.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 779244960153..130890907615 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -631,7 +631,7 @@  static int iomap_write_begin(struct iomap_iter *iter,
 	 * to zero) and corrupt data.
 	 */
 	if (ops->iomap_valid) {
-		bool iomap_valid = ops->iomap_valid(iter->inode, &iter->iomap);
+		bool iomap_valid = ops->iomap_valid(iter);
 
 		if (!iomap_valid) {
 			iter->iomap.flags |= IOMAP_F_STALE;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index db225130618c..bdaa55e725a8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4552,7 +4552,7 @@  xfs_bmapi_convert_delalloc(
 	 */
 	if (!isnullstartblock(bma.got.br_startblock)) {
 		*seq = READ_ONCE(ifp->if_seq);
-		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
+		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
 		goto out_trans_cancel;
 	}
 
@@ -4600,7 +4600,7 @@  xfs_bmapi_convert_delalloc(
 
 	ASSERT(!isnullstartblock(bma.got.br_startblock));
 	*seq = READ_ONCE(ifp->if_seq);
-	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq);
+	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
 
 	if (whichfork == XFS_COW_FORK)
 		xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index ca5a9e45a48c..5d1a995b15f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -373,7 +373,7 @@  xfs_map_blocks(
 	    isnullstartblock(imap.br_startblock))
 		goto allocate_blocks;
 
-	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
+	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
 	return 0;
 allocate_blocks:
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f3671e22ba16..64a1e8982467 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -701,6 +701,7 @@  xfs_file_buffered_write(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
+	struct xfs_iomap_buffered_ctx ibc = { };
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
@@ -722,7 +723,7 @@  xfs_file_buffered_write(
 
 	trace_xfs_file_buffered_write(iocb, from);
 	ret = iomap_file_buffered_write(iocb, from,
-			&xfs_buffered_write_iomap_ops, NULL);
+			&xfs_buffered_write_iomap_ops, &ibc);
 	if (likely(ret >= 0))
 		iocb->ki_pos += ret;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 00a4b60c97e9..c3c23524a3d2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -48,14 +48,30 @@  xfs_alert_fsblock_zero(
 	return -EFSCORRUPTED;
 }
 
+static inline void
+xfs_iomap_sample_seq(
+	const struct iomap_iter		*iter,
+	struct xfs_inode		*ip)
+{
+	struct xfs_iomap_buffered_ctx	*ibc = iter->private;
+
+	if (!ibc)
+		return;
+
+	/* The extent tree sequence is needed for iomap validity checking. */
+	ibc->data_seq = READ_ONCE(ip->i_df.if_seq);
+	ibc->has_cow_seq = xfs_inode_has_cow_data(ip);
+	if (ibc->has_cow_seq)
+		ibc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+}
+
 int
 xfs_bmbt_to_iomap(
 	struct xfs_inode	*ip,
 	struct iomap		*iomap,
 	struct xfs_bmbt_irec	*imap,
 	unsigned int		mapping_flags,
-	u16			iomap_flags,
-	int			sequence)
+	u16			iomap_flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
@@ -92,9 +108,6 @@  xfs_bmbt_to_iomap(
 	if (xfs_ipincount(ip) &&
 	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
-
-	/* The extent tree sequence is needed for iomap validity checking. */
-	*((int *)&iomap->private) = sequence;
 	return 0;
 }
 
@@ -193,14 +206,14 @@  xfs_iomap_eof_align_last_fsb(
 	return end_fsb;
 }
 
-int
-xfs_iomap_write_direct(
+static int
+__xfs_iomap_write_direct(
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		offset_fsb,
 	xfs_fileoff_t		count_fsb,
 	unsigned int		flags,
 	struct xfs_bmbt_irec	*imap,
-	int			*seq)
+	const struct iomap_iter	*iter)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
@@ -290,8 +303,8 @@  xfs_iomap_write_direct(
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
-	if (seq)
-		*seq = READ_ONCE(ip->i_df.if_seq);
+	if (iter)
+		xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 
@@ -300,6 +313,18 @@  xfs_iomap_write_direct(
 	goto out_unlock;
 }
 
+int
+xfs_iomap_write_direct(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		count_fsb,
+	unsigned int		flags,
+	struct xfs_bmbt_irec	*imap)
+{
+	return __xfs_iomap_write_direct(ip, offset_fsb, count_fsb, flags, imap,
+			NULL);
+}
+
 STATIC bool
 xfs_quota_need_throttle(
 	struct xfs_inode	*ip,
@@ -751,7 +776,6 @@  xfs_direct_write_iomap_begin(
 	bool			shared = false;
 	u16			iomap_flags = 0;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
-	int			seq;
 
 	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
 
@@ -820,10 +844,10 @@  xfs_direct_write_iomap_begin(
 			goto out_unlock;
 	}
 
-	seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, lockmode);
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
 
 allocate_blocks:
 	error = -EAGAIN;
@@ -848,26 +872,26 @@  xfs_direct_write_iomap_begin(
 		end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
 	xfs_iunlock(ip, lockmode);
 
-	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
-			flags, &imap, &seq);
+	error = __xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+			flags, &imap, iter);
 	if (error)
 		return error;
 
 	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 iomap_flags | IOMAP_F_NEW, seq);
+				 iomap_flags | IOMAP_F_NEW);
 
 out_found_cow:
-	seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, lockmode);
 	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
 	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
 	if (imap.br_startblock != HOLESTARTBLOCK) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
 		if (error)
 			return error;
 	}
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
 
 out_unlock:
 	if (lockmode)
@@ -926,7 +950,6 @@  xfs_buffered_write_iomap_begin(
 	int			allocfork = XFS_DATA_FORK;
 	int			error = 0;
 	unsigned int		lockmode = XFS_ILOCK_EXCL;
-	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1105,29 +1128,29 @@  xfs_buffered_write_iomap_begin(
 	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
 	 * them out if the write happens to fail.
 	 */
-	seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
 
 found_imap:
-	seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
 
 found_cow:
-	seq = READ_ONCE(ip->i_df.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (imap.br_startoff <= offset_fsb) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
 		if (error)
 			return error;
 		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-					 IOMAP_F_SHARED, seq);
+					 IOMAP_F_SHARED);
 	}
 
 	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
 
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1340,12 +1363,12 @@  xfs_buffered_write_iomap_end(
  */
 static bool
 xfs_buffered_write_iomap_valid(
-	struct inode		*inode,
-	const struct iomap	*iomap)
+	const struct iomap_iter		*iter)
 {
-	int			seq = *((int *)&iomap->private);
+	struct xfs_iomap_buffered_ctx	*ibc = iter->private;
+	struct xfs_inode		*ip = XFS_I(iter->inode);
 
-	if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq))
+	if (ibc->data_seq != READ_ONCE(ip->i_df.if_seq))
 		return false;
 	return true;
 }
@@ -1383,7 +1406,6 @@  xfs_read_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
-	int			seq;
 
 	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
 
@@ -1397,14 +1419,15 @@  xfs_read_iomap_begin(
 			       &nimaps, 0);
 	if (!error && (flags & IOMAP_REPORT))
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
-	seq = READ_ONCE(ip->i_df.if_seq);
+	if (!error)
+		xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 shared ? IOMAP_F_SHARED : 0, seq);
+				 shared ? IOMAP_F_SHARED : 0);
 }
 
 const struct iomap_ops xfs_read_iomap_ops = {
@@ -1463,9 +1486,13 @@  xfs_seek_iomap_begin(
 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
 		if (data_fsb < cow_fsb + cmap.br_blockcount)
 			end_fsb = min(end_fsb, data_fsb);
+		xfs_iomap_sample_seq(iter, ip);
 		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
 		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-				IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq));
+				IOMAP_F_SHARED);
+		if (error)
+			goto out_unlock;
+
 		/*
 		 * This is a COW extent, so we must probe the page cache
 		 * because there could be dirty page cache being backed
@@ -1487,8 +1514,8 @@  xfs_seek_iomap_begin(
 	imap.br_state = XFS_EXT_NORM;
 done:
 	xfs_trim_extent(&imap, offset_fsb, end_fsb);
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0,
-			READ_ONCE(ip->i_df.if_seq));
+	xfs_iomap_sample_seq(iter, ip);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 	return error;
@@ -1515,7 +1542,6 @@  xfs_xattr_iomap_begin(
 	struct xfs_bmbt_irec	imap;
 	int			nimaps = 1, error = 0;
 	unsigned		lockmode;
-	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1532,14 +1558,13 @@  xfs_xattr_iomap_begin(
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, XFS_BMAPI_ATTRFORK);
 out_unlock:
-
-	seq = READ_ONCE(ip->i_af.if_seq);
+	xfs_iomap_sample_seq(iter, ip);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	ASSERT(nimaps);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
@@ -1553,13 +1578,14 @@  xfs_zero_range(
 	loff_t			len,
 	bool			*did_zero)
 {
+	struct xfs_iomap_buffered_ctx ibc = { };
 	struct inode		*inode = VFS_I(ip);
 
 	if (IS_DAX(inode))
 		return dax_zero_range(inode, pos, len, did_zero,
 				      &xfs_direct_write_iomap_ops);
 	return iomap_zero_range(inode, pos, len, did_zero,
-				&xfs_buffered_write_iomap_ops, NULL);
+				&xfs_buffered_write_iomap_ops, &ibc);
 }
 
 int
@@ -1568,11 +1594,12 @@  xfs_truncate_page(
 	loff_t			pos,
 	bool			*did_zero)
 {
+	struct xfs_iomap_buffered_ctx ibc = { };
 	struct inode		*inode = VFS_I(ip);
 
 	if (IS_DAX(inode))
 		return dax_truncate_page(inode, pos, did_zero,
 					&xfs_direct_write_iomap_ops);
 	return iomap_truncate_page(inode, pos, did_zero,
-				   &xfs_buffered_write_iomap_ops, NULL);
+				   &xfs_buffered_write_iomap_ops, &ibc);
 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 792fed2a9072..369de43ae568 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -11,16 +11,22 @@ 
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
+struct xfs_iomap_buffered_ctx {
+	unsigned int	data_seq;
+	unsigned int	cow_seq;
+	bool		has_cow_seq;
+};
+
 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
 		xfs_fileoff_t count_fsb, unsigned int flags,
-		struct xfs_bmbt_irec *imap, int *sequence);
+		struct xfs_bmbt_irec *imap);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
 		xfs_fileoff_t end_fsb);
 
 int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
 		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
-		u16 iomap_flags, int sequence);
+		u16 iomap_flags);
 
 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
 		bool *did_zero);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index eea507a80c5c..37a24f0f7cd4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,7 +125,6 @@  xfs_fs_map_blocks(
 	int			nimaps = 1;
 	uint			lock_flags;
 	int			error = 0;
-	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -190,7 +189,7 @@  xfs_fs_map_blocks(
 		xfs_iunlock(ip, lock_flags);
 
 		error = xfs_iomap_write_direct(ip, offset_fsb,
-				end_fsb - offset_fsb, 0, &imap, &seq);
+				end_fsb - offset_fsb, 0, &imap);
 		if (error)
 			goto out_unlock;
 
@@ -210,7 +209,7 @@  xfs_fs_map_blocks(
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
 	*device_generation = mp->m_generation;
 	return error;
 out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 31b7b6e5db45..33b4853a31d5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1683,6 +1683,7 @@  xfs_reflink_unshare(
 	xfs_off_t		offset,
 	xfs_off_t		len)
 {
+	struct xfs_iomap_buffered_ctx ibc = { };
 	struct inode		*inode = VFS_I(ip);
 	int			error;
 
@@ -1694,7 +1695,7 @@  xfs_reflink_unshare(
 	inode_dio_wait(inode);
 
 	error = iomap_file_unshare(inode, offset, len,
-			&xfs_buffered_write_iomap_ops, NULL);
+			&xfs_buffered_write_iomap_ops, &ibc);
 	if (error)
 		goto out;
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 152353164a5a..af8ead9ac362 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -182,7 +182,7 @@  struct iomap_ops {
 	 * This is called with the folio over the specified file position
 	 * held locked by the iomap code.
 	 */
-	bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+	bool (*iomap_valid)(const struct iomap_iter *iter);
 };
 
 /**