diff mbox series

[2/4] xfs: make file data allocations observe the 'forcealign' flag

Message ID 170404855929.1770028.8502538039360735032.stgit@frogsfrogsfrogs (mailing list archive)
State New
Headers show
Series [1/4] xfs: create a new inode flag to require extsize alignment of file data space | expand

Commit Message

Darrick J. Wong Dec. 31, 2023, 10:03 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

The existing extsize hint code already did the work of expanding file
range mapping requests so that the range is aligned to the hint value.
Now add the code we need to guarantee that the space allocations are
also always aligned.

Co-developed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   48 ++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_iomap.c       |    4 +++-
 fs/xfs/xfs_reflink.c     |    4 ++++
 3 files changed, 51 insertions(+), 5 deletions(-)

Comments

John Garry Jan. 16, 2024, 9:26 a.m. UTC | #1
Hi Darrick,

As mentioned internally, we have an issue for atomic writes [0] that we 
get an aligned and but not fully-written extent when we initially write 
a size less than the forcealign size, like:

#/mkfs.xfs -f -d forcealign=16k /dev/sda
...
# mount /dev/sda mnt
# touch  mnt/file
# /test-pwritev2 -a -d -l 4096 -p 0 /root/mnt/file # direct IO, atomic 
write, 4096B at pos 0
# filefrag -v mnt/file
Filesystem type is: 58465342
File size of mnt/file is 4096 (1 block of 4096 bytes)
  ext:     logical_offset:        physical_offset: length:   expected: 
flags:
    0:        0..       0:         24..        24:      1: 
last,eof
mnt/file: 1 extent found
# /test-pwritev2 -a -d -l 16384 -p 0 /root/mnt/file
wrote -1 bytes at pos 0 write_size=16384
#

This causes an issue for atomic writes in that the 16K write means 2x 
mappings and then 2x BIOs, which we cannot tolerate.

So how about this change on top:

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 731260a5af6d..6609f1058ae3 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -685,6 +685,12 @@ xfs_can_free_eofblocks(
         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
         if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
                 end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
+
+       /* Don't trim eof blocks */
+       if (xfs_inode_force_align(ip)) {
+               end_fsb = roundup_64(end_fsb, xfs_get_extsz_hint(ip));
+       }
+
         last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
         if (last_fsb <= end_fsb)
                 return false;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0c7008322326..c906e3a424d1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -291,6 +291,10 @@ xfs_iomap_write_direct(
                 }david@fromorbit.com
         }

+       if (xfs_inode_force_align(ip)) {
+               bmapi_flags = XFS_BMAPI_ZERO;
+       }
+
         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
                         rblocks, force, &tp);
         if (error)
lines 1-38/38 (END)


Which gives:

#/mkfs.xfs -d forcealign=16k /dev/sda
...
# /test-pwritev2 -a -d -l 4096 -p 0 /root/mnt/file
wrote 4096 bytes at pos 0 write_size=4096
# filefrag -v mnt/file
Filesystem type is: 58465342
File size of mnt/file is 4096 (1 block of 4096 bytes)
  ext:     logical_offset:        physical_offset: length:   expected: 
flags:
    0:        0..       3:         24..        27:      4: 
last,eof
mnt/file: 1 extent found
#
# /test-pwritev2 -a -d -l 16384 -p 0 /root/mnt/file
wrote 16384 bytes at pos 0 write_size=16384
# filefrag -v mnt/file
Filesystem type is: 58465342
File size of mnt/file is 16384 (4 blocks of 4096 bytes)
  ext:     logical_offset:        physical_offset: length:   expected: 
flags:
    0:        0..       3:         24..        27:      4: 
last,eof
mnt/file: 1 extent found
#

Or maybe make that change under FS_XFLAG_ATOMICWRITES flag. Previously 
we were pre-zero'ing the complete file to get around this.

Thanks,
John

[0] 
https://lore.kernel.org/linux-scsi/20240111161522.GB16626@lst.de/T/#mbc6824fbe9ce62c9506aa4c3f281173747695d77 
(just referencing for others)
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7d0d82353a745..b14761ec96b87 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3351,6 +3351,17 @@  xfs_bmap_btalloc_accounting(
 		args->len);
 }
 
+static inline bool
+xfs_bmap_use_forcealign(
+	const struct xfs_bmalloca	*ap)
+{
+	if (!xfs_inode_force_align(ap->ip))
+		return false;
+
+	return  (ap->flags & XFS_BMAPI_COWFORK) ||
+		(ap->datatype & XFS_ALLOC_USERDATA);
+}
+
 static int
 xfs_bmap_compute_alignments(
 	struct xfs_bmalloca	*ap,
@@ -3366,6 +3377,17 @@  xfs_bmap_compute_alignments(
 	else if (mp->m_dalign)
 		stripe_align = mp->m_dalign;
 
+	/*
+	 * File data mappings with forced alignment can use the stripe
+	 * alignment if it's a multiple of the forcealign value.  Otherwise,
+	 * use the regular forcealign value.
+	 */
+	if (xfs_bmap_use_forcealign(ap)) {
+		if (!stripe_align || stripe_align % mp->m_sb.sb_rextsize)
+			stripe_align = mp->m_sb.sb_rextsize;
+		args->alignment = stripe_align;
+	}
+
 	if (ap->flags & XFS_BMAPI_COWFORK)
 		align = xfs_get_cowextsz_hint(ap->ip);
 	else if (ap->datatype & XFS_ALLOC_USERDATA)
@@ -3438,6 +3460,9 @@  xfs_bmap_exact_minlen_extent_alloc(
 
 	ASSERT(ap->length);
 
+	if (xfs_inode_force_align(ap->ip))
+		return -ENOSPC;
+
 	if (ap->minlen != 1) {
 		ap->blkno = NULLFSBLOCK;
 		ap->length = 0;
@@ -3511,6 +3536,7 @@  xfs_bmap_btalloc_at_eof(
 {
 	struct xfs_mount	*mp = args->mp;
 	struct xfs_perag	*caller_pag = args->pag;
+	int			orig_alignment = args->alignment;
 	int			error;
 
 	/*
@@ -3585,10 +3611,10 @@  xfs_bmap_btalloc_at_eof(
 
 	/*
 	 * Allocation failed, so turn return the allocation args to their
-	 * original non-aligned state so the caller can proceed on allocation
-	 * failure as if this function was never called.
+	 * original state so the caller can proceed on allocation failure as
+	 * if this function was never called.
 	 */
-	args->alignment = 1;
+	args->alignment = orig_alignment;
 	return 0;
 }
 
@@ -3611,6 +3637,10 @@  xfs_bmap_btalloc_low_space(
 {
 	int			error;
 
+	/* Don't try unaligned last-chance allocations with forcealign */
+	if (xfs_inode_force_align(ap->ip))
+		return -ENOSPC;
+
 	if (args->minlen > ap->minlen) {
 		args->minlen = ap->minlen;
 		error = xfs_alloc_vextent_start_ag(args, ap->blkno);
@@ -4115,7 +4145,9 @@  xfs_bmap_alloc_userdata(
 		if (bma->offset == 0)
 			bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
 
-		if (mp->m_dalign && bma->length >= mp->m_dalign) {
+		/* forcealign mode reuses the stripe unit alignment code */
+		if (xfs_inode_force_align(bma->ip) ||
+		    (mp->m_dalign && bma->length >= mp->m_dalign)) {
 			error = xfs_bmap_isaeof(bma, whichfork);
 			if (error)
 				return error;
@@ -6381,6 +6413,10 @@  xfs_extlen_t
 xfs_get_extsz_hint(
 	struct xfs_inode	*ip)
 {
+	/* forcealign means we align to rextsize */
+	if (xfs_inode_force_align(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+
 	/*
 	 * No point in aligning allocations if we need to COW to actually
 	 * write to them.
@@ -6405,6 +6441,10 @@  xfs_get_cowextsz_hint(
 {
 	xfs_extlen_t		a, b;
 
+	/* forcealign means we align to rextsize */
+	if (xfs_inode_force_align(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+
 	a = 0;
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 559e8e7855952..3bfbcbed1bd68 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -185,7 +185,9 @@  xfs_eof_alignment(
 		 * If mounted with the "-o swalloc" option the alignment is
 		 * increased from the strip unit size to the stripe width.
 		 */
-		if (mp->m_swidth && xfs_has_swalloc(mp))
+		if (xfs_inode_force_align(ip))
+			align = xfs_get_extsz_hint(ip);
+		else if (mp->m_swidth && xfs_has_swalloc(mp))
 			align = mp->m_swidth;
 		else if (mp->m_dalign)
 			align = mp->m_dalign;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 0c54522404963..da39da13fcd7d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1803,6 +1803,10 @@  xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
+	/* XXX Can't reflink forcealign files for now */
+	if (xfs_inode_force_align(src) || xfs_inode_force_align(dest))
+		goto out_unlock;
+
 	/* Check non-power of two alignment issues, if necessary. */
 	if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
 		ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,