Message ID | 20250303171120.2837067-13-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | large atomic writes for xfs with CoW | expand |
On Mon, Mar 03, 2025 at 05:11:20PM +0000, John Garry wrote: > When issuing an atomic write by the CoW method, give the block allocator a > hint to align to the extszhint. > > This means that we have a better chance to issuing the atomic write via > HW offload next time. > > It does mean that the inode extszhint should be set appropriately for the > expected atomic write size. > > Reviewed-by: "Darrick J. Wong" <djwong@kernel.org> > Signed-off-by: John Garry <john.g.garry@oracle.com> > --- > fs/xfs/libxfs/xfs_bmap.c | 7 ++++++- > fs/xfs/libxfs/xfs_bmap.h | 6 +++++- > fs/xfs/xfs_reflink.c | 8 ++++++-- > 3 files changed, 17 insertions(+), 4 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c > index 0ef19f1469ec..9bfdfb7cdcae 100644 > --- a/fs/xfs/libxfs/xfs_bmap.c > +++ b/fs/xfs/libxfs/xfs_bmap.c > @@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments( > align = xfs_get_cowextsz_hint(ap->ip); > else if (ap->datatype & XFS_ALLOC_USERDATA) > align = xfs_get_extsz_hint(ap->ip); > + > + if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN) needs () around the & logic. if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) > + args->alignment = align; > + else > + args->alignment = 1; When is args->alignment not already initialised to 1? > + > if (align) { > if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, > ap->eof, 0, ap->conv, &ap->offset, > @@ -3782,7 +3788,6 @@ xfs_bmap_btalloc( > .wasdel = ap->wasdel, > .resv = XFS_AG_RESV_NONE, > .datatype = ap->datatype, > - .alignment = 1, > .minalignslop = 0, > }; Oh, you removed the initialisation to 1, so now we have the possibility of getting args->alignment = 0 anywhere in the allocation stack? FWIW, we've been trying to get rid of that case - args->alignment should always be 1 if no alignment is necessary so we don't ahve to special case alignment of 0 (meaning no alignemnt) anywhere. This seems like a step backwards from that perspective... > xfs_fileoff_t orig_offset; > diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h > index 4b721d935994..e6baa81e20d8 100644 > --- a/fs/xfs/libxfs/xfs_bmap.h > +++ b/fs/xfs/libxfs/xfs_bmap.h > @@ -87,6 +87,9 @@ struct xfs_bmalloca { > /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ > #define XFS_BMAPI_NORMAP (1u << 10) > > +/* Try to align allocations to the extent size hint */ > +#define XFS_BMAPI_EXTSZALIGN (1u << 11) Don't we already do that? Or is this doing something subtle and non-obvious like overriding stripe width alignment for large atomic writes? -Dave.
On 09/03/2025 22:03, Dave Chinner wrote: > On Mon, Mar 03, 2025 at 05:11:20PM +0000, John Garry wrote: >> When issuing an atomic write by the CoW method, give the block allocator a >> hint to align to the extszhint. >> >> This means that we have a better chance to issuing the atomic write via >> HW offload next time. >> >> It does mean that the inode extszhint should be set appropriately for the >> expected atomic write size. >> >> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org> >> Signed-off-by: John Garry <john.g.garry@oracle.com> >> --- >> fs/xfs/libxfs/xfs_bmap.c | 7 ++++++- >> fs/xfs/libxfs/xfs_bmap.h | 6 +++++- >> fs/xfs/xfs_reflink.c | 8 ++++++-- >> 3 files changed, 17 insertions(+), 4 deletions(-) >> >> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c >> index 0ef19f1469ec..9bfdfb7cdcae 100644 >> --- a/fs/xfs/libxfs/xfs_bmap.c >> +++ b/fs/xfs/libxfs/xfs_bmap.c >> @@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments( >> align = xfs_get_cowextsz_hint(ap->ip); >> else if (ap->datatype & XFS_ALLOC_USERDATA) >> align = xfs_get_extsz_hint(ap->ip); >> + >> + if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN) > > needs () around the & logic. ok > > if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) > >> + args->alignment = align; >> + else >> + args->alignment = 1; > > When is args->alignment not already initialised to 1? > >> + >> if (align) { >> if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, >> ap->eof, 0, ap->conv, &ap->offset, >> @@ -3782,7 +3788,6 @@ xfs_bmap_btalloc( >> .wasdel = ap->wasdel, >> .resv = XFS_AG_RESV_NONE, >> .datatype = ap->datatype, >> - .alignment = 1, >> .minalignslop = 0, >> }; > > Oh, you removed the initialisation to 1, so now we have the > possibility of getting args->alignment = 0 anywhere in the > allocation stack? > > FWIW, we've been trying to get rid of that case - args->alignment should > always be 1 if no alignment is necessary so we don't ahve to special > case alignment of 0 (meaning no alignemnt) anywhere. This seems > like a step backwards from that perspective... As I recall, doing this was a suggestion when developing the forcealign support (as it had similar logic). Anyway, I can leave the init to 1 in xfs_bmap_btalloc() > > > >> xfs_fileoff_t orig_offset; >> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h >> index 4b721d935994..e6baa81e20d8 100644 >> --- a/fs/xfs/libxfs/xfs_bmap.h >> +++ b/fs/xfs/libxfs/xfs_bmap.h >> @@ -87,6 +87,9 @@ struct xfs_bmalloca { >> /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ >> #define XFS_BMAPI_NORMAP (1u << 10) >> >> +/* Try to align allocations to the extent size hint */ >> +#define XFS_BMAPI_EXTSZALIGN (1u << 11) > > Don't we already do that? > > Or is this doing something subtle and non-obvious like overriding > stripe width alignment for large atomic writes? > stripe alignment only comes into play for eof allocation. args->alignment is used in xfs_alloc_compute_aligned() to actually align the start bno. If I don't have this, then we can get this ping-pong affect when overwriting atomically the same region: # dd if=/dev/zero of=mnt/file bs=1M count=10 conv=fsync # xfs_bmap -vp mnt/file mnt/file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..20479]: 192..20671 0 (192..20671) 20480 000000 # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file wrote 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0525 sec (1.190 MiB/sec and 19.0425 ops/sec) # xfs_bmap -vp mnt/file mnt/file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..127]: 20672..20799 0 (20672..20799) 128 000000 1: [128..20479]: 320..20671 0 (320..20671) 20352 000000 # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file wrote 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0581 ops/sec) # xfs_bmap -vp mnt/file mnt/file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..20479]: 192..20671 0 (192..20671) 20480 000000 # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file wrote 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0611 ops/sec) # xfs_bmap -vp mnt/file mnt/file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..127]: 20672..20799 0 (20672..20799) 128 000000 1: [128..20479]: 320..20671 0 (320..20671) 20352 000000 We are never getting aligned extents wrt write length, and so have to fall back to the SW-based atomic write always. That is not what we want. Thanks, John
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ef19f1469ec..9bfdfb7cdcae 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN) + args->alignment = align; + else + args->alignment = 1; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -3782,7 +3788,6 @@ xfs_bmap_btalloc( .wasdel = ap->wasdel, .resv = XFS_AG_RESV_NONE, .datatype = ap->datatype, - .alignment = 1, .minalignslop = 0, }; xfs_fileoff_t orig_offset; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b721d935994..e6baa81e20d8 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 844e2b43357b..72fb60df9a53 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -445,6 +445,11 @@ xfs_reflink_fill_cow_hole( int error; bool found; bool atomic_sw = flags & XFS_REFLINK_ATOMIC_SW; + uint32_t bmapi_flags = XFS_BMAPI_COWFORK | + XFS_BMAPI_PREALLOC; + + if (atomic_sw) + bmapi_flags |= XFS_BMAPI_EXTSZALIGN; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); @@ -478,8 +483,7 @@ xfs_reflink_fill_cow_hole( /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, - &nimaps); + bmapi_flags, 0, cmap, &nimaps); if (error) goto out_trans_cancel;