diff mbox series

[v6,11/12] xfs: add xfs_compute_atomic_write_unit_max()

Message ID 20250408104209.1852036-12-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series large atomic writes for xfs | expand

Commit Message

John Garry April 8, 2025, 10:42 a.m. UTC
Now that CoW-based atomic writes are supported, update the max size of an
atomic write for the data device.

The limit of a CoW-based atomic write will be the limit of the number of
logitems which can fit into a single transaction.

In addition, the max atomic write size needs to be aligned to the agsize.
Limit the size of atomic writes to the greatest power-of-two factor of the
agsize so that allocations for an atomic write will always be aligned
compatibly with the alignment requirements of the storage.

rtvol is not commonly used, so it is not very important to support large
atomic writes there initially.

Furthermore, adding large atomic writes for rtvol would be complicated due
to alignment already offered by rtextsize and also the limitation of
reflink support only be possible for rtextsize is a power-of-2.

Function xfs_atomic_write_logitems() is added to find the limit the number
of log items which can fit in a single transaction.

Darrick Wong contributed the changes in xfs_atomic_write_logitems()
originally, but may now be outdated by [0].

[0] https://lore.kernel.org/linux-xfs/20250406172227.GC6307@frogsfrogsfrogs/

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_mount.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h |  5 +++++
 fs/xfs/xfs_super.c | 22 ++++++++++++++++++++++
 fs/xfs/xfs_super.h |  1 +
 4 files changed, 64 insertions(+)

Comments

Darrick J. Wong April 8, 2025, 9:28 p.m. UTC | #1
On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> Now that CoW-based atomic writes are supported, update the max size of an
> atomic write for the data device.
> 
> The limit of a CoW-based atomic write will be the limit of the number of
> logitems which can fit into a single transaction.
> 
> In addition, the max atomic write size needs to be aligned to the agsize.
> Limit the size of atomic writes to the greatest power-of-two factor of the
> agsize so that allocations for an atomic write will always be aligned
> compatibly with the alignment requirements of the storage.
> 
> rtvol is not commonly used, so it is not very important to support large
> atomic writes there initially.
> 
> Furthermore, adding large atomic writes for rtvol would be complicated due
> to alignment already offered by rtextsize and also the limitation of
> reflink support only be possible for rtextsize is a power-of-2.
> 
> Function xfs_atomic_write_logitems() is added to find the limit the number
> of log items which can fit in a single transaction.
> 
> Darrick Wong contributed the changes in xfs_atomic_write_logitems()
> originally, but may now be outdated by [0].
> 
> [0] https://lore.kernel.org/linux-xfs/20250406172227.GC6307@frogsfrogsfrogs/
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_mount.c | 36 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_mount.h |  5 +++++
>  fs/xfs/xfs_super.c | 22 ++++++++++++++++++++++
>  fs/xfs/xfs_super.h |  1 +
>  4 files changed, 64 insertions(+)
> 
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 00b53f479ece..27a737202637 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -666,6 +666,37 @@ xfs_agbtree_compute_maxlevels(
>  	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
>  }
>  
> +static inline void
> +xfs_compute_atomic_write_unit_max(
> +	struct xfs_mount	*mp)
> +{
> +	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
> +	unsigned int		max_extents_logitems;
> +	unsigned int		max_agsize;
> +
> +	if (!xfs_has_reflink(mp)) {
> +		mp->m_atomic_write_unit_max = 1;
> +		return;
> +	}
> +
> +	/*
> +	 * Find limit according to logitems.
> +	 */
> +	max_extents_logitems = xfs_atomic_write_logitems(mp);
> +
> +	/*
> +	 * Also limit the size of atomic writes to the greatest power-of-two
> +	 * factor of the agsize so that allocations for an atomic write will
> +	 * always be aligned compatibly with the alignment requirements of the
> +	 * storage.
> +	 * The greatest power-of-two is the value according to the lowest bit
> +	 * set.
> +	 */
> +	max_agsize = 1 << (ffs(agsize) - 1);
> +
> +	mp->m_atomic_write_unit_max = min(max_extents_logitems, max_agsize);
> +}
> +
>  /* Compute maximum possible height for realtime btree types for this fs. */
>  static inline void
>  xfs_rtbtree_compute_maxlevels(
> @@ -842,6 +873,11 @@ xfs_mountfs(
>  	 */
>  	xfs_trans_init(mp);
>  
> +	/*
> +	 * Pre-calculate atomic write unit max.
> +	 */
> +	xfs_compute_atomic_write_unit_max(mp);
> +
>  	/*
>  	 * Allocate and initialize the per-ag data.
>  	 */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 799b84220ebb..4462bffbf0ff 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -230,6 +230,11 @@ typedef struct xfs_mount {
>  	bool			m_update_sb;	/* sb needs update in mount */
>  	unsigned int		m_max_open_zones;
>  
> +	/*
> +	 * data device max atomic write.
> +	 */
> +	xfs_extlen_t		m_atomic_write_unit_max;
> +
>  	/*
>  	 * Bitsets of per-fs metadata that have been checked and/or are sick.
>  	 * Callers must hold m_sb_lock to access these two fields.
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..42b2b7540507 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
>  	return -ENOMEM;
>  }
>  
> +unsigned int
> +xfs_atomic_write_logitems(
> +	struct xfs_mount	*mp)
> +{
> +	unsigned int		efi = xfs_efi_item_overhead(1);
> +	unsigned int		rui = xfs_rui_item_overhead(1);
> +	unsigned int		cui = xfs_cui_item_overhead(1);
> +	unsigned int		bui = xfs_bui_item_overhead(1);

Intent items can be relogged during a transaction roll, so you need to
add the done item overhead too, e.g.

	const unsigned int	efi = xfs_efi_item_overhead(1) +
				      xfs_efd_item_overhead(1);
	const unsigned int	rui = xfs_rui_item_overhead(1) +
				      xfs_rud_item_overhead();
	const unsigned int	cui = xfs_cui_item_overhead(1) +
				      xfs_cud_item_overhead();
	const unsigned int	bui = xfs_bui_item_overhead(1) +
				      xfs_bud_item_overhead();

> +	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
> +
> +	/*
> +	 * Maximum overhead to complete an atomic write ioend in software:
> +	 * remove data fork extent + remove cow fork extent +
> +	 * map extent into data fork
> +	 */
> +	unsigned int		atomic_logitems =
> +		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);

You still have to leave enough space to finish at least one step of the
intent types that can be attached to the untorn cow ioend.  Assuming
that you have functions to compute the reservation needed to finish one
step of each of the four intent item types, the worst case reservation
to finish one item is:

	/* Overhead to finish one step of each intent item type */
	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
	const unsigned int	f3 = xfs_calc_finish_cui_reservation(mp, 1);
	const unsigned int	f4 = xfs_calc_finish_bui_reservation(mp, 1);

	/* We only finish one item per transaction in a chain */
	const unsigned int	step_size = max(f4, max3(f1, f2, f3));

So the worst case limit on the number of loops through
xfs_reflink_remap_extent is:

	return rounddown_pow_of_two((logres - step_size) /
			atomic_logitems);

and that's the maximum software untorn write unit.  On my system that
gets you 128 blocks, but YMMY.  Those xfs_calc_finish_*_reservation
helpers look something like this:

/*
 * Finishing an EFI can free the blocks and bmap blocks (t2):
 *    the agf for each of the ags: nr * sector size
 *    the agfl for each of the ags: nr * sector size
 *    the super block to reflect the freed blocks: sector size
 *    worst case split in allocation btrees per extent assuming nr extents:
 *		nr exts * 2 trees * (2 * max depth - 1) * block size
 */
inline unsigned int
xfs_calc_finish_efi_reservation(
	struct xfs_mount	*mp,
	unsigned int		nr)
{
	return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
	       xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
			       mp->m_sb.sb_blocksize);
}

/*
 * Finishing an RUI is the same as an EFI.  We can split the rmap btree twice
 * on each end of the record, and that can cause the AGFL to be refilled or
 * emptied out.
 */
inline unsigned int
xfs_calc_finish_rui_reservation(
	struct xfs_mount	*mp,
	unsigned int		nr)
{
	if (!xfs_has_rmapbt(mp))
		return 0;
	return xfs_calc_finish_efi_reservation(mp, nr);
}

/*
 * In finishing a BUI, we can modify:
 *    the inode being truncated: inode size
 *    dquots
 *    the inode's bmap btree: (max depth + 1) * block size
 */
inline unsigned int
xfs_calc_finish_bui_reservation(
	struct xfs_mount	*mp,
	unsigned int		nr)
{
	return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
	       xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
			       mp->m_sb.sb_blocksize);
}


/*
 * Finishing a data device refcount updates (t1):
 *    the agfs of the ags containing the blocks: nr_ops * sector size
 *    the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
 */
inline unsigned int
xfs_calc_finish_cui_reservation(
	struct xfs_mount	*mp,
	unsigned int		nr_ops)
{
	if (!xfs_has_reflink(mp))
		return 0;

	return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
	       xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
			       mp->m_sb.sb_blocksize);
}

--D

> +
> +	/* atomic write limits are always a power-of-2 */
> +	return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> +}
> +
>  STATIC void
>  xfs_destroy_mount_workqueues(
>  	struct xfs_mount	*mp)
> diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
> index c0e85c1e42f2..e0f82be9093a 100644
> --- a/fs/xfs/xfs_super.h
> +++ b/fs/xfs/xfs_super.h
> @@ -100,5 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq;
>  #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
>  
>  struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
> +unsigned int xfs_atomic_write_logitems(struct xfs_mount *mp);
>  
>  #endif	/* __XFS_SUPER_H__ */
> -- 
> 2.31.1
> 
>
Dave Chinner April 8, 2025, 10:47 p.m. UTC | #2
On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> Now that CoW-based atomic writes are supported, update the max size of an
> atomic write for the data device.
> 
> The limit of a CoW-based atomic write will be the limit of the number of
> logitems which can fit into a single transaction.

I still think this is the wrong way to define the maximum
size of a COW-based atomic write because it is going to change from
filesystem to filesystem and that variability in supported maximum
length will be exposed to userspace...

i.e. Maximum supported atomic write size really should be defined as
a well documented fixed size (e.g. 16MB). Then the transaction
reservations sizes needed to perform that conversion can be
calculated directly from that maximum size and optimised directly
for the conversion operation that atomic writes need to perform.

.....

> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..42b2b7540507 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
>  	return -ENOMEM;
>  }
>  
> +unsigned int
> +xfs_atomic_write_logitems(
> +	struct xfs_mount	*mp)
> +{
> +	unsigned int		efi = xfs_efi_item_overhead(1);
> +	unsigned int		rui = xfs_rui_item_overhead(1);
> +	unsigned int		cui = xfs_cui_item_overhead(1);
> +	unsigned int		bui = xfs_bui_item_overhead(1);
> +	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
> +
> +	/*
> +	 * Maximum overhead to complete an atomic write ioend in software:
> +	 * remove data fork extent + remove cow fork extent +
> +	 * map extent into data fork
> +	 */
> +	unsigned int		atomic_logitems =
> +		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);

This seems wrong. Unmap from the data fork only logs a (bui + cui)
pair, we don't log a RUI or an EFI until the transaction that
processes the BUI or CUI actually frees an extent from the the BMBT
or removes a block from the refcount btree.

We also need to be able to relog all the intents and everything that
was modified, so we effectively have at least one
xfs_allocfree_block_count() reservation needed here as well. Even
finishing an invalidation BUI can result in BMBT block allocation
occurring if the operation splits an existing extent record and the
insert of the new record causes a BMBT block split....


> +
> +	/* atomic write limits are always a power-of-2 */
> +	return rounddown_pow_of_two(logres / (2 * atomic_logitems));

What is the magic 2 in that division?

> +}

Also this function does not belong in xfs_super.c - that file is for
interfacing with the VFS layer.  Calculating log reservation
constants at mount time is done in xfs_trans_resv.c - I suspect most
of the code in this patch should probably be moved there and run
from xfs_trans_resv_calc()...

-Dave.
Darrick J. Wong April 9, 2025, 12:41 a.m. UTC | #3
On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > Now that CoW-based atomic writes are supported, update the max size of an
> > atomic write for the data device.
> > 
> > The limit of a CoW-based atomic write will be the limit of the number of
> > logitems which can fit into a single transaction.
> 
> I still think this is the wrong way to define the maximum
> size of a COW-based atomic write because it is going to change from
> filesystem to filesystem and that variability in supported maximum
> length will be exposed to userspace...
> 
> i.e. Maximum supported atomic write size really should be defined as
> a well documented fixed size (e.g. 16MB). Then the transaction
> reservations sizes needed to perform that conversion can be
> calculated directly from that maximum size and optimised directly
> for the conversion operation that atomic writes need to perform.

I'll get to this below...

> .....
> 
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index b2dd0c0bf509..42b2b7540507 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> >  	return -ENOMEM;
> >  }
> >  
> > +unsigned int
> > +xfs_atomic_write_logitems(
> > +	struct xfs_mount	*mp)
> > +{
> > +	unsigned int		efi = xfs_efi_item_overhead(1);
> > +	unsigned int		rui = xfs_rui_item_overhead(1);
> > +	unsigned int		cui = xfs_cui_item_overhead(1);
> > +	unsigned int		bui = xfs_bui_item_overhead(1);
> > +	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
> > +
> > +	/*
> > +	 * Maximum overhead to complete an atomic write ioend in software:
> > +	 * remove data fork extent + remove cow fork extent +
> > +	 * map extent into data fork
> > +	 */
> > +	unsigned int		atomic_logitems =
> > +		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);
> 
> This seems wrong. Unmap from the data fork only logs a (bui + cui)
> pair, we don't log a RUI or an EFI until the transaction that
> processes the BUI or CUI actually frees an extent from the the BMBT
> or removes a block from the refcount btree.

This is tricky -- the first transaction in the chain creates a BUI and a
CUI and that's all it needs.

Then we roll to finish the BUI.  The second transaction needs space for
the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).

Then we roll again to finish the RUI.  This third transaction needs
space for the RUD and space to relog the CUI.

Roll again, fourth transaction needs space for the CUD and possibly a
new EFI.

Roll again, fifth transaction needs space for an EFD.

	const unsigned int	efi = xfs_efi_log_space(1);
	const unsigned int	efd = xfs_efd_log_space(1);
	const unsigned int	rui = xfs_rui_log_space(1);
	const unsigned int	rud = xfs_rud_log_space();
	const unsigned int	cui = xfs_cui_log_space(1);
	const unsigned int	cud = xfs_cud_log_space();
	const unsigned int	bui = xfs_bui_log_space(1);
	const unsigned int	bud = xfs_bud_log_space();

	/*
	 * Maximum overhead to complete an atomic write ioend in software:
	 * remove data fork extent + remove cow fork extent + map extent into
	 * data fork.
	 *
	 * tx0: Creates a BUI and a CUI and that's all it needs.
	 *
	 * tx1: Roll to finish the BUI.  Need space for the BUD, an RUI, and
	 * enough space to relog the CUI (== CUI + CUD).
	 *
	 * tx2: Roll again to finish the RUI.  Need space for the RUD and space
	 * to relog the CUI.
	 *
	 * tx3: Roll again, need space for the CUD and possibly a new EFI.
	 *
	 * tx4: Roll again, need space for an EFD.
	 */
	const unsigned int	tx0 = bui + cui;
	const unsigned int	tx1 = bud + rui + cui + cud;
	const unsigned int	tx2 = rud + cui + cud;
	const unsigned int	tx3 = cud + efi;
	const unsigned int	tx4 = efd;

	const unsigned int	per_intent = max3(max3(tx0, tx1, tx2), tx3, tx4);

> We also need to be able to relog all the intents and everything that
> was modified, so we effectively have at least one
> xfs_allocfree_block_count() reservation needed here as well. Even
> finishing an invalidation BUI can result in BMBT block allocation
> occurring if the operation splits an existing extent record and the
> insert of the new record causes a BMBT block split....

Agreed.

> > +
> > +	/* atomic write limits are always a power-of-2 */
> > +	return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> 
> What is the magic 2 in that division?

That's handwaving the lack of a computation involving
xfs_allocfree_block_count.  A better version would be to figure out the
log space needed:

	/* Overhead to finish one step of each intent item type */
	const unsigned int	f1 = libxfs_calc_finish_efi_reservation(mp, 1);
	const unsigned int	f2 = libxfs_calc_finish_rui_reservation(mp, 1);
	const unsigned int	f3 = libxfs_calc_finish_cui_reservation(mp, 1);
	const unsigned int	f4 = libxfs_calc_finish_bui_reservation(mp, 1);

	/* We only finish one item per transaction in a chain */
	const unsigned int	step_size = max(f4, max3(f1, f2, f3));

and then you have what you need to figure out the logres needed to
support a given awu_max, or vice versa:

	if (desired_max) {
		dbprintf(
 "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
				desired_max, step_size, per_intent,
				(desired_max * per_intent) + step_size);
	} else if (logres) {
		dbprintf(
 "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
				logres, step_size, per_intent,
				logres >= step_size ? (logres - step_size) / per_intent : 0);
	}

I hacked this into xfs_db so that I could compute a variety of
scenarios.  Let's pretend I have a 10T filesystem with 4k fsblocks and
the default configuration.

# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
type 0 logres 244216 logcount 5 flags 0x4
type 1 logres 428928 logcount 5 flags 0x4
<snip>
minlogsize logres 648576 logcount 8

To emulate a 16MB untorn write, you'd need a logres of:

desired_max: 4096
step_size: 107520
per_intent: 208
logres: 959488

959488 > 648576, which would alter the minlogsize calculation.  I know
we banned tiny filesystems a few years ago, but there's still a risk in
increasing it.

For the tr_write logres that John picked, the awu_max we can emulate is:

logres: 244216
step_size: 107520
per_intent: 208
max_awu: 657

That's enough to emulate a 2MB untorn write.

Now let's try again with a realtime filesystem, because the log
reservations are absurdly huge there:

# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 417400"
type 0 logres 417400 logcount 5 flags 0x4
type 1 logres 772736 logcount 5 flags 0x4
<snip>
minlogsize logres 772736 logcount 5

For 16MB, you'd need a logres of:

desired_max: 4096
step_size: 107520
per_intent: 208
logres: 959488

959488 > 772736, which would alter the minlogsize calculation.

For the tr_write logres that John picked, the awu_max we can emulate is:

logres: 417400
step_size: 107520
per_intent: 208
max_awu: 1489

This is more than enough to handle a 4MB atomic write without affecting
any the other filesystem geometry.  Now I'll try a 400MB filesystem:

# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 142840"
type 0 logres 142840 logcount 5 flags 0x4
type 1 logres 226176 logcount 5 flags 0x4
<snip>
minlogsize logres 445824 logcount 8

desired_max: 4096
step_size: 56832
per_intent: 208
logres: 908800

Definitely still above minlogsize.

logres: 142840
step_size: 56832
per_intent: 208
max_awu: 413

We can still simulate a 1MB untorn write even on a tiny filesystem.

On a 1k fsblock 400MB filesystem:

# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 63352"
type 0 logres 63352 logcount 5 flags 0x4
type 1 logres 103296 logcount 5 flags 0x4
<snip>
minlogsize logres 153984 logcount 8

desired_max: 4096
step_size: 26112
per_intent: 208
logres: 878080

Again we see that we'd have to increase the min logsize to support a
16mB untorn write.

logres: 63352
step_size: 26112
per_intent: 208
max_awu: 179

But we can still emulate a 128K untorn write in software.

This is why I don't agree with adding a static 16MB limit -- we clearly
don't need it to emulate current hardware, which can commit up to 64k
atomically.  Future hardware can increase that by 64x and we'll still be
ok with using the existing tr_write transaction type.

By contrast, adding a 16MB limit would result in a much larger minimum
log size.  If we add that to struct xfs_trans_resv for all filesystems
then we run the risk of some ancient filesystem with a 12M log failing
suddenly failing to mount on a new kernel.

I don't see the point.

--D

> > +}
> 
> Also this function does not belong in xfs_super.c - that file is for
> interfacing with the VFS layer.  Calculating log reservation
> constants at mount time is done in xfs_trans_resv.c - I suspect most
> of the code in this patch should probably be moved there and run
> from xfs_trans_resv_calc()...
> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner April 9, 2025, 5:30 a.m. UTC | #4
On Tue, Apr 08, 2025 at 05:41:56PM -0700, Darrick J. Wong wrote:
> On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> > On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > > Now that CoW-based atomic writes are supported, update the max size of an
> > > atomic write for the data device.
> > > 
> > > The limit of a CoW-based atomic write will be the limit of the number of
> > > logitems which can fit into a single transaction.
> > 
> > I still think this is the wrong way to define the maximum
> > size of a COW-based atomic write because it is going to change from
> > filesystem to filesystem and that variability in supported maximum
> > length will be exposed to userspace...
> > 
> > i.e. Maximum supported atomic write size really should be defined as
> > a well documented fixed size (e.g. 16MB). Then the transaction
> > reservations sizes needed to perform that conversion can be
> > calculated directly from that maximum size and optimised directly
> > for the conversion operation that atomic writes need to perform.
> 
> I'll get to this below...

I'll paste it here so it's all in one context.

> This is why I don't agree with adding a static 16MB limit -- we clearly
> don't need it to emulate current hardware, which can commit up to 64k
> atomically.  Future hardware can increase that by 64x and we'll still be
> ok with using the existing tr_write transaction type.
> 
> By contrast, adding a 16MB limit would result in a much larger minimum
> log size.  If we add that to struct xfs_trans_resv for all filesystems
> then we run the risk of some ancient filesystem with a 12M log failing
> suddenly failing to mount on a new kernel.
> 
> I don't see the point.

You've got stuck on ithe example size of 16MB I gave, not
the actual reason I gave that example.

That is: we should not be exposing some unpredictable, filesystem
geometry depending maximum atomic size to a userspace API.  We need
to define a maximum size that we support, that we can clearly
document and guarantee will work on all filesystem geometries.

What size that is needs to be discussed - all you've done is
demonstrate that 16MB would require a larger minimum log size for a
small set of historic/legacy filesystem configs.

I'm actually quite fine with adding new reservations that change
minimum required log sizes - this isn't a show-stopper in any way.

Atomic writes are optional functionality, so if the log size is too
small for the atomic write transaction requirements, then we don't
enable the atomic write functionality on that filesystem. Hence the
minimum required log size for the filesystem -does not change- and
the filesystem mounts as normal..

i.e. the user hasn't lost anything on these tiny log filesystems
when the kernel is upgraded to support software atomic writes.
All that happens is that the legacy filesystem will never support
atomic writes....

Such a mount time check allows us to design the atomic write
functionality around a fixed upper size bound that we can guarantee
will work correctly. i.e. the size of the transaction reservation
needed for it is irrelevant because we guarantee to only run that
transaction on filesytsems with logs large enough to support it.

Having a consistent maximum atomic write size makes it easier for
userspace to create logic and algorithms around, and it makes it
much easier for us to do boundary condition testing as we don't have
to reverse engineer the expected boundaries from filesysetm geometry
in test code. Fixed sizes make API verification and testing a whole
lot simpler.

And, finally, if everything is sized from an single API constant, it
makes it easy to modify the supported size in future. The 64MB
minimum log size gives us lots of headroom to increase supported
atomic write sizes, so if userspace finds that it really needs
larger sizes than what we initially support, it's trivial to change
both the kernel and the test code to support a larger size...

> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index b2dd0c0bf509..42b2b7540507 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> > >  	return -ENOMEM;
> > >  }
> > >  
> > > +unsigned int
> > > +xfs_atomic_write_logitems(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	unsigned int		efi = xfs_efi_item_overhead(1);
> > > +	unsigned int		rui = xfs_rui_item_overhead(1);
> > > +	unsigned int		cui = xfs_cui_item_overhead(1);
> > > +	unsigned int		bui = xfs_bui_item_overhead(1);
> > > +	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
> > > +
> > > +	/*
> > > +	 * Maximum overhead to complete an atomic write ioend in software:
> > > +	 * remove data fork extent + remove cow fork extent +
> > > +	 * map extent into data fork
> > > +	 */
> > > +	unsigned int		atomic_logitems =
> > > +		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);
> > 
> > This seems wrong. Unmap from the data fork only logs a (bui + cui)
> > pair, we don't log a RUI or an EFI until the transaction that
> > processes the BUI or CUI actually frees an extent from the the BMBT
> > or removes a block from the refcount btree.
> 
> This is tricky -- the first transaction in the chain creates a BUI and a
> CUI and that's all it needs.
> 
> Then we roll to finish the BUI.  The second transaction needs space for
> the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).
> 
> Then we roll again to finish the RUI.  This third transaction needs
> space for the RUD and space to relog the CUI.
> 
> Roll again, fourth transaction needs space for the CUD and possibly a
> new EFI.
> 
> Roll again, fifth transaction needs space for an EFD.

Yes, that is exactly the point I was making.

> > > +
> > > +	/* atomic write limits are always a power-of-2 */
> > > +	return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> > 
> > What is the magic 2 in that division?
> 
> That's handwaving the lack of a computation involving
> xfs_allocfree_block_count.  A better version would be to figure out the
> log space needed:
> 
> 	/* Overhead to finish one step of each intent item type */
> 	const unsigned int	f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> 	const unsigned int	f2 = libxfs_calc_finish_rui_reservation(mp, 1);

Yup, those should be a single call to xfs_calc_buf_res(xfs_allocfree_block_count())

> 	const unsigned int	f3 = libxfs_calc_finish_cui_reservation(mp, 1);

Similarly, xfs_calc_refcountbt_reservation().

> 	const unsigned int	f4 = libxfs_calc_finish_bui_reservation(mp, 1);

xfs_calc_write_reservation() but for a single extent instead of 2.
Also, I think the bui finish needs to take into account dquots, too.

> 	/* We only finish one item per transaction in a chain */
> 	const unsigned int	step_size = max(f4, max3(f1, f2, f3));

And that is xfs_calc_itruncate_reservation(), except it reserves
space for 1 extent to be processed instead of 4.

FWIW, I'd suggest that these helpers make for a better way of
calculating high level reservations such as
xfs_calc_write_reservation() and xfs_calc_itruncate_reservation()
because those functions currently don't take into account how
intents are relogged during those operations.

> and then you have what you need to figure out the logres needed to
> support a given awu_max, or vice versa:
> 
> 	if (desired_max) {
> 		dbprintf(
>  "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> 				desired_max, step_size, per_intent,
> 				(desired_max * per_intent) + step_size);
> 	} else if (logres) {
> 		dbprintf(
>  "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> 				logres, step_size, per_intent,
> 				logres >= step_size ? (logres - step_size) / per_intent : 0);
> 	}
> 
> I hacked this into xfs_db so that I could compute a variety of
> scenarios.  Let's pretend I have a 10T filesystem with 4k fsblocks and
> the default configuration.
> 
> # xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
> type 0 logres 244216 logcount 5 flags 0x4
> type 1 logres 428928 logcount 5 flags 0x4
> <snip>
> minlogsize logres 648576 logcount 8
> 
> To emulate a 16MB untorn write, you'd need a logres of:
> 
> desired_max: 4096
> step_size: 107520
> per_intent: 208
> logres: 959488
>
> 959488 > 648576, which would alter the minlogsize calculation.  I know
> we banned tiny filesystems a few years ago, but there's still a risk in
> increasing it.

Yeah, it's a bit under a megabyte of reservation space. That's no
big deal, as log reservations get much larger than this as block
size and log stripe units are increased.

<snip the rest, they all show roughly the same thing>

Ok, these numbers pretty much prove my point - that a fixed max
atomic write size somewhere around 16MB isn't going to be a problem
for any filesystem that uses the historic small fs default log size
(10MB) or larger.

Let's avoid the internal XFS minlogsize problems by disabling the
atomic write functionality on the increasingly rare legacy
filesystems where the log is too small. That allows us to design and
implement sane userspace API bounds and guarantees for atomic writes
and not expose internal filesystem constraints and inconsistencies
to applications...

-Dave.
John Garry April 9, 2025, 8:15 a.m. UTC | #5
On 09/04/2025 06:30, Dave Chinner wrote:
>> This is why I don't agree with adding a static 16MB limit -- we clearly
>> don't need it to emulate current hardware, which can commit up to 64k
>> atomically.  Future hardware can increase that by 64x and we'll still be
>> ok with using the existing tr_write transaction type.
>>
>> By contrast, adding a 16MB limit would result in a much larger minimum
>> log size.  If we add that to struct xfs_trans_resv for all filesystems
>> then we run the risk of some ancient filesystem with a 12M log failing
>> suddenly failing to mount on a new kernel.
>>
>> I don't see the point.
> You've got stuck on ithe example size of 16MB I gave, not
> the actual reason I gave that example.

You did provide a relatively large value in 16MB. When I say relative, I 
mean relative to what can be achieved with HW offload today.

The target user we see for this feature is DBs, and they want to do 
writes in the 16/32/64KB size range. Indeed, these are the sort of sizes 
we see supported in terms of disk atomic write support today.

Furthermore, they (DBs) want fast and predictable performance which HW 
offload provides. They do not want to use a slow software-based 
solution. Such a software-based solution will always be slower, as we 
need to deal with block alloc/de-alloc and extent remapping for every write.

So are there people who really want very large atomic write support and 
will tolerate slow performance, i.e. slower than what can be achieved 
with double-write buffer or some other application logging?

Thanks,
John
Dave Chinner April 9, 2025, 10:49 p.m. UTC | #6
On Wed, Apr 09, 2025 at 09:15:23AM +0100, John Garry wrote:
> On 09/04/2025 06:30, Dave Chinner wrote:
> > > This is why I don't agree with adding a static 16MB limit -- we clearly
> > > don't need it to emulate current hardware, which can commit up to 64k
> > > atomically.  Future hardware can increase that by 64x and we'll still be
> > > ok with using the existing tr_write transaction type.
> > > 
> > > By contrast, adding a 16MB limit would result in a much larger minimum
> > > log size.  If we add that to struct xfs_trans_resv for all filesystems
> > > then we run the risk of some ancient filesystem with a 12M log failing
> > > suddenly failing to mount on a new kernel.
> > > 
> > > I don't see the point.
> > You've got stuck on ithe example size of 16MB I gave, not
> > the actual reason I gave that example.
> 
> You did provide a relatively large value in 16MB. When I say relative, I
> mean relative to what can be achieved with HW offload today.
> 
> The target user we see for this feature is DBs, and they want to do writes
> in the 16/32/64KB size range. Indeed, these are the sort of sizes we see
> supported in terms of disk atomic write support today.

The target user I see for RWF_ATOMIC write is applications
overwriting files safely (e.g. config files, documents, etc).

This requires an atomic write operation that is large enough to
overwrite the file entirely in one go.

i.e. we need to think about how RWF_ATOMIC is applicable to the
entire userspace ecosystem, not just a narrow database specific
niche. Databases really want atomic writes to avoid the need for
WAL, whereas application developers that keep asking us for safe
file overwrite without fsync() for arbitrary sized files and IO.

> Furthermore, they (DBs) want fast and predictable performance which HW
> offload provides. They do not want to use a slow software-based solution.
> Such a software-based solution will always be slower, as we need to deal
> with block alloc/de-alloc and extent remapping for every write.

"slow" is relative to the use case for atomic writes.

> So are there people who really want very large atomic write support and will
> tolerate slow performance, i.e. slower than what can be achieved with
> double-write buffer or some other application logging?

Large atomic write support solves the O_PONIES problem, which is
fundamentally a performance problem w.r.t. ensuring data integrity.
I'll quote myself when you asked this exact same question back about
4 months ago:

| "At this point we actually provide app developers with what they've
| been repeatedly asking kernel filesystem engineers to provide them
| for the past 20 years: a way of overwriting arbitrary file data
| safely without needing an expensive fdatasync operation on every
| file that gets modified.
| 
| Put simply: atomic writes have a huge potential to fundamentally
| change the way applications interact with Linux filesystems and to
| make it *much* simpler for applications to safely overwrite user
| data.  Hence there is an imperitive here to make the foundational
| support for this technology solid and robust because atomic writes
| are going to be with us for the next few decades..."

https://lwn.net/Articles/1001770/

-Dave.
Darrick J. Wong April 9, 2025, 11:46 p.m. UTC | #7
On Wed, Apr 09, 2025 at 03:30:28PM +1000, Dave Chinner wrote:
> On Tue, Apr 08, 2025 at 05:41:56PM -0700, Darrick J. Wong wrote:
> > On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> > > On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > > > Now that CoW-based atomic writes are supported, update the max size of an
> > > > atomic write for the data device.
> > > > 
> > > > The limit of a CoW-based atomic write will be the limit of the number of
> > > > logitems which can fit into a single transaction.
> > > 
> > > I still think this is the wrong way to define the maximum
> > > size of a COW-based atomic write because it is going to change from
> > > filesystem to filesystem and that variability in supported maximum
> > > length will be exposed to userspace...
> > > 
> > > i.e. Maximum supported atomic write size really should be defined as
> > > a well documented fixed size (e.g. 16MB). Then the transaction
> > > reservations sizes needed to perform that conversion can be
> > > calculated directly from that maximum size and optimised directly
> > > for the conversion operation that atomic writes need to perform.
> > 
> > I'll get to this below...
> 
> I'll paste it here so it's all in one context.
> 
> > This is why I don't agree with adding a static 16MB limit -- we clearly
> > don't need it to emulate current hardware, which can commit up to 64k
> > atomically.  Future hardware can increase that by 64x and we'll still be
> > ok with using the existing tr_write transaction type.
> > 
> > By contrast, adding a 16MB limit would result in a much larger minimum
> > log size.  If we add that to struct xfs_trans_resv for all filesystems
> > then we run the risk of some ancient filesystem with a 12M log failing
> > suddenly failing to mount on a new kernel.
> > 
> > I don't see the point.
> 
> You've got stuck on ithe example size of 16MB I gave, not
> the actual reason I gave that example.

Well you kept saying 16MB, so that's why I ran the numbers on it.

> That is: we should not be exposing some unpredictable, filesystem
> geometry depending maximum atomic size to a userspace API.  We need
> to define a maximum size that we support, that we can clearly
> document and guarantee will work on all filesystem geometries.
> 
> What size that is needs to be discussed - all you've done is
> demonstrate that 16MB would require a larger minimum log size for a
> small set of historic/legacy filesystem configs.

No, I've done more than that.  I've shown that it's possible to
determine the maximum permissible size of a software untorn writes
completions given a log reservation, which means that we can:

1) Determine if we can backstop hardware untorn writes with software
untorn writes.

2) Determine the largest untorn write we can complete in software for
an existing filesystems with no other changes required.

in addition to 3) Determine if we can land an untorn write of a
specific size.

We care about (1).  (2) is basically done (albeit with calculation
errors) by this patchset.  (3) is not something we're interested in.
If you believe that having a fixed upper size bound that isn't tied to
existing log reservations is worth pursuing, then please send patches
building off this series.

I think it wouldn't be much work to have a larger fixed limit that
doesn't depend on any of the existing static transaction reservations.
Use the computation I provided to decide if that limit won't push the
filesystem too close to the maximum transaction size, then dynamically
compute the reservation at ioend time for the specific IO being
completed.

But again, our users don't want that, they're fine with 64k.  They
really don't want the software fallback at all.  If you have users who
prefer a well known static limit, then please work with them.

> I'm actually quite fine with adding new reservations that change
> minimum required log sizes - this isn't a show-stopper in any way.

I'm not convinced.  The minimum log size footgun still exists; it's just
that the new(ish) 64M floor makes it much less likely to happen.  The
image factories have been upgraded with newer xfsprogs so the number of
complaints will (at long last) slowly go down, but 12M logs are still
causing problems.

> Atomic writes are optional functionality, so if the log size is too
> small for the atomic write transaction requirements, then we don't
> enable the atomic write functionality on that filesystem. Hence the
> minimum required log size for the filesystem -does not change- and
> the filesystem mounts as normal..
> 
> i.e. the user hasn't lost anything on these tiny log filesystems
> when the kernel is upgraded to support software atomic writes.
> All that happens is that the legacy filesystem will never support
> atomic writes....
> 
> Such a mount time check allows us to design the atomic write
> functionality around a fixed upper size bound that we can guarantee
> will work correctly. i.e. the size of the transaction reservation
> needed for it is irrelevant because we guarantee to only run that
> transaction on filesytsems with logs large enough to support it.

Yes, that's what I've been trying to say all along.

> Having a consistent maximum atomic write size makes it easier for
> userspace to create logic and algorithms around, and it makes it
> much easier for us to do boundary condition testing as we don't have
> to reverse engineer the expected boundaries from filesysetm geometry
> in test code. Fixed sizes make API verification and testing a whole
> lot simpler.

John added a statx field so that the kernel can advertise the hardware
and software untorn write capabilities.  Are you saying that's not good
enough even for (1) and (2) from above?

If it turns out that users want a higher limit, they can come talk to us
about adding a mkfs option or whatever to declare that they want more,
and then we can enforce that.

> And, finally, if everything is sized from an single API constant, it
> makes it easy to modify the supported size in future. The 64MB
> minimum log size gives us lots of headroom to increase supported
> atomic write sizes, so if userspace finds that it really needs
> larger sizes than what we initially support, it's trivial to change
> both the kernel and the test code to support a larger size...
> 
> > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > index b2dd0c0bf509..42b2b7540507 100644
> > > > --- a/fs/xfs/xfs_super.c
> > > > +++ b/fs/xfs/xfs_super.c
> > > > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> > > >  	return -ENOMEM;
> > > >  }
> > > >  
> > > > +unsigned int
> > > > +xfs_atomic_write_logitems(
> > > > +	struct xfs_mount	*mp)
> > > > +{
> > > > +	unsigned int		efi = xfs_efi_item_overhead(1);
> > > > +	unsigned int		rui = xfs_rui_item_overhead(1);
> > > > +	unsigned int		cui = xfs_cui_item_overhead(1);
> > > > +	unsigned int		bui = xfs_bui_item_overhead(1);
> > > > +	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
> > > > +
> > > > +	/*
> > > > +	 * Maximum overhead to complete an atomic write ioend in software:
> > > > +	 * remove data fork extent + remove cow fork extent +
> > > > +	 * map extent into data fork
> > > > +	 */
> > > > +	unsigned int		atomic_logitems =
> > > > +		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);
> > > 
> > > This seems wrong. Unmap from the data fork only logs a (bui + cui)
> > > pair, we don't log a RUI or an EFI until the transaction that
> > > processes the BUI or CUI actually frees an extent from the the BMBT
> > > or removes a block from the refcount btree.
> > 
> > This is tricky -- the first transaction in the chain creates a BUI and a
> > CUI and that's all it needs.
> > 
> > Then we roll to finish the BUI.  The second transaction needs space for
> > the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).
> > 
> > Then we roll again to finish the RUI.  This third transaction needs
> > space for the RUD and space to relog the CUI.
> > 
> > Roll again, fourth transaction needs space for the CUD and possibly a
> > new EFI.
> > 
> > Roll again, fifth transaction needs space for an EFD.
> 
> Yes, that is exactly the point I was making.

It's also slightly wrong -- if an extent referenced by the chain isn't
actively being worked on, its BUI/CUI can get relogged.  So the actual
computation is:

	relog = bui + bud + cui + cud;
	per_intent = max(tx0..tx4, relog);

That doesn't seem to change the output though.

> > > > +
> > > > +	/* atomic write limits are always a power-of-2 */
> > > > +	return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> > > 
> > > What is the magic 2 in that division?
> > 
> > That's handwaving the lack of a computation involving
> > xfs_allocfree_block_count.  A better version would be to figure out the
> > log space needed:
> > 
> > 	/* Overhead to finish one step of each intent item type */
> > 	const unsigned int	f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> > 	const unsigned int	f2 = libxfs_calc_finish_rui_reservation(mp, 1);
> 
> Yup, those should be a single call to xfs_calc_buf_res(xfs_allocfree_block_count())
> 
> > 	const unsigned int	f3 = libxfs_calc_finish_cui_reservation(mp, 1);
> 
> Similarly, xfs_calc_refcountbt_reservation().
> 
> > 	const unsigned int	f4 = libxfs_calc_finish_bui_reservation(mp, 1);
> 
> xfs_calc_write_reservation() but for a single extent instead of 2.
> Also, I think the bui finish needs to take into account dquots, too.
> 
> > 	/* We only finish one item per transaction in a chain */
> > 	const unsigned int	step_size = max(f4, max3(f1, f2, f3));
> 
> And that is xfs_calc_itruncate_reservation(), except it reserves
> space for 1 extent to be processed instead of 4.
> 
> FWIW, I'd suggest that these helpers make for a better way of
> calculating high level reservations such as
> xfs_calc_write_reservation() and xfs_calc_itruncate_reservation()
> because those functions currently don't take into account how
> intents are relogged during those operations.
> 
> > and then you have what you need to figure out the logres needed to
> > support a given awu_max, or vice versa:
> > 
> > 	if (desired_max) {
> > 		dbprintf(
> >  "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> > 				desired_max, step_size, per_intent,
> > 				(desired_max * per_intent) + step_size);
> > 	} else if (logres) {
> > 		dbprintf(
> >  "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> > 				logres, step_size, per_intent,
> > 				logres >= step_size ? (logres - step_size) / per_intent : 0);
> > 	}
> > 
> > I hacked this into xfs_db so that I could compute a variety of
> > scenarios.  Let's pretend I have a 10T filesystem with 4k fsblocks and
> > the default configuration.
> > 
> > # xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
> > type 0 logres 244216 logcount 5 flags 0x4
> > type 1 logres 428928 logcount 5 flags 0x4
> > <snip>
> > minlogsize logres 648576 logcount 8
> > 
> > To emulate a 16MB untorn write, you'd need a logres of:
> > 
> > desired_max: 4096
> > step_size: 107520
> > per_intent: 208
> > logres: 959488
> >
> > 959488 > 648576, which would alter the minlogsize calculation.  I know
> > we banned tiny filesystems a few years ago, but there's still a risk in
> > increasing it.
> 
> Yeah, it's a bit under a megabyte of reservation space. That's no
> big deal, as log reservations get much larger than this as block
> size and log stripe units are increased.
> 
> <snip the rest, they all show roughly the same thing>
> 
> Ok, these numbers pretty much prove my point - that a fixed max
> atomic write size somewhere around 16MB isn't going to be a problem
> for any filesystem that uses the historic small fs default log size
> (10MB) or larger.
> 
> Let's avoid the internal XFS minlogsize problems by disabling the
> atomic write functionality on the increasingly rare legacy
> filesystems where the log is too small. That allows us to design and
> implement sane userspace API bounds and guarantees for atomic writes
> and not expose internal filesystem constraints and inconsistencies
> to applications...

Ok then.

--D

> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
John Garry April 10, 2025, 8:58 a.m. UTC | #8
On 09/04/2025 23:49, Dave Chinner wrote:
>> You did provide a relatively large value in 16MB. When I say relative, I
>> mean relative to what can be achieved with HW offload today.
>>
>> The target user we see for this feature is DBs, and they want to do writes
>> in the 16/32/64KB size range. Indeed, these are the sort of sizes we see
>> supported in terms of disk atomic write support today.
> The target user I see for RWF_ATOMIC write is applications
> overwriting files safely (e.g. config files, documents, etc).
> 
> This requires an atomic write operation that is large enough to
> overwrite the file entirely in one go.
> 
> i.e. we need to think about how RWF_ATOMIC is applicable to the
> entire userspace ecosystem, not just a narrow database specific
> niche. Databases really want atomic writes to avoid the need for
> WAL, whereas application developers that keep asking us for safe
> file overwrite without fsync() for arbitrary sized files and IO.

If they want to use this API, then arbitrary-sized files will need to be 
power-of-2 sized files (if the user is happy to atomic update all of the 
file).

I would have thought that the work Christoph did with O_ATOMIC a few 
years ago for atomic file updates would be more suited for scenario 
which you mention.

Anyway, back to the main topic..

What is the method you propose that the admin can use to fix this upper 
atomic write limit? Is this an mkfs and/or mount option?

I appreciate that I am asking the same question as Darrick did in his 
follow up mail.

John
diff mbox series

Patch

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00b53f479ece..27a737202637 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -666,6 +666,37 @@  xfs_agbtree_compute_maxlevels(
 	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
 }
 
+static inline void
+xfs_compute_atomic_write_unit_max(
+	struct xfs_mount	*mp)
+{
+	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
+	unsigned int		max_extents_logitems;
+	unsigned int		max_agsize;
+
+	if (!xfs_has_reflink(mp)) {
+		mp->m_atomic_write_unit_max = 1;
+		return;
+	}
+
+	/*
+	 * Find limit according to logitems.
+	 */
+	max_extents_logitems = xfs_atomic_write_logitems(mp);
+
+	/*
+	 * Also limit the size of atomic writes to the greatest power-of-two
+	 * factor of the agsize so that allocations for an atomic write will
+	 * always be aligned compatibly with the alignment requirements of the
+	 * storage.
+	 * The greatest power-of-two is the value according to the lowest bit
+	 * set.
+	 */
+	max_agsize = 1 << (ffs(agsize) - 1);
+
+	mp->m_atomic_write_unit_max = min(max_extents_logitems, max_agsize);
+}
+
 /* Compute maximum possible height for realtime btree types for this fs. */
 static inline void
 xfs_rtbtree_compute_maxlevels(
@@ -842,6 +873,11 @@  xfs_mountfs(
 	 */
 	xfs_trans_init(mp);
 
+	/*
+	 * Pre-calculate atomic write unit max.
+	 */
+	xfs_compute_atomic_write_unit_max(mp);
+
 	/*
 	 * Allocate and initialize the per-ag data.
 	 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..4462bffbf0ff 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -230,6 +230,11 @@  typedef struct xfs_mount {
 	bool			m_update_sb;	/* sb needs update in mount */
 	unsigned int		m_max_open_zones;
 
+	/*
+	 * data device max atomic write.
+	 */
+	xfs_extlen_t		m_atomic_write_unit_max;
+
 	/*
 	 * Bitsets of per-fs metadata that have been checked and/or are sick.
 	 * Callers must hold m_sb_lock to access these two fields.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b2dd0c0bf509..42b2b7540507 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -615,6 +615,28 @@  xfs_init_mount_workqueues(
 	return -ENOMEM;
 }
 
+unsigned int
+xfs_atomic_write_logitems(
+	struct xfs_mount	*mp)
+{
+	unsigned int		efi = xfs_efi_item_overhead(1);
+	unsigned int		rui = xfs_rui_item_overhead(1);
+	unsigned int		cui = xfs_cui_item_overhead(1);
+	unsigned int		bui = xfs_bui_item_overhead(1);
+	unsigned int		logres = M_RES(mp)->tr_write.tr_logres;
+
+	/*
+	 * Maximum overhead to complete an atomic write ioend in software:
+	 * remove data fork extent + remove cow fork extent +
+	 * map extent into data fork
+	 */
+	unsigned int		atomic_logitems =
+		(bui + cui + rui + efi) + (cui + rui) + (bui + rui);
+
+	/* atomic write limits are always a power-of-2 */
+	return rounddown_pow_of_two(logres / (2 * atomic_logitems));
+}
+
 STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index c0e85c1e42f2..e0f82be9093a 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -100,5 +100,6 @@  extern struct workqueue_struct *xfs_discard_wq;
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
 struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
+unsigned int xfs_atomic_write_logitems(struct xfs_mount *mp);
 
 #endif	/* __XFS_SUPER_H__ */