diff mbox series

[4/6] fs: xfs: Support atomic write for statx

Message ID 20240124142645.9334-5-john.g.garry@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry Jan. 24, 2024, 2:26 p.m. UTC
Support providing info on atomic write unit min and max for an inode.

For simplicity, currently we limit the min at the FS block size, but a
lower limit could be supported in future.

The atomic write unit min and max is limited by the guaranteed extent
alignment for the inode.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iops.h |  4 ++++
 2 files changed, 49 insertions(+)

Comments

Darrick J. Wong Feb. 2, 2024, 6:05 p.m. UTC | #1
On Wed, Jan 24, 2024 at 02:26:43PM +0000, John Garry wrote:
> Support providing info on atomic write unit min and max for an inode.
> 
> For simplicity, currently we limit the min at the FS block size, but a
> lower limit could be supported in future.
> 
> The atomic write unit min and max is limited by the guaranteed extent
> alignment for the inode.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iops.h |  4 ++++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index a0d77f5f512e..0890d2f70f4d 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -546,6 +546,44 @@ xfs_stat_blksize(
>  	return PAGE_SIZE;
>  }
>  
> +void xfs_get_atomic_write_attr(

static void?

> +	struct xfs_inode *ip,
> +	unsigned int *unit_min,
> +	unsigned int *unit_max)

Weird indenting here.

> +{
> +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct block_device	*bdev = target->bt_bdev;
> +	unsigned int		awu_min, awu_max, align;
> +	struct request_queue	*q = bdev->bd_queue;
> +	struct xfs_mount	*mp = ip->i_mount;
> +
> +	/*
> +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
> +	 * atomic write unit of BLOCKSIZE).
> +	 */
> +	awu_min = queue_atomic_write_unit_min_bytes(q);
> +	awu_max = queue_atomic_write_unit_max_bytes(q);
> +
> +	awu_min &= ~mp->m_blockmask;

Why do you round /down/ the awu_min value here?

> +	awu_max &= ~mp->m_blockmask;

Actually -- since the atomic write units have to be powers of 2, why is
rounding needed here at all?

> +
> +	align = XFS_FSB_TO_B(mp, extsz);
> +
> +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
> +	    !is_power_of_2(align)) {

...and if you take my suggestion to make a common helper to validate the
atomic write unit parameters, this can collapse into:

	alloc_unit_bytes = xfs_inode_alloc_unitsize(ip);
	if (!xfs_inode_has_atomicwrites(ip) ||
	    !bdev_validate_atomic_write(bdev, alloc_unit_bytes)) {
		/* not supported, return zeroes */
		*unit_min = 0;
		*unit_max = 0;
		return;
	}

	*unit_min = max(alloc_unit_bytes, awu_min);
	*unit_max = min(alloc_unit_bytes, awu_max);

--D

> +		*unit_min = 0;
> +		*unit_max = 0;
> +	} else {
> +		if (awu_min)
> +			*unit_min = min(awu_min, align);
> +		else
> +			*unit_min = mp->m_sb.sb_blocksize;
> +
> +		*unit_max = min(awu_max, align);
> +	}
> +}
> +
>  STATIC int
>  xfs_vn_getattr(
>  	struct mnt_idmap	*idmap,
> @@ -619,6 +657,13 @@ xfs_vn_getattr(
>  			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
>  			stat->dio_offset_align = bdev_logical_block_size(bdev);
>  		}
> +		if (request_mask & STATX_WRITE_ATOMIC) {
> +			unsigned int unit_min, unit_max;
> +
> +			xfs_get_atomic_write_attr(ip, &unit_min, &unit_max);
> +			generic_fill_statx_atomic_writes(stat,
> +				unit_min, unit_max);
> +		}
>  		fallthrough;
>  	default:
>  		stat->blksize = xfs_stat_blksize(ip);
> diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
> index 7f84a0843b24..76dd4c3687aa 100644
> --- a/fs/xfs/xfs_iops.h
> +++ b/fs/xfs/xfs_iops.h
> @@ -19,4 +19,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap,
>  int xfs_inode_init_security(struct inode *inode, struct inode *dir,
>  		const struct qstr *qstr);
>  
> +void xfs_get_atomic_write_attr(struct xfs_inode *ip,
> +		unsigned int *unit_min,
> +		unsigned int *unit_max);
> +
>  #endif /* __XFS_IOPS_H__ */
> -- 
> 2.31.1
> 
>
John Garry Feb. 5, 2024, 1:10 p.m. UTC | #2
On 02/02/2024 18:05, Darrick J. Wong wrote:
> On Wed, Jan 24, 2024 at 02:26:43PM +0000, John Garry wrote:
>> Support providing info on atomic write unit min and max for an inode.
>>
>> For simplicity, currently we limit the min at the FS block size, but a
>> lower limit could be supported in future.
>>
>> The atomic write unit min and max is limited by the guaranteed extent
>> alignment for the inode.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>   fs/xfs/xfs_iops.h |  4 ++++
>>   2 files changed, 49 insertions(+)
>>
>> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
>> index a0d77f5f512e..0890d2f70f4d 100644
>> --- a/fs/xfs/xfs_iops.c
>> +++ b/fs/xfs/xfs_iops.c
>> @@ -546,6 +546,44 @@ xfs_stat_blksize(
>>   	return PAGE_SIZE;
>>   }
>>   
>> +void xfs_get_atomic_write_attr(
> 
> static void?

We use this in the iomap and statx code

> 
>> +	struct xfs_inode *ip,
>> +	unsigned int *unit_min,
>> +	unsigned int *unit_max)
> 
> Weird indenting here.

hmmm... I thought that this was the XFS style

Can you show how it should look?

> 
>> +{
>> +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
>> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>> +	struct block_device	*bdev = target->bt_bdev;
>> +	unsigned int		awu_min, awu_max, align;
>> +	struct request_queue	*q = bdev->bd_queue;
>> +	struct xfs_mount	*mp = ip->i_mount;
>> +
>> +	/*
>> +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
>> +	 * atomic write unit of BLOCKSIZE).
>> +	 */
>> +	awu_min = queue_atomic_write_unit_min_bytes(q);
>> +	awu_max = queue_atomic_write_unit_max_bytes(q);
>> +
>> +	awu_min &= ~mp->m_blockmask;
> 
> Why do you round /down/ the awu_min value here?

This is just to ensure that we returning *unit_min >= BLOCKSIZE

For example, if awu_min, max 1K, 64K from the bdev, we now have 0 and 
64K. And below this gives us awu_min, max of 4k, 64k.

Maybe there is a more logical way of doing this.

> 
>> +	awu_max &= ~mp->m_blockmask;
> 
> Actually -- since the atomic write units have to be powers of 2, why is
> rounding needed here at all?

Sure, but the bdev can report a awu_min < BLOCKSIZE

> 
>> +
>> +	align = XFS_FSB_TO_B(mp, extsz);
>> +
>> +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
>> +	    !is_power_of_2(align)) {
> 
> ...and if you take my suggestion to make a common helper to validate the
> atomic write unit parameters, this can collapse into:
> 
> 	alloc_unit_bytes = xfs_inode_alloc_unitsize(ip);
> 	if (!xfs_inode_has_atomicwrites(ip) ||
> 	    !bdev_validate_atomic_write(bdev, alloc_unit_bytes))  > 		/* not supported, return zeroes */
> 		*unit_min = 0;
> 		*unit_max = 0;
> 		return;
> 	}
> 
> 	*unit_min = max(alloc_unit_bytes, awu_min);
> 	*unit_max = min(alloc_unit_bytes, awu_max);

Again, we need to ensure that *unit_min >= BLOCKSIZE

Thanks,
John
Ojaswin Mujoo Feb. 9, 2024, 7 a.m. UTC | #3
Hi John,

Thanks for the patch, I've added some review comments and questions
below.

On Wed, Jan 24, 2024 at 02:26:43PM +0000, John Garry wrote:
> Support providing info on atomic write unit min and max for an inode.
> 
> For simplicity, currently we limit the min at the FS block size, but a
> lower limit could be supported in future.
> 
> The atomic write unit min and max is limited by the guaranteed extent
> alignment for the inode.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iops.h |  4 ++++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index a0d77f5f512e..0890d2f70f4d 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -546,6 +546,44 @@ xfs_stat_blksize(
>  	return PAGE_SIZE;
>  }
>  
> +void xfs_get_atomic_write_attr(
> +	struct xfs_inode *ip,
> +	unsigned int *unit_min,
> +	unsigned int *unit_max)
> +{
> +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct block_device	*bdev = target->bt_bdev;
> +	unsigned int		awu_min, awu_max, align;
> +	struct request_queue	*q = bdev->bd_queue;
> +	struct xfs_mount	*mp = ip->i_mount;
> +
> +	/*
> +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
> +	 * atomic write unit of BLOCKSIZE).
> +	 */
> +	awu_min = queue_atomic_write_unit_min_bytes(q);
> +	awu_max = queue_atomic_write_unit_max_bytes(q);
> +
> +	awu_min &= ~mp->m_blockmask;
> +	awu_max &= ~mp->m_blockmask;

I don't understand why we try to round down the awu_max to blocks size
here and not just have an explicit check of (awu_max < blocksize).

I think the issue with changing the awu_max is that we are using awu_max
to also indirectly reflect the alignment so as to ensure we don't cross
atomic boundaries set by the hw (eg we check uint_max % atomic alignment
== 0 in scsi). So once we change the awu_max, there's a chance that even
if an atomic write aligns to the new awu_max it still doesn't have the
right alignment and fails. 

It works right now since eveything is power of 2 but it should cause
issues incase we decide to remove that limitation.  Anyways, I think
this implicit behavior of things working since eveything is a power of 2
should atleast be documented in a comment, so these things are
immediately clear. 

> +
> +	align = XFS_FSB_TO_B(mp, extsz);
> +
> +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
> +	    !is_power_of_2(align)) {

Correct me if I'm wrong but here as well, the is_power_of_2(align) is
esentially checking if the align % uinit_max == 0 (or vice versa if
unit_max is greater) so that an allocation of extsize will always align
nicely as needed by the device. 

So maybe we should use the % expression explicitly so that the intention
is immediately clear.

> +		*unit_min = 0;
> +		*unit_max = 0;
> +	} else {
> +		if (awu_min)
> +			*unit_min = min(awu_min, align);

How will the min() here work? If awu_min is the minumum set by the
device, how can statx be allowed to advertise something smaller than
that?

If I understand correctly, right now the way we set awu_min in scsi and
nvme, the follwoing should usually be true for a sane device:

 awu_min <= blocks size of fs <= align

 so the min() anyways becomes redundant, but if we do assume that there
 might be some weird devices with awu_min absurdly large (SCSI with
 high atomic granularity) we still can't actually advertise a min
 smaller than that of the device, or am I missing something here?

> +		else
> +			*unit_min = mp->m_sb.sb_blocksize;
> +
> +		*unit_max = min(awu_max, align);
> +	}
> +}
> +

Regards,
ojaswin
John Garry Feb. 9, 2024, 5:30 p.m. UTC | #4
>> +void xfs_get_atomic_write_attr(
>> +	struct xfs_inode *ip,
>> +	unsigned int *unit_min,
>> +	unsigned int *unit_max)
>> +{
>> +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
>> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>> +	struct block_device	*bdev = target->bt_bdev;
>> +	unsigned int		awu_min, awu_max, align;
>> +	struct request_queue	*q = bdev->bd_queue;
>> +	struct xfs_mount	*mp = ip->i_mount;
>> +
>> +	/*
>> +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
>> +	 * atomic write unit of BLOCKSIZE).
>> +	 */
>> +	awu_min = queue_atomic_write_unit_min_bytes(q);
>> +	awu_max = queue_atomic_write_unit_max_bytes(q);
>> +
>> +	awu_min &= ~mp->m_blockmask;
>> +	awu_max &= ~mp->m_blockmask;
> 
> I don't understand why we try to round down the awu_max to blocks size
> here and not just have an explicit check of (awu_max < blocksize).
We have later check for !awu_max:

if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
...

So what we are doing is ensuring that the awu_max which the device 
reports is at least FS block size. If it is not, then we cannot support 
atomic writes.

Indeed, my NVMe drive reports awu_max  = 4K. So for XFS configured for 
64K block size, we will say that we don't support atomic writes.

> 
> I think the issue with changing the awu_max is that we are using awu_max
> to also indirectly reflect the alignment so as to ensure we don't cross
> atomic boundaries set by the hw (eg we check uint_max % atomic alignment
> == 0 in scsi). So once we change the awu_max, there's a chance that even
> if an atomic write aligns to the new awu_max it still doesn't have the
> right alignment and fails.

All these values should be powers-of-2, so rounding down should not 
affect whether we would not cross an atomic write boundary.

> 
> It works right now since eveything is power of 2 but it should cause
> issues incase we decide to remove that limitation. 

Sure, but that is a fundamental principle of this atomic write support. 
Not having powers-of-2 requirement for atomic writes will affect many 
things.

> Anyways, I think
> this implicit behavior of things working since eveything is a power of 2
> should atleast be documented in a comment, so these things are
> immediately clear.
> 
>> +
>> +	align = XFS_FSB_TO_B(mp, extsz);
>> +
>> +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
>> +	    !is_power_of_2(align)) {
> 
> Correct me if I'm wrong but here as well, the is_power_of_2(align) is
> esentially checking if the align % uinit_max == 0 (or vice versa if
> unit_max is greater) 

yes

>so that an allocation of extsize will always align
> nicely as needed by the device.
>

I'm trying to keep things simple now.

In theory we could allow, say, align == 12 FSB, and then could say 
awu_max = 4.

The same goes for atomic write boundary in NVMe. Currently we say that 
it needs to be a power-of-2. However, it really just needs to be a 
multiple of awu_max. So if some HW did report a !power-of-2 atomic write 
boundary, we could reduce awu_max reported until to fits the power-of-2 
rule and also is cleanly divisible into atomic write boundary. But that 
is just not what HW will report (I expect). We live in a power-of-2 data 
granularity world.

> So maybe we should use the % expression explicitly so that the intention
> is immediately clear.

As mentioned, I wanted to keep it simple. In addition, it's a bit of a 
mess for the FS block allocator to work with odd sizes, like 12. And it 
does not suit RAID stripe alignment, which works in powers-of-2.

> 
>> +		*unit_min = 0;
>> +		*unit_max = 0;
>> +	} else {
>> +		if (awu_min)
>> +			*unit_min = min(awu_min, align);
> 
> How will the min() here work? If awu_min is the minumum set by the
> device, how can statx be allowed to advertise something smaller than
> that?
The idea is that is that if the awu_min reported by the device is less 
than the FS block size, then we report awu_min = FS block size. We 
already know that awu_max => FS block size, since we got this far, so 
saying that awu_min = FS block size is ok.

Otherwise it is the minimum of alignment and awu_min. I suppose that 
does not make much sense, and we should just always require awu_min = FS 
block size.

> 
> If I understand correctly, right now the way we set awu_min in scsi and
> nvme, the follwoing should usually be true for a sane device:
> 
>   awu_min <= blocks size of fs <= align
> 
>   so the min() anyways becomes redundant, but if we do assume that there
>   might be some weird devices with awu_min absurdly large (SCSI with
>   high atomic granularity) we still can't actually advertise a min
>   smaller than that of the device, or am I missing something here?

As above, I might just ensure that we can do awu_min = FS block size and 
not deal with crazy devices.

Thanks,
John
Ojaswin Mujoo Feb. 12, 2024, 11:48 a.m. UTC | #5
On Fri, Feb 09, 2024 at 05:30:50PM +0000, John Garry wrote:
> 
> > > +void xfs_get_atomic_write_attr(
> > > +	struct xfs_inode *ip,
> > > +	unsigned int *unit_min,
> > > +	unsigned int *unit_max)
> > > +{
> > > +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
> > > +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> > > +	struct block_device	*bdev = target->bt_bdev;
> > > +	unsigned int		awu_min, awu_max, align;
> > > +	struct request_queue	*q = bdev->bd_queue;
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +
> > > +	/*
> > > +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
> > > +	 * atomic write unit of BLOCKSIZE).
> > > +	 */
> > > +	awu_min = queue_atomic_write_unit_min_bytes(q);
> > > +	awu_max = queue_atomic_write_unit_max_bytes(q);
> > > +
> > > +	awu_min &= ~mp->m_blockmask;
> > > +	awu_max &= ~mp->m_blockmask;
> > 
> > I don't understand why we try to round down the awu_max to blocks size
> > here and not just have an explicit check of (awu_max < blocksize).
> We have later check for !awu_max:
> 
> if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
> ...
> 
> So what we are doing is ensuring that the awu_max which the device reports
> is at least FS block size. If it is not, then we cannot support atomic
> writes.
> 
> Indeed, my NVMe drive reports awu_max  = 4K. So for XFS configured for 64K
> block size, we will say that we don't support atomic writes.

> 
> > 
> > I think the issue with changing the awu_max is that we are using awu_max
> > to also indirectly reflect the alignment so as to ensure we don't cross
> > atomic boundaries set by the hw (eg we check uint_max % atomic alignment
> > == 0 in scsi). So once we change the awu_max, there's a chance that even
> > if an atomic write aligns to the new awu_max it still doesn't have the
> > right alignment and fails.
> 
> All these values should be powers-of-2, so rounding down should not affect
> whether we would not cross an atomic write boundary.


> 
> > 
> > It works right now since eveything is power of 2 but it should cause
> > issues incase we decide to remove that limitation.
> 
> Sure, but that is a fundamental principle of this atomic write support. Not
> having powers-of-2 requirement for atomic writes will affect many things.
> 

Correct, so the only reason for the rounding down is to ensure that
awu_max is not smaller than our block size but the way these checks are
right now doesn't make it obvious. It also raises questions like why we
are changing these min and max values especially why are we rounding
*down* min.

I think we should just have explicit (unit_[min/max] < bs) checks
without trying to round down the values.

> > Anyways, I think
> > this implicit behavior of things working since eveything is a power of 2
> > should atleast be documented in a comment, so these things are
> > immediately clear.
> > 
> > > +
> > > +	align = XFS_FSB_TO_B(mp, extsz);
> > > +
> > > +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
> > > +	    !is_power_of_2(align)) {
> > 
> > Correct me if I'm wrong but here as well, the is_power_of_2(align) is
> > esentially checking if the align % uinit_max == 0 (or vice versa if
> > unit_max is greater)
> 
> yes
> 
> > so that an allocation of extsize will always align
> > nicely as needed by the device.
> > 
> 
> I'm trying to keep things simple now.
> 
> In theory we could allow, say, align == 12 FSB, and then could say awu_max =
> 4.
> 
> The same goes for atomic write boundary in NVMe. Currently we say that it
> needs to be a power-of-2. However, it really just needs to be a multiple of
> awu_max. So if some HW did report a !power-of-2 atomic write boundary, we
> could reduce awu_max reported until to fits the power-of-2 rule and also is
> cleanly divisible into atomic write boundary. But that is just not what HW
> will report (I expect). We live in a power-of-2 data granularity world.

True, we ideally won't expect the hw to report that but why not just
make the check as (awu_max % align) so that:

1. intention is immediately clear
2. It'll directly work for non power of 2 boundary in future without
change.

Just my 2 cents.
> 
> > So maybe we should use the % expression explicitly so that the intention
> > is immediately clear.
> 
> As mentioned, I wanted to keep it simple. In addition, it's a bit of a mess
> for the FS block allocator to work with odd sizes, like 12. And it does not
> suit RAID stripe alignment, which works in powers-of-2.
> 
> > 
> > > +		*unit_min = 0;
> > > +		*unit_max = 0;
> > > +	} else {
> > > +		if (awu_min)
> > > +			*unit_min = min(awu_min, align);
> > 
> > How will the min() here work? If awu_min is the minumum set by the
> > device, how can statx be allowed to advertise something smaller than
> > that?
> The idea is that is that if the awu_min reported by the device is less than
> the FS block size, then we report awu_min = FS block size. We already know
> that awu_max => FS block size, since we got this far, so saying that awu_min
> = FS block size is ok.
> 
> Otherwise it is the minimum of alignment and awu_min. I suppose that does
> not make much sense, and we should just always require awu_min = FS block
> size.

Yep, that also works. We should just set the minimum to FS block size,
since that min() operator there is confusing.

Regards,
ojaswin
Ojaswin Mujoo Feb. 12, 2024, 12:05 p.m. UTC | #6
On Fri, Feb 09, 2024 at 05:30:50PM +0000, John Garry wrote:
> The same goes for atomic write boundary in NVMe. Currently we say that it
> needs to be a power-of-2. However, it really just needs to be a multiple of
> awu_max. So if some HW did report a !power-of-2 atomic write boundary, we

Hey John, sorry for double reply but can you point out where this
requrement is stated in the spec?

For example in NVME 2.1.4.3 Command Set spec I can see that

> The boundary size shall be greater than or equal to the corresponding
> atomic write size

However I'm not able to find the multiple-of-unit-max reqirement in the
spec. Maybe I'm missing something?

Regards,
ojaswin

> could reduce awu_max reported until to fits the power-of-2 rule and also is
> cleanly divisible into atomic write boundary. But that is just not what HW
> will report (I expect). We live in a power-of-2 data granularity world.
Darrick J. Wong Feb. 13, 2024, 5:37 p.m. UTC | #7
On Mon, Feb 05, 2024 at 01:10:54PM +0000, John Garry wrote:
> On 02/02/2024 18:05, Darrick J. Wong wrote:
> > On Wed, Jan 24, 2024 at 02:26:43PM +0000, John Garry wrote:
> > > Support providing info on atomic write unit min and max for an inode.
> > > 
> > > For simplicity, currently we limit the min at the FS block size, but a
> > > lower limit could be supported in future.
> > > 
> > > The atomic write unit min and max is limited by the guaranteed extent
> > > alignment for the inode.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/xfs/xfs_iops.h |  4 ++++
> > >   2 files changed, 49 insertions(+)
> > > 
> > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > index a0d77f5f512e..0890d2f70f4d 100644
> > > --- a/fs/xfs/xfs_iops.c
> > > +++ b/fs/xfs/xfs_iops.c
> > > @@ -546,6 +546,44 @@ xfs_stat_blksize(
> > >   	return PAGE_SIZE;
> > >   }
> > > +void xfs_get_atomic_write_attr(
> > 
> > static void?
> 
> We use this in the iomap and statx code
> 
> > 
> > > +	struct xfs_inode *ip,
> > > +	unsigned int *unit_min,
> > > +	unsigned int *unit_max)
> > 
> > Weird indenting here.
> 
> hmmm... I thought that this was the XFS style
> 
> Can you show how it should look?

The parameter declarations should line up with the local variables:

void
xfs_get_atomic_write_attr(
	struct xfs_inode	*ip,
	unsigned int		*unit_min,
	unsigned int		*unit_max)
{
	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
	struct block_device	*bdev = target->bt_bdev;
	struct request_queue	*q = bdev->bd_queue;
	struct xfs_mount	*mp = ip->i_mount;
	unsigned int		awu_min, awu_max, align;
	xfs_extlen_t		extsz = xfs_get_extsz(ip);

> > 
> > > +{
> > > +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
> > > +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> > > +	struct block_device	*bdev = target->bt_bdev;
> > > +	unsigned int		awu_min, awu_max, align;
> > > +	struct request_queue	*q = bdev->bd_queue;
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +
> > > +	/*
> > > +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
> > > +	 * atomic write unit of BLOCKSIZE).
> > > +	 */
> > > +	awu_min = queue_atomic_write_unit_min_bytes(q);
> > > +	awu_max = queue_atomic_write_unit_max_bytes(q);
> > > +
> > > +	awu_min &= ~mp->m_blockmask;
> > 
> > Why do you round /down/ the awu_min value here?
> 
> This is just to ensure that we returning *unit_min >= BLOCKSIZE
> 
> For example, if awu_min, max 1K, 64K from the bdev, we now have 0 and 64K.
> And below this gives us awu_min, max of 4k, 64k.
> 
> Maybe there is a more logical way of doing this.

	awu_min = roundup(queue_atomic_write_unit_min_bytes(q),
			  mp->m_sb.sb_blocksize);

?

> 
> > 
> > > +	awu_max &= ~mp->m_blockmask;
> > 
> > Actually -- since the atomic write units have to be powers of 2, why is
> > rounding needed here at all?
> 
> Sure, but the bdev can report a awu_min < BLOCKSIZE
> 
> > 
> > > +
> > > +	align = XFS_FSB_TO_B(mp, extsz);
> > > +
> > > +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
> > > +	    !is_power_of_2(align)) {
> > 
> > ...and if you take my suggestion to make a common helper to validate the
> > atomic write unit parameters, this can collapse into:
> > 
> > 	alloc_unit_bytes = xfs_inode_alloc_unitsize(ip);
> > 	if (!xfs_inode_has_atomicwrites(ip) ||
> > 	    !bdev_validate_atomic_write(bdev, alloc_unit_bytes))  > 		/* not supported, return zeroes */
> > 		*unit_min = 0;
> > 		*unit_max = 0;
> > 		return;
> > 	}
> > 
> > 	*unit_min = max(alloc_unit_bytes, awu_min);
> > 	*unit_max = min(alloc_unit_bytes, awu_max);
> 
> Again, we need to ensure that *unit_min >= BLOCKSIZE

The file allocation unit and hence the return value of
xfs_inode_alloc_unitsize is always a multiple of sb_blocksize.

--D

> Thanks,
> John
> 
>
John Garry Feb. 14, 2024, 12:26 p.m. UTC | #8
On 13/02/2024 17:37, Darrick J. Wong wrote:
>> We use this in the iomap and statx code
>>
>>>> +	struct xfs_inode *ip,
>>>> +	unsigned int *unit_min,
>>>> +	unsigned int *unit_max)
>>> Weird indenting here.
>> hmmm... I thought that this was the XFS style
>>
>> Can you show how it should look?
> The parameter declarations should line up with the local variables:
> 
> void
> xfs_get_atomic_write_attr(
> 	struct xfs_inode	*ip,
> 	unsigned int		*unit_min,
> 	unsigned int		*unit_max)
> {
> 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> 	struct block_device	*bdev = target->bt_bdev;
> 	struct request_queue	*q = bdev->bd_queue;
> 	struct xfs_mount	*mp = ip->i_mount;
> 	unsigned int		awu_min, awu_max, align;
> 	xfs_extlen_t		extsz = xfs_get_extsz(ip);
> 
>>>> +{
>>>> +	xfs_extlen_t		extsz = xfs_get_extsz(ip);
>>>> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>>>> +	struct block_device	*bdev = target->bt_bdev;
>>>> +	unsigned int		awu_min, awu_max, align;
>>>> +	struct request_queue	*q = bdev->bd_queue;
>>>> +	struct xfs_mount	*mp = ip->i_mount;
>>>> +
>>>> +	/*
>>>> +	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
>>>> +	 * atomic write unit of BLOCKSIZE).
>>>> +	 */
>>>> +	awu_min = queue_atomic_write_unit_min_bytes(q);
>>>> +	awu_max = queue_atomic_write_unit_max_bytes(q);
>>>> +
>>>> +	awu_min &= ~mp->m_blockmask;
>>> Why do you round/down/  the awu_min value here?
>> This is just to ensure that we returning *unit_min >= BLOCKSIZE
>>
>> For example, if awu_min, max 1K, 64K from the bdev, we now have 0 and 64K.
>> And below this gives us awu_min, max of 4k, 64k.
>>
>> Maybe there is a more logical way of doing this.
> 	awu_min = roundup(queue_atomic_write_unit_min_bytes(q),
> 			  mp->m_sb.sb_blocksize);
> 
> ?

Yeah, I think that all this can be simplified to be made more obvious.

> 
>>>> +	awu_max &= ~mp->m_blockmask;
>>> Actually -- since the atomic write units have to be powers of 2, why is
>>> rounding needed here at all?
>> Sure, but the bdev can report a awu_min < BLOCKSIZE
>>
>>>> +
>>>> +	align = XFS_FSB_TO_B(mp, extsz);
>>>> +
>>>> +	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
>>>> +	    !is_power_of_2(align)) {
>>> ...and if you take my suggestion to make a common helper to validate the
>>> atomic write unit parameters, this can collapse into:
>>>
>>> 	alloc_unit_bytes = xfs_inode_alloc_unitsize(ip);
>>> 	if (!xfs_inode_has_atomicwrites(ip) ||
>>> 	    !bdev_validate_atomic_write(bdev, alloc_unit_bytes))  > 		/* not supported, return zeroes */
>>> 		*unit_min = 0;
>>> 		*unit_max = 0;
>>> 		return;
>>> 	}
>>>
>>> 	*unit_min = max(alloc_unit_bytes, awu_min);
>>> 	*unit_max = min(alloc_unit_bytes, awu_max);
>> Again, we need to ensure that *unit_min >= BLOCKSIZE
> The file allocation unit and hence the return value of
> xfs_inode_alloc_unitsize is always a multiple of sb_blocksize.

Right, but this value is coming from HW and we are just ensuring that 
the awu_min which we report is >= BLOCKSIZE. xfs_inode_alloc_unitsize() 
return value will really guide unit_max.

Anyway, again I can make this all more obvious.

Thanks,
John
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a0d77f5f512e..0890d2f70f4d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -546,6 +546,44 @@  xfs_stat_blksize(
 	return PAGE_SIZE;
 }
 
+void xfs_get_atomic_write_attr(
+	struct xfs_inode *ip,
+	unsigned int *unit_min,
+	unsigned int *unit_max)
+{
+	xfs_extlen_t		extsz = xfs_get_extsz(ip);
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct block_device	*bdev = target->bt_bdev;
+	unsigned int		awu_min, awu_max, align;
+	struct request_queue	*q = bdev->bd_queue;
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/*
+	 * Convert to multiples of the BLOCKSIZE (as we support a minimum
+	 * atomic write unit of BLOCKSIZE).
+	 */
+	awu_min = queue_atomic_write_unit_min_bytes(q);
+	awu_max = queue_atomic_write_unit_max_bytes(q);
+
+	awu_min &= ~mp->m_blockmask;
+	awu_max &= ~mp->m_blockmask;
+
+	align = XFS_FSB_TO_B(mp, extsz);
+
+	if (!awu_max || !xfs_inode_atomicwrites(ip) || !align ||
+	    !is_power_of_2(align)) {
+		*unit_min = 0;
+		*unit_max = 0;
+	} else {
+		if (awu_min)
+			*unit_min = min(awu_min, align);
+		else
+			*unit_min = mp->m_sb.sb_blocksize;
+
+		*unit_max = min(awu_max, align);
+	}
+}
+
 STATIC int
 xfs_vn_getattr(
 	struct mnt_idmap	*idmap,
@@ -619,6 +657,13 @@  xfs_vn_getattr(
 			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
 			stat->dio_offset_align = bdev_logical_block_size(bdev);
 		}
+		if (request_mask & STATX_WRITE_ATOMIC) {
+			unsigned int unit_min, unit_max;
+
+			xfs_get_atomic_write_attr(ip, &unit_min, &unit_max);
+			generic_fill_statx_atomic_writes(stat,
+				unit_min, unit_max);
+		}
 		fallthrough;
 	default:
 		stat->blksize = xfs_stat_blksize(ip);
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 7f84a0843b24..76dd4c3687aa 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,4 +19,8 @@  int xfs_vn_setattr_size(struct mnt_idmap *idmap,
 int xfs_inode_init_security(struct inode *inode, struct inode *dir,
 		const struct qstr *qstr);
 
+void xfs_get_atomic_write_attr(struct xfs_inode *ip,
+		unsigned int *unit_min,
+		unsigned int *unit_max);
+
 #endif /* __XFS_IOPS_H__ */