diff mbox series

[3/3] xfs: report the correct read/write dio alignment for reflinked inodes

Message ID 20240828051149.1897291-4-hch@lst.de (mailing list archive)
State Not Applicable, archived
Headers show
Series [1/3] fs: reformat the statx definition | expand

Commit Message

Christoph Hellwig Aug. 28, 2024, 5:11 a.m. UTC
For I/O to reflinked blocks we always need to write an entire new
file system block, and the code enforces the file system block alignment
for the entire file if it has any reflinked blocks.

Use the new STATX_DIO_READ_ALIGN flag to report the asymmetric read
vs write alignments for reflinked files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iops.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

Comments

Darrick J. Wong Aug. 28, 2024, 4:23 p.m. UTC | #1
On Wed, Aug 28, 2024 at 08:11:03AM +0300, Christoph Hellwig wrote:
> For I/O to reflinked blocks we always need to write an entire new
> file system block, and the code enforces the file system block alignment
> for the entire file if it has any reflinked blocks.
> 
> Use the new STATX_DIO_READ_ALIGN flag to report the asymmetric read
> vs write alignments for reflinked files.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iops.c | 37 +++++++++++++++++++++++++++++--------
>  1 file changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1cdc8034f54d93..de2fc12688dc23 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -570,6 +570,33 @@ xfs_stat_blksize(
>  	return PAGE_SIZE;
>  }
>  
> +static void
> +xfs_report_dioalign(
> +	struct xfs_inode	*ip,
> +	struct kstat		*stat)
> +{
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct block_device	*bdev = target->bt_bdev;
> +
> +	stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
> +	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> +	stat->dio_read_offset_align = bdev_logical_block_size(bdev);
> +
> +	/*
> +	 * On COW inodes we are forced to always rewrite an entire file system
> +	 * block.
> +	 *
> +	 * Because applications assume they can do sector sized direct writes
> +	 * on XFS we provide an emulation by doing a read-modify-write cycle
> +	 * through the cache, but that is highly inefficient.  Thus report the
> +	 * natively supported size here.
> +	 */
> +	if (xfs_is_cow_inode(ip))
> +		stat->dio_offset_align = ip->i_mount->m_sb.sb_blocksize;

xfs_inode_alloc_unitsize(), since we can only cow full allocation units.

(Not necessary today, but we might as well make it work for rtreflink
from the start.)

--D

> +	else
> +		stat->dio_offset_align = stat->dio_read_offset_align;
> +}
> +
>  STATIC int
>  xfs_vn_getattr(
>  	struct mnt_idmap	*idmap,
> @@ -635,14 +662,8 @@ xfs_vn_getattr(
>  		stat->rdev = inode->i_rdev;
>  		break;
>  	case S_IFREG:
> -		if (request_mask & STATX_DIOALIGN) {
> -			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> -			struct block_device	*bdev = target->bt_bdev;
> -
> -			stat->result_mask |= STATX_DIOALIGN;
> -			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> -			stat->dio_offset_align = bdev_logical_block_size(bdev);
> -		}
> +		if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
> +			xfs_report_dioalign(ip, stat);
>  		fallthrough;
>  	default:
>  		stat->blksize = xfs_stat_blksize(ip);
> -- 
> 2.43.0
> 
>
Dave Chinner Aug. 29, 2024, 1:13 a.m. UTC | #2
On Wed, Aug 28, 2024 at 08:11:03AM +0300, Christoph Hellwig wrote:
> For I/O to reflinked blocks we always need to write an entire new
> file system block, and the code enforces the file system block alignment
> for the entire file if it has any reflinked blocks.
> 
> Use the new STATX_DIO_READ_ALIGN flag to report the asymmetric read
> vs write alignments for reflinked files.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iops.c | 37 +++++++++++++++++++++++++++++--------
>  1 file changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1cdc8034f54d93..de2fc12688dc23 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -570,6 +570,33 @@ xfs_stat_blksize(
>  	return PAGE_SIZE;
>  }
>  
> +static void
> +xfs_report_dioalign(
> +	struct xfs_inode	*ip,
> +	struct kstat		*stat)
> +{
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct block_device	*bdev = target->bt_bdev;
> +
> +	stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
> +	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> +	stat->dio_read_offset_align = bdev_logical_block_size(bdev);
> +
> +	/*
> +	 * On COW inodes we are forced to always rewrite an entire file system
> +	 * block.
> +	 *
> +	 * Because applications assume they can do sector sized direct writes
> +	 * on XFS we provide an emulation by doing a read-modify-write cycle
> +	 * through the cache, but that is highly inefficient.  Thus report the
> +	 * natively supported size here.
> +	 */
> +	if (xfs_is_cow_inode(ip))
> +		stat->dio_offset_align = ip->i_mount->m_sb.sb_blocksize;
> +	else
> +		stat->dio_offset_align = stat->dio_read_offset_align;

It might be worth making it explicitly clear that logical block size
aligned IO for COW operations will still work. I think that's what
you are trying to say, but it took me a while to work out. Perhaps
something like:

	/*
	 * COW operations are inefficient on sub-fsblock aligned
	 * ranges.  They need to copy the entire block, so the
	 * minimum IO size we will ever do in this case is a single
	 * filesystem block.
	 *
	 * Even though we support sector sized IO on COW inodes, we
	 * want to help applications avoid the costly RMW cycle it
	 * requires for COW inodes. Hence report the native
	 * filesystem allocation unit size here to indicate the
	 * smallest alignment that will avoid RMW cycles in the DIO
	 * write path.
	 */

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1cdc8034f54d93..de2fc12688dc23 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -570,6 +570,33 @@  xfs_stat_blksize(
 	return PAGE_SIZE;
 }
 
+static void
+xfs_report_dioalign(
+	struct xfs_inode	*ip,
+	struct kstat		*stat)
+{
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct block_device	*bdev = target->bt_bdev;
+
+	stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+	stat->dio_read_offset_align = bdev_logical_block_size(bdev);
+
+	/*
+	 * On COW inodes we are forced to always rewrite an entire file system
+	 * block.
+	 *
+	 * Because applications assume they can do sector sized direct writes
+	 * on XFS we provide an emulation by doing a read-modify-write cycle
+	 * through the cache, but that is highly inefficient.  Thus report the
+	 * natively supported size here.
+	 */
+	if (xfs_is_cow_inode(ip))
+		stat->dio_offset_align = ip->i_mount->m_sb.sb_blocksize;
+	else
+		stat->dio_offset_align = stat->dio_read_offset_align;
+}
+
 STATIC int
 xfs_vn_getattr(
 	struct mnt_idmap	*idmap,
@@ -635,14 +662,8 @@  xfs_vn_getattr(
 		stat->rdev = inode->i_rdev;
 		break;
 	case S_IFREG:
-		if (request_mask & STATX_DIOALIGN) {
-			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
-			struct block_device	*bdev = target->bt_bdev;
-
-			stat->result_mask |= STATX_DIOALIGN;
-			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
-			stat->dio_offset_align = bdev_logical_block_size(bdev);
-		}
+		if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
+			xfs_report_dioalign(ip, stat);
 		fallthrough;
 	default:
 		stat->blksize = xfs_stat_blksize(ip);