diff mbox series

[1/3] block: export dma_alignment attribute

Message ID 20220513161339.1580042-1-kbusch@fb.com (mailing list archive)
State New, archived
Headers show
Series [1/3] block: export dma_alignment attribute | expand

Commit Message

Keith Busch May 13, 2022, 4:13 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

User space may want to know how to align their buffers to avoid
bouncing. Export the queue attribute.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-sysfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

Comments

Christoph Hellwig May 16, 2022, 6:52 a.m. UTC | #1
I'm not against the sysfs file, but for applications this is a
horrible interface.  We really need something that works on the fd.

The XFS_IOC_DIOINFO ioctl from xfs is one, although ugly, option.
The other is to export the alignment requirements through statx.
We had that whole discussion with the inline crypto direct I/O support,
and we really need to tackle it rather sooner than later.
Keith Busch May 16, 2022, 2:32 p.m. UTC | #2
On Sun, May 15, 2022 at 11:52:46PM -0700, Christoph Hellwig wrote:
> I'm not against the sysfs file, but for applications this is a
> horrible interface.  We really need something that works on the fd.

Yeah, sysfs is not very convenient for the intended use.
 
> The XFS_IOC_DIOINFO ioctl from xfs is one, although ugly, option.
> The other is to export the alignment requirements through statx.
> We had that whole discussion with the inline crypto direct I/O support,
> and we really need to tackle it rather sooner than later.

I'll try out assigning one of the spares in 'struct statx' for the purpose. I
like the interface for that, and looks easy enough to pass along the dma
alignment requirement through it.
Keith Busch May 17, 2022, 10:55 p.m. UTC | #3
On Mon, May 16, 2022 at 08:32:22AM -0600, Keith Busch wrote:
>  
> > The XFS_IOC_DIOINFO ioctl from xfs is one, although ugly, option.
> > The other is to export the alignment requirements through statx.
> > We had that whole discussion with the inline crypto direct I/O support,
> > and we really need to tackle it rather sooner than later.
> 
> I'll try out assigning one of the spares in 'struct statx' for the purpose. I
> like the interface for that, and looks easy enough to pass along the dma
> alignment requirement through it.

A little less easy than I thought... This is what I'm coming up with, and it's
bad enough that I'm sure there has to be a better way. Specifically concerning
is the new "do_dma()" function below, and was the only way I managed to get the
correct 'struct block_device' whether we're stat'ing a filesystem file or raw
block device file.

---
diff --git a/fs/stat.c b/fs/stat.c
index 5c2c94464e8b..72c0b36599c2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 #include <linux/compat.h>
+#include <linux/blkdev.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -594,6 +595,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
 	tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
 	tmp.stx_mode = stat->mode;
+	tmp.stx_dma = stat->dma;
 	tmp.stx_ino = stat->ino;
 	tmp.stx_size = stat->size;
 	tmp.stx_blocks = stat->blocks;
@@ -615,6 +617,35 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
 
+static void do_dma(int dfd, struct filename *filename, struct kstat *stat)
+{
+	struct open_flags op = {};
+	struct block_device *bdev;
+	struct file *f;
+
+	stat->dma = 511;
+
+	if (S_ISBLK(stat->mode)) {
+		bdev = blkdev_get_no_open(stat->rdev);
+
+		if (bdev) {
+			stat->dma = bdev_dma_alignment(bdev);
+			blkdev_put_no_open(bdev);
+		}
+		return;
+	}
+
+	f = do_filp_open(dfd, filename, &op);
+	if (IS_ERR(f))
+		return;
+
+	bdev = f->f_inode->i_sb->s_bdev;
+	if (bdev)
+		stat->dma = bdev_dma_alignment(bdev);
+	fput(f);
+}
+
 int do_statx(int dfd, struct filename *filename, unsigned int flags,
 	     unsigned int mask, struct statx __user *buffer)
 {
@@ -630,6 +661,7 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
 	if (error)
 		return error;
 
+	do_dma(dfd, filename, &stat);
 	return cp_statx(&stat, buffer);
 }
 
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..0a12c7498aa0 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,7 @@ struct kstat {
 	struct timespec64 btime;			/* File creation time */
 	u64		blocks;
 	u64		mnt_id;
+	u16		dma;
 };
 
 #endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..380820c29c35 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -106,7 +106,7 @@ struct statx {
 	__u32	stx_uid;	/* User ID of owner */
 	__u32	stx_gid;	/* Group ID of owner */
 	__u16	stx_mode;	/* File mode */
-	__u16	__spare0[1];
+	__u16	stx_dma;	/* DMA alignment */
 	/* 0x20 */
 	__u64	stx_ino;	/* Inode number */
 	__u64	stx_size;	/* File size */
--
Christoph Hellwig May 18, 2022, 7:23 a.m. UTC | #4
On Tue, May 17, 2022 at 04:55:20PM -0600, Keith Busch wrote:
> A little less easy than I thought... This is what I'm coming up with, and it's
> bad enough that I'm sure there has to be a better way. Specifically concerning
> is the new "do_dma()" function below, and was the only way I managed to get the
> correct 'struct block_device' whether we're stat'ing a filesystem file or raw
> block device file.

I don't think doing this in common code makes much sense.  The core
VFS code should not have to know if something is on a block device or
not.

Instead add a new getattr method to block/bdev.c for the block devices,
and just have a helper to set the alinments(s) based on that by it,
and any file systems that is made ready to accept lower alignment.
And I'd prefer to do them individully and tested as there might be all
kinds of assumptions.  For all other instances keep the value as 0
for unknown.

> --- a/include/uapi/linux/stat.h
> +++ b/include/uapi/linux/stat.h
> @@ -106,7 +106,7 @@ struct statx {
>  	__u32	stx_uid;	/* User ID of owner */
>  	__u32	stx_gid;	/* Group ID of owner */
>  	__u16	stx_mode;	/* File mode */
> -	__u16	__spare0[1];
> +	__u16	stx_dma;	/* DMA alignment */

I'd name this dio_mem_alignment, because it is is:

 a) specific to direct I/O
 b) DMA is just the implementation detail, but not the user semantics

while we're at it, please also add a dio_file_alignment for the
alignment in the file, which can be sector or fs block size.  I'm
perfectly fine if you only do it for the block layer first, I'll
take up the wok to update the most common file systems after that.
diff mbox series

Patch

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb59..14607565d781 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@  static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
 	return queue_var_show(q->limits.virt_boundary_mask, page);
 }
 
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_dma_alignment(q), page);
+}
+
 #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
 static ssize_t								\
 queue_##name##_show(struct request_queue *q, char *page)		\
@@ -606,6 +611,7 @@  QUEUE_RO_ENTRY(queue_dax, "dax");
 QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@  static struct attribute *queue_attrs[] = {
 	&blk_throtl_sample_time_entry.attr,
 #endif
 	&queue_virt_boundary_mask_entry.attr,
+	&queue_dma_alignment_entry.attr,
 	NULL,
 };