diff mbox series

[4/5] io_uring: add support for dma pre-mapping

Message ID 20220726173814.2264573-5-kbusch@fb.com (mailing list archive)
State New
Headers show
Series dma mapping optimisations | expand

Commit Message

Keith Busch July 26, 2022, 5:38 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Provide a new register operation that can request to pre-map a known
bvec to the driver of the requested file descriptor's specific
implementation. If successful, io_uring will use the returned dma tag
for future fixed buffer requests to the same file.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/uapi/linux/io_uring.h |  12 ++++
 io_uring/io_uring.c           | 129 ++++++++++++++++++++++++++++++++++
 io_uring/net.c                |   2 +-
 io_uring/rsrc.c               |  13 +++-
 io_uring/rsrc.h               |  16 ++++-
 io_uring/rw.c                 |   2 +-
 6 files changed, 166 insertions(+), 8 deletions(-)

Comments

Al Viro July 26, 2022, 11:12 p.m. UTC | #1
On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:

> +	if (S_ISBLK(file_inode(file)->i_mode))
> +		bdev = I_BDEV(file->f_mapping->host);
> +	else if (S_ISREG(file_inode(file)->i_mode))
> +		bdev = file->f_inode->i_sb->s_bdev;

*blink*

Just what's the intended use of the second case here?
Keith Busch July 27, 2022, 1:58 p.m. UTC | #2
On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> 
> > +	if (S_ISBLK(file_inode(file)->i_mode))
> > +		bdev = I_BDEV(file->f_mapping->host);
> > +	else if (S_ISREG(file_inode(file)->i_mode))
> > +		bdev = file->f_inode->i_sb->s_bdev;
> 
> *blink*
> 
> Just what's the intended use of the second case here?

??

The use case is same as the first's: dma map the user addresses to the backing
storage. There's two cases here because getting the block_device for a regular
filesystem file is different than a raw block device.
Al Viro July 27, 2022, 2:04 p.m. UTC | #3
On Wed, Jul 27, 2022 at 07:58:29AM -0600, Keith Busch wrote:
> On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> > On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> > 
> > > +	if (S_ISBLK(file_inode(file)->i_mode))
> > > +		bdev = I_BDEV(file->f_mapping->host);
> > > +	else if (S_ISREG(file_inode(file)->i_mode))
> > > +		bdev = file->f_inode->i_sb->s_bdev;
> > 
> > *blink*
> > 
> > Just what's the intended use of the second case here?
> 
> ??
> 
> The use case is same as the first's: dma map the user addresses to the backing
> storage. There's two cases here because getting the block_device for a regular
> filesystem file is different than a raw block device.

Excuse me, but "file on some filesystem + block number on underlying device"
makes no sense as an API...
Al Viro July 27, 2022, 2:11 p.m. UTC | #4
On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:

> +	file = fget(map.fd);
> +	if (!file)
> +		return -EBADF;
> +
> +	if (S_ISBLK(file_inode(file)->i_mode))
> +		bdev = I_BDEV(file->f_mapping->host);
> +	else if (S_ISREG(file_inode(file)->i_mode))
> +		bdev = file->f_inode->i_sb->s_bdev;
> +	else
> +		return -EOPNOTSUPP;
> +
> +	for (i = map.buf_start; i < map.buf_end; i++) {
> +		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
> +		void *tag;
> +
> +		if (imu->dma_tag) {
> +			ret = -EBUSY;
> +			goto err;
> +		}
> +
> +		tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
> +		if (IS_ERR(tag)) {
> +			ret = PTR_ERR(tag);
> +			goto err;
> +		}
> +
> +		imu->dma_tag = tag;
> +		imu->dma_file = file;
> +		imu->bdev = bdev;
> +	}
> +
> +	fput(file);

This, BTW, is completely insane - what happens if you follow that
with close(map.fd)?  A bunch of dangling struct file references?

I really don't understand what you are trying to do here.
Keith Busch July 27, 2022, 2:48 p.m. UTC | #5
On Wed, Jul 27, 2022 at 03:11:05PM +0100, Al Viro wrote:
> On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> 
> > +	file = fget(map.fd);
> > +	if (!file)
> > +		return -EBADF;
> > +
> > +	if (S_ISBLK(file_inode(file)->i_mode))
> > +		bdev = I_BDEV(file->f_mapping->host);
> > +	else if (S_ISREG(file_inode(file)->i_mode))
> > +		bdev = file->f_inode->i_sb->s_bdev;
> > +	else
> > +		return -EOPNOTSUPP;
> > +
> > +	for (i = map.buf_start; i < map.buf_end; i++) {
> > +		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
> > +		void *tag;
> > +
> > +		if (imu->dma_tag) {
> > +			ret = -EBUSY;
> > +			goto err;
> > +		}
> > +
> > +		tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
> > +		if (IS_ERR(tag)) {
> > +			ret = PTR_ERR(tag);
> > +			goto err;
> > +		}
> > +
> > +		imu->dma_tag = tag;
> > +		imu->dma_file = file;
> > +		imu->bdev = bdev;
> > +	}
> > +
> > +	fput(file);
> 
> This, BTW, is completely insane - what happens if you follow that
> with close(map.fd)?  A bunch of dangling struct file references?

This should have been tied to files registered with the io_uring instance
holding a reference, and cleaned up when the files are unregistered. I may be
missing some cases here, so I'll fix that up.

> I really don't understand what you are trying to do here

We want to register userspace addresses with the block_device just once. We can
skip costly per-IO setup this way.
Keith Busch July 27, 2022, 3:04 p.m. UTC | #6
On Wed, Jul 27, 2022 at 03:04:56PM +0100, Al Viro wrote:
> On Wed, Jul 27, 2022 at 07:58:29AM -0600, Keith Busch wrote:
> > On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> > > On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> > > 
> > > > +	if (S_ISBLK(file_inode(file)->i_mode))
> > > > +		bdev = I_BDEV(file->f_mapping->host);
> > > > +	else if (S_ISREG(file_inode(file)->i_mode))
> > > > +		bdev = file->f_inode->i_sb->s_bdev;
> > > 
> > > *blink*
> > > 
> > > Just what's the intended use of the second case here?
> > 
> > ??
> > 
> > The use case is same as the first's: dma map the user addresses to the backing
> > storage. There's two cases here because getting the block_device for a regular
> > filesystem file is different than a raw block device.
> 
> Excuse me, but "file on some filesystem + block number on underlying device"
> makes no sense as an API...

Sorry if I'm misunderstanding your concern here.

The API is a file descriptor + index range of registered buffers (which is a
pre-existing io_uring API). The file descriptor can come from opening either a
raw block device (ex: /dev/nvme0n1), or any regular file on a mounted
filesystem using nvme as a backing store.

You don't need to know about specific block numbers. You can use the result
with any offset in the underlying block device.

This also isn't necessarily tied to nvme-pci; that's just the only low-level
driver I've enabled in this series, but others may come later.
Al Viro July 27, 2022, 3:26 p.m. UTC | #7
On Wed, Jul 27, 2022 at 08:48:29AM -0600, Keith Busch wrote:

> > This, BTW, is completely insane - what happens if you follow that
> > with close(map.fd)?  A bunch of dangling struct file references?
> 
> This should have been tied to files registered with the io_uring instance
> holding a reference, and cleaned up when the files are unregistered. I may be
> missing some cases here, so I'll fix that up.

???

Your code does the following sequence:
	file = fget(some number)
	store the obtained pointer in a lot of places
	fput(file)

What is "may be missing" and what kind of "registration" could possibly
help here?  As soon as fget() had returned the reference, another thread
might have removed it from the descriptor table, leaving you the sole holder
of reference to object.  In that case it will be destroyed by fput(), making
its memory free for reuse.

Looks like you have some very odd idea of what the struct file lifetime rules
are...

> > I really don't understand what you are trying to do here
> 
> We want to register userspace addresses with the block_device just once. We can
> skip costly per-IO setup this way.

Explain, please.  How will those be used afterwards and how will IO be matched
with the file you've passed here?
Dave Chinner July 27, 2022, 10:32 p.m. UTC | #8
On Wed, Jul 27, 2022 at 09:04:25AM -0600, Keith Busch wrote:
> On Wed, Jul 27, 2022 at 03:04:56PM +0100, Al Viro wrote:
> > On Wed, Jul 27, 2022 at 07:58:29AM -0600, Keith Busch wrote:
> > > On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> > > > On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> > > > 
> > > > > +	if (S_ISBLK(file_inode(file)->i_mode))
> > > > > +		bdev = I_BDEV(file->f_mapping->host);
> > > > > +	else if (S_ISREG(file_inode(file)->i_mode))
> > > > > +		bdev = file->f_inode->i_sb->s_bdev;
> > > > 
> > > > *blink*
> > > > 
> > > > Just what's the intended use of the second case here?
> > > 
> > > ??
> > > 
> > > The use case is same as the first's: dma map the user addresses to the backing
> > > storage. There's two cases here because getting the block_device for a regular
> > > filesystem file is different than a raw block device.
> > 
> > Excuse me, but "file on some filesystem + block number on underlying device"
> > makes no sense as an API...
> 
> Sorry if I'm misunderstanding your concern here.
> 
> The API is a file descriptor + index range of registered buffers (which is a
> pre-existing io_uring API). The file descriptor can come from opening either a
> raw block device (ex: /dev/nvme0n1), or any regular file on a mounted
> filesystem using nvme as a backing store.

That's fundamentally flawed. Filesystems can have multiple block
devices backing them that the VFS doesn't actually know about (e.g.
btrfs, XFS, etc). Further, some of these filesystems can spread
indiivdual file data across mutliple block devices i.e. the backing
bdev changes as file offset changes....

Filesystems might not even have a block device (NFS, CIFS, etc) -
what happens if you call this function on a file belonging to such a
filesystem?

> You don't need to know about specific block numbers. You can use the result
> with any offset in the underlying block device.

Sure, but you how exactly do you know what block device the file
offset maps to?

We have entire layers like fs/iomap or bufferheads for this - their
entire purpose in life is to efficiently manage the translation
between {file, file_offset} and {dev, dev_offset} for the purposes
of IO and data access...

Cheers,

Dave.
Keith Busch July 27, 2022, 11 p.m. UTC | #9
On Thu, Jul 28, 2022 at 08:32:32AM +1000, Dave Chinner wrote:
> On Wed, Jul 27, 2022 at 09:04:25AM -0600, Keith Busch wrote:
> > On Wed, Jul 27, 2022 at 03:04:56PM +0100, Al Viro wrote:
> > > On Wed, Jul 27, 2022 at 07:58:29AM -0600, Keith Busch wrote:
> > > > On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> > > > > On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> > > > > 
> > > > > > +	if (S_ISBLK(file_inode(file)->i_mode))
> > > > > > +		bdev = I_BDEV(file->f_mapping->host);
> > > > > > +	else if (S_ISREG(file_inode(file)->i_mode))
> > > > > > +		bdev = file->f_inode->i_sb->s_bdev;
> > > > > 
> > > > > *blink*
> > > > > 
> > > > > Just what's the intended use of the second case here?
> > > > 
> > > > ??
> > > > 
> > > > The use case is same as the first's: dma map the user addresses to the backing
> > > > storage. There's two cases here because getting the block_device for a regular
> > > > filesystem file is different than a raw block device.
> > > 
> > > Excuse me, but "file on some filesystem + block number on underlying device"
> > > makes no sense as an API...
> > 
> > Sorry if I'm misunderstanding your concern here.
> > 
> > The API is a file descriptor + index range of registered buffers (which is a
> > pre-existing io_uring API). The file descriptor can come from opening either a
> > raw block device (ex: /dev/nvme0n1), or any regular file on a mounted
> > filesystem using nvme as a backing store.
> 
> That's fundamentally flawed. Filesystems can have multiple block
> devices backing them that the VFS doesn't actually know about (e.g.
> btrfs, XFS, etc). Further, some of these filesystems can spread
> indiivdual file data across mutliple block devices i.e. the backing
> bdev changes as file offset changes....
> 
> Filesystems might not even have a block device (NFS, CIFS, etc) -
> what happens if you call this function on a file belonging to such a
> filesystem?

The block_device driver has to opt-in to this feature. If a multi-device block
driver wants to opt-in to this, then it would be responsible to handle
translating that driver's specific cookie to whatever representation the
drivers it stacks atop require. Otherwise, the cookie threaded through the bio
is an opque value: nothing between io_uring and the block_device driver need to
decode it.

If the block_device doesn't support providing this cookie, then io_uring just
falls back to the existing less optimal methond, and all will continue to work
as it does today.
Dave Chinner July 28, 2022, 2:35 a.m. UTC | #10
On Wed, Jul 27, 2022 at 05:00:09PM -0600, Keith Busch wrote:
> On Thu, Jul 28, 2022 at 08:32:32AM +1000, Dave Chinner wrote:
> > On Wed, Jul 27, 2022 at 09:04:25AM -0600, Keith Busch wrote:
> > > On Wed, Jul 27, 2022 at 03:04:56PM +0100, Al Viro wrote:
> > > > On Wed, Jul 27, 2022 at 07:58:29AM -0600, Keith Busch wrote:
> > > > > On Wed, Jul 27, 2022 at 12:12:53AM +0100, Al Viro wrote:
> > > > > > On Tue, Jul 26, 2022 at 10:38:13AM -0700, Keith Busch wrote:
> > > > > > 
> > > > > > > +	if (S_ISBLK(file_inode(file)->i_mode))
> > > > > > > +		bdev = I_BDEV(file->f_mapping->host);
> > > > > > > +	else if (S_ISREG(file_inode(file)->i_mode))
> > > > > > > +		bdev = file->f_inode->i_sb->s_bdev;
> > > > > > 
> > > > > > *blink*
> > > > > > 
> > > > > > Just what's the intended use of the second case here?
> > > > > 
> > > > > ??
> > > > > 
> > > > > The use case is same as the first's: dma map the user addresses to the backing
> > > > > storage. There's two cases here because getting the block_device for a regular
> > > > > filesystem file is different than a raw block device.
> > > > 
> > > > Excuse me, but "file on some filesystem + block number on underlying device"
> > > > makes no sense as an API...
> > > 
> > > Sorry if I'm misunderstanding your concern here.
> > > 
> > > The API is a file descriptor + index range of registered buffers (which is a
> > > pre-existing io_uring API). The file descriptor can come from opening either a
> > > raw block device (ex: /dev/nvme0n1), or any regular file on a mounted
> > > filesystem using nvme as a backing store.
> > 
> > That's fundamentally flawed. Filesystems can have multiple block
> > devices backing them that the VFS doesn't actually know about (e.g.
> > btrfs, XFS, etc). Further, some of these filesystems can spread
> > indiivdual file data across mutliple block devices i.e. the backing
> > bdev changes as file offset changes....
> > 
> > Filesystems might not even have a block device (NFS, CIFS, etc) -
> > what happens if you call this function on a file belonging to such a
> > filesystem?
> 
> The block_device driver has to opt-in to this feature. If a multi-device block
> driver wants to opt-in to this, then it would be responsible to handle
> translating that driver's specific cookie to whatever representation the
> drivers it stacks atop require. Otherwise, the cookie threaded through the bio
> is an opque value: nothing between io_uring and the block_device driver need to
> decode it.

I'm not talking about "multi-device" block devices like we build
with DM or MD to present a single stacked block device to the
filesystem. I'm talking about the fact that both btrfs and XFS
support multiple *independent* block devices in the one filesystem.

i.e.:

# mkfs.xfs -r rtdev=/dev/nvme0n1 -l logdev=/dev/nvme1n1,size=2000m /dev/nvme2n1
meta-data=/dev/nvme2n1           isize=512    agcount=4, agsize=22893287 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=0
         =                       reflink=0    bigtime=1 inobtcount=1 nrext64=0
data     =                       bsize=4096   blocks=91573146, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =/dev/nvme1n1           bsize=4096   blocks=512000, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =/dev/nvme0n1           extsz=4096   blocks=91573146, rtextents=91573146
#

This builds an XFS filesystem which can write file data to either
/dev/nvme0n1 or /dev/nvme2n1, and journal IO will get sent to a
third block dev (/dev/nvme1n1).

So, which block device do we map for the DMA buffers that contain
the file data for any given file in that filesystem? There is no
guarantee that is is sb->s_bdev, because it only points at one of
the two block devices that can contain file data.

Btrfs is similar, but it might stripe data across /dev/nvme0n1,
/dev/nvme1n1 and /dev/nvme2n1 for a single file writes (and hence
reads) and so needs separate DMA mappings for each block device just
to do IO direct to/from one file....

Indeed, for XFS there's no requirement that the block devices have
the same capabilities or even storage types - the rtdev could be
spinning disks, the logdev an nvme SSD, and the datadev is pmem. If
XFs has to do something special, it queries the bdev it needs to
operate on (e.g. DAX mappings are only allowed on pmem based
devices).

Hence it is invalid to assume that sb->s_bdev points at the actual
block device the data for any given regular file is stored on. It is
also invalid to assume the characteristics of the device in
sb->s_bdev are common for all files in the filesystem.

IOWs, the only way you can make something like this work via
filesystem mapping infrastructure to translate file offset to
to a {dev, dev_offset} tuple to tell you what persistently mapped
device buffers you need to use for IO to the given file {offset,len}
range that IO needs to be done on....

Cheers,

Dave.
Keith Busch July 28, 2022, 1:25 p.m. UTC | #11
On Thu, Jul 28, 2022 at 12:35:11PM +1000, Dave Chinner wrote:
> On Wed, Jul 27, 2022 at 05:00:09PM -0600, Keith Busch wrote:
> > The block_device driver has to opt-in to this feature. If a multi-device block
> > driver wants to opt-in to this, then it would be responsible to handle
> > translating that driver's specific cookie to whatever representation the
> > drivers it stacks atop require. Otherwise, the cookie threaded through the bio
> > is an opque value: nothing between io_uring and the block_device driver need to
> > decode it.
> 
> I'm not talking about "multi-device" block devices like we build
> with DM or MD to present a single stacked block device to the
> filesystem. I'm talking about the fact that both btrfs and XFS
> support multiple *independent* block devices in the one filesystem.
> 
> i.e.:
> 
> # mkfs.xfs -r rtdev=/dev/nvme0n1 -l logdev=/dev/nvme1n1,size=2000m /dev/nvme2n1
> meta-data=/dev/nvme2n1           isize=512    agcount=4, agsize=22893287 blks
>          =                       sectsz=512   attr=2, projid32bit=1
>          =                       crc=1        finobt=1, sparse=1, rmapbt=0
>          =                       reflink=0    bigtime=1 inobtcount=1 nrext64=0
> data     =                       bsize=4096   blocks=91573146, imaxpct=25
>          =                       sunit=0      swidth=0 blks
> naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
> log      =/dev/nvme1n1           bsize=4096   blocks=512000, version=2
>          =                       sectsz=512   sunit=0 blks, lazy-count=1
> realtime =/dev/nvme0n1           extsz=4096   blocks=91573146, rtextents=91573146
> #
> 
> This builds an XFS filesystem which can write file data to either
> /dev/nvme0n1 or /dev/nvme2n1, and journal IO will get sent to a
> third block dev (/dev/nvme1n1).
> 
> So, which block device do we map for the DMA buffers that contain
> the file data for any given file in that filesystem? There is no
> guarantee that is is sb->s_bdev, because it only points at one of
> the two block devices that can contain file data.
> 
> Btrfs is similar, but it might stripe data across /dev/nvme0n1,
> /dev/nvme1n1 and /dev/nvme2n1 for a single file writes (and hence
> reads) and so needs separate DMA mappings for each block device just
> to do IO direct to/from one file....
> 
> Indeed, for XFS there's no requirement that the block devices have
> the same capabilities or even storage types - the rtdev could be
> spinning disks, the logdev an nvme SSD, and the datadev is pmem. If
> XFs has to do something special, it queries the bdev it needs to
> operate on (e.g. DAX mappings are only allowed on pmem based
> devices).
> 
> Hence it is invalid to assume that sb->s_bdev points at the actual
> block device the data for any given regular file is stored on. It is
> also invalid to assume the characteristics of the device in
> sb->s_bdev are common for all files in the filesystem.
> 
> IOWs, the only way you can make something like this work via
> filesystem mapping infrastructure to translate file offset to
> to a {dev, dev_offset} tuple to tell you what persistently mapped
> device buffers you need to use for IO to the given file {offset,len}
> range that IO needs to be done on....

Thank you for the explanation. I understand now, sorry for my previous
misunderstanding.

I may consider just initially supporting direct raw block devices if I can't
find a viable solution quick enough.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@  enum {
 	IORING_REGISTER_NOTIFIERS		= 26,
 	IORING_UNREGISTER_NOTIFIERS		= 27,
 
+	/* dma map registered buffers */
+	IORING_REGISTER_MAP_BUFFERS		= 28,
+	IORING_REGISTER_UNMAP_BUFFERS		= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -661,4 +665,12 @@  struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+struct io_uring_map_buffers {
+	__s32	fd;
+	__s32	buf_start;
+	__s32	buf_end;
+	__u32	flags;
+	__u64	rsvd[2];
+};
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d600a63643b..12f7354e0423 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3704,6 +3704,123 @@  static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+#ifdef CONFIG_BLOCK
+static int get_map_range(struct io_ring_ctx *ctx,
+			 struct io_uring_map_buffers *map, void __user *arg)
+{
+	int ret;
+
+	if (copy_from_user(map, arg, sizeof(*map)))
+		return -EFAULT;
+	if (map->flags || map->rsvd[0] || map->rsvd[1])
+		return -EINVAL;
+	if (map->buf_start < 0)
+		return -EINVAL;
+	if (map->buf_start >= ctx->nr_user_bufs)
+		return -EINVAL;
+	if (map->buf_end > ctx->nr_user_bufs)
+		map->buf_end = ctx->nr_user_bufs;
+
+	ret = map->buf_end - map->buf_start;
+	if (ret <= 0)
+		return -EINVAL;
+
+	return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+	if (imu->dma_tag)
+		block_dma_unmap(imu->bdev, imu->dma_tag);
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	int i, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+
+	return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	struct block_device *bdev;
+	struct file *file;
+	int ret, i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	file = fget(map.fd);
+	if (!file)
+		return -EBADF;
+
+	if (S_ISBLK(file_inode(file)->i_mode))
+		bdev = I_BDEV(file->f_mapping->host);
+	else if (S_ISREG(file_inode(file)->i_mode))
+		bdev = file->f_inode->i_sb->s_bdev;
+	else
+		return -EOPNOTSUPP;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		void *tag;
+
+		if (imu->dma_tag) {
+			ret = -EBUSY;
+			goto err;
+		}
+
+		tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
+		if (IS_ERR(tag)) {
+			ret = PTR_ERR(tag);
+			goto err;
+		}
+
+		imu->dma_tag = tag;
+		imu->dma_file = file;
+		imu->bdev = bdev;
+	}
+
+	fput(file);
+	return 0;
+err:
+	while (--i >= map.buf_start) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+	fput(file);
+	return ret;
+}
+#else /* CONFIG_BLOCK */
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BLOCK */
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -3870,6 +3987,18 @@  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_notif_unregister(ctx);
 		break;
+	case IORING_REGISTER_MAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_map_buffers(ctx, arg);
+		break;
+	case IORING_REGISTER_UNMAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_unmap_buffers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 8276b9537194..68a996318959 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@  int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
-					(u64)(uintptr_t)zc->buf, zc->len);
+					(u64)(uintptr_t)zc->buf, zc->len, NULL);
 		if (unlikely(ret))
 				return ret;
 	} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1a7a8dedbbd5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
+		io_dma_unmap(imu);
 		kvfree(imu);
 	}
 	*slot = NULL;
@@ -1285,6 +1286,7 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	imu->dma_tag = NULL;
 	*pimu = imu;
 	ret = 0;
 done:
@@ -1359,9 +1361,8 @@  int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file)
 {
 	u64 buf_end;
 	size_t offset;
@@ -1379,6 +1380,12 @@  int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
+	if (imu->dma_tag && file == imu->dma_file) {
+		unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) +
+					(len >> PAGE_SHIFT);
+		iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len);
+		return 0;
+	}
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..6e63b7a57b34 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,11 @@  struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	void		*dma_tag;
+	struct file	*dma_file;
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+#endif
 	struct bio_vec	bvec[];
 };
 
@@ -64,9 +69,14 @@  int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file);
+
+#ifdef CONFIG_BLOCK
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+#else
+static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {}
+#endif
 
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@  static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;