diff mbox

[05/10] dax: provide an iomap based dax read/write path

Message ID 1473438884-674-6-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Sept. 9, 2016, 4:34 p.m. UTC
This is a much simpler implementation of the DAX read/write path that makes
use of the iomap infrastructure.  It does not try to mirror the direct I/O
calling conventions and thus doesn't have to deal with i_dio_count or the
end_io handler, but instead leaves locking and filesystem-specific I/O
completion to the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dax.c              | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |   2 +
 2 files changed, 105 insertions(+)

Comments

Ross Zwisler Sept. 13, 2016, 11 p.m. UTC | #1
On Fri, Sep 09, 2016 at 06:34:39PM +0200, Christoph Hellwig wrote:
> This is a much simpler implementation of the DAX read/write path that makes
> use of the iomap infrastructure.  It does not try to mirror the direct I/O
> calling conventions and thus doesn't have to deal with i_dio_count or the
> end_io handler, but instead leaves locking and filesystem-specific I/O
> completion to the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/dax.c              | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h |   2 +
>  2 files changed, 105 insertions(+)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 84343ce..57ad456 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -31,6 +31,8 @@
>  #include <linux/vmstat.h>
>  #include <linux/pfn_t.h>
>  #include <linux/sizes.h>
> +#include <linux/iomap.h>
> +#include "internal.h"
>  
>  /*
>   * We use lowest available bit in exceptional entry for locking, other two
> @@ -1241,3 +1243,104 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
>  	return dax_zero_page_range(inode, from, length, get_block);
>  }
>  EXPORT_SYMBOL_GPL(dax_truncate_page);
> +
> +#ifdef CONFIG_FS_IOMAP
> +static loff_t
> +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> +		struct iomap *iomap)
> +{
> +	struct iov_iter *iter = data;
> +	loff_t end = pos + length, done = 0;
> +	ssize_t ret = 0;
> +
> +	if (iov_iter_rw(iter) == READ) {
> +		end = min(end, i_size_read(inode));
> +		if (pos >= end)
> +			return 0;
> +
> +		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
> +			return iov_iter_zero(min(length, end - pos), iter);
> +	}
> +
> +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> +		return -EIO;
> +
> +	while (pos < end) {
> +		unsigned offset = pos & (PAGE_SIZE - 1);
> +		struct blk_dax_ctl dax = { 0 };
> +		ssize_t map_len;
> +
> +		dax.sector = iomap->blkno +
> +			(((pos & PAGE_MASK) - iomap->offset) >> 9);
> +		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
> +		map_len = dax_map_atomic(iomap->bdev, &dax);
> +		if (map_len < 0) {
> +			ret = map_len;
> +			break;
> +		}
> +
> +		dax.addr += offset;
> +		map_len -= offset;
> +		if (map_len > end - pos)
> +			map_len = end - pos;
> +
> +		if (iov_iter_rw(iter) == WRITE)
> +			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
> +		else
> +			map_len = copy_to_iter(dax.addr, map_len, iter);
> +		dax_unmap_atomic(iomap->bdev, &dax);
> +		if (map_len <= 0) {
> +			ret = map_len ? map_len : -EFAULT;
> +			break;
> +		}
> +
> +		pos += map_len;
> +		length -= map_len;
> +		done += map_len;
> +	}
> +
> +	return done ? done : ret;
> +}
> +
> +/**
> + * iomap_dax_rw - Perform I/O to a DAX file
> + * @iocb: The control block for this I/O
> + * @iter: The addresses to do I/O from or to
> + * @ops: iomap ops passed from the file system
> + *
> + * This funtions performs read and write operations to directly mapped

	   function

> + * persistent memory.  The callers needs to take care of read/write exclusion
> + * and evicting any page cache pages in the region under I/O.
> + */
> +ssize_t
> +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
> +		struct iomap_ops *ops)
> +{
> +	struct inode *inode = iocb->ki_filp->f_mapping->host;
> +	loff_t pos = iocb->ki_pos, ret = 0, done = 0;

Just a note that 'ret' is loff_t about half the time in the iomap code and
ssize_t the other half.  I guess it doesn't really matter since they should
both be big unsigned values (64 bits on x96_64), but it's a bit inconsistent.

> +	size_t count = iov_iter_count(iter);
> +	unsigned flags = 0;
> +
> +	if (!count)
> +		return 0;
> +
> +	if (iov_iter_rw(iter) == WRITE)
> +		flags |= IOMAP_WRITE;
> +
> +	do {
> +		ret = iomap_apply(inode, pos, count, flags, ops, iter,
> +				  iomap_dax_actor);
> +		if (ret <= 0)
> +			break;
> +		pos += ret;
> +		done += ret;
> +	} while ((count = iov_iter_count(iter)));
> +
> +	if (!done)
> +		return ret;
> +
> +	iocb->ki_pos += done;
> +	return done;
> +}

I think you can remove the special casing around 'done' and 'count' and make
this a bit simpler:

ssize_t
iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
		struct iomap_ops *ops)
{
	struct inode *inode = iocb->ki_filp->f_mapping->host;
	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
	unsigned flags = 0;
	size_t count;

	if (iov_iter_rw(iter) == WRITE)
		flags |= IOMAP_WRITE;

	 while ((count = iov_iter_count(iter))) {
		ret = iomap_apply(inode, pos, count, flags, ops, iter,
				  iomap_dax_actor);
		if (ret <= 0)
			break;
		pos += ret;
		done += ret;
	}

	iocb->ki_pos += done;
	return done ? done : ret;
}

This is now very similar to iomap_file_buffered_write().

> +EXPORT_SYMBOL_GPL(iomap_dax_rw);
> +#endif /* CONFIG_FS_IOMAP */
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 14d7067..3d5f785 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -65,6 +65,8 @@ struct iomap_ops {
>  
>  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
>  		struct iomap_ops *ops);
> +ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
> +		struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
>  		bool *did_zero, struct iomap_ops *ops);
>  int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> -- 
> 2.1.4
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 84343ce..57ad456 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@ 
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
 
 /*
  * We use lowest available bit in exceptional entry for locking, other two
@@ -1241,3 +1243,104 @@  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 	return dax_zero_page_range(inode, from, length, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *iter = data;
+	loff_t end = pos + length, done = 0;
+	ssize_t ret = 0;
+
+	if (iov_iter_rw(iter) == READ) {
+		end = min(end, i_size_read(inode));
+		if (pos >= end)
+			return 0;
+
+		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+			return iov_iter_zero(min(length, end - pos), iter);
+	}
+
+	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+		return -EIO;
+
+	while (pos < end) {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		struct blk_dax_ctl dax = { 0 };
+		ssize_t map_len;
+
+		dax.sector = iomap->blkno +
+			(((pos & PAGE_MASK) - iomap->offset) >> 9);
+		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+		map_len = dax_map_atomic(iomap->bdev, &dax);
+		if (map_len < 0) {
+			ret = map_len;
+			break;
+		}
+
+		dax.addr += offset;
+		map_len -= offset;
+		if (map_len > end - pos)
+			map_len = end - pos;
+
+		if (iov_iter_rw(iter) == WRITE)
+			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+		else
+			map_len = copy_to_iter(dax.addr, map_len, iter);
+		dax_unmap_atomic(iomap->bdev, &dax);
+		if (map_len <= 0) {
+			ret = map_len ? map_len : -EFAULT;
+			break;
+		}
+
+		pos += map_len;
+		length -= map_len;
+		done += map_len;
+	}
+
+	return done ? done : ret;
+}
+
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb: The control block for this I/O
+ * @iter: The addresses to do I/O from or to
+ * @ops: iomap ops passed from the file system
+ *
+ * This funtions performs read and write operations to directly mapped
+ * persistent memory.  The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+	size_t count = iov_iter_count(iter);
+	unsigned flags = 0;
+
+	if (!count)
+		return 0;
+
+	if (iov_iter_rw(iter) == WRITE)
+		flags |= IOMAP_WRITE;
+
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, iter,
+				  iomap_dax_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		done += ret;
+	} while ((count = iov_iter_count(iter)));
+
+	if (!done)
+		return ret;
+
+	iocb->ki_pos += done;
+	return done;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 14d7067..3d5f785 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -65,6 +65,8 @@  struct iomap_ops {
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		struct iomap_ops *ops);
+ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,