[06/10] btrfs: dax write support
diff mbox series

Message ID 20181205122835.19290-7-rgoldwyn@suse.de
State New
Headers show
Series
  • btrfs: Support for DAX devices
Related show

Commit Message

Goldwyn Rodrigues Dec. 5, 2018, 12:28 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

This is a combination of direct and buffered I/O. Similarties
with direct I/O is that it needs to allocate space before
writing. Similarities with buffered is when the data is not
page-aligned, it needs to copy parts of the previous extents. In
order to accomplish that, keep a references of the first and last
extent (if required) and then perform allocations. If the "pos"
or "end" is not aligned, copy the data from first and last extent
respectively.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/ctree.h |   1 +
 fs/btrfs/dax.c   | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/file.c  |   4 +-
 3 files changed, 125 insertions(+), 1 deletion(-)

Comments

Johannes Thumshirn Dec. 5, 2018, 1:56 p.m. UTC | #1
On 05/12/2018 13:28, Goldwyn Rodrigues wrote:
[...]

> +static int copy_extent_page(struct extent_map *em, void *daddr, u64 pos)
> +{
> +        struct dax_device *dax_dev;

^ space instead of tabs?

> +	void *saddr;
> +	sector_t start;
> +	size_t len;
> +
> +	if (em->block_start == EXTENT_MAP_HOLE) {
> +		memset(daddr, 0, PAGE_SIZE);
> +	} else {
> +		dax_dev = fs_dax_get_by_bdev(em->bdev);
> +		start = (get_start_sect(em->bdev) << 9) + (em->block_start + (pos - em->start));
> +		len = dax_direct_access(dax_dev, PHYS_PFN(start), 1, &saddr, NULL);
> +		memcpy(daddr, saddr, PAGE_SIZE);
> +	}
> +	free_extent_map(em);
> +
> +	return 0;
> +}
> +
> +

copy_extent_page() always returns 0, why not make it void?
Plus a nit: double newline.

> +ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	ssize_t ret, done = 0, count = iov_iter_count(from);
> +        struct inode *inode = file_inode(iocb->ki_filp);
^ again spaces vs tabs.

> +	u64 pos = iocb->ki_pos;
> +	u64 start = round_down(pos, PAGE_SIZE);
> +	u64 end = round_up(pos + count, PAGE_SIZE);
> +	struct extent_state *cached_state = NULL;
> +	struct extent_changeset *data_reserved = NULL;
> +	struct extent_map *first = NULL, *last = NULL;
> +
> +	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, end - start);
> +	if (ret < 0)
> +		return ret;
> +
> +	/* Grab a reference of the first extent to copy data */
> +	if (start < pos) {
> +		first = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, end - start, 0);
> +		if (IS_ERR(first)) {
> +			ret = PTR_ERR(first);
> +			goto out2;
> +		}
> +	}

You're using 'end - start' at least twice here, maybe you could move
'len' out of the loop and use it for btrfs_delalloc_reserve_space() and
btrfs_get_extent() as well.

> +
> +	/* Grab a reference of the last extent to copy data */
> +	if (pos + count < end) {
> +		last = btrfs_get_extent(BTRFS_I(inode), NULL, 0, end - PAGE_SIZE, PAGE_SIZE, 0);
> +		if (IS_ERR(last)) {
> +			ret = PTR_ERR(last);
> +			goto out2;
> +		}
> +	}
> +
> +	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
> +	while (done < count) {
> +		struct extent_map *em;
> +		struct dax_device *dax_dev;
> +		int offset = pos & (PAGE_SIZE - 1);
> +		u64 estart = round_down(pos, PAGE_SIZE);
> +		u64 elen = end - estart;
> +		size_t len = count - done;
> +		sector_t dstart;
> +		void *daddr;
> +		ssize_t maplen;
> +
> +		/* Read the current extent */
> +                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, estart, elen, 0);

Space again.

> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		/* Get a new extent */
> +		ret = btrfs_get_extent_map_write(&em, NULL, inode, estart, elen);
> +		if (ret < 0)
> +			goto out;
> +
> +		dax_dev = fs_dax_get_by_bdev(em->bdev);
> +		/* Calculate start address start of destination extent */
> +		dstart = (get_start_sect(em->bdev) << 9) + em->block_start;
> +		maplen = dax_direct_access(dax_dev, PHYS_PFN(dstart),
> +				PHYS_PFN(em->len), &daddr, NULL);
> +
> +		/* Copy front of extent page */
> +		if (offset)
> +			ret = copy_extent_page(first, daddr, estart);
> +
> +		/* Copy end of extent page */
> +		if ((pos + len > estart + PAGE_SIZE) && (pos + len < em->start + em->len))
> +			ret = copy_extent_page(last, daddr + em->len - PAGE_SIZE, em->start + em->len - PAGE_SIZE);
> +
> +		/* Copy the data from the iter */
> +		maplen = PFN_PHYS(maplen);
> +		maplen -= offset;
> +		ret = dax_copy_from_iter(dax_dev, dstart, daddr + offset, maplen, from);
> +		if (ret < 0)
> +			goto out;
> +		pos += ret;
> +		done += ret;
> +	}
> +out:

out_unlock?

> +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
> +	if (done) {
> +		btrfs_update_ordered_extent(inode, start,
> +				end - start, true);
> +		iocb->ki_pos += done;
> +		if (iocb->ki_pos > i_size_read(inode))
> +			i_size_write(inode, iocb->ki_pos);
> +	}
> +
> +	btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
> +out2:

out?

> +	if (count - done > 0)
> +		btrfs_delalloc_release_space(inode, data_reserved, pos,
> +				count - done, true);
> +	extent_changeset_free(data_reserved);
> +        return done ? done : ret;
> +
> +}
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index ef6ed93f44d1..29a3b12e6660 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1964,7 +1964,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>  	if (sync)
>  		atomic_inc(&BTRFS_I(inode)->sync_writers);
>  
> -	if (iocb->ki_flags & IOCB_DIRECT) {
> +	if (IS_DAX(inode)) {
> +		num_written = btrfs_file_dax_write(iocb, from);
> +	} else if (iocb->ki_flags & IOCB_DIRECT) {
>  		num_written = __btrfs_direct_write(iocb, from);
>  	} else {
>  		num_written = btrfs_buffered_write(iocb, from);
>

Patch
diff mbox series

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a0d296b0d826..d91ff283a966 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3693,6 +3693,7 @@  int btree_readahead_hook(struct extent_buffer *eb, int err);
 #ifdef CONFIG_FS_DAX
 /* dax.c */
 ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to);
+ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from);
 #endif /* CONFIG_FS_DAX */
 
 static inline int is_fstree(u64 rootid)
diff --git a/fs/btrfs/dax.c b/fs/btrfs/dax.c
index 5a297674adec..4000259a426c 100644
--- a/fs/btrfs/dax.c
+++ b/fs/btrfs/dax.c
@@ -2,6 +2,7 @@ 
 #include <linux/uio.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "extent_io.h"
 
 static ssize_t em_dax_rw(struct inode *inode, struct extent_map *em, u64 pos,
 		u64 len, struct iov_iter *iter)
@@ -71,3 +72,123 @@  ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to)
         return done ? done : ret;
 }
 
+static int copy_extent_page(struct extent_map *em, void *daddr, u64 pos)
+{
+        struct dax_device *dax_dev;
+	void *saddr;
+	sector_t start;
+	size_t len;
+
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		memset(daddr, 0, PAGE_SIZE);
+	} else {
+		dax_dev = fs_dax_get_by_bdev(em->bdev);
+		start = (get_start_sect(em->bdev) << 9) + (em->block_start + (pos - em->start));
+		len = dax_direct_access(dax_dev, PHYS_PFN(start), 1, &saddr, NULL);
+		memcpy(daddr, saddr, PAGE_SIZE);
+	}
+	free_extent_map(em);
+
+	return 0;
+}
+
+
+ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t ret, done = 0, count = iov_iter_count(from);
+        struct inode *inode = file_inode(iocb->ki_filp);
+	u64 pos = iocb->ki_pos;
+	u64 start = round_down(pos, PAGE_SIZE);
+	u64 end = round_up(pos + count, PAGE_SIZE);
+	struct extent_state *cached_state = NULL;
+	struct extent_changeset *data_reserved = NULL;
+	struct extent_map *first = NULL, *last = NULL;
+
+	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, end - start);
+	if (ret < 0)
+		return ret;
+
+	/* Grab a reference of the first extent to copy data */
+	if (start < pos) {
+		first = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, end - start, 0);
+		if (IS_ERR(first)) {
+			ret = PTR_ERR(first);
+			goto out2;
+		}
+	}
+
+	/* Grab a reference of the last extent to copy data */
+	if (pos + count < end) {
+		last = btrfs_get_extent(BTRFS_I(inode), NULL, 0, end - PAGE_SIZE, PAGE_SIZE, 0);
+		if (IS_ERR(last)) {
+			ret = PTR_ERR(last);
+			goto out2;
+		}
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+	while (done < count) {
+		struct extent_map *em;
+		struct dax_device *dax_dev;
+		int offset = pos & (PAGE_SIZE - 1);
+		u64 estart = round_down(pos, PAGE_SIZE);
+		u64 elen = end - estart;
+		size_t len = count - done;
+		sector_t dstart;
+		void *daddr;
+		ssize_t maplen;
+
+		/* Read the current extent */
+                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, estart, elen, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		/* Get a new extent */
+		ret = btrfs_get_extent_map_write(&em, NULL, inode, estart, elen);
+		if (ret < 0)
+			goto out;
+
+		dax_dev = fs_dax_get_by_bdev(em->bdev);
+		/* Calculate start address start of destination extent */
+		dstart = (get_start_sect(em->bdev) << 9) + em->block_start;
+		maplen = dax_direct_access(dax_dev, PHYS_PFN(dstart),
+				PHYS_PFN(em->len), &daddr, NULL);
+
+		/* Copy front of extent page */
+		if (offset)
+			ret = copy_extent_page(first, daddr, estart);
+
+		/* Copy end of extent page */
+		if ((pos + len > estart + PAGE_SIZE) && (pos + len < em->start + em->len))
+			ret = copy_extent_page(last, daddr + em->len - PAGE_SIZE, em->start + em->len - PAGE_SIZE);
+
+		/* Copy the data from the iter */
+		maplen = PFN_PHYS(maplen);
+		maplen -= offset;
+		ret = dax_copy_from_iter(dax_dev, dstart, daddr + offset, maplen, from);
+		if (ret < 0)
+			goto out;
+		pos += ret;
+		done += ret;
+	}
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+	if (done) {
+		btrfs_update_ordered_extent(inode, start,
+				end - start, true);
+		iocb->ki_pos += done;
+		if (iocb->ki_pos > i_size_read(inode))
+			i_size_write(inode, iocb->ki_pos);
+	}
+
+	btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+out2:
+	if (count - done > 0)
+		btrfs_delalloc_release_space(inode, data_reserved, pos,
+				count - done, true);
+	extent_changeset_free(data_reserved);
+        return done ? done : ret;
+
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ef6ed93f44d1..29a3b12e6660 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1964,7 +1964,9 @@  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	if (sync)
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (IS_DAX(inode)) {
+		num_written = btrfs_file_dax_write(iocb, from);
+	} else if (iocb->ki_flags & IOCB_DIRECT) {
 		num_written = __btrfs_direct_write(iocb, from);
 	} else {
 		num_written = btrfs_buffered_write(iocb, from);