diff mbox series

[v9,7/8] xfs: support CoW in fsdax mode

Message ID 20210915104501.4146910-8-ruansy.fnst@fujitsu.com (mailing list archive)
State Superseded
Headers show
Series fsdax,xfs: Add reflink&dedupe support for fsdax | expand

Commit Message

Shiyang Ruan Sept. 15, 2021, 10:45 a.m. UTC
In fsdax mode, WRITE and ZERO on a shared extent need CoW performed.
After that, new allocated extents needs to be remapped to the file.
So, add a CoW identification in ->iomap_begin(), and implement
->iomap_end() to do the remapping work.

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 fs/xfs/xfs_bmap_util.c |  3 +--
 fs/xfs/xfs_file.c      |  6 +++---
 fs/xfs/xfs_iomap.c     | 38 +++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_iomap.h     | 30 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_iops.c      |  7 +++----
 fs/xfs/xfs_reflink.c   |  3 +--
 6 files changed, 75 insertions(+), 12 deletions(-)

Comments

Darrick J. Wong Sept. 16, 2021, 12:22 a.m. UTC | #1
On Wed, Sep 15, 2021 at 06:45:00PM +0800, Shiyang Ruan wrote:
> In fsdax mode, WRITE and ZERO on a shared extent need CoW performed.
> After that, new allocated extents needs to be remapped to the file.
> So, add a CoW identification in ->iomap_begin(), and implement
> ->iomap_end() to do the remapping work.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  fs/xfs/xfs_bmap_util.c |  3 +--
>  fs/xfs/xfs_file.c      |  6 +++---
>  fs/xfs/xfs_iomap.c     | 38 +++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_iomap.h     | 30 ++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iops.c      |  7 +++----
>  fs/xfs/xfs_reflink.c   |  3 +--
>  6 files changed, 75 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 73a36b7be3bd..0681250e0a5d 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1009,8 +1009,7 @@ xfs_free_file_space(
>  		return 0;
>  	if (offset + len > XFS_ISIZE(ip))
>  		len = XFS_ISIZE(ip) - offset;
> -	error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
> -			&xfs_buffered_write_iomap_ops);
> +	error = xfs_iomap_zero_range(ip, offset, len, NULL);
>  	if (error)
>  		return error;
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 7aa943edfc02..2ef1930374d2 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -704,7 +704,7 @@ xfs_file_dax_write(
>  	pos = iocb->ki_pos;
>  
>  	trace_xfs_file_dax_write(iocb, from);
> -	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
> +	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
>  	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
>  		i_size_write(inode, iocb->ki_pos);
>  		error = xfs_setfilesize(ip, pos, ret);
> @@ -1329,8 +1329,8 @@ __xfs_filemap_fault(
>  		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
>  				(write_fault && !vmf->cow_page) ?
> -				 &xfs_direct_write_iomap_ops :
> -				 &xfs_read_iomap_ops);
> +					&xfs_dax_write_iomap_ops :
> +					&xfs_read_iomap_ops);

Hmm... I wonder if this should get hoisted to a "xfs_dax_iomap_fault"
wrapper like you did for xfs_iomap_zero_range?

>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
>  		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 093758440ad5..6fa3b377cb81 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -761,7 +761,8 @@ xfs_direct_write_iomap_begin(
>  
>  		/* may drop and re-acquire the ilock */
>  		error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> -				&lockmode, flags & IOMAP_DIRECT);
> +				&lockmode,
> +				(flags & IOMAP_DIRECT) || IS_DAX(inode));
>  		if (error)
>  			goto out_unlock;
>  		if (shared)
> @@ -854,6 +855,41 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
>  	.iomap_begin		= xfs_direct_write_iomap_begin,
>  };
>  
> +static int
> +xfs_dax_write_iomap_end(
> +	struct inode 		*inode,
> +	loff_t 			pos,
> +	loff_t 			length,
> +	ssize_t 		written,
> +	unsigned 		flags,
> +	struct iomap 		*iomap)

Whitespace nit:     ^ space before a tab.

> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	/*
> +	 * Usually we use @written to indicate whether the operation was
> +	 * successful.  But it is always positive or zero.  The CoW needs the
> +	 * actual error code from actor().  So, get it from
> +	 * iomap_iter->processed.

Hm.  All six arguments are derived from the struct iomap_iter, so maybe
it makes more sense to pass that in?  I'll poke around with this more
tomorrow.

> +	 */
> +	const struct iomap_iter *iter =
> +				container_of(iomap, typeof(*iter), iomap);
> +
> +	if (!xfs_is_cow_inode(ip))
> +		return 0;
> +
> +	if (iter->processed <= 0) {
> +		xfs_reflink_cancel_cow_range(ip, pos, length, true);
> +		return 0;
> +	}
> +
> +	return xfs_reflink_end_cow(ip, pos, iter->processed);
> +}
> +
> +const struct iomap_ops xfs_dax_write_iomap_ops = {
> +	.iomap_begin 	= xfs_direct_write_iomap_begin,

Space before tab    ^

> +	.iomap_end	= xfs_dax_write_iomap_end,
> +};
> +
>  static int
>  xfs_buffered_write_iomap_begin(

Also, we have an related request to drop the EXPERIMENTAL tag for
non-DAX reflink.  Whichever patch enables dax+reflink for xfs needs to
make it clear that reflink + any possibility of DAX emits an
EXPERIMENTAL warning.

--D

>  	struct inode		*inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 7d3703556d0e..92679a0c3578 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -45,5 +45,35 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
>  extern const struct iomap_ops xfs_read_iomap_ops;
>  extern const struct iomap_ops xfs_seek_iomap_ops;
>  extern const struct iomap_ops xfs_xattr_iomap_ops;
> +extern const struct iomap_ops xfs_dax_write_iomap_ops;
> +
> +static inline int
> +xfs_iomap_zero_range(
> +	struct xfs_inode	*ip,
> +	loff_t			pos,
> +	loff_t			len,
> +	bool			*did_zero)
> +{
> +	struct inode		*inode = VFS_I(ip);
> +
> +	return iomap_zero_range(inode, pos, len, did_zero,
> +			IS_DAX(inode) ?
> +				&xfs_dax_write_iomap_ops :
> +				&xfs_buffered_write_iomap_ops);
> +}
> +
> +static inline int
> +xfs_iomap_truncate_page(
> +	struct xfs_inode	*ip,
> +	loff_t			pos,
> +	bool			*did_zero)
> +{
> +	struct inode		*inode = VFS_I(ip);
> +
> +	return iomap_truncate_page(inode, pos, did_zero,
> +			IS_DAX(inode)?
> +				&xfs_dax_write_iomap_ops :
> +				&xfs_buffered_write_iomap_ops);
> +}
>  
>  #endif /* __XFS_IOMAP_H__*/
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index a607d6aca5c4..332e6208dffd 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -911,8 +911,8 @@ xfs_setattr_size(
>  	 */
>  	if (newsize > oldsize) {
>  		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
> -		error = iomap_zero_range(inode, oldsize, newsize - oldsize,
> -				&did_zeroing, &xfs_buffered_write_iomap_ops);
> +		error = xfs_iomap_zero_range(ip, oldsize, newsize - oldsize,
> +				&did_zeroing);
>  	} else {
>  		/*
>  		 * iomap won't detect a dirty page over an unwritten block (or a
> @@ -924,8 +924,7 @@ xfs_setattr_size(
>  						     newsize);
>  		if (error)
>  			return error;
> -		error = iomap_truncate_page(inode, newsize, &did_zeroing,
> -				&xfs_buffered_write_iomap_ops);
> +		error = xfs_iomap_truncate_page(ip, newsize, &did_zeroing);
>  	}
>  
>  	if (error)
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 7ecea0311e88..9d876e268734 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1269,8 +1269,7 @@ xfs_reflink_zero_posteof(
>  		return 0;
>  
>  	trace_xfs_zero_eof(ip, isize, pos - isize);
> -	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
> -			&xfs_buffered_write_iomap_ops);
> +	return xfs_iomap_zero_range(ip, isize, pos - isize, NULL);
>  }
>  
>  /*
> -- 
> 2.33.0
> 
> 
>
Christoph Hellwig Sept. 16, 2021, 6:23 a.m. UTC | #2
On Wed, Sep 15, 2021 at 06:45:00PM +0800, Shiyang Ruan wrote:
> +static int
> +xfs_dax_write_iomap_end(
> +	struct inode 		*inode,
> +	loff_t 			pos,
> +	loff_t 			length,
> +	ssize_t 		written,
> +	unsigned 		flags,
> +	struct iomap 		*iomap)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	/*
> +	 * Usually we use @written to indicate whether the operation was
> +	 * successful.  But it is always positive or zero.  The CoW needs the
> +	 * actual error code from actor().  So, get it from
> +	 * iomap_iter->processed.
> +	 */
> +	const struct iomap_iter *iter =
> +				container_of(iomap, typeof(*iter), iomap);
> +
> +	if (!xfs_is_cow_inode(ip))
> +		return 0;
> +
> +	if (iter->processed <= 0) {
> +		xfs_reflink_cancel_cow_range(ip, pos, length, true);
> +		return 0;
> +	}
> +
> +	return xfs_reflink_end_cow(ip, pos, iter->processed);

Didn't we come to the conflusion last time that we don't actually
need to poke into the iomap_iter here as the written argument is equal
to iter->processed if it is > 0:

	if (iter->iomap.length && ops->iomap_end) {
		ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
				iter->processed > 0 ? iter->processed : 0,
				iter->flags, &iter->iomap);
		..

So should be able to just do:

static int
xfs_dax_write_iomap_end(
	struct inode 		*inode,
	loff_t 			pos,
	loff_t 			length,
	ssize_t 		written,
	unsigned 		flags,
	struct iomap 		*iomap)
{
	struct xfs_inode	*ip = XFS_I(inode);

	if (!xfs_is_cow_inode(ip))
		return 0;

	if (!written) {
		xfs_reflink_cancel_cow_range(ip, pos, length, true);
		return 0;
	}

	return xfs_reflink_end_cow(ip, pos, written);
}
Christoph Hellwig Sept. 16, 2021, 6:32 a.m. UTC | #3
On Wed, Sep 15, 2021 at 05:22:27PM -0700, Darrick J. Wong wrote:
> >  		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
> >  				(write_fault && !vmf->cow_page) ?
> > -				 &xfs_direct_write_iomap_ops :
> > -				 &xfs_read_iomap_ops);
> > +					&xfs_dax_write_iomap_ops :
> > +					&xfs_read_iomap_ops);
> 
> Hmm... I wonder if this should get hoisted to a "xfs_dax_iomap_fault"
> wrapper like you did for xfs_iomap_zero_range?

This has just a single users, so the classic argument won't apply.  That
being said __xfs_filemap_fault is a complete mess to due the calling
conventions of the various VFS methods multiplexed into it.  So yes,
splitting out a xfs_dax_iomap_fault to wrap the above plus the
dax_finish_sync_fault call might not actually be a bad idea nevertheless.

> > +	struct xfs_inode	*ip = XFS_I(inode);
> > +	/*
> > +	 * Usually we use @written to indicate whether the operation was
> > +	 * successful.  But it is always positive or zero.  The CoW needs the
> > +	 * actual error code from actor().  So, get it from
> > +	 * iomap_iter->processed.
> 
> Hm.  All six arguments are derived from the struct iomap_iter, so maybe
> it makes more sense to pass that in?  I'll poke around with this more
> tomorrow.

I'd argue against just changing the calling conventions for ->iomap_end
now.  The original iter patches from willy allowed passing a single
next callback combinging iomap_begin and iomap_end in a way that with
a little magic we can avoid the indirect calls entirely.  I think we'll
need to experiment with that that a bit and see if is worth the effort
first.  I plan to do that but I might not get to it immediate.  If some
else wants to take over I'm fine with that.

> >  static int
> >  xfs_buffered_write_iomap_begin(
> 
> Also, we have an related request to drop the EXPERIMENTAL tag for
> non-DAX reflink.  Whichever patch enables dax+reflink for xfs needs to
> make it clear that reflink + any possibility of DAX emits an
> EXPERIMENTAL warning.

More importantly before we can merge this series we also need the VM
level support for reflink-aware reverse mapping.  So while this series
here is no in a good enough shape I don't see how we could merge it
without that other series as we'd have to disallow mmap for reflink+dax
files otherwise.
Shiyang Ruan Sept. 16, 2021, 8:51 a.m. UTC | #4
On 2021/9/16 14:23, Christoph Hellwig wrote:
> On Wed, Sep 15, 2021 at 06:45:00PM +0800, Shiyang Ruan wrote:
>> +static int
>> +xfs_dax_write_iomap_end(
>> +	struct inode 		*inode,
>> +	loff_t 			pos,
>> +	loff_t 			length,
>> +	ssize_t 		written,
>> +	unsigned 		flags,
>> +	struct iomap 		*iomap)
>> +{
>> +	struct xfs_inode	*ip = XFS_I(inode);
>> +	/*
>> +	 * Usually we use @written to indicate whether the operation was
>> +	 * successful.  But it is always positive or zero.  The CoW needs the
>> +	 * actual error code from actor().  So, get it from
>> +	 * iomap_iter->processed.
>> +	 */
>> +	const struct iomap_iter *iter =
>> +				container_of(iomap, typeof(*iter), iomap);
>> +
>> +	if (!xfs_is_cow_inode(ip))
>> +		return 0;
>> +
>> +	if (iter->processed <= 0) {
>> +		xfs_reflink_cancel_cow_range(ip, pos, length, true);
>> +		return 0;
>> +	}
>> +
>> +	return xfs_reflink_end_cow(ip, pos, iter->processed);
> 
> Didn't we come to the conflusion last time that we don't actually
> need to poke into the iomap_iter here as the written argument is equal
> to iter->processed if it is > 0:
> 
> 	if (iter->iomap.length && ops->iomap_end) {
> 		ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
> 				iter->processed > 0 ? iter->processed : 0,
> 				iter->flags, &iter->iomap);
> 		..
> 
> So should be able to just do:
> 
> static int
> xfs_dax_write_iomap_end(
> 	struct inode 		*inode,
> 	loff_t 			pos,
> 	loff_t 			length,
> 	ssize_t 		written,
> 	unsigned 		flags,
> 	struct iomap 		*iomap)
> {
> 	struct xfs_inode	*ip = XFS_I(inode);
> 
> 	if (!xfs_is_cow_inode(ip))
> 		return 0;
> 
> 	if (!written) {
> 		xfs_reflink_cancel_cow_range(ip, pos, length, true);
> 		return 0;
> 	}
> 
> 	return xfs_reflink_end_cow(ip, pos, written);
> }
> 

I see.  Thanks for your guidance.


--
Ruan
Darrick J. Wong Sept. 17, 2021, 3:33 p.m. UTC | #5
On Thu, Sep 16, 2021 at 08:32:51AM +0200, Christoph Hellwig wrote:
> On Wed, Sep 15, 2021 at 05:22:27PM -0700, Darrick J. Wong wrote:
> > >  		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
> > >  				(write_fault && !vmf->cow_page) ?
> > > -				 &xfs_direct_write_iomap_ops :
> > > -				 &xfs_read_iomap_ops);
> > > +					&xfs_dax_write_iomap_ops :
> > > +					&xfs_read_iomap_ops);
> > 
> > Hmm... I wonder if this should get hoisted to a "xfs_dax_iomap_fault"
> > wrapper like you did for xfs_iomap_zero_range?
> 
> This has just a single users, so the classic argument won't apply.  That
> being said __xfs_filemap_fault is a complete mess to due the calling
> conventions of the various VFS methods multiplexed into it.  So yes,
> splitting out a xfs_dax_iomap_fault to wrap the above plus the
> dax_finish_sync_fault call might not actually be a bad idea nevertheless.

Agree.

> > > +	struct xfs_inode	*ip = XFS_I(inode);
> > > +	/*
> > > +	 * Usually we use @written to indicate whether the operation was
> > > +	 * successful.  But it is always positive or zero.  The CoW needs the
> > > +	 * actual error code from actor().  So, get it from
> > > +	 * iomap_iter->processed.
> > 
> > Hm.  All six arguments are derived from the struct iomap_iter, so maybe
> > it makes more sense to pass that in?  I'll poke around with this more
> > tomorrow.
> 
> I'd argue against just changing the calling conventions for ->iomap_end
> now.  The original iter patches from willy allowed passing a single
> next callback combinging iomap_begin and iomap_end in a way that with
> a little magic we can avoid the indirect calls entirely.  I think we'll
> need to experiment with that that a bit and see if is worth the effort
> first.  I plan to do that but I might not get to it immediate.  If some
> else wants to take over I'm fine with that.

Ah, I forgot that.  Yay Etch-a-Sketch brain. <shake> -ENODATA ;)

> > >  static int
> > >  xfs_buffered_write_iomap_begin(
> > 
> > Also, we have an related request to drop the EXPERIMENTAL tag for
> > non-DAX reflink.  Whichever patch enables dax+reflink for xfs needs to
> > make it clear that reflink + any possibility of DAX emits an
> > EXPERIMENTAL warning.
> 
> More importantly before we can merge this series we also need the VM
> level support for reflink-aware reverse mapping.  So while this series
> here is no in a good enough shape I don't see how we could merge it
> without that other series as we'd have to disallow mmap for reflink+dax
> files otherwise.

I've forgotten why we need mm level reverse mapping again?  The pmem
poison stuff can use ->media_failure (or whatever it was called,
memory_failure?) to find all the owners and notify them.  Was there
some other accounting reason that fell out of my brain?

I'm more afraid of 'sharing pages between files needs mm support'
sparking another multi-year folioesque fight with the mm people.

--D
Christoph Hellwig Sept. 21, 2021, 8:14 a.m. UTC | #6
On Fri, Sep 17, 2021 at 08:33:04AM -0700, Darrick J. Wong wrote:
> > More importantly before we can merge this series we also need the VM
> > level support for reflink-aware reverse mapping.  So while this series
> > here is no in a good enough shape I don't see how we could merge it
> > without that other series as we'd have to disallow mmap for reflink+dax
> > files otherwise.
> 
> I've forgotten why we need mm level reverse mapping again?  The pmem
> poison stuff can use ->media_failure (or whatever it was called,
> memory_failure?) to find all the owners and notify them.  Was there
> some other accounting reason that fell out of my brain?
> 
> I'm more afraid of 'sharing pages between files needs mm support'
> sparking another multi-year folioesque fight with the mm people.

Because of the way page->mapping is used by DAX.  But I think this is
mostly under control in the other series.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 73a36b7be3bd..0681250e0a5d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1009,8 +1009,7 @@  xfs_free_file_space(
 		return 0;
 	if (offset + len > XFS_ISIZE(ip))
 		len = XFS_ISIZE(ip) - offset;
-	error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
-			&xfs_buffered_write_iomap_ops);
+	error = xfs_iomap_zero_range(ip, offset, len, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..2ef1930374d2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -704,7 +704,7 @@  xfs_file_dax_write(
 	pos = iocb->ki_pos;
 
 	trace_xfs_file_dax_write(iocb, from);
-	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+	ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		error = xfs_setfilesize(ip, pos, ret);
@@ -1329,8 +1329,8 @@  __xfs_filemap_fault(
 		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
 				(write_fault && !vmf->cow_page) ?
-				 &xfs_direct_write_iomap_ops :
-				 &xfs_read_iomap_ops);
+					&xfs_dax_write_iomap_ops :
+					&xfs_read_iomap_ops);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 093758440ad5..6fa3b377cb81 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -761,7 +761,8 @@  xfs_direct_write_iomap_begin(
 
 		/* may drop and re-acquire the ilock */
 		error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
-				&lockmode, flags & IOMAP_DIRECT);
+				&lockmode,
+				(flags & IOMAP_DIRECT) || IS_DAX(inode));
 		if (error)
 			goto out_unlock;
 		if (shared)
@@ -854,6 +855,41 @@  const struct iomap_ops xfs_direct_write_iomap_ops = {
 	.iomap_begin		= xfs_direct_write_iomap_begin,
 };
 
+static int
+xfs_dax_write_iomap_end(
+	struct inode 		*inode,
+	loff_t 			pos,
+	loff_t 			length,
+	ssize_t 		written,
+	unsigned 		flags,
+	struct iomap 		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	/*
+	 * Usually we use @written to indicate whether the operation was
+	 * successful.  But it is always positive or zero.  The CoW needs the
+	 * actual error code from actor().  So, get it from
+	 * iomap_iter->processed.
+	 */
+	const struct iomap_iter *iter =
+				container_of(iomap, typeof(*iter), iomap);
+
+	if (!xfs_is_cow_inode(ip))
+		return 0;
+
+	if (iter->processed <= 0) {
+		xfs_reflink_cancel_cow_range(ip, pos, length, true);
+		return 0;
+	}
+
+	return xfs_reflink_end_cow(ip, pos, iter->processed);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+	.iomap_begin 	= xfs_direct_write_iomap_begin,
+	.iomap_end	= xfs_dax_write_iomap_end,
+};
+
 static int
 xfs_buffered_write_iomap_begin(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e..92679a0c3578 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -45,5 +45,35 @@  extern const struct iomap_ops xfs_direct_write_iomap_ops;
 extern const struct iomap_ops xfs_read_iomap_ops;
 extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
+
+static inline int
+xfs_iomap_zero_range(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	loff_t			len,
+	bool			*did_zero)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	return iomap_zero_range(inode, pos, len, did_zero,
+			IS_DAX(inode) ?
+				&xfs_dax_write_iomap_ops :
+				&xfs_buffered_write_iomap_ops);
+}
+
+static inline int
+xfs_iomap_truncate_page(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	bool			*did_zero)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	return iomap_truncate_page(inode, pos, did_zero,
+			IS_DAX(inode)?
+				&xfs_dax_write_iomap_ops :
+				&xfs_buffered_write_iomap_ops);
+}
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a607d6aca5c4..332e6208dffd 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -911,8 +911,8 @@  xfs_setattr_size(
 	 */
 	if (newsize > oldsize) {
 		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
-		error = iomap_zero_range(inode, oldsize, newsize - oldsize,
-				&did_zeroing, &xfs_buffered_write_iomap_ops);
+		error = xfs_iomap_zero_range(ip, oldsize, newsize - oldsize,
+				&did_zeroing);
 	} else {
 		/*
 		 * iomap won't detect a dirty page over an unwritten block (or a
@@ -924,8 +924,7 @@  xfs_setattr_size(
 						     newsize);
 		if (error)
 			return error;
-		error = iomap_truncate_page(inode, newsize, &did_zeroing,
-				&xfs_buffered_write_iomap_ops);
+		error = xfs_iomap_truncate_page(ip, newsize, &did_zeroing);
 	}
 
 	if (error)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7ecea0311e88..9d876e268734 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1269,8 +1269,7 @@  xfs_reflink_zero_posteof(
 		return 0;
 
 	trace_xfs_zero_eof(ip, isize, pos - isize);
-	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
-			&xfs_buffered_write_iomap_ops);
+	return xfs_iomap_zero_range(ip, isize, pos - isize, NULL);
 }
 
 /*