diff mbox series

[v5,01/25] fuse: Fix dax truncate/punch_hole fault path

Message ID f20cc2603bd33ee05ec4bc4cc7327cec61119796.1736221254.git-series.apopple@nvidia.com (mailing list archive)
State Superseded
Headers show
Series fs/dax: Fix ZONE_DEVICE page reference counts | expand

Commit Message

Alistair Popple Jan. 7, 2025, 3:42 a.m. UTC
FS DAX requires file systems to call into the DAX layout prior to
unlinking inodes to ensure there is no ongoing DMA or other remote
access to the direct mapped page. The fuse file system implements
fuse_dax_break_layouts() to do this which includes a comment
indicating that passing dmap_end == 0 leads to unmapping of the whole
file.

However this is not true - passing dmap_end == 0 will not unmap
anything before dmap_start, and further more
dax_layout_busy_page_range() will not scan any of the range to see if
there maybe ongoing DMA access to the range. Fix this by checking for
dmap_end == 0 in fuse_dax_break_layouts() and pass the entire file
range to dax_layout_busy_page_range().

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Fixes: 6ae330cad6ef ("virtiofs: serialize truncate/punch_hole and dax fault path")
Cc: Vivek Goyal <vgoyal@redhat.com>

---

I am not at all familiar with the fuse file system driver so I have no
idea if the comment is relevant or not and whether the documented
behaviour for dmap_end == 0 is ever relied upon. However this seemed
like the safest fix unless someone more familiar with fuse can confirm
that dmap_end == 0 is never used.
---
 fs/fuse/dax.c | 4 ++++
 1 file changed, 4 insertions(+)

Comments

Dan Williams Jan. 8, 2025, 10:30 p.m. UTC | #1
Alistair Popple wrote:
> FS DAX requires file systems to call into the DAX layout prior to
> unlinking inodes to ensure there is no ongoing DMA or other remote
> access to the direct mapped page. The fuse file system implements
> fuse_dax_break_layouts() to do this which includes a comment
> indicating that passing dmap_end == 0 leads to unmapping of the whole
> file.
> 
> However this is not true - passing dmap_end == 0 will not unmap
> anything before dmap_start, and further more
> dax_layout_busy_page_range() will not scan any of the range to see if
> there maybe ongoing DMA access to the range.

It would be useful to clarify that this is bug was found by inspection
and that there are no known end user reports of trouble but that the
failure more would look like random fs corruption. The window is hard to
hit because a block needs to be truncated, reallocated to
a file, and written to before stale DMA could corrupt it. So that may
contribute to the fact that fuse-dax users have not reported an issue
since v5.10.

> Fix this by checking for dmap_end == 0 in fuse_dax_break_layouts() and
> pass the entire file range to dax_layout_busy_page_range().

That's not what this patch does, maybe a rebase error that pushed the
@dmap_end fixup after the call to dax_layout_busy_page_range?

However, I don't think this is quite the right fix, more below...

> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Fixes: 6ae330cad6ef ("virtiofs: serialize truncate/punch_hole and dax fault path")
> Cc: Vivek Goyal <vgoyal@redhat.com>
> 
> ---
> 
> I am not at all familiar with the fuse file system driver so I have no
> idea if the comment is relevant or not and whether the documented
> behaviour for dmap_end == 0 is ever relied upon. However this seemed
> like the safest fix unless someone more familiar with fuse can confirm
> that dmap_end == 0 is never used.

It is used in several places and has been wrong since day one. I believe
the original commit simply misunderstood that
dax_layout_busy_page_range() semantics are analogous to
invalidate_inode_pages2_range() semantics in terms of what @start and
@end mean.

You can add:

Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

...if you end up doing a resend, or I will add it on applying to
nvdimm.git if the rebase does not end up being too prickly.

-- 8< --
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index c5d1feaa239c..455c4a16080b 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -681,7 +681,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
 			0, 0, fuse_wait_dax_page(inode));
 }
 
-/* dmap_end == 0 leads to unmapping of whole file */
 int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
 				  u64 dmap_end)
 {
@@ -693,10 +692,6 @@ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
 		ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
 					       dmap_end);
 	} while (ret == 0 && retry);
-	if (!dmap_end) {
-		dmap_start = 0;
-		dmap_end = LLONG_MAX;
-	}
 
 	return ret;
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0b2f8567ca30..bc6c8936c529 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1936,7 +1936,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (FUSE_IS_DAX(inode) && is_truncate) {
 		filemap_invalidate_lock(mapping);
 		fault_blocked = true;
-		err = fuse_dax_break_layouts(inode, 0, 0);
+		err = fuse_dax_break_layouts(inode, 0, -1);
 		if (err) {
 			filemap_invalidate_unlock(mapping);
 			return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 082ee374f694..cef7a8f75821 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 
 	if (dax_truncate) {
 		filemap_invalidate_lock(inode->i_mapping);
-		err = fuse_dax_break_layouts(inode, 0, 0);
+		err = fuse_dax_break_layouts(inode, 0, -1);
 		if (err)
 			goto out_inode_unlock;
 	}
@@ -2890,7 +2890,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	inode_lock(inode);
 	if (block_faults) {
 		filemap_invalidate_lock(inode->i_mapping);
-		err = fuse_dax_break_layouts(inode, 0, 0);
+		err = fuse_dax_break_layouts(inode, 0, -1);
 		if (err)
 			goto out;
 	}
Alistair Popple Jan. 9, 2025, 4:38 a.m. UTC | #2
On Wed, Jan 08, 2025 at 02:30:24PM -0800, Dan Williams wrote:
> Alistair Popple wrote:
> > FS DAX requires file systems to call into the DAX layout prior to
> > unlinking inodes to ensure there is no ongoing DMA or other remote
> > access to the direct mapped page. The fuse file system implements
> > fuse_dax_break_layouts() to do this which includes a comment
> > indicating that passing dmap_end == 0 leads to unmapping of the whole
> > file.
> > 
> > However this is not true - passing dmap_end == 0 will not unmap
> > anything before dmap_start, and further more
> > dax_layout_busy_page_range() will not scan any of the range to see if
> > there maybe ongoing DMA access to the range.
> 
> It would be useful to clarify that this is bug was found by inspection
> and that there are no known end user reports of trouble but that the
> failure more would look like random fs corruption. The window is hard to
> hit because a block needs to be truncated, reallocated to
> a file, and written to before stale DMA could corrupt it. So that may
> contribute to the fact that fuse-dax users have not reported an issue
> since v5.10.
> 
> > Fix this by checking for dmap_end == 0 in fuse_dax_break_layouts() and
> > pass the entire file range to dax_layout_busy_page_range().
> 
> That's not what this patch does, maybe a rebase error that pushed the
> @dmap_end fixup after the call to dax_layout_busy_page_range?

Ha. Yep. I spotted this when doing the conversion to
dax_layout_busy_page_range() and then had to rebase it into a stand alone patch
for easy review. Obviously I put the check in the wrong spot, although it ends
up in the right spot at the end of the series.

> However, I don't think this is quite the right fix, more below...

Yeah, I like your version better so will respin with that.

> > Signed-off-by: Alistair Popple <apopple@nvidia.com>
> > Fixes: 6ae330cad6ef ("virtiofs: serialize truncate/punch_hole and dax fault path")
> > Cc: Vivek Goyal <vgoyal@redhat.com>
> > 
> > ---
> > 
> > I am not at all familiar with the fuse file system driver so I have no
> > idea if the comment is relevant or not and whether the documented
> > behaviour for dmap_end == 0 is ever relied upon. However this seemed
> > like the safest fix unless someone more familiar with fuse can confirm
> > that dmap_end == 0 is never used.
> 
> It is used in several places and has been wrong since day one. I believe
> the original commit simply misunderstood that
> dax_layout_busy_page_range() semantics are analogous to
> invalidate_inode_pages2_range() semantics in terms of what @start and
> @end mean.
> 
> You can add:
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> 
> ...if you end up doing a resend, or I will add it on applying to
> nvdimm.git if the rebase does not end up being too prickly.

Looks like I accidentally dropped a PPC fix so will do a respin for that. And
the kernel build bot was complaining about incorrect documentation so will fix
that while I'm at it.

> -- 8< --
> diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
> index c5d1feaa239c..455c4a16080b 100644
> --- a/fs/fuse/dax.c
> +++ b/fs/fuse/dax.c
> @@ -681,7 +681,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
>  			0, 0, fuse_wait_dax_page(inode));
>  }
>  
> -/* dmap_end == 0 leads to unmapping of whole file */
>  int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
>  				  u64 dmap_end)
>  {
> @@ -693,10 +692,6 @@ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
>  		ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
>  					       dmap_end);
>  	} while (ret == 0 && retry);
> -	if (!dmap_end) {
> -		dmap_start = 0;
> -		dmap_end = LLONG_MAX;
> -	}
>  
>  	return ret;
>  }
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 0b2f8567ca30..bc6c8936c529 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -1936,7 +1936,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>  	if (FUSE_IS_DAX(inode) && is_truncate) {
>  		filemap_invalidate_lock(mapping);
>  		fault_blocked = true;
> -		err = fuse_dax_break_layouts(inode, 0, 0);
> +		err = fuse_dax_break_layouts(inode, 0, -1);
>  		if (err) {
>  			filemap_invalidate_unlock(mapping);
>  			return err;
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 082ee374f694..cef7a8f75821 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file)
>  
>  	if (dax_truncate) {
>  		filemap_invalidate_lock(inode->i_mapping);
> -		err = fuse_dax_break_layouts(inode, 0, 0);
> +		err = fuse_dax_break_layouts(inode, 0, -1);
>  		if (err)
>  			goto out_inode_unlock;
>  	}
> @@ -2890,7 +2890,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
>  	inode_lock(inode);
>  	if (block_faults) {
>  		filemap_invalidate_lock(inode->i_mapping);
> -		err = fuse_dax_break_layouts(inode, 0, 0);
> +		err = fuse_dax_break_layouts(inode, 0, -1);
>  		if (err)
>  			goto out;
>  	}
diff mbox series

Patch

diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9abbc2f..c5d1fea 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -693,6 +693,10 @@  int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
 		ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
 					       dmap_end);
 	} while (ret == 0 && retry);
+	if (!dmap_end) {
+		dmap_start = 0;
+		dmap_end = LLONG_MAX;
+	}
 
 	return ret;
 }