diff mbox series

[v2,2/3] iomap: support partial page discard on writeback block mapping failure

Message ID 20201029132325.1663790-3-bfoster@redhat.com
State New
Headers show
Series misc iomap/xfs writeback fixes | expand

Commit Message

Brian Foster Oct. 29, 2020, 1:23 p.m. UTC
iomap writeback mapping failure only calls into ->discard_page() if
the current page has not been added to the ioend. Accordingly, the
XFS callback assumes a full page discard and invalidation. This is
problematic for sub-page block size filesystems where some portion
of a page might have been mapped successfully before a failure to
map a delalloc block occurs. ->discard_page() is not called in that
error scenario and the bio is explicitly failed by iomap via the
error return from ->prepare_ioend(). As a result, the filesystem
leaks delalloc blocks and corrupts the filesystem block counters.

Since XFS is the only user of ->discard_page(), tweak the semantics
to invoke the callback unconditionally on mapping errors and provide
the file offset that failed to map. Update xfs_discard_page() to
discard the corresponding portion of the file and pass the range
along to iomap_invalidatepage(). The latter already properly handles
both full and sub-page scenarios by not changing any iomap or page
state on sub-page invalidations.

Signed-off-by: Brian Foster <bfoster@redhat.com>
---
 fs/iomap/buffered-io.c | 15 ++++++++-------
 fs/xfs/xfs_aops.c      | 13 +++++++------
 include/linux/iomap.h  |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

Comments

Christoph Hellwig Oct. 29, 2020, 3:06 p.m. UTC | #1
Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>
Darrick J. Wong Oct. 29, 2020, 3:27 p.m. UTC | #2
On Thu, Oct 29, 2020 at 09:23:24AM -0400, Brian Foster wrote:
> iomap writeback mapping failure only calls into ->discard_page() if
> the current page has not been added to the ioend. Accordingly, the
> XFS callback assumes a full page discard and invalidation. This is
> problematic for sub-page block size filesystems where some portion
> of a page might have been mapped successfully before a failure to
> map a delalloc block occurs. ->discard_page() is not called in that
> error scenario and the bio is explicitly failed by iomap via the
> error return from ->prepare_ioend(). As a result, the filesystem
> leaks delalloc blocks and corrupts the filesystem block counters.
> 
> Since XFS is the only user of ->discard_page(), tweak the semantics
> to invoke the callback unconditionally on mapping errors and provide
> the file offset that failed to map. Update xfs_discard_page() to
> discard the corresponding portion of the file and pass the range
> along to iomap_invalidatepage(). The latter already properly handles
> both full and sub-page scenarios by not changing any iomap or page
> state on sub-page invalidations.
> 
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> ---
>  fs/iomap/buffered-io.c | 15 ++++++++-------
>  fs/xfs/xfs_aops.c      | 13 +++++++------
>  include/linux/iomap.h  |  2 +-
>  3 files changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index bcfc288dba3f..d1f04eabc7e4 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1412,14 +1412,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * appropriately.
>  	 */
>  	if (unlikely(error)) {
> +		/*
> +		 * Let the filesystem know what portion of the current page
> +		 * failed to map. If the page wasn't been added to ioend, it
> +		 * won't be affected by I/O completion and we must unlock it
> +		 * now.
> +		 */
> +		if (wpc->ops->discard_page)
> +			wpc->ops->discard_page(page, file_offset);
>  		if (!count) {
> -			/*
> -			 * If the current page hasn't been added to ioend, it
> -			 * won't be affected by I/O completions and we must
> -			 * discard and unlock it right here.
> -			 */
> -			if (wpc->ops->discard_page)
> -				wpc->ops->discard_page(page);
>  			ClearPageUptodate(page);
>  			unlock_page(page);
>  			goto done;
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index b35611882ff9..46920c530b20 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -527,13 +527,14 @@ xfs_prepare_ioend(
>   */
>  static void
>  xfs_discard_page(
> -	struct page		*page)
> +	struct page		*page,
> +	loff_t			fileoff)
>  {
>  	struct inode		*inode = page->mapping->host;
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	struct xfs_mount	*mp = ip->i_mount;
> -	loff_t			offset = page_offset(page);
> -	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
> +	unsigned int		pageoff = offset_in_page(fileoff);
> +	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
>  	int			error;
>  
>  	if (XFS_FORCED_SHUTDOWN(mp))
> @@ -541,14 +542,14 @@ xfs_discard_page(
>  
>  	xfs_alert_ratelimited(mp,
>  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> -			page, ip->i_ino, offset);
> +			page, ip->i_ino, fileoff);
>  
>  	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> -			PAGE_SIZE / i_blocksize(inode));
> +			(PAGE_SIZE - pageoff) / i_blocksize(inode));

Er... could you rebase this against 5.10-rc1, please?  willy changed
that line to not use PAGE_SIZE directly.

I /think/ the way to resolve the merge conflict here is to change this
last argument to:

(i_blocks_per_page(page) - pageoff) / i_blocksize(inode)

--D

>  	if (error && !XFS_FORCED_SHUTDOWN(mp))
>  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
>  out_invalidate:
> -	iomap_invalidatepage(page, 0, PAGE_SIZE);
> +	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
>  }
>  
>  static const struct iomap_writeback_ops xfs_writeback_ops = {
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 4d1d3c3469e9..36e0ab19210a 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -220,7 +220,7 @@ struct iomap_writeback_ops {
>  	 * Optional, allows the file system to discard state on a page where
>  	 * we failed to submit any I/O.
>  	 */
> -	void (*discard_page)(struct page *page);
> +	void (*discard_page)(struct page *page, loff_t fileoff);
>  };
>  
>  struct iomap_writepage_ctx {
> -- 
> 2.25.4
>
Brian Foster Oct. 29, 2020, 4:07 p.m. UTC | #3
On Thu, Oct 29, 2020 at 08:27:18AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 29, 2020 at 09:23:24AM -0400, Brian Foster wrote:
> > iomap writeback mapping failure only calls into ->discard_page() if
> > the current page has not been added to the ioend. Accordingly, the
> > XFS callback assumes a full page discard and invalidation. This is
> > problematic for sub-page block size filesystems where some portion
> > of a page might have been mapped successfully before a failure to
> > map a delalloc block occurs. ->discard_page() is not called in that
> > error scenario and the bio is explicitly failed by iomap via the
> > error return from ->prepare_ioend(). As a result, the filesystem
> > leaks delalloc blocks and corrupts the filesystem block counters.
> > 
> > Since XFS is the only user of ->discard_page(), tweak the semantics
> > to invoke the callback unconditionally on mapping errors and provide
> > the file offset that failed to map. Update xfs_discard_page() to
> > discard the corresponding portion of the file and pass the range
> > along to iomap_invalidatepage(). The latter already properly handles
> > both full and sub-page scenarios by not changing any iomap or page
> > state on sub-page invalidations.
> > 
> > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > ---
> >  fs/iomap/buffered-io.c | 15 ++++++++-------
> >  fs/xfs/xfs_aops.c      | 13 +++++++------
> >  include/linux/iomap.h  |  2 +-
> >  3 files changed, 16 insertions(+), 14 deletions(-)
> > 
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index bcfc288dba3f..d1f04eabc7e4 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -1412,14 +1412,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >  	 * appropriately.
> >  	 */
> >  	if (unlikely(error)) {
> > +		/*
> > +		 * Let the filesystem know what portion of the current page
> > +		 * failed to map. If the page wasn't been added to ioend, it
> > +		 * won't be affected by I/O completion and we must unlock it
> > +		 * now.
> > +		 */
> > +		if (wpc->ops->discard_page)
> > +			wpc->ops->discard_page(page, file_offset);
> >  		if (!count) {
> > -			/*
> > -			 * If the current page hasn't been added to ioend, it
> > -			 * won't be affected by I/O completions and we must
> > -			 * discard and unlock it right here.
> > -			 */
> > -			if (wpc->ops->discard_page)
> > -				wpc->ops->discard_page(page);
> >  			ClearPageUptodate(page);
> >  			unlock_page(page);
> >  			goto done;
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index b35611882ff9..46920c530b20 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -527,13 +527,14 @@ xfs_prepare_ioend(
> >   */
> >  static void
> >  xfs_discard_page(
> > -	struct page		*page)
> > +	struct page		*page,
> > +	loff_t			fileoff)
> >  {
> >  	struct inode		*inode = page->mapping->host;
> >  	struct xfs_inode	*ip = XFS_I(inode);
> >  	struct xfs_mount	*mp = ip->i_mount;
> > -	loff_t			offset = page_offset(page);
> > -	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
> > +	unsigned int		pageoff = offset_in_page(fileoff);
> > +	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
> >  	int			error;
> >  
> >  	if (XFS_FORCED_SHUTDOWN(mp))
> > @@ -541,14 +542,14 @@ xfs_discard_page(
> >  
> >  	xfs_alert_ratelimited(mp,
> >  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> > -			page, ip->i_ino, offset);
> > +			page, ip->i_ino, fileoff);
> >  
> >  	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> > -			PAGE_SIZE / i_blocksize(inode));
> > +			(PAGE_SIZE - pageoff) / i_blocksize(inode));
> 
> Er... could you rebase this against 5.10-rc1, please?  willy changed
> that line to not use PAGE_SIZE directly.
> 

Sure.. (note that there's still a PAGE_SIZE usage in the
iomap_invalidatepage() call).

> I /think/ the way to resolve the merge conflict here is to change this
> last argument to:
> 
> (i_blocks_per_page(page) - pageoff) / i_blocksize(inode)
> 

Hmm... pageoff is bytes so that doesn't look quite right. How about
something like this?

	...
	unsigned int            pageoff = offset_in_page(fileoff);
	xfs_fileoff_t           pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);

	...
        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
				i_blocks_per_page(inode, page) - pageoff_fsb);
	...

Brian

> --D
> 
> >  	if (error && !XFS_FORCED_SHUTDOWN(mp))
> >  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
> >  out_invalidate:
> > -	iomap_invalidatepage(page, 0, PAGE_SIZE);
> > +	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
> >  }
> >  
> >  static const struct iomap_writeback_ops xfs_writeback_ops = {
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 4d1d3c3469e9..36e0ab19210a 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -220,7 +220,7 @@ struct iomap_writeback_ops {
> >  	 * Optional, allows the file system to discard state on a page where
> >  	 * we failed to submit any I/O.
> >  	 */
> > -	void (*discard_page)(struct page *page);
> > +	void (*discard_page)(struct page *page, loff_t fileoff);
> >  };
> >  
> >  struct iomap_writepage_ctx {
> > -- 
> > 2.25.4
> > 
>
Darrick J. Wong Oct. 29, 2020, 4:12 p.m. UTC | #4
On Thu, Oct 29, 2020 at 12:07:32PM -0400, Brian Foster wrote:
> On Thu, Oct 29, 2020 at 08:27:18AM -0700, Darrick J. Wong wrote:
> > On Thu, Oct 29, 2020 at 09:23:24AM -0400, Brian Foster wrote:
> > > iomap writeback mapping failure only calls into ->discard_page() if
> > > the current page has not been added to the ioend. Accordingly, the
> > > XFS callback assumes a full page discard and invalidation. This is
> > > problematic for sub-page block size filesystems where some portion
> > > of a page might have been mapped successfully before a failure to
> > > map a delalloc block occurs. ->discard_page() is not called in that
> > > error scenario and the bio is explicitly failed by iomap via the
> > > error return from ->prepare_ioend(). As a result, the filesystem
> > > leaks delalloc blocks and corrupts the filesystem block counters.
> > > 
> > > Since XFS is the only user of ->discard_page(), tweak the semantics
> > > to invoke the callback unconditionally on mapping errors and provide
> > > the file offset that failed to map. Update xfs_discard_page() to
> > > discard the corresponding portion of the file and pass the range
> > > along to iomap_invalidatepage(). The latter already properly handles
> > > both full and sub-page scenarios by not changing any iomap or page
> > > state on sub-page invalidations.
> > > 
> > > Signed-off-by: Brian Foster <bfoster@redhat.com>
> > > ---
> > >  fs/iomap/buffered-io.c | 15 ++++++++-------
> > >  fs/xfs/xfs_aops.c      | 13 +++++++------
> > >  include/linux/iomap.h  |  2 +-
> > >  3 files changed, 16 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > > index bcfc288dba3f..d1f04eabc7e4 100644
> > > --- a/fs/iomap/buffered-io.c
> > > +++ b/fs/iomap/buffered-io.c
> > > @@ -1412,14 +1412,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > >  	 * appropriately.
> > >  	 */
> > >  	if (unlikely(error)) {
> > > +		/*
> > > +		 * Let the filesystem know what portion of the current page
> > > +		 * failed to map. If the page wasn't been added to ioend, it
> > > +		 * won't be affected by I/O completion and we must unlock it
> > > +		 * now.
> > > +		 */
> > > +		if (wpc->ops->discard_page)
> > > +			wpc->ops->discard_page(page, file_offset);
> > >  		if (!count) {
> > > -			/*
> > > -			 * If the current page hasn't been added to ioend, it
> > > -			 * won't be affected by I/O completions and we must
> > > -			 * discard and unlock it right here.
> > > -			 */
> > > -			if (wpc->ops->discard_page)
> > > -				wpc->ops->discard_page(page);
> > >  			ClearPageUptodate(page);
> > >  			unlock_page(page);
> > >  			goto done;
> > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > > index b35611882ff9..46920c530b20 100644
> > > --- a/fs/xfs/xfs_aops.c
> > > +++ b/fs/xfs/xfs_aops.c
> > > @@ -527,13 +527,14 @@ xfs_prepare_ioend(
> > >   */
> > >  static void
> > >  xfs_discard_page(
> > > -	struct page		*page)
> > > +	struct page		*page,
> > > +	loff_t			fileoff)
> > >  {
> > >  	struct inode		*inode = page->mapping->host;
> > >  	struct xfs_inode	*ip = XFS_I(inode);
> > >  	struct xfs_mount	*mp = ip->i_mount;
> > > -	loff_t			offset = page_offset(page);
> > > -	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
> > > +	unsigned int		pageoff = offset_in_page(fileoff);
> > > +	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
> > >  	int			error;
> > >  
> > >  	if (XFS_FORCED_SHUTDOWN(mp))
> > > @@ -541,14 +542,14 @@ xfs_discard_page(
> > >  
> > >  	xfs_alert_ratelimited(mp,
> > >  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> > > -			page, ip->i_ino, offset);
> > > +			page, ip->i_ino, fileoff);
> > >  
> > >  	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> > > -			PAGE_SIZE / i_blocksize(inode));
> > > +			(PAGE_SIZE - pageoff) / i_blocksize(inode));
> > 
> > Er... could you rebase this against 5.10-rc1, please?  willy changed
> > that line to not use PAGE_SIZE directly.
> > 
> 
> Sure.. (note that there's still a PAGE_SIZE usage in the
> iomap_invalidatepage() call).

<nod> I think he has more for 5.11, but in the meantime it's just fixing
the merge conflicts. :)

> > I /think/ the way to resolve the merge conflict here is to change this
> > last argument to:
> > 
> > (i_blocks_per_page(page) - pageoff) / i_blocksize(inode)
> > 
> 
> Hmm... pageoff is bytes so that doesn't look quite right. How about
> something like this?
> 
> 	...
> 	unsigned int            pageoff = offset_in_page(fileoff);
> 	xfs_fileoff_t           pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
> 
> 	...
>         error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> 				i_blocks_per_page(inode, page) - pageoff_fsb);
> 	...

You could probably combine the two variables, but otherwise that looks
fine to me.

--D

> 
> Brian
> 
> > --D
> > 
> > >  	if (error && !XFS_FORCED_SHUTDOWN(mp))
> > >  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
> > >  out_invalidate:
> > > -	iomap_invalidatepage(page, 0, PAGE_SIZE);
> > > +	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
> > >  }
> > >  
> > >  static const struct iomap_writeback_ops xfs_writeback_ops = {
> > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > > index 4d1d3c3469e9..36e0ab19210a 100644
> > > --- a/include/linux/iomap.h
> > > +++ b/include/linux/iomap.h
> > > @@ -220,7 +220,7 @@ struct iomap_writeback_ops {
> > >  	 * Optional, allows the file system to discard state on a page where
> > >  	 * we failed to submit any I/O.
> > >  	 */
> > > -	void (*discard_page)(struct page *page);
> > > +	void (*discard_page)(struct page *page, loff_t fileoff);
> > >  };
> > >  
> > >  struct iomap_writepage_ctx {
> > > -- 
> > > 2.25.4
> > > 
> > 
>
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bcfc288dba3f..d1f04eabc7e4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1412,14 +1412,15 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * appropriately.
 	 */
 	if (unlikely(error)) {
+		/*
+		 * Let the filesystem know what portion of the current page
+		 * failed to map. If the page wasn't been added to ioend, it
+		 * won't be affected by I/O completion and we must unlock it
+		 * now.
+		 */
+		if (wpc->ops->discard_page)
+			wpc->ops->discard_page(page, file_offset);
 		if (!count) {
-			/*
-			 * If the current page hasn't been added to ioend, it
-			 * won't be affected by I/O completions and we must
-			 * discard and unlock it right here.
-			 */
-			if (wpc->ops->discard_page)
-				wpc->ops->discard_page(page);
 			ClearPageUptodate(page);
 			unlock_page(page);
 			goto done;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b35611882ff9..46920c530b20 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -527,13 +527,14 @@  xfs_prepare_ioend(
  */
 static void
 xfs_discard_page(
-	struct page		*page)
+	struct page		*page,
+	loff_t			fileoff)
 {
 	struct inode		*inode = page->mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	loff_t			offset = page_offset(page);
-	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	unsigned int		pageoff = offset_in_page(fileoff);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
 	int			error;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -541,14 +542,14 @@  xfs_discard_page(
 
 	xfs_alert_ratelimited(mp,
 		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
-			page, ip->i_ino, offset);
+			page, ip->i_ino, fileoff);
 
 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-			PAGE_SIZE / i_blocksize(inode));
+			(PAGE_SIZE - pageoff) / i_blocksize(inode));
 	if (error && !XFS_FORCED_SHUTDOWN(mp))
 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 out_invalidate:
-	iomap_invalidatepage(page, 0, PAGE_SIZE);
+	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
 }
 
 static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4d1d3c3469e9..36e0ab19210a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -220,7 +220,7 @@  struct iomap_writeback_ops {
 	 * Optional, allows the file system to discard state on a page where
 	 * we failed to submit any I/O.
 	 */
-	void (*discard_page)(struct page *page);
+	void (*discard_page)(struct page *page, loff_t fileoff);
 };
 
 struct iomap_writepage_ctx {