diff mbox

[23/24] iomap: add support for sub-pagesize buffered I/O without buffer heads

Message ID 20180615130209.1970-24-hch@lst.de (mailing list archive)
State Superseded
Headers show

Commit Message

Christoph Hellwig June 15, 2018, 1:02 p.m. UTC
After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 262 ++++++++++++++++++++++++++++++++++++++----
 include/linux/iomap.h |  31 +++++
 2 files changed, 272 insertions(+), 21 deletions(-)

Comments

Brian Foster June 19, 2018, 4:52 p.m. UTC | #1
On Fri, Jun 15, 2018 at 03:02:08PM +0200, Christoph Hellwig wrote:
> After already supporting a simple implementation of buffered writes for
> the blocksize == PAGE_SIZE case in the last commit this adds full support
> even for smaller block sizes.   There are three bits of per-block
> information in the buffer_head structure that really matter for the iomap
> read and write path:
> 
>  - uptodate status (BH_uptodate)
>  - marked as currently under read I/O (BH_Async_Read)
>  - marked as currently under write I/O (BH_Async_Write)
> 
> Instead of having new per-block structures this now adds a per-page
> structure called struct iomap_page to track this information in a slightly
> different form:
> 
>  - a bitmap for the per-block uptodate status.  For worst case of a 64k
>    page size system this bitmap needs to contain 128 bits.  For the
>    typical 4k page size case it only needs 8 bits, although we still
>    need a full unsigned long due to the way the atomic bitmap API works.
>  - two atomic_t counters are used to track the outstanding read and write
>    counts
> 
> There is quite a bit of boilerplate code as the buffered I/O path uses
> various helper methods, but the actual code is very straight forward.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap.c            | 262 ++++++++++++++++++++++++++++++++++++++----
>  include/linux/iomap.h |  31 +++++
>  2 files changed, 272 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index a504077bb38f..c59d1922991d 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
...
> @@ -197,7 +322,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  
>  	__bio_add_page(ctx->bio, page, plen, poff);
>  done:
> -	return plen;
> +	/*
> +	 * Move the caller beyond our range so that it keeps making progress.
> +	 * For that we have to include any leading non-uptodate ranges, but

Do you mean "leading uptodate ranges" here? E.g., pos is pushed forward
past those ranges we don't have to read, so (pos - orig_pos) reflects
the initial uptodate range while plen reflects the length we have to
read..?

> +	 * we can skip trailing ones as they will be handled in the next
> +	 * iteration.
> +	 */
> +	return pos - orig_pos + plen;
>  }
>  
>  int
...
> @@ -373,21 +581,33 @@ static int
>  __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
>  		struct page *page, struct iomap *iomap)
>  {
> +	struct iomap_page *iop = iomap_page_create(inode, page);
>  	loff_t block_size = i_blocksize(inode);
>  	loff_t block_start = pos & ~(block_size - 1);
>  	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
> -	unsigned poff = block_start & (PAGE_SIZE - 1);
> -	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
> -	unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
> -
> -	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> +	unsigned from = pos & (PAGE_SIZE - 1), to = from + len, poff, plen;
> +	int status = 0;
>  
>  	if (PageUptodate(page))
>  		return 0;
> -	if (from <= poff && to >= poff + plen)
> -		return 0;
> -	return iomap_read_page_sync(inode, block_start, page,
> -			poff, plen, from, to, iomap);
> +
> +	do {

Kind of a nit, but this catches my eye and manages to confuse me every
time I look at it. A comment along the lines of:

                /*
		 * Pass in the block aligned start/end so we get back block
		 * aligned/adjusted poff/plen and can compare with unaligned
		 * from/to below.
                 */

... would be nice here, IMO.

> +		iomap_adjust_read_range(inode, iop, &block_start,
> +				block_end - block_start, &poff, &plen);
> +		if (plen == 0)
> +			break;
> +
> +		if ((from > poff && from < poff + plen) ||
> +		    (to > poff && to < poff + plen)) {
> +			status = iomap_read_page_sync(inode, block_start, page,
> +					poff, plen, from, to, iomap);

After taking another look at the buffer head path, it does look like we
have slightly different behavior here. IIUC, the former reads only the
!uptodate blocks that fall along the from/to boundaries. Here, if say
from = 1, to = PAGE_SIZE and the page is fully !uptodate, it looks like
we'd read the entire page worth of blocks (assuming contiguous 512b
blocks, for example). Intentional? Doesn't seem like a big deal, but
could be worth a followup fix.

Brian

> +			if (status)
> +				break;
> +		}
> +
> +	} while ((block_start += plen) < block_end);
> +
> +	return status;
>  }
>  
>  static int
> @@ -470,7 +690,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
>  	if (unlikely(copied < len && !PageUptodate(page))) {
>  		copied = 0;
>  	} else {
> -		SetPageUptodate(page);
> +		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
>  		iomap_set_page_dirty(page);
>  	}
>  	return __generic_write_end(inode, pos, copied, page);
> @@ -806,7 +1026,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
>  		block_commit_write(page, 0, length);
>  	} else {
>  		WARN_ON_ONCE(!PageUptodate(page));
> -		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> +		iomap_page_create(inode, page);
>  	}
>  
>  	return length;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index c2706cfb27c7..60b196c54dd6 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -2,6 +2,9 @@
>  #ifndef LINUX_IOMAP_H
>  #define LINUX_IOMAP_H 1
>  
> +#include <linux/atomic.h>
> +#include <linux/bitmap.h>
> +#include <linux/mm.h>
>  #include <linux/types.h>
>  
>  struct address_space;
> @@ -99,12 +102,40 @@ struct iomap_ops {
>  			ssize_t written, unsigned flags, struct iomap *iomap);
>  };
>  
> +/*
> + * Structure allocate for each page when block size < PAGE_SIZE to track
> + * sub-page uptodate status and I/O completions.
> + */
> +struct iomap_page {
> +	atomic_t		read_count;
> +	atomic_t		write_count;
> +	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
> +};
> +
> +static inline struct iomap_page *to_iomap_page(struct page *page)
> +{
> +	if (page_has_private(page))
> +		return (struct iomap_page *)page_private(page);
> +	return NULL;
> +}
> +
>  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
>  		const struct iomap_ops *ops);
>  int iomap_readpage(struct page *page, const struct iomap_ops *ops);
>  int iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  		unsigned nr_pages, const struct iomap_ops *ops);
>  int iomap_set_page_dirty(struct page *page);
> +int iomap_is_partially_uptodate(struct page *page, unsigned long from,
> +		unsigned long count);
> +int iomap_releasepage(struct page *page, gfp_t gfp_mask);
> +void iomap_invalidatepage(struct page *page, unsigned int offset,
> +		unsigned int len);
> +#ifdef CONFIG_MIGRATION
> +int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
> +		struct page *page, enum migrate_mode mode);
> +#else
> +#define iomap_migrate_page NULL
> +#endif
>  int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
>  		const struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
> -- 
> 2.17.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 20, 2018, 7:56 a.m. UTC | #2
On Tue, Jun 19, 2018 at 12:52:11PM -0400, Brian Foster wrote:
> > +	/*
> > +	 * Move the caller beyond our range so that it keeps making progress.
> > +	 * For that we have to include any leading non-uptodate ranges, but
> 
> Do you mean "leading uptodate ranges" here? E.g., pos is pushed forward
> past those ranges we don't have to read, so (pos - orig_pos) reflects
> the initial uptodate range while plen reflects the length we have to
> read..?

Yes.

> > +
> > +	do {
> 
> Kind of a nit, but this catches my eye and manages to confuse me every
> time I look at it. A comment along the lines of:
> 
>                 /*
> 		 * Pass in the block aligned start/end so we get back block
> 		 * aligned/adjusted poff/plen and can compare with unaligned
> 		 * from/to below.
>                  */
> 
> ... would be nice here, IMO.

Fine with me.

> > +		iomap_adjust_read_range(inode, iop, &block_start,
> > +				block_end - block_start, &poff, &plen);
> > +		if (plen == 0)
> > +			break;
> > +
> > +		if ((from > poff && from < poff + plen) ||
> > +		    (to > poff && to < poff + plen)) {
> > +			status = iomap_read_page_sync(inode, block_start, page,
> > +					poff, plen, from, to, iomap);
> 
> After taking another look at the buffer head path, it does look like we
> have slightly different behavior here. IIUC, the former reads only the
> !uptodate blocks that fall along the from/to boundaries. Here, if say
> from = 1, to = PAGE_SIZE and the page is fully !uptodate, it looks like
> we'd read the entire page worth of blocks (assuming contiguous 512b
> blocks, for example). Intentional? Doesn't seem like a big deal, but
> could be worth a followup fix.

It wasn't actuall intentional, but I actually think it is the right thing
in then end, as it means we'll often do a single read instead of two
separate ones.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster June 20, 2018, 2:32 p.m. UTC | #3
Sending again without the attachment... Christoph, let me know if it
didn't hit your mbox at least.

On Wed, Jun 20, 2018 at 09:56:55AM +0200, Christoph Hellwig wrote:
> On Tue, Jun 19, 2018 at 12:52:11PM -0400, Brian Foster wrote:
> > > +	/*
> > > +	 * Move the caller beyond our range so that it keeps making progress.
> > > +	 * For that we have to include any leading non-uptodate ranges, but
> > 
> > Do you mean "leading uptodate ranges" here? E.g., pos is pushed forward
> > past those ranges we don't have to read, so (pos - orig_pos) reflects
> > the initial uptodate range while plen reflects the length we have to
> > read..?
> 
> Yes.
> 
> > > +
> > > +	do {
> > 
> > Kind of a nit, but this catches my eye and manages to confuse me every
> > time I look at it. A comment along the lines of:
> > 
> >                 /*
> > 		 * Pass in the block aligned start/end so we get back block
> > 		 * aligned/adjusted poff/plen and can compare with unaligned
> > 		 * from/to below.
> >                  */
> > 
> > ... would be nice here, IMO.
> 
> Fine with me.
> 
> > > +		iomap_adjust_read_range(inode, iop, &block_start,
> > > +				block_end - block_start, &poff, &plen);
> > > +		if (plen == 0)
> > > +			break;
> > > +
> > > +		if ((from > poff && from < poff + plen) ||
> > > +		    (to > poff && to < poff + plen)) {
> > > +			status = iomap_read_page_sync(inode, block_start, page,
> > > +					poff, plen, from, to, iomap);
> > 
> > After taking another look at the buffer head path, it does look like we
> > have slightly different behavior here. IIUC, the former reads only the
> > !uptodate blocks that fall along the from/to boundaries. Here, if say
> > from = 1, to = PAGE_SIZE and the page is fully !uptodate, it looks like
> > we'd read the entire page worth of blocks (assuming contiguous 512b
> > blocks, for example). Intentional? Doesn't seem like a big deal, but
> > could be worth a followup fix.
> 
> It wasn't actuall intentional, but I actually think it is the right thing
> in then end, as it means we'll often do a single read instead of two
> separate ones.

Ok, but if that's the argument, then shouldn't we not be doing two
separate I/Os if the middle range of a write happens to be already
uptodate? Or more for that matter, if the page happens to be sparsely
uptodate for whatever reason..?

OTOH, I also do wonder a bit whether that may always be the right thing
if we consider cases like 64k page size arches and whatnot. It seems
like we could end up consuming more bandwidth for reads than we
typically have in the past. That said, unless there's a functional
reason to change this I think it's fine to optimize this path for these
kinds of corner cases in follow on patches.

Finally, this survived xfstests on a sub-page block size fs but I
managed to hit an fsx error:

Mapped Read: non-zero data past EOF (0x21a1f) page offset 0xc00 is
0xc769

It repeats 100% of the time for me using the attached fsxops file (with
--replay-ops) on XFS w/ -bsize=1k. It doesn't occur without the final
patch to enable sub-page block iomap on XFS.

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong June 20, 2018, 4:08 p.m. UTC | #4
On Wed, Jun 20, 2018 at 10:32:53AM -0400, Brian Foster wrote:
> Sending again without the attachment... Christoph, let me know if it
> didn't hit your mbox at least.
> 
> On Wed, Jun 20, 2018 at 09:56:55AM +0200, Christoph Hellwig wrote:
> > On Tue, Jun 19, 2018 at 12:52:11PM -0400, Brian Foster wrote:
> > > > +	/*
> > > > +	 * Move the caller beyond our range so that it keeps making progress.
> > > > +	 * For that we have to include any leading non-uptodate ranges, but
> > > 
> > > Do you mean "leading uptodate ranges" here? E.g., pos is pushed forward
> > > past those ranges we don't have to read, so (pos - orig_pos) reflects
> > > the initial uptodate range while plen reflects the length we have to
> > > read..?
> > 
> > Yes.
> > 
> > > > +
> > > > +	do {
> > > 
> > > Kind of a nit, but this catches my eye and manages to confuse me every
> > > time I look at it. A comment along the lines of:
> > > 
> > >                 /*
> > > 		 * Pass in the block aligned start/end so we get back block
> > > 		 * aligned/adjusted poff/plen and can compare with unaligned
> > > 		 * from/to below.
> > >                  */
> > > 
> > > ... would be nice here, IMO.
> > 
> > Fine with me.
> > 
> > > > +		iomap_adjust_read_range(inode, iop, &block_start,
> > > > +				block_end - block_start, &poff, &plen);
> > > > +		if (plen == 0)
> > > > +			break;
> > > > +
> > > > +		if ((from > poff && from < poff + plen) ||
> > > > +		    (to > poff && to < poff + plen)) {
> > > > +			status = iomap_read_page_sync(inode, block_start, page,
> > > > +					poff, plen, from, to, iomap);
> > > 
> > > After taking another look at the buffer head path, it does look like we
> > > have slightly different behavior here. IIUC, the former reads only the
> > > !uptodate blocks that fall along the from/to boundaries. Here, if say
> > > from = 1, to = PAGE_SIZE and the page is fully !uptodate, it looks like
> > > we'd read the entire page worth of blocks (assuming contiguous 512b
> > > blocks, for example). Intentional? Doesn't seem like a big deal, but
> > > could be worth a followup fix.
> > 
> > It wasn't actuall intentional, but I actually think it is the right thing
> > in then end, as it means we'll often do a single read instead of two
> > separate ones.
> 
> Ok, but if that's the argument, then shouldn't we not be doing two
> separate I/Os if the middle range of a write happens to be already
> uptodate? Or more for that matter, if the page happens to be sparsely
> uptodate for whatever reason..?
> 
> OTOH, I also do wonder a bit whether that may always be the right thing
> if we consider cases like 64k page size arches and whatnot. It seems
> like we could end up consuming more bandwidth for reads than we
> typically have in the past. That said, unless there's a functional
> reason to change this I think it's fine to optimize this path for these
> kinds of corner cases in follow on patches.
> 
> Finally, this survived xfstests on a sub-page block size fs but I
> managed to hit an fsx error:
> 
> Mapped Read: non-zero data past EOF (0x21a1f) page offset 0xc00 is
> 0xc769
> 
> It repeats 100% of the time for me using the attached fsxops file (with
> --replay-ops) on XFS w/ -bsize=1k. It doesn't occur without the final
> patch to enable sub-page block iomap on XFS.

Funny, because I saw the exact same complaint from generic/127 last
night on my development tree that doesn't include hch's patches and was
going to see if I could figure out what's going on.

FWIW it's been happening sporadically for a few weeks now but every time
I've tried to analyze it I (of course) couldn't get it to reproduce. :)

I also ran this series (all of it, including the subpagesize config)
last night and aside from it stumbling over an unrelated locking problem
seemed fine....

--D

> Brian
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster June 20, 2018, 6:12 p.m. UTC | #5
On Wed, Jun 20, 2018 at 09:08:03AM -0700, Darrick J. Wong wrote:
> On Wed, Jun 20, 2018 at 10:32:53AM -0400, Brian Foster wrote:
> > Sending again without the attachment... Christoph, let me know if it
> > didn't hit your mbox at least.
> > 
> > On Wed, Jun 20, 2018 at 09:56:55AM +0200, Christoph Hellwig wrote:
> > > On Tue, Jun 19, 2018 at 12:52:11PM -0400, Brian Foster wrote:
> > > > > +	/*
> > > > > +	 * Move the caller beyond our range so that it keeps making progress.
> > > > > +	 * For that we have to include any leading non-uptodate ranges, but
> > > > 
> > > > Do you mean "leading uptodate ranges" here? E.g., pos is pushed forward
> > > > past those ranges we don't have to read, so (pos - orig_pos) reflects
> > > > the initial uptodate range while plen reflects the length we have to
> > > > read..?
> > > 
> > > Yes.
> > > 
> > > > > +
> > > > > +	do {
> > > > 
> > > > Kind of a nit, but this catches my eye and manages to confuse me every
> > > > time I look at it. A comment along the lines of:
> > > > 
> > > >                 /*
> > > > 		 * Pass in the block aligned start/end so we get back block
> > > > 		 * aligned/adjusted poff/plen and can compare with unaligned
> > > > 		 * from/to below.
> > > >                  */
> > > > 
> > > > ... would be nice here, IMO.
> > > 
> > > Fine with me.
> > > 
> > > > > +		iomap_adjust_read_range(inode, iop, &block_start,
> > > > > +				block_end - block_start, &poff, &plen);
> > > > > +		if (plen == 0)
> > > > > +			break;
> > > > > +
> > > > > +		if ((from > poff && from < poff + plen) ||
> > > > > +		    (to > poff && to < poff + plen)) {
> > > > > +			status = iomap_read_page_sync(inode, block_start, page,
> > > > > +					poff, plen, from, to, iomap);
> > > > 
> > > > After taking another look at the buffer head path, it does look like we
> > > > have slightly different behavior here. IIUC, the former reads only the
> > > > !uptodate blocks that fall along the from/to boundaries. Here, if say
> > > > from = 1, to = PAGE_SIZE and the page is fully !uptodate, it looks like
> > > > we'd read the entire page worth of blocks (assuming contiguous 512b
> > > > blocks, for example). Intentional? Doesn't seem like a big deal, but
> > > > could be worth a followup fix.
> > > 
> > > It wasn't actuall intentional, but I actually think it is the right thing
> > > in then end, as it means we'll often do a single read instead of two
> > > separate ones.
> > 
> > Ok, but if that's the argument, then shouldn't we not be doing two
> > separate I/Os if the middle range of a write happens to be already
> > uptodate? Or more for that matter, if the page happens to be sparsely
> > uptodate for whatever reason..?
> > 
> > OTOH, I also do wonder a bit whether that may always be the right thing
> > if we consider cases like 64k page size arches and whatnot. It seems
> > like we could end up consuming more bandwidth for reads than we
> > typically have in the past. That said, unless there's a functional
> > reason to change this I think it's fine to optimize this path for these
> > kinds of corner cases in follow on patches.
> > 
> > Finally, this survived xfstests on a sub-page block size fs but I
> > managed to hit an fsx error:
> > 
> > Mapped Read: non-zero data past EOF (0x21a1f) page offset 0xc00 is
> > 0xc769
> > 
> > It repeats 100% of the time for me using the attached fsxops file (with
> > --replay-ops) on XFS w/ -bsize=1k. It doesn't occur without the final
> > patch to enable sub-page block iomap on XFS.
> 
> Funny, because I saw the exact same complaint from generic/127 last
> night on my development tree that doesn't include hch's patches and was
> going to see if I could figure out what's going on.
> 
> FWIW it's been happening sporadically for a few weeks now but every time
> I've tried to analyze it I (of course) couldn't get it to reproduce. :)
> 
> I also ran this series (all of it, including the subpagesize config)
> last night and aside from it stumbling over an unrelated locking problem
> seemed fine....
> 

That's interesting. Perhaps it's a pre-existing issue in that case and
the iomap stuff just changes the timing to make it reliably reproducible
on this particular system.

I only ran it a handful of times in both cases and now have lost access
to the server. Once I regain access, I'll try running for longer on
for-next to see if the same thing eventually triggers.

Brian

> --D
> 
> > Brian
> > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 21, 2018, 7:53 a.m. UTC | #6
On Wed, Jun 20, 2018 at 10:32:53AM -0400, Brian Foster wrote:
> Sending again without the attachment... Christoph, let me know if it
> didn't hit your mbox at least.

I did get the previous one as well as this one.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/iomap.c b/fs/iomap.c
index a504077bb38f..c59d1922991d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@ 
 #include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
@@ -104,6 +105,121 @@  iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+static void
+iomap_page_release(struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (!iop)
+		return;
+	WARN_ON_ONCE(atomic_read(&iop->read_count));
+	WARN_ON_ONCE(atomic_read(&iop->write_count));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	kfree(iop);
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	unsigned poff = *pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+
+	if (iop) {
+		unsigned block_size = i_blocksize(inode);
+		unsigned first = poff >> inode->i_blkbits;
+		unsigned last = (poff + plen - 1) >> inode->i_blkbits;
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				break;
+			}
+		}
+	}
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
 static void
 iomap_read_inline_data(struct inode *inode, struct page *page,
 		struct iomap *iomap)
@@ -132,7 +248,7 @@  iomap_read_end_io(struct bio *bio)
 	int i;
 
 	bio_for_each_segment_all(bvec, bio, i)
-		page_endio(bvec->bv_page, false, error);
+		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
 
@@ -150,18 +266,19 @@  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 {
 	struct iomap_readpage_ctx *ctx = data;
 	struct page *page = ctx->cur_page;
-	unsigned poff = pos & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	bool is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
 	sector_t sector;
 
-	/* we don't support blocksize < PAGE_SIZE quite yet. */
-	WARN_ON_ONCE(pos != page_offset(page));
-	WARN_ON_ONCE(plen != PAGE_SIZE);
+	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
 
 	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
 		zero_user(page, poff, plen);
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, poff, plen);
 		goto done;
 	}
 
@@ -177,6 +294,14 @@  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		is_contig = true;
 	}
 
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
 	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -197,7 +322,13 @@  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 	__bio_add_page(ctx->bio, page, plen, poff);
 done:
-	return plen;
+	/*
+	 * Move the caller beyond our range so that it keeps making progress.
+	 * For that we have to include any leading non-uptodate ranges, but
+	 * we can skip trailing ones as they will be handled in the next
+	 * iteration.
+	 */
+	return pos - orig_pos + plen;
 }
 
 int
@@ -208,8 +339,6 @@  iomap_readpage(struct page *page, const struct iomap_ops *ops)
 	unsigned poff;
 	loff_t ret;
 
-	WARN_ON_ONCE(page_has_buffers(page));
-
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 		ret = iomap_apply(inode, page_offset(page) + poff,
 				PAGE_SIZE - poff, 0, ops, &ctx,
@@ -335,6 +464,84 @@  iomap_readpages(struct address_space *mapping, struct list_head *pages,
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = from >> inode->i_blkbits;
+	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned i;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	iomap_page_release(page);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		WARN_ON_ONCE(PageWriteback(page));
+		cancel_dirty_page(page);
+		iomap_page_release(page);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -358,6 +565,7 @@  iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
 
 	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
 		zero_user_segments(page, poff, from, to, poff + plen);
+		iomap_set_range_uptodate(page, poff, plen);
 		return 0;
 	}
 
@@ -373,21 +581,33 @@  static int
 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 		struct page *page, struct iomap *iomap)
 {
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	loff_t block_size = i_blocksize(inode);
 	loff_t block_start = pos & ~(block_size - 1);
 	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
-	unsigned poff = block_start & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
-	unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-
-	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+	unsigned from = pos & (PAGE_SIZE - 1), to = from + len, poff, plen;
+	int status = 0;
 
 	if (PageUptodate(page))
 		return 0;
-	if (from <= poff && to >= poff + plen)
-		return 0;
-	return iomap_read_page_sync(inode, block_start, page,
-			poff, plen, from, to, iomap);
+
+	do {
+		iomap_adjust_read_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
+
+		if ((from > poff && from < poff + plen) ||
+		    (to > poff && to < poff + plen)) {
+			status = iomap_read_page_sync(inode, block_start, page,
+					poff, plen, from, to, iomap);
+			if (status)
+				break;
+		}
+
+	} while ((block_start += plen) < block_end);
+
+	return status;
 }
 
 static int
@@ -470,7 +690,7 @@  __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	if (unlikely(copied < len && !PageUptodate(page))) {
 		copied = 0;
 	} else {
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
 		iomap_set_page_dirty(page);
 	}
 	return __generic_write_end(inode, pos, copied, page);
@@ -806,7 +1026,7 @@  iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 		block_commit_write(page, 0, length);
 	} else {
 		WARN_ON_ONCE(!PageUptodate(page));
-		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+		iomap_page_create(inode, page);
 	}
 
 	return length;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c2706cfb27c7..60b196c54dd6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -2,6 +2,9 @@ 
 #ifndef LINUX_IOMAP_H
 #define LINUX_IOMAP_H 1
 
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 struct address_space;
@@ -99,12 +102,40 @@  struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Structure allocate for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+	atomic_t		read_count;
+	atomic_t		write_count;
+	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+	if (page_has_private(page))
+		return (struct iomap_page *)page_private(page);
+	return NULL;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_set_page_dirty(struct page *page);
+int iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count);
+int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidatepage(struct page *page, unsigned int offset,
+		unsigned int len);
+#ifdef CONFIG_MIGRATION
+int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode);
+#else
+#define iomap_migrate_page NULL
+#endif
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,