[v6,08/19] mm: Add readahead address space operation
diff mbox series

Message ID 20200217184613.19668-14-willy@infradead.org
State New
Headers show
Series
  • Change readahead API
Related show

Commit Message

Matthew Wilcox Feb. 17, 2020, 6:45 p.m. UTC
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>

This replaces ->readpages with a saner interface:
 - Return void instead of an ignored error code.
 - Pages are already in the page cache when ->readahead is called.
 - Implementation looks up the pages in the page cache instead of
   having them passed in a linked list.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 Documentation/filesystems/locking.rst |  6 +++++-
 Documentation/filesystems/vfs.rst     | 13 +++++++++++++
 include/linux/fs.h                    |  2 ++
 include/linux/pagemap.h               | 18 ++++++++++++++++++
 mm/readahead.c                        |  8 +++++++-
 5 files changed, 45 insertions(+), 2 deletions(-)

Comments

Dave Chinner Feb. 18, 2020, 6:21 a.m. UTC | #1
On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> This replaces ->readpages with a saner interface:
>  - Return void instead of an ignored error code.
>  - Pages are already in the page cache when ->readahead is called.

Might read better as:

 - Page cache is already populates with locked pages when
   ->readahead is called.

>  - Implementation looks up the pages in the page cache instead of
>    having them passed in a linked list.

Add:

 - cleanup of unused readahead handled by ->readahead caller, not
   the method implementation.

> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  Documentation/filesystems/locking.rst |  6 +++++-
>  Documentation/filesystems/vfs.rst     | 13 +++++++++++++
>  include/linux/fs.h                    |  2 ++
>  include/linux/pagemap.h               | 18 ++++++++++++++++++
>  mm/readahead.c                        |  8 +++++++-
>  5 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index 5057e4d9dcd1..0ebc4491025a 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -239,6 +239,7 @@ prototypes::
>  	int (*readpage)(struct file *, struct page *);
>  	int (*writepages)(struct address_space *, struct writeback_control *);
>  	int (*set_page_dirty)(struct page *page);
> +	void (*readahead)(struct readahead_control *);
>  	int (*readpages)(struct file *filp, struct address_space *mapping,
>  			struct list_head *pages, unsigned nr_pages);
>  	int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -271,7 +272,8 @@ writepage:		yes, unlocks (see below)
>  readpage:		yes, unlocks
>  writepages:
>  set_page_dirty		no
> -readpages:
> +readahead:		yes, unlocks
> +readpages:		no
>  write_begin:		locks the page		 exclusive
>  write_end:		yes, unlocks		 exclusive
>  bmap:
> @@ -295,6 +297,8 @@ the request handler (/dev/loop).
>  ->readpage() unlocks the page, either synchronously or via I/O
>  completion.
>  
> +->readahead() unlocks the pages like ->readpage().
> +

"... the pages that I/O is attempted on ..."

>  ->readpages() populates the pagecache with the passed pages and starts
>  I/O against them.  They come unlocked upon I/O completion.
>  
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..81ab30fbe45c 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,7 @@ cache in your filesystem.  The following members are defined:
>  		int (*readpage)(struct file *, struct page *);
>  		int (*writepages)(struct address_space *, struct writeback_control *);
>  		int (*set_page_dirty)(struct page *page);
> +		void (*readahead)(struct readahead_control *);
>  		int (*readpages)(struct file *filp, struct address_space *mapping,
>  				 struct list_head *pages, unsigned nr_pages);
>  		int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,12 +782,24 @@ cache in your filesystem.  The following members are defined:
>  	If defined, it should set the PageDirty flag, and the
>  	PAGECACHE_TAG_DIRTY tag in the radix tree.
>  
> +``readahead``
> +	Called by the VM to read pages associated with the address_space
> +	object.  The pages are consecutive in the page cache and are
> +	locked.  The implementation should decrement the page refcount
> +	after starting I/O on each page.  Usually the page will be
> +	unlocked by the I/O completion handler.  If the function does
> +	not attempt I/O on some pages, the caller will decrement the page
> +	refcount and unlock the pages for you.	Set PageUptodate if the
> +	I/O completes successfully.  Setting PageError on any page will
> +	be ignored; simply unlock the page if an I/O error occurs.
> +
>  ``readpages``
>  	called by the VM to read pages associated with the address_space
>  	object.  This is essentially just a vector version of readpage.
>  	Instead of just one page, several pages are requested.
>  	readpages is only used for read-ahead, so read errors are
>  	ignored.  If anything goes wrong, feel free to give up.
> +	This interface is deprecated; implement readahead instead.

What is the removal schedule for the deprecated interface? 

> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 3613154e79e4..bd4291f78f41 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -665,6 +665,24 @@ static inline void readahead_next(struct readahead_control *rac)
>  #define readahead_for_each(rac, page)					\
>  	for (; (page = readahead_page(rac)); readahead_next(rac))
>  
> +/* The byte offset into the file of this readahead block */
> +static inline loff_t readahead_offset(struct readahead_control *rac)
> +{
> +	return (loff_t)rac->_start * PAGE_SIZE;
> +}

Urk. Didn't an early page use "offset" for the page index? That
was was "mm: Remove 'page_offset' from readahead loop" did, right?

That's just going to cause confusion to have different units for
readahead "offsets"....

> +
> +/* The number of bytes in this readahead block */
> +static inline loff_t readahead_length(struct readahead_control *rac)
> +{
> +	return (loff_t)rac->_nr_pages * PAGE_SIZE;
> +}
> +
> +/* The index of the first page in this readahead block */
> +static inline unsigned int readahead_index(struct readahead_control *rac)
> +{
> +	return rac->_start;
> +}

Based on this, I suspect the earlier patch should use "index" rather
than "offset" when walking the page cache indexes...

> +
>  /* The number of pages in this readahead block */
>  static inline unsigned int readahead_count(struct readahead_control *rac)
>  {
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 9e430daae42f..975ff5e387be 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -121,7 +121,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages)
>  
>  	blk_start_plug(&plug);
>  
> -	if (aops->readpages) {
> +	if (aops->readahead) {
> +		aops->readahead(rac);
> +		readahead_for_each(rac, page) {
> +			unlock_page(page);
> +			put_page(page);
> +		}

This needs a comment to explain the unwinding that needs to be done
here. I'm not going to remember in a year's time that this is just
for the pages that weren't submitted by ->readahead....

Cheers,

Dave.
Matthew Wilcox Feb. 18, 2020, 4:10 p.m. UTC | #2
On Tue, Feb 18, 2020 at 05:21:47PM +1100, Dave Chinner wrote:
> On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> > 
> > This replaces ->readpages with a saner interface:
> >  - Return void instead of an ignored error code.
> >  - Pages are already in the page cache when ->readahead is called.
> 
> Might read better as:
> 
>  - Page cache is already populates with locked pages when
>    ->readahead is called.

Will do.

> >  - Implementation looks up the pages in the page cache instead of
> >    having them passed in a linked list.
> 
> Add:
> 
>  - cleanup of unused readahead handled by ->readahead caller, not
>    the method implementation.

The readpages caller does that cleanup too, so it's not an advantage
to the readahead interface.

        if (mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
                /* Clean up the remaining pages */
                put_pages_list(pages);
                goto out;
        }

> >  ``readpages``
> >  	called by the VM to read pages associated with the address_space
> >  	object.  This is essentially just a vector version of readpage.
> >  	Instead of just one page, several pages are requested.
> >  	readpages is only used for read-ahead, so read errors are
> >  	ignored.  If anything goes wrong, feel free to give up.
> > +	This interface is deprecated; implement readahead instead.
> 
> What is the removal schedule for the deprecated interface? 

I mentioned that in the cover letter; once Dave Howells has the fscache
branch merged, I'll do the remaining filesystems.  Should be within the
next couple of merge windows.

> > +/* The byte offset into the file of this readahead block */
> > +static inline loff_t readahead_offset(struct readahead_control *rac)
> > +{
> > +	return (loff_t)rac->_start * PAGE_SIZE;
> > +}
> 
> Urk. Didn't an early page use "offset" for the page index? That
> was was "mm: Remove 'page_offset' from readahead loop" did, right?
> 
> That's just going to cause confusion to have different units for
> readahead "offsets"....

We are ... not consistent anywhere in the VM/VFS with our naming.
Unfortunately.

$ grep -n offset mm/filemap.c 
391: * @start:	offset in bytes where the range starts
...
815:	pgoff_t offset = old->index;
...
2020:	unsigned long offset;      /* offset into pagecache page */
...
2257:	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;

That last one's my favourite.  Not to mention the fine distinction you
and I discussed recently between offset_in_page() and page_offset().

Best of all, even our types encode the ambiguity of an 'offset'.  We have
pgoff_t and loff_t (replacing the earlier off_t).

So, new rule.  'pos' is the number of bytes into a file.  'index' is the
number of PAGE_SIZE pages into a file.  We don't use the word 'offset'
at all.  'length' as a byte count and 'count' as a page count seem like
fine names to me.

> > -	if (aops->readpages) {
> > +	if (aops->readahead) {
> > +		aops->readahead(rac);
> > +		readahead_for_each(rac, page) {
> > +			unlock_page(page);
> > +			put_page(page);
> > +		}
> 
> This needs a comment to explain the unwinding that needs to be done
> here. I'm not going to remember in a year's time that this is just
> for the pages that weren't submitted by ->readahead....

ACK.
John Hubbard Feb. 19, 2020, 12:12 a.m. UTC | #3
On 2/17/20 10:45 AM, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> This replaces ->readpages with a saner interface:
>  - Return void instead of an ignored error code.
>  - Pages are already in the page cache when ->readahead is called.
>  - Implementation looks up the pages in the page cache instead of
>    having them passed in a linked list.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  Documentation/filesystems/locking.rst |  6 +++++-
>  Documentation/filesystems/vfs.rst     | 13 +++++++++++++
>  include/linux/fs.h                    |  2 ++
>  include/linux/pagemap.h               | 18 ++++++++++++++++++
>  mm/readahead.c                        |  8 +++++++-
>  5 files changed, 45 insertions(+), 2 deletions(-)
> 

Looks nice,

    Reviewed-by: John Hubbard <jhubbard@nvidia.com>


thanks,
Dave Chinner Feb. 19, 2020, 1:04 a.m. UTC | #4
On Tue, Feb 18, 2020 at 08:10:04AM -0800, Matthew Wilcox wrote:
> On Tue, Feb 18, 2020 at 05:21:47PM +1100, Dave Chinner wrote:
> > On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> > > 
> > > This replaces ->readpages with a saner interface:
> > >  - Return void instead of an ignored error code.
> > >  - Pages are already in the page cache when ->readahead is called.
> > 
> > Might read better as:
> > 
> >  - Page cache is already populates with locked pages when
> >    ->readahead is called.
> 
> Will do.
> 
> > >  - Implementation looks up the pages in the page cache instead of
> > >    having them passed in a linked list.
> > 
> > Add:
> > 
> >  - cleanup of unused readahead handled by ->readahead caller, not
> >    the method implementation.
> 
> The readpages caller does that cleanup too, so it's not an advantage
> to the readahead interface.

Right. I kinda of read the list as "the reasons the new API is sane"
not as "how readahead is different to readpages"....

> > >  ``readpages``
> > >  	called by the VM to read pages associated with the address_space
> > >  	object.  This is essentially just a vector version of readpage.
> > >  	Instead of just one page, several pages are requested.
> > >  	readpages is only used for read-ahead, so read errors are
> > >  	ignored.  If anything goes wrong, feel free to give up.
> > > +	This interface is deprecated; implement readahead instead.
> > 
> > What is the removal schedule for the deprecated interface? 
> 
> I mentioned that in the cover letter; once Dave Howells has the fscache
> branch merged, I'll do the remaining filesystems.  Should be within the
> next couple of merge windows.

Sure, but I like to see actual release tags with the deprecation
notice so that it's obvious to the reader as to whether this is
something new and/or when they can expect it to go away.

> > > +/* The byte offset into the file of this readahead block */
> > > +static inline loff_t readahead_offset(struct readahead_control *rac)
> > > +{
> > > +	return (loff_t)rac->_start * PAGE_SIZE;
> > > +}
> > 
> > Urk. Didn't an early page use "offset" for the page index? That
> > was was "mm: Remove 'page_offset' from readahead loop" did, right?
> > 
> > That's just going to cause confusion to have different units for
> > readahead "offsets"....
> 
> We are ... not consistent anywhere in the VM/VFS with our naming.
> Unfortunately.
> 
> $ grep -n offset mm/filemap.c 
> 391: * @start:	offset in bytes where the range starts
> ...
> 815:	pgoff_t offset = old->index;
> ...
> 2020:	unsigned long offset;      /* offset into pagecache page */
> ...
> 2257:	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
> 
> That last one's my favourite.  Not to mention the fine distinction you
> and I discussed recently between offset_in_page() and page_offset().
> 
> Best of all, even our types encode the ambiguity of an 'offset'.  We have
> pgoff_t and loff_t (replacing the earlier off_t).
> 
> So, new rule.  'pos' is the number of bytes into a file.  'index' is the
> number of PAGE_SIZE pages into a file.  We don't use the word 'offset'
> at all.  'length' as a byte count and 'count' as a page count seem like
> fine names to me.

That sounds very reasonable to me. Another patchset in the making? :)

Cheers,

Dave.
Eric Biggers Feb. 19, 2020, 3:10 a.m. UTC | #5
On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..81ab30fbe45c 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,7 @@ cache in your filesystem.  The following members are defined:
>  		int (*readpage)(struct file *, struct page *);
>  		int (*writepages)(struct address_space *, struct writeback_control *);
>  		int (*set_page_dirty)(struct page *page);
> +		void (*readahead)(struct readahead_control *);
>  		int (*readpages)(struct file *filp, struct address_space *mapping,
>  				 struct list_head *pages, unsigned nr_pages);
>  		int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,12 +782,24 @@ cache in your filesystem.  The following members are defined:
>  	If defined, it should set the PageDirty flag, and the
>  	PAGECACHE_TAG_DIRTY tag in the radix tree.
>  
> +``readahead``
> +	Called by the VM to read pages associated with the address_space
> +	object.  The pages are consecutive in the page cache and are
> +	locked.  The implementation should decrement the page refcount
> +	after starting I/O on each page.  Usually the page will be
> +	unlocked by the I/O completion handler.  If the function does
> +	not attempt I/O on some pages, the caller will decrement the page
> +	refcount and unlock the pages for you.	Set PageUptodate if the
> +	I/O completes successfully.  Setting PageError on any page will
> +	be ignored; simply unlock the page if an I/O error occurs.
> +

This is unclear about how "not attempting I/O" works and how that affects who is
responsible for putting and unlocking the pages.  How does the caller know which
pages were not attempted?  Can any arbitrary subset of pages be not attempted,
or just the last N pages?

In the code, the caller actually uses readahead_for_each() to iterate through
and put+unlock the pages.  That implies that ->readahead() must also use
readahead_for_each() as well, in order for the iterator to be advanced
correctly... Right?  And the ownership of each page is transferred to the callee
when the callee advances the iterator past that page.

I don't see how ext4_readahead() and f2fs_readahead() can work at all, given
that they don't advance the iterator.

- Eric
Eric Biggers Feb. 19, 2020, 3:35 a.m. UTC | #6
On Tue, Feb 18, 2020 at 07:10:44PM -0800, Eric Biggers wrote:
> On Mon, Feb 17, 2020 at 10:45:54AM -0800, Matthew Wilcox wrote:
> > diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> > index 7d4d09dd5e6d..81ab30fbe45c 100644
> > --- a/Documentation/filesystems/vfs.rst
> > +++ b/Documentation/filesystems/vfs.rst
> > @@ -706,6 +706,7 @@ cache in your filesystem.  The following members are defined:
> >  		int (*readpage)(struct file *, struct page *);
> >  		int (*writepages)(struct address_space *, struct writeback_control *);
> >  		int (*set_page_dirty)(struct page *page);
> > +		void (*readahead)(struct readahead_control *);
> >  		int (*readpages)(struct file *filp, struct address_space *mapping,
> >  				 struct list_head *pages, unsigned nr_pages);
> >  		int (*write_begin)(struct file *, struct address_space *mapping,
> > @@ -781,12 +782,24 @@ cache in your filesystem.  The following members are defined:
> >  	If defined, it should set the PageDirty flag, and the
> >  	PAGECACHE_TAG_DIRTY tag in the radix tree.
> >  
> > +``readahead``
> > +	Called by the VM to read pages associated with the address_space
> > +	object.  The pages are consecutive in the page cache and are
> > +	locked.  The implementation should decrement the page refcount
> > +	after starting I/O on each page.  Usually the page will be
> > +	unlocked by the I/O completion handler.  If the function does
> > +	not attempt I/O on some pages, the caller will decrement the page
> > +	refcount and unlock the pages for you.	Set PageUptodate if the
> > +	I/O completes successfully.  Setting PageError on any page will
> > +	be ignored; simply unlock the page if an I/O error occurs.
> > +
> 
> This is unclear about how "not attempting I/O" works and how that affects who is
> responsible for putting and unlocking the pages.  How does the caller know which
> pages were not attempted?  Can any arbitrary subset of pages be not attempted,
> or just the last N pages?
> 
> In the code, the caller actually uses readahead_for_each() to iterate through
> and put+unlock the pages.  That implies that ->readahead() must also use
> readahead_for_each() as well, in order for the iterator to be advanced
> correctly... Right?  And the ownership of each page is transferred to the callee
> when the callee advances the iterator past that page.
> 
> I don't see how ext4_readahead() and f2fs_readahead() can work at all, given
> that they don't advance the iterator.
> 

Yep, this patchset immediately crashes on boot with:

BUG: Bad page state in process swapper/0  pfn:02176
page:ffffea00000751d0 refcount:0 mapcount:0 mapping:ffff88807cba0400 index:0x0
ext4_da_aops name:"systemd"
flags: 0x100000000020001(locked|mappedtodisk)
raw: 0100000000020001 dead000000000100 dead000000000122 ffff88807cba0400
raw: 0000000000000000 0000000000000000 00000000ffffffff
page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
bad because of flags: 0x1(locked)
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2-00019-g7203ed9018cb9 #18
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20191223_100556-anatol 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x7a/0xaa lib/dump_stack.c:118
 bad_page.cold+0x89/0xba mm/page_alloc.c:649
 free_pages_check_bad+0x5d/0x60 mm/page_alloc.c:1050
 free_pages_check mm/page_alloc.c:1059 [inline]
 free_pages_prepare mm/page_alloc.c:1157 [inline]
 free_pcp_prepare+0x1c1/0x200 mm/page_alloc.c:1198
 free_unref_page_prepare mm/page_alloc.c:3011 [inline]
 free_unref_page+0x16/0x70 mm/page_alloc.c:3060
 __put_single_page mm/swap.c:81 [inline]
 __put_page+0x31/0x40 mm/swap.c:115
 put_page include/linux/mm.h:1029 [inline]
 ext4_mpage_readpages+0x778/0x9b0 fs/ext4/readpage.c:405
 ext4_readahead+0x2f/0x40 fs/ext4/inode.c:3242
 read_pages+0x4c/0x200 mm/readahead.c:126
 page_cache_readahead_limit+0x224/0x250 mm/readahead.c:241
 __do_page_cache_readahead mm/readahead.c:266 [inline]
 ra_submit mm/internal.h:62 [inline]
 ondemand_readahead+0x1df/0x4d0 mm/readahead.c:544
 page_cache_sync_readahead+0x2d/0x40 mm/readahead.c:579
 generic_file_buffered_read+0x77e/0xa90 mm/filemap.c:2029
 generic_file_read_iter+0xd4/0x130 mm/filemap.c:2302
 ext4_file_read_iter fs/ext4/file.c:131 [inline]
 ext4_file_read_iter+0x53/0x180 fs/ext4/file.c:114
 call_read_iter include/linux/fs.h:1897 [inline]
 new_sync_read+0x113/0x1a0 fs/read_write.c:414
 __vfs_read+0x21/0x30 fs/read_write.c:427
 vfs_read+0xcb/0x160 fs/read_write.c:461
 kernel_read+0x2c/0x40 fs/read_write.c:440
 prepare_binprm+0x14f/0x190 fs/exec.c:1589
 __do_execve_file.isra.0+0x4c0/0x800 fs/exec.c:1806
 do_execveat_common fs/exec.c:1871 [inline]
 do_execve+0x20/0x30 fs/exec.c:1888
 run_init_process+0xc8/0xcd init/main.c:1279
 try_to_run_init_process+0x10/0x36 init/main.c:1288
 kernel_init+0xac/0xfd init/main.c:1385
 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
Disabling lock debugging due to kernel taint
page:ffffea00000751d0 refcount:0 mapcount:0 mapping:ffff88807cba0400 index:0x0
ext4_da_aops name:"systemd"
flags: 0x100000000020001(locked|mappedtodisk)
raw: 0100000000020001 dead000000000100 dead000000000122 ffff88807cba0400
raw: 0000000000000000 0000000000000000 00000000ffffffff
page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0)


I had to add:

diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e14841ade6122..cb982088b5225 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -401,8 +401,10 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		else
 			unlock_page(page);
 	next_page:
-		if (rac)
+		if (rac) {
 			put_page(page);
+			readahead_next(rac);
+		}
 	}
 	if (bio)
 		submit_bio(bio);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87964e4cb6b81..e16b0fe42e2e5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2238,8 +2238,10 @@ int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac,
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 next_page:
 #endif
-		if (rac)
+		if (rac) {
 			put_page(page);
+			readahead_next(rac);
+		}
 
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		if (f2fs_compressed_file(inode)) {
Matthew Wilcox Feb. 19, 2020, 4:52 p.m. UTC | #7
On Tue, Feb 18, 2020 at 07:10:44PM -0800, Eric Biggers wrote:
> > +``readahead``
> > +	Called by the VM to read pages associated with the address_space
> > +	object.  The pages are consecutive in the page cache and are
> > +	locked.  The implementation should decrement the page refcount
> > +	after starting I/O on each page.  Usually the page will be
> > +	unlocked by the I/O completion handler.  If the function does
> > +	not attempt I/O on some pages, the caller will decrement the page
> > +	refcount and unlock the pages for you.	Set PageUptodate if the
> > +	I/O completes successfully.  Setting PageError on any page will
> > +	be ignored; simply unlock the page if an I/O error occurs.
> > +
> 
> This is unclear about how "not attempting I/O" works and how that affects who is
> responsible for putting and unlocking the pages.  How does the caller know which
> pages were not attempted?  Can any arbitrary subset of pages be not attempted,
> or just the last N pages?

Changed to:

``readahead``
        Called by the VM to read pages associated with the address_space
        object.  The pages are consecutive in the page cache and are
        locked.  The implementation should decrement the page refcount
        after starting I/O on each page.  Usually the page will be
        unlocked by the I/O completion handler.  If the filesystem decides
        to stop attempting I/O before reaching the end of the readahead
        window, it can simply return.  The caller will decrement the page
        refcount and unlock the remaining pages for you.  Set PageUptodate
        if the I/O completes successfully.  Setting PageError on any page
        will be ignored; simply unlock the page if an I/O error occurs.

Patch
diff mbox series

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 5057e4d9dcd1..0ebc4491025a 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,7 @@  prototypes::
 	int (*readpage)(struct file *, struct page *);
 	int (*writepages)(struct address_space *, struct writeback_control *);
 	int (*set_page_dirty)(struct page *page);
+	void (*readahead)(struct readahead_control *);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
 	int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +272,8 @@  writepage:		yes, unlocks (see below)
 readpage:		yes, unlocks
 writepages:
 set_page_dirty		no
-readpages:
+readahead:		yes, unlocks
+readpages:		no
 write_begin:		locks the page		 exclusive
 write_end:		yes, unlocks		 exclusive
 bmap:
@@ -295,6 +297,8 @@  the request handler (/dev/loop).
 ->readpage() unlocks the page, either synchronously or via I/O
 completion.
 
+->readahead() unlocks the pages like ->readpage().
+
 ->readpages() populates the pagecache with the passed pages and starts
 I/O against them.  They come unlocked upon I/O completion.
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..81ab30fbe45c 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,7 @@  cache in your filesystem.  The following members are defined:
 		int (*readpage)(struct file *, struct page *);
 		int (*writepages)(struct address_space *, struct writeback_control *);
 		int (*set_page_dirty)(struct page *page);
+		void (*readahead)(struct readahead_control *);
 		int (*readpages)(struct file *filp, struct address_space *mapping,
 				 struct list_head *pages, unsigned nr_pages);
 		int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,12 +782,24 @@  cache in your filesystem.  The following members are defined:
 	If defined, it should set the PageDirty flag, and the
 	PAGECACHE_TAG_DIRTY tag in the radix tree.
 
+``readahead``
+	Called by the VM to read pages associated with the address_space
+	object.  The pages are consecutive in the page cache and are
+	locked.  The implementation should decrement the page refcount
+	after starting I/O on each page.  Usually the page will be
+	unlocked by the I/O completion handler.  If the function does
+	not attempt I/O on some pages, the caller will decrement the page
+	refcount and unlock the pages for you.	Set PageUptodate if the
+	I/O completes successfully.  Setting PageError on any page will
+	be ignored; simply unlock the page if an I/O error occurs.
+
 ``readpages``
 	called by the VM to read pages associated with the address_space
 	object.  This is essentially just a vector version of readpage.
 	Instead of just one page, several pages are requested.
 	readpages is only used for read-ahead, so read errors are
 	ignored.  If anything goes wrong, feel free to give up.
+	This interface is deprecated; implement readahead instead.
 
 ``write_begin``
 	Called by the generic buffered write code to ask the filesystem
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3cd4fe6b845e..d4e2d2964346 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -292,6 +292,7 @@  enum positive_aop_returns {
 struct page;
 struct address_space;
 struct writeback_control;
+struct readahead_control;
 
 /*
  * Write life time hint values.
@@ -375,6 +376,7 @@  struct address_space_operations {
 	 */
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
+	void (*readahead)(struct readahead_control *);
 
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3613154e79e4..bd4291f78f41 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -665,6 +665,24 @@  static inline void readahead_next(struct readahead_control *rac)
 #define readahead_for_each(rac, page)					\
 	for (; (page = readahead_page(rac)); readahead_next(rac))
 
+/* The byte offset into the file of this readahead block */
+static inline loff_t readahead_offset(struct readahead_control *rac)
+{
+	return (loff_t)rac->_start * PAGE_SIZE;
+}
+
+/* The number of bytes in this readahead block */
+static inline loff_t readahead_length(struct readahead_control *rac)
+{
+	return (loff_t)rac->_nr_pages * PAGE_SIZE;
+}
+
+/* The index of the first page in this readahead block */
+static inline unsigned int readahead_index(struct readahead_control *rac)
+{
+	return rac->_start;
+}
+
 /* The number of pages in this readahead block */
 static inline unsigned int readahead_count(struct readahead_control *rac)
 {
diff --git a/mm/readahead.c b/mm/readahead.c
index 9e430daae42f..975ff5e387be 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -121,7 +121,13 @@  static void read_pages(struct readahead_control *rac, struct list_head *pages)
 
 	blk_start_plug(&plug);
 
-	if (aops->readpages) {
+	if (aops->readahead) {
+		aops->readahead(rac);
+		readahead_for_each(rac, page) {
+			unlock_page(page);
+			put_page(page);
+		}
+	} else if (aops->readpages) {
 		aops->readpages(rac->file, rac->mapping, pages,
 				readahead_count(rac));
 		/* Clean up the remaining pages */