Message ID | 20200219210103.32400-10-willy@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Change readahead API | expand |
On 2/19/20 1:00 PM, Matthew Wilcox wrote: > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > When populating the page cache for readahead, mappings that use > ->readpages must populate the page cache themselves as the pages are > passed on a linked list which would normally be used for the page cache's > LRU. For mappings that use ->readpage or the upcoming ->readahead method, > we can put the pages into the page cache as soon as they're allocated, > which solves a race between readahead and direct IO. It also lets us > remove the gfp argument from read_pages(). > > Use the new readahead_page() API to implement the repeated calls to > ->readpage(), just like most filesystems will. This iterator also > supports huge pages, even though none of the filesystems have been > converted to use them yet. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > include/linux/pagemap.h | 20 +++++++++++++++++ > mm/readahead.c | 48 +++++++++++++++++++++++++---------------- > 2 files changed, 49 insertions(+), 19 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index 55fcea0249e6..4989d330fada 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -647,8 +647,28 @@ struct readahead_control { > /* private: use the readahead_* accessors instead */ > pgoff_t _index; > unsigned int _nr_pages; > + unsigned int _batch_count; > }; > > +static inline struct page *readahead_page(struct readahead_control *rac) > +{ > + struct page *page; > + > + BUG_ON(rac->_batch_count > rac->_nr_pages); > + rac->_nr_pages -= rac->_batch_count; > + rac->_index += rac->_batch_count; > + rac->_batch_count = 0; Is it intentional, to set rac->_batch_count twice (here, and below)? The only reason I can see is if a caller needs to use ->_batch_count in the "return NULL" case, which doesn't seem to come up... > + > + if (!rac->_nr_pages) > + return NULL; > + > + page = xa_load(&rac->mapping->i_pages, rac->_index); > + VM_BUG_ON_PAGE(!PageLocked(page), page); > + rac->_batch_count = hpage_nr_pages(page); > + > + return page; > +} > + > /* The number of pages in this readahead block */ > static inline unsigned int readahead_count(struct readahead_control *rac) > { > diff --git a/mm/readahead.c b/mm/readahead.c > index 83df5c061d33..aaa209559ba2 100644 > --- a/mm/readahead.c > +++ b/mm/readahead.c > @@ -113,15 +113,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, > > EXPORT_SYMBOL(read_cache_pages); > > -static void read_pages(struct readahead_control *rac, struct list_head *pages, > - gfp_t gfp) > +static void read_pages(struct readahead_control *rac, struct list_head *pages) > { > const struct address_space_operations *aops = rac->mapping->a_ops; > + struct page *page; > struct blk_plug plug; > - unsigned page_idx; > > if (!readahead_count(rac)) > - return; > + goto out; > > blk_start_plug(&plug); > > @@ -130,23 +129,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, > readahead_count(rac)); > /* Clean up the remaining pages */ > put_pages_list(pages); > - goto out; > - } > - > - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { > - struct page *page = lru_to_page(pages); > - list_del(&page->lru); > - if (!add_to_page_cache_lru(page, rac->mapping, page->index, > - gfp)) > + rac->_index += rac->_nr_pages; > + rac->_nr_pages = 0; > + } else { > + while ((page = readahead_page(rac))) { > aops->readpage(rac->file, page); > - put_page(page); > + put_page(page); > + } > } > > -out: > blk_finish_plug(&plug); > > BUG_ON(!list_empty(pages)); > - rac->_nr_pages = 0; > + BUG_ON(readahead_count(rac)); > + > +out: > + /* If we were called due to a conflicting page, skip over it */ Tiny documentation nit: What if we were *not* called due to a conflicting page? (And what is a "conflicting page", in this context, btw?) The next line unconditionally moves the index ahead, so the "if" part of the comment really confuses me. > + rac->_index++; > } > > /* > @@ -165,9 +164,11 @@ void __do_page_cache_readahead(struct address_space *mapping, > LIST_HEAD(page_pool); > loff_t isize = i_size_read(inode); > gfp_t gfp_mask = readahead_gfp_mask(mapping); > + bool use_list = mapping->a_ops->readpages; > struct readahead_control rac = { > .mapping = mapping, > .file = filp, > + ._index = index, > ._nr_pages = 0, > }; > unsigned long i; > @@ -184,6 +185,8 @@ void __do_page_cache_readahead(struct address_space *mapping, > if (index + i > end_index) > break; > > + BUG_ON(index + i != rac._index + rac._nr_pages); > + > page = xa_load(&mapping->i_pages, index + i); > if (page && !xa_is_value(page)) { > /* > @@ -191,15 +194,22 @@ void __do_page_cache_readahead(struct address_space *mapping, > * contiguous pages before continuing with the next > * batch. > */ > - read_pages(&rac, &page_pool, gfp_mask); > + read_pages(&rac, &page_pool); > continue; > } > > page = __page_cache_alloc(gfp_mask); > if (!page) > break; > - page->index = index + i; > - list_add(&page->lru, &page_pool); > + if (use_list) { > + page->index = index + i; > + list_add(&page->lru, &page_pool); > + } else if (add_to_page_cache_lru(page, mapping, index + i, > + gfp_mask) < 0) { I still think you'll want to compare against !=0, rather than < 0, here. > + put_page(page); > + read_pages(&rac, &page_pool); Doing a read_pages() in the error case is because...actually, I'm not sure yet. Why do we do this? Effectively it's a retry? > + continue; > + } > if (i == nr_to_read - lookahead_size) > SetPageReadahead(page); > rac._nr_pages++; > @@ -210,7 +220,7 @@ void __do_page_cache_readahead(struct address_space *mapping, > * uptodate then the caller will launch readpage again, and > * will then handle the error. > */ > - read_pages(&rac, &page_pool, gfp_mask); > + read_pages(&rac, &page_pool); > } > > /* > Didn't spot any actual errors, just mainly my own questions here. :) thanks,
On Thu, Feb 20, 2020 at 07:19:58PM -0800, John Hubbard wrote: > > +static inline struct page *readahead_page(struct readahead_control *rac) > > +{ > > + struct page *page; > > + > > + BUG_ON(rac->_batch_count > rac->_nr_pages); > > + rac->_nr_pages -= rac->_batch_count; > > + rac->_index += rac->_batch_count; > > + rac->_batch_count = 0; > > > Is it intentional, to set rac->_batch_count twice (here, and below)? The > only reason I can see is if a caller needs to use ->_batch_count in the > "return NULL" case, which doesn't seem to come up... Ah, but it does. Not in this patch, but the next one ... + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } In the normal case, the ->readahead method will consume all the pages, and we need readahead_page() to do nothing if it is called again. > > + if (!rac->_nr_pages) > > + return NULL; ... admittedly I could do: if (!rac->_nr_pages) { rac->_batch_count = 0; return NULL; } which might be less confusing. > > @@ -130,23 +129,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, > > readahead_count(rac)); > > /* Clean up the remaining pages */ > > put_pages_list(pages); > > - goto out; > > - } > > - > > - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { > > - struct page *page = lru_to_page(pages); > > - list_del(&page->lru); > > - if (!add_to_page_cache_lru(page, rac->mapping, page->index, > > - gfp)) > > + rac->_index += rac->_nr_pages; > > + rac->_nr_pages = 0; > > + } else { > > + while ((page = readahead_page(rac))) { > > aops->readpage(rac->file, page); > > - put_page(page); > > + put_page(page); > > + } > > } > > > > -out: > > blk_finish_plug(&plug); > > > > BUG_ON(!list_empty(pages)); > > - rac->_nr_pages = 0; > > + BUG_ON(readahead_count(rac)); > > + > > +out: > > + /* If we were called due to a conflicting page, skip over it */ > > Tiny documentation nit: What if we were *not* called due to a conflicting page? > (And what is a "conflicting page", in this context, btw?) The next line unconditionally > moves the index ahead, so the "if" part of the comment really confuses me. By the end of the series, read_pages() is called in three places: 1. if (page && !xa_is_value(page)) { read_pages(&rac, &page_pool); 2. } else if (add_to_page_cache_lru(page, mapping, index + i, gfp_mask) < 0) { put_page(page); read_pages(&rac, &page_pool); 3. read_pages(&rac, &page_pool); In the first two cases, there's an existing page in the page cache (which conflicts with this readahead operation), and so we need to advance index. In the third case, we're exiting the function, so it does no harm to advance index one further. > > + } else if (add_to_page_cache_lru(page, mapping, index + i, > > + gfp_mask) < 0) { > > I still think you'll want to compare against !=0, rather than < 0, here. I tend to prefer < 0 when checking for an error value in case the function decides to start using positive numbers to mean something. I don't think it's a particularly important preference though (after all, returning 1 might mean "failed, but for this weird reason rather than an errno"). > > + put_page(page); > > + read_pages(&rac, &page_pool); > > Doing a read_pages() in the error case is because...actually, I'm not sure yet. > Why do we do this? Effectively it's a retry? Same as the reason we call read_pages() if we found a page in the page cache earlier -- we're sending down a set of pages which are consecutive in the file's address space, and now we have to skip one. At least one ;-)
On 2/20/20 7:43 PM, Matthew Wilcox wrote: > On Thu, Feb 20, 2020 at 07:19:58PM -0800, John Hubbard wrote: >>> +static inline struct page *readahead_page(struct readahead_control *rac) >>> +{ >>> + struct page *page; >>> + >>> + BUG_ON(rac->_batch_count > rac->_nr_pages); >>> + rac->_nr_pages -= rac->_batch_count; >>> + rac->_index += rac->_batch_count; >>> + rac->_batch_count = 0; >> >> >> Is it intentional, to set rac->_batch_count twice (here, and below)? The >> only reason I can see is if a caller needs to use ->_batch_count in the >> "return NULL" case, which doesn't seem to come up... > > Ah, but it does. Not in this patch, but the next one ... > > + if (aops->readahead) { > + aops->readahead(rac); > + /* Clean up the remaining pages */ > + while ((page = readahead_page(rac))) { > + unlock_page(page); > + put_page(page); > + } > > In the normal case, the ->readahead method will consume all the pages, > and we need readahead_page() to do nothing if it is called again. > >>> + if (!rac->_nr_pages) >>> + return NULL; > > ... admittedly I could do: > > if (!rac->_nr_pages) { > rac->_batch_count = 0; > return NULL; > } > > which might be less confusing. Yes, that would be a nice bit of polish if you end up doing another revision for other reasons. > >>> @@ -130,23 +129,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, >>> readahead_count(rac)); >>> /* Clean up the remaining pages */ >>> put_pages_list(pages); >>> - goto out; >>> - } >>> - >>> - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { >>> - struct page *page = lru_to_page(pages); >>> - list_del(&page->lru); >>> - if (!add_to_page_cache_lru(page, rac->mapping, page->index, >>> - gfp)) >>> + rac->_index += rac->_nr_pages; >>> + rac->_nr_pages = 0; >>> + } else { >>> + while ((page = readahead_page(rac))) { >>> aops->readpage(rac->file, page); >>> - put_page(page); >>> + put_page(page); >>> + } >>> } >>> >>> -out: >>> blk_finish_plug(&plug); >>> >>> BUG_ON(!list_empty(pages)); >>> - rac->_nr_pages = 0; >>> + BUG_ON(readahead_count(rac)); >>> + >>> +out: >>> + /* If we were called due to a conflicting page, skip over it */ >> >> Tiny documentation nit: What if we were *not* called due to a conflicting page? >> (And what is a "conflicting page", in this context, btw?) The next line unconditionally >> moves the index ahead, so the "if" part of the comment really confuses me. > > By the end of the series, read_pages() is called in three places: > > 1. if (page && !xa_is_value(page)) { > read_pages(&rac, &page_pool); > > 2. } else if (add_to_page_cache_lru(page, mapping, index + i, > gfp_mask) < 0) { > put_page(page); > read_pages(&rac, &page_pool); > > 3. read_pages(&rac, &page_pool); > > In the first two cases, there's an existing page in the page cache > (which conflicts with this readahead operation), and so we need to > advance index. In the third case, we're exiting the function, so it > does no harm to advance index one further. OK, I see. As you know, I tend toward maybe over-documenting, but what about adding just a *few* hints to help new readers, like this approximately (maybe it should be pared down): diff --git a/mm/readahead.c b/mm/readahead.c index 9fb5f77dcf69..0dd5b09c376e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -114,6 +114,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); +/* + * Read pages into the page cache, OR skip over a page if it is already in the + * page cache. + */ static void read_pages(struct readahead_control *rac, struct list_head *pages) { const struct address_space_operations *aops = rac->mapping->a_ops; @@ -152,7 +156,11 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages) BUG_ON(readahead_count(rac)); out: - /* If we were called due to a conflicting page, skip over it */ + /* + * This routine might have been called in order to skip over a page + * that is already in the page cache. And for other cases, the index is + * ignored by the caller. So just increment unconditionally: + */ rac->_index++; } ? > >>> + } else if (add_to_page_cache_lru(page, mapping, index + i, >>> + gfp_mask) < 0) { >> >> I still think you'll want to compare against !=0, rather than < 0, here. > > I tend to prefer < 0 when checking for an error value in case the function > decides to start using positive numbers to mean something. I don't think > it's a particularly important preference though (after all, returning 1 > might mean "failed, but for this weird reason rather than an errno"). > >>> + put_page(page); >>> + read_pages(&rac, &page_pool); >> >> Doing a read_pages() in the error case is because...actually, I'm not sure yet. >> Why do we do this? Effectively it's a retry? > > Same as the reason we call read_pages() if we found a page in the page > cache earlier -- we're sending down a set of pages which are consecutive > in the file's address space, and now we have to skip one. At least one ;-) > Got it. Finally. :) thanks,
On Wed, Feb 19, 2020 at 01:00:48PM -0800, Matthew Wilcox wrote: > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > When populating the page cache for readahead, mappings that use > ->readpages must populate the page cache themselves as the pages are > passed on a linked list which would normally be used for the page cache's > LRU. For mappings that use ->readpage or the upcoming ->readahead method, > we can put the pages into the page cache as soon as they're allocated, > which solves a race between readahead and direct IO. It also lets us > remove the gfp argument from read_pages(). > > Use the new readahead_page() API to implement the repeated calls to > ->readpage(), just like most filesystems will. This iterator also > supports huge pages, even though none of the filesystems have been > converted to use them yet. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > include/linux/pagemap.h | 20 +++++++++++++++++ > mm/readahead.c | 48 +++++++++++++++++++++++++---------------- > 2 files changed, 49 insertions(+), 19 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index 55fcea0249e6..4989d330fada 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -647,8 +647,28 @@ struct readahead_control { > /* private: use the readahead_* accessors instead */ > pgoff_t _index; > unsigned int _nr_pages; > + unsigned int _batch_count; > }; > > +static inline struct page *readahead_page(struct readahead_control *rac) > +{ > + struct page *page; > + > + BUG_ON(rac->_batch_count > rac->_nr_pages); > + rac->_nr_pages -= rac->_batch_count; > + rac->_index += rac->_batch_count; > + rac->_batch_count = 0; > + > + if (!rac->_nr_pages) > + return NULL; > + > + page = xa_load(&rac->mapping->i_pages, rac->_index); > + VM_BUG_ON_PAGE(!PageLocked(page), page); > + rac->_batch_count = hpage_nr_pages(page); > + > + return page; > +} > + > /* The number of pages in this readahead block */ > static inline unsigned int readahead_count(struct readahead_control *rac) > { > diff --git a/mm/readahead.c b/mm/readahead.c > index 83df5c061d33..aaa209559ba2 100644 > --- a/mm/readahead.c > +++ b/mm/readahead.c > @@ -113,15 +113,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, > > EXPORT_SYMBOL(read_cache_pages); > > -static void read_pages(struct readahead_control *rac, struct list_head *pages, > - gfp_t gfp) > +static void read_pages(struct readahead_control *rac, struct list_head *pages) > { > const struct address_space_operations *aops = rac->mapping->a_ops; > + struct page *page; > struct blk_plug plug; > - unsigned page_idx; > > if (!readahead_count(rac)) > - return; > + goto out; > > blk_start_plug(&plug); > > @@ -130,23 +129,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, > readahead_count(rac)); > /* Clean up the remaining pages */ > put_pages_list(pages); > - goto out; > - } > - > - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { > - struct page *page = lru_to_page(pages); > - list_del(&page->lru); > - if (!add_to_page_cache_lru(page, rac->mapping, page->index, > - gfp)) > + rac->_index += rac->_nr_pages; > + rac->_nr_pages = 0; > + } else { > + while ((page = readahead_page(rac))) { > aops->readpage(rac->file, page); > - put_page(page); > + put_page(page); > + } > } > > -out: > blk_finish_plug(&plug); > > BUG_ON(!list_empty(pages)); > - rac->_nr_pages = 0; > + BUG_ON(readahead_count(rac)); > + > +out: > + /* If we were called due to a conflicting page, skip over it */ > + rac->_index++; > } > > /* > @@ -165,9 +164,11 @@ void __do_page_cache_readahead(struct address_space *mapping, > LIST_HEAD(page_pool); > loff_t isize = i_size_read(inode); > gfp_t gfp_mask = readahead_gfp_mask(mapping); > + bool use_list = mapping->a_ops->readpages; I find this single use variable a little weird. Not a dealbreaker, but just checking the methods would seem a little more obvious to me. Except for this and the other nitpick the patch looks good to me: Reviewed-by: Christoph Hellwig <hch@lst.de>
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 55fcea0249e6..4989d330fada 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -647,8 +647,28 @@ struct readahead_control { /* private: use the readahead_* accessors instead */ pgoff_t _index; unsigned int _nr_pages; + unsigned int _batch_count; }; +static inline struct page *readahead_page(struct readahead_control *rac) +{ + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + rac->_batch_count = 0; + + if (!rac->_nr_pages) + return NULL; + + page = xa_load(&rac->mapping->i_pages, rac->_index); + VM_BUG_ON_PAGE(!PageLocked(page), page); + rac->_batch_count = hpage_nr_pages(page); + + return page; +} + /* The number of pages in this readahead block */ static inline unsigned int readahead_count(struct readahead_control *rac) { diff --git a/mm/readahead.c b/mm/readahead.c index 83df5c061d33..aaa209559ba2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -113,15 +113,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static void read_pages(struct readahead_control *rac, struct list_head *pages, - gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages) { const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; if (!readahead_count(rac)) - return; + goto out; blk_start_plug(&plug); @@ -130,23 +129,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; - } - - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, rac->mapping, page->index, - gfp)) + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { aops->readpage(rac->file, page); - put_page(page); + put_page(page); + } } -out: blk_finish_plug(&plug); BUG_ON(!list_empty(pages)); - rac->_nr_pages = 0; + BUG_ON(readahead_count(rac)); + +out: + /* If we were called due to a conflicting page, skip over it */ + rac->_index++; } /* @@ -165,9 +164,11 @@ void __do_page_cache_readahead(struct address_space *mapping, LIST_HEAD(page_pool); loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + bool use_list = mapping->a_ops->readpages; struct readahead_control rac = { .mapping = mapping, .file = filp, + ._index = index, ._nr_pages = 0, }; unsigned long i; @@ -184,6 +185,8 @@ void __do_page_cache_readahead(struct address_space *mapping, if (index + i > end_index) break; + BUG_ON(index + i != rac._index + rac._nr_pages); + page = xa_load(&mapping->i_pages, index + i); if (page && !xa_is_value(page)) { /* @@ -191,15 +194,22 @@ void __do_page_cache_readahead(struct address_space *mapping, * contiguous pages before continuing with the next * batch. */ - read_pages(&rac, &page_pool, gfp_mask); + read_pages(&rac, &page_pool); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = index + i; - list_add(&page->lru, &page_pool); + if (use_list) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool); + continue; + } if (i == nr_to_read - lookahead_size) SetPageReadahead(page); rac._nr_pages++; @@ -210,7 +220,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(&rac, &page_pool, gfp_mask); + read_pages(&rac, &page_pool); } /*