Message ID | 20220607083714.183788-1-apopple@nvidia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/filemap.c: Always read one page in do_sync_mmap_readahead() | expand |
Looks good, a nice cleanup. Reviewed-by: William Kucharski <william.kucharski@oracle.com> > On Jun 7, 2022, at 2:37 AM, Alistair Popple <apopple@nvidia.com> wrote: > > filemap_fault() calls do_sync_mmap_readahead() to read pages when no > page is found in the page cache. However do_sync_mmap_readahead() will > not actually read any pages if VM_RAND_READ is specified or if there > have been a lot of page cache misses. > > This means filemap_fault() will see a folio that is not up-to-date which > is treated as an IO error. The IO error handling path happens to make > VM_RAND_READ work as expected though because it retries reading the > page. > > However it would be cleaner if this was handled in > do_sync_mmap_readahead() to match what is done for VM_HUGEPAGE. Also as > do_sync_mmap_readahead() adds the newly allocated folio to the pagecache > and unlocks it this clean-up also allows the FGP_FOR_MMAP flag to be > removed. > > Signed-off-by: Alistair Popple <apopple@nvidia.com> > Suggested-by: Matthew Wilcox <willy@infradead.org> > --- > include/linux/pagemap.h | 7 +++--- > mm/filemap.c | 47 +++++++++++++---------------------------- > 2 files changed, 18 insertions(+), 36 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index 993994cd943a..e0e0f5e7d4a0 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -505,10 +505,9 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, > #define FGP_WRITE 0x00000008 > #define FGP_NOFS 0x00000010 > #define FGP_NOWAIT 0x00000020 > -#define FGP_FOR_MMAP 0x00000040 > -#define FGP_HEAD 0x00000080 > -#define FGP_ENTRY 0x00000100 > -#define FGP_STABLE 0x00000200 > +#define FGP_HEAD 0x00000040 > +#define FGP_ENTRY 0x00000080 > +#define FGP_STABLE 0x00000100 > > struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > int fgp_flags, gfp_t gfp); > diff --git a/mm/filemap.c b/mm/filemap.c > index 9a1eef6c5d35..15d7e0a0ad4b 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -1922,9 +1922,6 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) > * * %FGP_CREAT - If no page is present then a new page is allocated using > * @gfp and added to the page cache and the VM's LRU list. > * The page is returned locked and with an increased refcount. > - * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the > - * page is already in cache. If the page was allocated, unlock it before > - * returning so the caller can do the same dance. > * * %FGP_WRITE - The page will be written to by the caller. > * * %FGP_NOFS - __GFP_FS will get cleared in gfp. > * * %FGP_NOWAIT - Don't get blocked by page lock. > @@ -1993,7 +1990,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > if (!folio) > return NULL; > > - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) > + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) > fgp_flags |= FGP_LOCK; > > /* Init accessed so avoid atomic mark_page_accessed later */ > @@ -2007,13 +2004,6 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > if (err == -EEXIST) > goto repeat; > } > - > - /* > - * filemap_add_folio locks the page, and for mmap > - * we expect an unlocked page. > - */ > - if (folio && (fgp_flags & FGP_FOR_MMAP)) > - folio_unlock(folio); > } > > return folio; > @@ -3011,14 +3001,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) > } > #endif > > - /* If we don't want any read-ahead, don't bother */ > - if (vmf->vma->vm_flags & VM_RAND_READ) > - return fpin; > - if (!ra->ra_pages) > - return fpin; > - > + fpin = maybe_unlock_mmap_for_io(vmf, fpin); > if (vmf->vma->vm_flags & VM_SEQ_READ) { > - fpin = maybe_unlock_mmap_for_io(vmf, fpin); > page_cache_sync_ra(&ractl, ra->ra_pages); > return fpin; > } > @@ -3029,19 +3013,20 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) > WRITE_ONCE(ra->mmap_miss, ++mmap_miss); > > /* > - * Do we miss much more than hit in this file? If so, > - * stop bothering with read-ahead. It will only hurt. > + * mmap read-around. If we don't want any read-ahead or if we miss more > + * than we hit don't bother with read-ahead and just read a single page. > */ > - if (mmap_miss > MMAP_LOTSAMISS) > - return fpin; > + if ((vmf->vma->vm_flags & VM_RAND_READ) || > + !ra->ra_pages || mmap_miss > MMAP_LOTSAMISS) { > + ra->start = vmf->pgoff; > + ra->size = 1; > + ra->async_size = 0; > + } else { > + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); > + ra->size = ra->ra_pages; > + ra->async_size = ra->ra_pages / 4; > + } > > - /* > - * mmap read-around > - */ > - fpin = maybe_unlock_mmap_for_io(vmf, fpin); > - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); > - ra->size = ra->ra_pages; > - ra->async_size = ra->ra_pages / 4; > ractl._index = ra->start; > page_cache_ra_order(&ractl, ra, 0); > return fpin; > @@ -3145,9 +3130,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) > filemap_invalidate_lock_shared(mapping); > mapping_locked = true; > } > - folio = __filemap_get_folio(mapping, index, > - FGP_CREAT|FGP_FOR_MMAP, > - vmf->gfp_mask); > + folio = filemap_get_folio(mapping, index); > if (!folio) { > if (fpin) > goto out_retry; > -- > 2.35.1 > >
On Tue, Jun 07, 2022 at 06:37:14PM +1000, Alistair Popple wrote: > --- > include/linux/pagemap.h | 7 +++--- > mm/filemap.c | 47 +++++++++++++---------------------------- > 2 files changed, 18 insertions(+), 36 deletions(-) Love the diffstat ;-) > @@ -3011,14 +3001,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) > } > #endif > > - /* If we don't want any read-ahead, don't bother */ > - if (vmf->vma->vm_flags & VM_RAND_READ) > - return fpin; > - if (!ra->ra_pages) > - return fpin; > - > + fpin = maybe_unlock_mmap_for_io(vmf, fpin); > if (vmf->vma->vm_flags & VM_SEQ_READ) { > - fpin = maybe_unlock_mmap_for_io(vmf, fpin); > page_cache_sync_ra(&ractl, ra->ra_pages); > return fpin; > } Good. Could even pull the maybe_unlock_mmap_for_io() all the way to the top of the file and remove it from the VM_HUGEPAGE case? > @@ -3029,19 +3013,20 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) > WRITE_ONCE(ra->mmap_miss, ++mmap_miss); > > /* > - * Do we miss much more than hit in this file? If so, > - * stop bothering with read-ahead. It will only hurt. > + * mmap read-around. If we don't want any read-ahead or if we miss more > + * than we hit don't bother with read-ahead and just read a single page. > */ > - if (mmap_miss > MMAP_LOTSAMISS) > - return fpin; > + if ((vmf->vma->vm_flags & VM_RAND_READ) || > + !ra->ra_pages || mmap_miss > MMAP_LOTSAMISS) { > + ra->start = vmf->pgoff; > + ra->size = 1; > + ra->async_size = 0; > + } else { I'd put the: /* mmap read-around */ here > + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); > + ra->size = ra->ra_pages; > + ra->async_size = ra->ra_pages / 4; > + } > > - /* > - * mmap read-around > - */ > - fpin = maybe_unlock_mmap_for_io(vmf, fpin); > - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); > - ra->size = ra->ra_pages; > - ra->async_size = ra->ra_pages / 4; > ractl._index = ra->start; > page_cache_ra_order(&ractl, ra, 0); > return fpin; > @@ -3145,9 +3130,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) > filemap_invalidate_lock_shared(mapping); > mapping_locked = true; > } > - folio = __filemap_get_folio(mapping, index, > - FGP_CREAT|FGP_FOR_MMAP, > - vmf->gfp_mask); > + folio = filemap_get_folio(mapping, index); > if (!folio) { > if (fpin) > goto out_retry; I think we also should remove the filemap_invalidate_lock_shared() here, no? We also need to handle the !folio case differently. Before, if it was gone, that was definitely an OOM. Now if it's gone it might have been truncated, or removed due to memory pressure, or it might be an OOM situation where readahead didn't manage to create the folio.
Matthew Wilcox <willy@infradead.org> writes: > On Tue, Jun 07, 2022 at 06:37:14PM +1000, Alistair Popple wrote: >> --- >> include/linux/pagemap.h | 7 +++--- >> mm/filemap.c | 47 +++++++++++++---------------------------- >> 2 files changed, 18 insertions(+), 36 deletions(-) > > Love the diffstat ;-) > >> @@ -3011,14 +3001,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) >> } >> #endif >> >> - /* If we don't want any read-ahead, don't bother */ >> - if (vmf->vma->vm_flags & VM_RAND_READ) >> - return fpin; >> - if (!ra->ra_pages) >> - return fpin; >> - >> + fpin = maybe_unlock_mmap_for_io(vmf, fpin); >> if (vmf->vma->vm_flags & VM_SEQ_READ) { >> - fpin = maybe_unlock_mmap_for_io(vmf, fpin); >> page_cache_sync_ra(&ractl, ra->ra_pages); >> return fpin; >> } > > Good. Could even pull the maybe_unlock_mmap_for_io() all the way to the > top of the file and remove it from the VM_HUGEPAGE case? Good idea. Also while I'm here is there a reason we don't update ra->start or mmap_miss for the VM_HUGEPAGE case? >> @@ -3029,19 +3013,20 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) >> WRITE_ONCE(ra->mmap_miss, ++mmap_miss); >> >> /* >> - * Do we miss much more than hit in this file? If so, >> - * stop bothering with read-ahead. It will only hurt. >> + * mmap read-around. If we don't want any read-ahead or if we miss more >> + * than we hit don't bother with read-ahead and just read a single page. >> */ >> - if (mmap_miss > MMAP_LOTSAMISS) >> - return fpin; >> + if ((vmf->vma->vm_flags & VM_RAND_READ) || >> + !ra->ra_pages || mmap_miss > MMAP_LOTSAMISS) { >> + ra->start = vmf->pgoff; >> + ra->size = 1; >> + ra->async_size = 0; >> + } else { > > I'd put the: > /* mmap read-around */ > here > >> + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); >> + ra->size = ra->ra_pages; >> + ra->async_size = ra->ra_pages / 4; >> + } >> >> - /* >> - * mmap read-around >> - */ >> - fpin = maybe_unlock_mmap_for_io(vmf, fpin); >> - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); >> - ra->size = ra->ra_pages; >> - ra->async_size = ra->ra_pages / 4; >> ractl._index = ra->start; >> page_cache_ra_order(&ractl, ra, 0); >> return fpin; >> @@ -3145,9 +3130,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) >> filemap_invalidate_lock_shared(mapping); >> mapping_locked = true; >> } >> - folio = __filemap_get_folio(mapping, index, >> - FGP_CREAT|FGP_FOR_MMAP, >> - vmf->gfp_mask); >> + folio = filemap_get_folio(mapping, index); >> if (!folio) { >> if (fpin) >> goto out_retry; > > I think we also should remove the filemap_invalidate_lock_shared() > here, no? Right, afaik filemap_invalidate_lock_shared() is needed when instantiating pages in the page cache during fault, which this patch does via page_cache_ra_order() in do_sync_mmap_readahead() so I think you're right about removing it for filemap_get_folio(). However do_sync_mmap_readahead() is the way normal (ie. !VM_RAND_READ) pages would get instantiated today. So shouldn't filemap_invalidate_lock_shared() be called before do_sync_mmap_readahead() anyway? Or am I missing something? > We also need to handle the !folio case differently. Before, if it was > gone, that was definitely an OOM. Now if it's gone it might have been > truncated, or removed due to memory pressure, or it might be an OOM > situation where readahead didn't manage to create the folio. Good point, thanks for catching that.
Alistair Popple <apopple@nvidia.com> writes: > Matthew Wilcox <willy@infradead.org> writes: > >> On Tue, Jun 07, 2022 at 06:37:14PM +1000, Alistair Popple wrote: >>> --- >>> include/linux/pagemap.h | 7 +++--- >>> mm/filemap.c | 47 +++++++++++++---------------------------- >>> 2 files changed, 18 insertions(+), 36 deletions(-) >> >> Love the diffstat ;-) >> >>> @@ -3011,14 +3001,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) >>> } >>> #endif >>> >>> - /* If we don't want any read-ahead, don't bother */ >>> - if (vmf->vma->vm_flags & VM_RAND_READ) >>> - return fpin; >>> - if (!ra->ra_pages) >>> - return fpin; >>> - >>> + fpin = maybe_unlock_mmap_for_io(vmf, fpin); >>> if (vmf->vma->vm_flags & VM_SEQ_READ) { >>> - fpin = maybe_unlock_mmap_for_io(vmf, fpin); >>> page_cache_sync_ra(&ractl, ra->ra_pages); >>> return fpin; >>> } >> >> Good. Could even pull the maybe_unlock_mmap_for_io() all the way to the >> top of the file and remove it from the VM_HUGEPAGE case? > > Good idea. Also while I'm here is there a reason we don't update > ra->start or mmap_miss for the VM_HUGEPAGE case? > >>> @@ -3029,19 +3013,20 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) >>> WRITE_ONCE(ra->mmap_miss, ++mmap_miss); >>> >>> /* >>> - * Do we miss much more than hit in this file? If so, >>> - * stop bothering with read-ahead. It will only hurt. >>> + * mmap read-around. If we don't want any read-ahead or if we miss more >>> + * than we hit don't bother with read-ahead and just read a single page. >>> */ >>> - if (mmap_miss > MMAP_LOTSAMISS) >>> - return fpin; >>> + if ((vmf->vma->vm_flags & VM_RAND_READ) || >>> + !ra->ra_pages || mmap_miss > MMAP_LOTSAMISS) { >>> + ra->start = vmf->pgoff; >>> + ra->size = 1; >>> + ra->async_size = 0; >>> + } else { >> >> I'd put the: >> /* mmap read-around */ >> here >> >>> + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); >>> + ra->size = ra->ra_pages; >>> + ra->async_size = ra->ra_pages / 4; >>> + } >>> >>> - /* >>> - * mmap read-around >>> - */ >>> - fpin = maybe_unlock_mmap_for_io(vmf, fpin); >>> - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); >>> - ra->size = ra->ra_pages; >>> - ra->async_size = ra->ra_pages / 4; >>> ractl._index = ra->start; >>> page_cache_ra_order(&ractl, ra, 0); >>> return fpin; >>> @@ -3145,9 +3130,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) >>> filemap_invalidate_lock_shared(mapping); >>> mapping_locked = true; >>> } >>> - folio = __filemap_get_folio(mapping, index, >>> - FGP_CREAT|FGP_FOR_MMAP, >>> - vmf->gfp_mask); >>> + folio = filemap_get_folio(mapping, index); >>> if (!folio) { >>> if (fpin) >>> goto out_retry; >> >> I think we also should remove the filemap_invalidate_lock_shared() >> here, no? > > Right, afaik filemap_invalidate_lock_shared() is needed when > instantiating pages in the page cache during fault, which this patch > does via page_cache_ra_order() in do_sync_mmap_readahead() so I think > you're right about removing it for filemap_get_folio(). > > However do_sync_mmap_readahead() is the way normal (ie. !VM_RAND_READ) > pages would get instantiated today. So shouldn't > filemap_invalidate_lock_shared() be called before > do_sync_mmap_readahead() anyway? Or am I missing something? Never mind. I missed that this is normally done further down the call stack (in page_cache_ra_unbounded()). This makes it somewhat annoying to do this clean-up though, because to deal with this case: if (unlikely(!folio_test_uptodate(folio))) { /* * The page was in cache and uptodate and now it is not. * Strange but possible since we didn't hold the page lock all * the time. Let's drop everything get the invalidate lock and * try again. */ if (!mapping_locked) { In this change we need to be able to call do_sync_mmap_readahead() whilst holding invalidate_lock to ensure we can successfully get an uptodate folio without it being removed by eg. hole punching when the folio lock is dropped. I am experimenting with pulling all the filemap_invalidate_lock_shared() calls further up the stack, but that creates it's own problems. >> We also need to handle the !folio case differently. Before, if it was >> gone, that was definitely an OOM. Now if it's gone it might have been >> truncated, or removed due to memory pressure, or it might be an OOM >> situation where readahead didn't manage to create the folio. > > Good point, thanks for catching that.
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 993994cd943a..e0e0f5e7d4a0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -505,10 +505,9 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 -#define FGP_FOR_MMAP 0x00000040 -#define FGP_HEAD 0x00000080 -#define FGP_ENTRY 0x00000100 -#define FGP_STABLE 0x00000200 +#define FGP_HEAD 0x00000040 +#define FGP_ENTRY 0x00000080 +#define FGP_STABLE 0x00000100 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); diff --git a/mm/filemap.c b/mm/filemap.c index 9a1eef6c5d35..15d7e0a0ad4b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1922,9 +1922,6 @@ static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. - * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the - * page is already in cache. If the page was allocated, unlock it before - * returning so the caller can do the same dance. * * %FGP_WRITE - The page will be written to by the caller. * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't get blocked by page lock. @@ -1993,7 +1990,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, if (!folio) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -2007,13 +2004,6 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, if (err == -EEXIST) goto repeat; } - - /* - * filemap_add_folio locks the page, and for mmap - * we expect an unlocked page. - */ - if (folio && (fgp_flags & FGP_FOR_MMAP)) - folio_unlock(folio); } return folio; @@ -3011,14 +3001,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } #endif - /* If we don't want any read-ahead, don't bother */ - if (vmf->vma->vm_flags & VM_RAND_READ) - return fpin; - if (!ra->ra_pages) - return fpin; - + fpin = maybe_unlock_mmap_for_io(vmf, fpin); if (vmf->vma->vm_flags & VM_SEQ_READ) { - fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } @@ -3029,19 +3013,20 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * mmap read-around. If we don't want any read-ahead or if we miss more + * than we hit don't bother with read-ahead and just read a single page. */ - if (mmap_miss > MMAP_LOTSAMISS) - return fpin; + if ((vmf->vma->vm_flags & VM_RAND_READ) || + !ra->ra_pages || mmap_miss > MMAP_LOTSAMISS) { + ra->start = vmf->pgoff; + ra->size = 1; + ra->async_size = 0; + } else { + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; + } - /* - * mmap read-around - */ - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); - ra->size = ra->ra_pages; - ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; page_cache_ra_order(&ractl, ra, 0); return fpin; @@ -3145,9 +3130,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) filemap_invalidate_lock_shared(mapping); mapping_locked = true; } - folio = __filemap_get_folio(mapping, index, - FGP_CREAT|FGP_FOR_MMAP, - vmf->gfp_mask); + folio = filemap_get_folio(mapping, index); if (!folio) { if (fpin) goto out_retry;
filemap_fault() calls do_sync_mmap_readahead() to read pages when no page is found in the page cache. However do_sync_mmap_readahead() will not actually read any pages if VM_RAND_READ is specified or if there have been a lot of page cache misses. This means filemap_fault() will see a folio that is not up-to-date which is treated as an IO error. The IO error handling path happens to make VM_RAND_READ work as expected though because it retries reading the page. However it would be cleaner if this was handled in do_sync_mmap_readahead() to match what is done for VM_HUGEPAGE. Also as do_sync_mmap_readahead() adds the newly allocated folio to the pagecache and unlocks it this clean-up also allows the FGP_FOR_MMAP flag to be removed. Signed-off-by: Alistair Popple <apopple@nvidia.com> Suggested-by: Matthew Wilcox <willy@infradead.org> --- include/linux/pagemap.h | 7 +++--- mm/filemap.c | 47 +++++++++++++---------------------------- 2 files changed, 18 insertions(+), 36 deletions(-)