diff mbox series

[v2,02/18] mm/filemap: Remove dynamically allocated array from filemap_read

Message ID 20201104204219.23810-3-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series Refactor generic_file_buffered_read | expand

Commit Message

Matthew Wilcox Nov. 4, 2020, 8:42 p.m. UTC
Increasing the batch size runs into diminishing returns.  It's probably
better to make, eg, three calls to filemap_get_pages() than it is to
call into kmalloc().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/filemap.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

Comments

Kent Overstreet Nov. 4, 2020, 9:30 p.m. UTC | #1
On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> Increasing the batch size runs into diminishing returns.  It's probably
> better to make, eg, three calls to filemap_get_pages() than it is to
> call into kmalloc().

I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
like working with 4k pages today, and have you actually read the slub code for
the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
doesn't even have to disable preemption - which is why you never see it showing
up in profiles ever since we switched to slub.

It would however be better to have a standard abstraction for this rather than
open coding it - perhaps adding it to the pagevec code. Please don't just drop
it, though.


> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/filemap.c | 15 ++-------------
>  1 file changed, 2 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 23e3781b3aef..bb1c42d0223c 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2429,8 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  	struct file_ra_state *ra = &filp->f_ra;
>  	struct address_space *mapping = filp->f_mapping;
>  	struct inode *inode = mapping->host;
> -	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
> -	unsigned int nr_pages = min_t(unsigned int, 512,
> +	struct page *pages[PAGEVEC_SIZE];
> +	unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
>  			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
>  			(iocb->ki_pos >> PAGE_SHIFT));
>  	int i, pg_nr, error = 0;
> @@ -2441,14 +2441,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  		return 0;
>  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
>  
> -	if (nr_pages > ARRAY_SIZE(pages_onstack))
> -		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
> -
> -	if (!pages) {
> -		pages = pages_onstack;
> -		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
> -	}
> -
>  	do {
>  		cond_resched();
>  
> @@ -2533,9 +2525,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  
>  	file_accessed(filp);
>  
> -	if (pages != pages_onstack)
> -		kfree(pages);
> -
>  	return written ? written : error;
>  }
>  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
> -- 
> 2.28.0
>
Amy Parker Nov. 4, 2020, 9:43 p.m. UTC | #2
On Wed, Nov 4, 2020 at 1:34 PM Kent Overstreet
<kent.overstreet@gmail.com> wrote:
>
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
>
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.

Yeah, kmalloc's fast path is extremely fast. Let's stay off
PAGEVEC_SIZE for now.

>
> It would however be better to have a standard abstraction for this rather than
> open coding it - perhaps adding it to the pagevec code. Please don't just drop
> it, though.
>
>
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > ---
> >  mm/filemap.c | 15 ++-------------
> >  1 file changed, 2 insertions(+), 13 deletions(-)
> >
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 23e3781b3aef..bb1c42d0223c 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -2429,8 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >       struct file_ra_state *ra = &filp->f_ra;
> >       struct address_space *mapping = filp->f_mapping;
> >       struct inode *inode = mapping->host;
> > -     struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
> > -     unsigned int nr_pages = min_t(unsigned int, 512,
> > +     struct page *pages[PAGEVEC_SIZE];
> > +     unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
> >                       ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
> >                       (iocb->ki_pos >> PAGE_SHIFT));
> >       int i, pg_nr, error = 0;
> > @@ -2441,14 +2441,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >               return 0;
> >       iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
> >
> > -     if (nr_pages > ARRAY_SIZE(pages_onstack))
> > -             pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
> > -
> > -     if (!pages) {
> > -             pages = pages_onstack;
> > -             nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
> > -     }
> > -
> >       do {
> >               cond_resched();
> >
> > @@ -2533,9 +2525,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >
> >       file_accessed(filp);
> >
> > -     if (pages != pages_onstack)
> > -             kfree(pages);
> > -
> >       return written ? written : error;
> >  }
> >  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
> > --
> > 2.28.0
> >

Best regards,
Amy Parker
(they/them)
Matthew Wilcox Nov. 5, 2020, 12:13 a.m. UTC | #3
On Wed, Nov 04, 2020 at 04:30:05PM -0500, Kent Overstreet wrote:
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
> 
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.
> 
> It would however be better to have a standard abstraction for this rather than
> open coding it - perhaps adding it to the pagevec code. Please don't just drop
> it, though.

I have the beginnings of a patch for that, but I got busy with other stuff
and didn't finish it.

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 081d934eda64..b067d8aab874 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -18,13 +18,21 @@ struct page;
 struct address_space;
 
 struct pagevec {
-	unsigned char nr;
-	bool percpu_pvec_drained;
-	struct page *pages[PAGEVEC_SIZE];
+	union {
+		struct {
+			unsigned char sz;
+			unsigned char nr;
+			bool percpu_pvec_drained;
+			struct page *pages[];
+		};
+		void *__p[PAGEVEC_SIZE + 1];
+	};
 };
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+struct pagevec *pagevec_alloc(unsigned int sz, gfp_t gfp);
+void pagevec_free(struct pagevec *);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				struct address_space *mapping,
 				pgoff_t start, unsigned nr_entries,
@@ -54,6 +62,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 
 static inline void pagevec_init(struct pagevec *pvec)
 {
+	pvec->sz = PAGEVEC_SIZE;
 	pvec->nr = 0;
 	pvec->percpu_pvec_drained = false;
 }
@@ -63,6 +72,11 @@ static inline void pagevec_reinit(struct pagevec *pvec)
 	pvec->nr = 0;
 }
 
+static inline unsigned pagevec_size(struct pagevec *pvec)
+{
+	return pvec->sz;
+}
+
 static inline unsigned pagevec_count(struct pagevec *pvec)
 {
 	return pvec->nr;
Matthew Wilcox Nov. 5, 2020, 4:52 a.m. UTC | #4
On Wed, Nov 04, 2020 at 04:30:05PM -0500, Kent Overstreet wrote:
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
> 
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.

I've been puzzling over this, and trying to run some benchmarks to figure
it out.  My test VM is too noisy though; the error bars are too large to
get solid data.

There are three reasons why I think we hit diminishing returns:

1. Cost of going into the slab allocator (one alloc, one free).
Maybe that's not as high as I think it is.

2. Let's say the per-page overhead of walking i_pages is 10% of the
CPU time for a 128kB I/O with a batch size of 1.  Increasing the batch
size to 15 means we walk the array 3 times instead of 32 times, or 0.7%
of the CPU time -- total reduction in CPU time of 9.3%.  Increasing the
batch size to 32 means we only walk the array once, which cuts it down
from 10% to 0.3% -- reduction in CPU time of 9.7%.

If we are doing 2MB I/Os (and most applications I've looked at recently
only do 128kB), and the 10% remains constant, then the batch-size-15
case walks the tree 17 times instead of 512 times -- 0.6%, whereas the
batch-size-512 case walks the tree once -- 0.02%.  But that only loks
like an overall savings of 9.98% versus 9.4%.  And is an extra 0.6%
saving worth it?

3. By the time we're doing such large I/Os, we're surely dominated by
memcpy() and not walking the tree.  Even if the file you're working on
is a terabyte in size, the radix tree is only 5 layers deep.  So that's
five pointer dereferences to find the struct page, and they should stay
in cache (maybe they'd fall out to L2, but surely not as far as L3).
And generally radix tree cachelines stay clean so there shouldn't be any
contention on them from other CPUs unless they're dirtying the pages or
writing them back.
Christoph Hellwig Nov. 6, 2020, 8:07 a.m. UTC | #5
On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> Increasing the batch size runs into diminishing returns.  It's probably
> better to make, eg, three calls to filemap_get_pages() than it is to
> call into kmalloc().

Yes, this always looked weird.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Christoph Hellwig Nov. 6, 2020, 8:08 a.m. UTC | #6
On Thu, Nov 05, 2020 at 12:13:02AM +0000, Matthew Wilcox wrote:
> I have the beginnings of a patch for that, but I got busy with other stuff
> and didn't finish it.

If we have numbers that we want larger batch sizes than the current
pagevec PAGEVEC_SIZE this is the way to go insted of special casing
it in one path.
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 23e3781b3aef..bb1c42d0223c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2429,8 +2429,8 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
-	unsigned int nr_pages = min_t(unsigned int, 512,
+	struct page *pages[PAGEVEC_SIZE];
+	unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
 			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
 			(iocb->ki_pos >> PAGE_SHIFT));
 	int i, pg_nr, error = 0;
@@ -2441,14 +2441,6 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		return 0;
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
-	if (nr_pages > ARRAY_SIZE(pages_onstack))
-		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
-
-	if (!pages) {
-		pages = pages_onstack;
-		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
-	}
-
 	do {
 		cond_resched();
 
@@ -2533,9 +2525,6 @@  ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 	file_accessed(filp);
 
-	if (pages != pages_onstack)
-		kfree(pages);
-
 	return written ? written : error;
 }
 EXPORT_SYMBOL_GPL(generic_file_buffered_read);