diff mbox series

[1/5] mm: add PSI accounting around ->read_folio and ->readahead calls

Message ID 20220910065058.3303831-2-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [1/5] mm: add PSI accounting around ->read_folio and ->readahead calls | expand

Commit Message

Christoph Hellwig Sept. 10, 2022, 6:50 a.m. UTC
PSI tries to account for the cost of bringing back in pages discarded by
the MM LRU management.  Currently the prime place for that is hooked into
the bio submission path, which is a rather bad place:

 - it does not actually account I/O for non-block file systems, of which
   we have many
 - it adds overhead and a layering violation to the block layer

Add the accounting into the two places in the core MM code that read
pages into an address space by calling into ->read_folio and ->readahead
so that the entire file system operations are covered, to broaden
the coverage and allow removing the accounting in the block layer going
forward.

As psi_memstall_enter can deal with nested calls this will not lead to
double accounting even while the bio annotations are still present.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            |  7 +++++++
 mm/readahead.c          | 22 ++++++++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

Comments

Jens Axboe Sept. 10, 2022, 11:34 a.m. UTC | #1
On 9/10/22 12:50 AM, Christoph Hellwig wrote:
> @@ -2390,8 +2392,13 @@ static int filemap_read_folio(struct file *file, filler_t filler,
>  	 * fails.
>  	 */
>  	folio_clear_error(folio);
> +
>  	/* Start the actual read. The read will unlock the page. */
> +	if (unlikely(workingset))
> +		psi_memstall_enter(&pflags);
>  	error = filler(file, folio);
> +	if (unlikely(workingset))
> +		psi_memstall_leave(&pflags);
>  	if (error)
>  		return error;

I think this would read better as:

  	/* Start the actual read. The read will unlock the page. */
	if (unlikely(workingset)) {
		psi_memstall_enter(&pflags);
		error = filler(file, folio);
		psi_memstall_leave(&pflags);
	} else {
		error = filler(file, folio);
	}
  	if (error)
  		return error;
Matthew Wilcox Sept. 10, 2022, 6:26 p.m. UTC | #2
On Sat, Sep 10, 2022 at 08:50:54AM +0200, Christoph Hellwig wrote:
> @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
>  	if (index == mark)
>  		folio_set_readahead(folio);
>  	err = filemap_add_folio(ractl->mapping, folio, index, gfp);
> -	if (err)
> +	if (err) {
>  		folio_put(folio);
> -	else
> -		ractl->_nr_pages += 1UL << order;
> -	return err;
> +		return err;
> +	}
> +
> +	ractl->_nr_pages += 1UL << order;
> +	ractl->_workingset = folio_test_workingset(folio);

I don't have time to look at this properly right now (about to catch a
bus to the plane), but I think this should be |=, not =?
Christoph Hellwig Sept. 12, 2022, 8:33 a.m. UTC | #3
On Sat, Sep 10, 2022 at 07:26:35PM +0100, Matthew Wilcox wrote:
> I don't have time to look at this properly right now (about to catch a
> bus to the plane), but I think this should be |=, not =?

Yes.
Christoph Hellwig Sept. 12, 2022, 8:35 a.m. UTC | #4
On Sat, Sep 10, 2022 at 05:34:02AM -0600, Jens Axboe wrote:
> >  	/* Start the actual read. The read will unlock the page. */
> > +	if (unlikely(workingset))
> > +		psi_memstall_enter(&pflags);
> >  	error = filler(file, folio);
> > +	if (unlikely(workingset))
> > +		psi_memstall_leave(&pflags);
> >  	if (error)
> >  		return error;
> 
> I think this would read better as:
> 
>   	/* Start the actual read. The read will unlock the page. */
> 	if (unlikely(workingset)) {
> 		psi_memstall_enter(&pflags);
> 		error = filler(file, folio);
> 		psi_memstall_leave(&pflags);
> 	} else {
> 		error = filler(file, folio);
> 	}
>   	if (error)
>   		return error;

I had it both ways.  For any non-trivial code in the conditionals I
tend to go with your version all the time.  But for two times a single
lines both variants tends to suck, so I can live with either one.
Johannes Weiner Sept. 14, 2022, 11:41 a.m. UTC | #5
On Sat, Sep 10, 2022 at 08:50:54AM +0200, Christoph Hellwig wrote:
> PSI tries to account for the cost of bringing back in pages discarded by
> the MM LRU management.  Currently the prime place for that is hooked into
> the bio submission path, which is a rather bad place:
> 
>  - it does not actually account I/O for non-block file systems, of which
>    we have many
>  - it adds overhead and a layering violation to the block layer
> 
> Add the accounting into the two places in the core MM code that read
> pages into an address space by calling into ->read_folio and ->readahead
> so that the entire file system operations are covered, to broaden
> the coverage and allow removing the accounting in the block layer going
> forward.
> 
> As psi_memstall_enter can deal with nested calls this will not lead to
> double accounting even while the bio annotations are still present.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

This is much cleaner. With the fixlet Willy pointed out:

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
diff mbox series

Patch

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0178b2040ea38..201dc7281640b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1173,6 +1173,8 @@  struct readahead_control {
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
+	bool _workingset;
+	unsigned long _pflags;
 };
 
 #define DEFINE_READAHEAD(ractl, f, r, m, i)				\
diff --git a/mm/filemap.c b/mm/filemap.c
index 15800334147b3..c943d1b90cc26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2382,6 +2382,8 @@  static void filemap_get_read_batch(struct address_space *mapping,
 static int filemap_read_folio(struct file *file, filler_t filler,
 		struct folio *folio)
 {
+	bool workingset = folio_test_workingset(folio);
+	unsigned long pflags;
 	int error;
 
 	/*
@@ -2390,8 +2392,13 @@  static int filemap_read_folio(struct file *file, filler_t filler,
 	 * fails.
 	 */
 	folio_clear_error(folio);
+
 	/* Start the actual read. The read will unlock the page. */
+	if (unlikely(workingset))
+		psi_memstall_enter(&pflags);
 	error = filler(file, folio);
+	if (unlikely(workingset))
+		psi_memstall_leave(&pflags);
 	if (error)
 		return error;
 
diff --git a/mm/readahead.c b/mm/readahead.c
index fdcd28cbd92de..43631c146d6dc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -122,6 +122,7 @@ 
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/psi.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
 #include <linux/mm_inline.h>
@@ -152,6 +153,8 @@  static void read_pages(struct readahead_control *rac)
 	if (!readahead_count(rac))
 		return;
 
+	if (unlikely(rac->_workingset))
+		psi_memstall_enter(&rac->_pflags);
 	blk_start_plug(&plug);
 
 	if (aops->readahead) {
@@ -179,6 +182,9 @@  static void read_pages(struct readahead_control *rac)
 	}
 
 	blk_finish_plug(&plug);
+	if (unlikely(rac->_workingset))
+		psi_memstall_leave(&rac->_pflags);
+	rac->_workingset = false;
 
 	BUG_ON(readahead_count(rac));
 }
@@ -252,6 +258,7 @@  void page_cache_ra_unbounded(struct readahead_control *ractl,
 		}
 		if (i == nr_to_read - lookahead_size)
 			folio_set_readahead(folio);
+		ractl->_workingset |= folio_test_workingset(folio);
 		ractl->_nr_pages++;
 	}
 
@@ -480,11 +487,14 @@  static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 	if (index == mark)
 		folio_set_readahead(folio);
 	err = filemap_add_folio(ractl->mapping, folio, index, gfp);
-	if (err)
+	if (err) {
 		folio_put(folio);
-	else
-		ractl->_nr_pages += 1UL << order;
-	return err;
+		return err;
+	}
+
+	ractl->_nr_pages += 1UL << order;
+	ractl->_workingset = folio_test_workingset(folio);
+	return 0;
 }
 
 void page_cache_ra_order(struct readahead_control *ractl,
@@ -826,6 +836,10 @@  void readahead_expand(struct readahead_control *ractl,
 			put_page(page);
 			return;
 		}
+		if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
+			ractl->_workingset = true;
+			psi_memstall_enter(&ractl->_pflags);
+		}
 		ractl->_nr_pages++;
 		if (ra) {
 			ra->size++;