diff mbox series

filemap: add trace events for get_pages, map_pages, and fault

Message ID 20240618093656.1944210-1-takayas@chromium.org (mailing list archive)
State Superseded
Headers show
Series filemap: add trace events for get_pages, map_pages, and fault | expand

Commit Message

Takaya Saeki June 18, 2024, 9:36 a.m. UTC
To allow precise tracking of page caches accessed, add new tracepoints
that trigger when a process actually accesses them.

The ureadahead program used by ChromeOS traces the disk access of
programs as they start up at boot up. It uses mincore(2) or the
'mm_filemap_add_to_page_cache' trace event to accomplish this. It stores
this information in a "pack" file and on subsequent boots, it will read
the pack file and call readahead(2) on the information so that disk
storage can be loaded into RAM before the applications actually need it.

A problem we see is that due to the kernel's readahead algorithm that
can aggressively pull in more data than needed (to try and accomplish
the same goal) and this data is also recorded. The end result is that
the pack file contains a lot of pages on disk that are never actually
used. Calling readahead(2) on these unused pages can slow down the
system boot up times.

To solve this, add 3 new trace events, get_pages, map_pages, and fault.
These will be used to trace the pages are not only pulled in from disk,
but are actually used by the application. Only those pages will be
stored in the pack file, and this helps out the performance of boot up.

With the combination of these 3 new trace events and
mm_filemap_add_to_page_cache, we observed a reduction in the pack file
by 7.3% - 20% on ChromeOS varying by device.

Signed-off-by: Takaya Saeki <takayas@chromium.org>
---
 include/trace/events/filemap.h | 84 ++++++++++++++++++++++++++++++++++
 mm/filemap.c                   |  4 ++
 2 files changed, 88 insertions(+)

Comments

Steven Rostedt June 18, 2024, 7:53 p.m. UTC | #1
On Tue, 18 Jun 2024 09:36:56 +0000
Takaya Saeki <takayas@chromium.org> wrote:

> To allow precise tracking of page caches accessed, add new tracepoints
> that trigger when a process actually accesses them.
> 
> The ureadahead program used by ChromeOS traces the disk access of
> programs as they start up at boot up. It uses mincore(2) or the
> 'mm_filemap_add_to_page_cache' trace event to accomplish this. It stores
> this information in a "pack" file and on subsequent boots, it will read
> the pack file and call readahead(2) on the information so that disk
> storage can be loaded into RAM before the applications actually need it.
> 
> A problem we see is that due to the kernel's readahead algorithm that
> can aggressively pull in more data than needed (to try and accomplish
> the same goal) and this data is also recorded. The end result is that
> the pack file contains a lot of pages on disk that are never actually
> used. Calling readahead(2) on these unused pages can slow down the
> system boot up times.
> 
> To solve this, add 3 new trace events, get_pages, map_pages, and fault.
> These will be used to trace the pages are not only pulled in from disk,
> but are actually used by the application. Only those pages will be
> stored in the pack file, and this helps out the performance of boot up.
> 
> With the combination of these 3 new trace events and
> mm_filemap_add_to_page_cache, we observed a reduction in the pack file
> by 7.3% - 20% on ChromeOS varying by device.
> 
> Signed-off-by: Takaya Saeki <takayas@chromium.org>

Thanks Takaya.

I just applied this and ran:

 # trace-cmd start -e filemap

did a less and then:

 # trace-cmd show | grep less-901 | grep 13f730
            less-901     [005] .....    72.531607: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bad ofs=0 order=0
            less-901     [005] .....    72.531613: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bcc ofs=4096 order=0
            less-901     [005] .....    72.531617: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113b19 ofs=8192 order=0
            less-901     [005] .....    72.531620: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x114970 ofs=12288 order=0
            less-901     [005] .....    72.532039: mm_filemap_get_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=4096
            less-901     [005] .....    72.532104: mm_filemap_get_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=4096
            less-901     [005] .....    72.532107: mm_filemap_get_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=4096
            less-901     [005] .....    72.532497: mm_filemap_fault: dev 253:3 ino 13f730 ofs=196608
            less-901     [005] .....    72.532503: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x111232 ofs=131072 order=0
            less-901     [005] .....    72.532505: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x110099 ofs=135168 order=0
            less-901     [005] .....    72.532506: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11496b ofs=139264 order=0
            less-901     [005] .....    72.532509: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x10b061 ofs=143360 order=0
            less-901     [005] .....    72.532511: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x111230 ofs=147456 order=0
            less-901     [005] .....    72.532512: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x110c88 ofs=151552 order=0
            less-901     [005] .....    72.532515: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1119ef ofs=155648 order=0
            less-901     [005] .....    72.532518: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x111b49 ofs=159744 order=0
            less-901     [005] .....    72.532519: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11498b ofs=163840 order=0
            less-901     [005] .....    72.532521: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1149ab ofs=167936 order=0
            less-901     [005] .....    72.532523: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bcb ofs=172032 order=0
            less-901     [005] .....    72.532525: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1119f3 ofs=176128 order=0
            less-901     [005] .....    72.532527: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1171d0 ofs=180224 order=0
            less-901     [005] .....    72.532529: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x112ae7 ofs=184320 order=0
            less-901     [005] .....    72.532531: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bc0 ofs=188416 order=0
            less-901     [005] .....    72.532534: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x111ad8 ofs=192512 order=0
            less-901     [005] .....    72.532535: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1149b6 ofs=196608 order=0

I'm guessing the above is the kernel "readahead" kicking in.

            less-901     [005] .....    72.533002: mm_filemap_fault: dev 253:3 ino 13f730 ofs=196608
            less-901     [005] .....    72.533237: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=16384
            less-901     [005] .....    72.533245: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=176128 max_ofs=196608
            less-901     [005] .....    72.533644: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=16384
            less-901     [005] .....    72.533644: mm_filemap_fault: dev 253:3 ino 13f730 ofs=4096
            less-901     [005] .....    72.533652: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x110d59 ofs=16384 order=0
            less-901     [005] .....    72.533655: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1149a4 ofs=20480 order=0
            less-901     [005] .....    72.533658: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11498c ofs=24576 order=0
            less-901     [005] .....    72.533661: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x114969 ofs=28672 order=0
            less-901     [005] .....    72.533663: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11496c ofs=32768 order=0
            less-901     [005] .....    72.533666: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bc8 ofs=36864 order=0
            less-901     [005] .....    72.533668: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113b23 ofs=40960 order=0
            less-901     [005] .....    72.533671: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11497f ofs=45056 order=0
            less-901     [005] .....    72.533674: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x114981 ofs=49152 order=0
            less-901     [005] .....    72.533676: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x114983 ofs=53248 order=0
            less-901     [005] .....    72.533679: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11124e ofs=57344 order=0
            less-901     [005] .....    72.533682: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x114d97 ofs=61440 order=0
            less-901     [005] .....    72.533684: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x113bbe ofs=65536 order=0
            less-901     [005] .....    72.533687: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1161ff ofs=69632 order=0
            less-901     [005] .....    72.533690: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11622f ofs=73728 order=0
            less-901     [005] .....    72.533692: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116554 ofs=77824 order=0
            less-901     [005] .....    72.533695: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116553 ofs=81920 order=0
            less-901     [005] .....    72.533698: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11626d ofs=86016 order=0
            less-901     [005] .....    72.533700: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11621f ofs=90112 order=0
            less-901     [005] .....    72.533703: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116564 ofs=94208 order=0
            less-901     [005] .....    72.533706: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116582 ofs=98304 order=0
            less-901     [005] .....    72.533708: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116580 ofs=102400 order=0
            less-901     [005] .....    72.533713: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11657e ofs=106496 order=0
            less-901     [005] .....    72.533716: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x11657d ofs=110592 order=0
            less-901     [005] .....    72.533718: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116237 ofs=114688 order=0
            less-901     [005] .....    72.533721: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1162a1 ofs=118784 order=0
            less-901     [005] .....    72.533724: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x116569 ofs=122880 order=0
            less-901     [005] .....    72.533726: mm_filemap_add_to_page_cache: dev 253:3 ino 13f730 pfn=0x1161fc ofs=126976 order=0

Same here.

            less-901     [005] .....    72.533769: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=16384
            less-901     [005] .....    72.534352: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=16384
            less-901     [005] .....    72.534354: mm_filemap_fault: dev 253:3 ino 13f730 ofs=16384
            less-901     [005] .....    72.534358: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=0 max_ofs=16384
            less-901     [005] .....    72.534580: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=20480 max_ofs=81920
            less-901     [005] .....    72.534611: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=126976 max_ofs=172032
            less-901     [005] .....    72.534630: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=32768 max_ofs=94208
            less-901     [005] .....    72.534715: mm_filemap_map_pages: dev 253:3 ino 13f730 ofs=98304 max_ofs=122880

Just to get some numbers:

 # trace-cmd show |grep less-901 | grep 13f730 | cut -d: -f2- |grep add_to_page | sort -u |wc -l
 49

 # trace-cmd show |grep less-901 | grep 13f730 | cut -d: -f2- |grep fault | sort -u |wc -l
 3

 # trace-cmd show |grep less-901 | grep 13f730 | cut -d: -f2- |grep get_pages | sort -u |wc -l
 1

 # trace-cmd show |grep less-901 | grep 13f730 | cut -d: -f2- |grep map_pages | sort -u |wc -l
 6

Note, ureadahead ignores duplicate pages.

If these new events really do only show what is used and not what is
just pulled in, and doesn't miss anything, I can see it bringing down
the number of pages needed to be saved dramatically.

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve
Takaya Saeki June 19, 2024, 5:57 p.m. UTC | #2
Thanks Steven for trying it out.

> I can see it bringing down the number of pages needed to be saved dramatically.

Yes, I agree.
However, note that wc does not count the size of the page caches
correctly since 'get_map_pages' gives you a range. In your example of
the less command, actually the total sizes of page_cache_size and
get_map_pages look the same.

Instead of less, running a large binary such as Chrome (232MB) gives
us a better example. '# trace-cmd record -e filemap chrome --version'
showed 58% reduction from 42MB to 17MB in my environment.
- Total size of mm_filemap_add_to_page_cache: 42,958,848 bytes
- With mm_filemap_map_pages and mm_filemap_get_pages: 17,993,728 bytes

By the way,  'mm_filemap_map_pages' traces a range requested to
populate, which includes pages that are not in caches yet and thus are
skipped. So, you need to calculate the intersection with
mm_filemap_add_to_page_cache to see page caches that are actually
mapped.
I'm wondering if we should put a trace event per each successful page
mapping event as mm_filemap_add_to_page_cache does, by putting an
event inside the page map loop.
Matthew Wilcox June 19, 2024, 9:03 p.m. UTC | #3
On Tue, Jun 18, 2024 at 09:36:56AM +0000, Takaya Saeki wrote:
> +	TP_printk(
> +		"dev %d:%d ino %lx ofs=%lu max_ofs=%lu",

It seems weird to have a space between dev and %d, but an equals between
ofs and %lu.  I see there is some precedent for this elsewhere, but
there are other places which use ino=.  I'd rather:

		"dev=%d:%d ino=%lx ofs=%lu max_ofs=%lu",

> +	TP_printk(
> +		"dev %d:%d ino %lx ofs=%lu",

Likewise.

> +		MAJOR(__entry->s_dev),
> +		MINOR(__entry->s_dev), __entry->i_ino,
> +		__entry->index << PAGE_SHIFT

This needs to be cast to an loff_t before shifting.
Takaya Saeki June 20, 2024, 4:18 p.m. UTC | #4
Thank you Matthew for taking a look at this!
I fix both points and send a V2 patch.

> This needs to be cast to an loff_t before shifting.

I found that this applies to add_to_page_cache and delete_from_page_cache, too.
I could fix them too, but I didn't since it will change the max value
of those traces from %lu to %lld on 32 bit environments, which would
be a breaking change. However, let me know if you think we should fix
existing events as well.
diff mbox series

Patch

diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h
index 46c89c1e460c..68c705572c4f 100644
--- a/include/trace/events/filemap.h
+++ b/include/trace/events/filemap.h
@@ -56,6 +56,90 @@  DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
 	TP_ARGS(folio)
 	);
 
+DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range,
+
+	TP_PROTO(
+		struct address_space *mapping,
+		pgoff_t index,
+		pgoff_t last_index
+	),
+
+	TP_ARGS(mapping, index, last_index),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, i_ino)
+		__field(dev_t, s_dev)
+		__field(unsigned long, index)
+		__field(unsigned long, last_index)
+	),
+
+	TP_fast_assign(
+		__entry->i_ino = mapping->host->i_ino;
+		if (mapping->host->i_sb)
+			__entry->s_dev =
+				mapping->host->i_sb->s_dev;
+		else
+			__entry->s_dev = mapping->host->i_rdev;
+		__entry->index = index;
+		__entry->last_index = last_index;
+	),
+
+	TP_printk(
+		"dev %d:%d ino %lx ofs=%lu max_ofs=%lu",
+		MAJOR(__entry->s_dev),
+		MINOR(__entry->s_dev), __entry->i_ino,
+		__entry->index << PAGE_SHIFT,
+		__entry->last_index << PAGE_SHIFT
+	)
+);
+
+DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_get_pages,
+	TP_PROTO(
+		struct address_space *mapping,
+		pgoff_t index,
+		pgoff_t last_index
+	),
+	TP_ARGS(mapping, index, last_index)
+);
+
+DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_map_pages,
+	TP_PROTO(
+		struct address_space *mapping,
+		pgoff_t index,
+		pgoff_t last_index
+	),
+	TP_ARGS(mapping, index, last_index)
+);
+
+TRACE_EVENT(mm_filemap_fault,
+	TP_PROTO(struct address_space *mapping, pgoff_t index),
+
+	TP_ARGS(mapping, index),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, i_ino)
+		__field(dev_t, s_dev)
+		__field(unsigned long, index)
+	),
+
+	TP_fast_assign(
+		__entry->i_ino = mapping->host->i_ino;
+		if (mapping->host->i_sb)
+			__entry->s_dev =
+				mapping->host->i_sb->s_dev;
+		else
+			__entry->s_dev = mapping->host->i_rdev;
+		__entry->index = index;
+	),
+
+	TP_printk(
+		"dev %d:%d ino %lx ofs=%lu",
+		MAJOR(__entry->s_dev),
+		MINOR(__entry->s_dev), __entry->i_ino,
+		__entry->index << PAGE_SHIFT
+	)
+);
+
 TRACE_EVENT(filemap_set_wb_err,
 		TP_PROTO(struct address_space *mapping, errseq_t eseq),
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 876cc64aadd7..39f9d7fb3d2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2556,6 +2556,7 @@  static int filemap_get_pages(struct kiocb *iocb, size_t count,
 			goto err;
 	}
 
+	trace_mm_filemap_get_pages(mapping, index, last_index);
 	return 0;
 err:
 	if (err < 0)
@@ -3286,6 +3287,8 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	if (unlikely(index >= max_idx))
 		return VM_FAULT_SIGBUS;
 
+	trace_mm_filemap_fault(mapping, index);
+
 	/*
 	 * Do we have something in the page cache already?
 	 */
@@ -3652,6 +3655,7 @@  vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
 	add_mm_counter(vma->vm_mm, folio_type, rss);
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
+	trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
 out:
 	rcu_read_unlock();