diff mbox series

[RFC,1/4] mm, page_owner: add folio allocate post callback for struct page_owner to make the owner clearer

Message ID 20231109032521.392217-2-jeff.xie@linux.dev (mailing list archive)
State New
Headers show
Series mm, page_owner: make the owner in page owner clearer | expand

Commit Message

Jeff Xie Nov. 9, 2023, 3:25 a.m. UTC
adding a callback function in the struct page_owner to let the slab layer or the
anon/file handler layer or any other memory-allocated layers to implement what
they would like to tell.

Signed-off-by: Jeff Xie <jeff.xie@linux.dev>
---
 include/linux/page_owner.h |  9 +++++++++
 mm/page_owner.c            | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

Comments

Matthew Wilcox Nov. 9, 2023, 1:59 p.m. UTC | #1
On Thu, Nov 09, 2023 at 11:25:18AM +0800, Jeff Xie wrote:
> adding a callback function in the struct page_owner to let the slab layer or the
> anon/file handler layer or any other memory-allocated layers to implement what
> they would like to tell.

There's no need to add a callback.  We can tell what a folio is.

> +	if (page_owner->folio_alloc_post_page_owner) {
> +		rcu_read_lock();
> +		tsk = find_task_by_pid_ns(page_owner->pid, &init_pid_ns);
> +		rcu_read_unlock();
> +		ret += page_owner->folio_alloc_post_page_owner(page_folio(page), tsk, page_owner->data,
> +				kbuf + ret, count - ret);
> +	} else
> +		ret += scnprintf(kbuf + ret, count - ret, "OTHER_PAGE\n");

	if (folio_test_slab(folio))
		ret += slab_page_owner_info(folio, kbuf + ret, count - ret);
	else if (folio_test_anon(folio))
		ret += anon_page_owner_info(folio, kbuf + ret, count - ret);
	else if (folio_test_movable(folio))
		ret += null_page_owner_info(folio, kbuf + ret, count - ret);
	else if (folio->mapping)
		ret += file_page_owner_info(folio, kbuf + ret, count - ret);
	else
		ret += null_page_owner_info(folio, kbuf + ret, count - ret);

In this scenario, I have the anon handling ksm pages, but if that's not
desirable, we can add

	else if (folio_test_ksm(folio))
		ret += ksm_page_owner_info(folio, kbuf + ret, count - ret);

right before the folio_test_anon() clause
Jeff Xie Nov. 9, 2023, 3:25 p.m. UTC | #2
Hi Matthew,

On Thu, Nov 9, 2023 at 10:00 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Nov 09, 2023 at 11:25:18AM +0800, Jeff Xie wrote:
> > adding a callback function in the struct page_owner to let the slab layer or the
> > anon/file handler layer or any other memory-allocated layers to implement what
> > they would like to tell.
>
> There's no need to add a callback.  We can tell what a folio is.
>
> > +     if (page_owner->folio_alloc_post_page_owner) {
> > +             rcu_read_lock();
> > +             tsk = find_task_by_pid_ns(page_owner->pid, &init_pid_ns);
> > +             rcu_read_unlock();
> > +             ret += page_owner->folio_alloc_post_page_owner(page_folio(page), tsk, page_owner->data,
> > +                             kbuf + ret, count - ret);
> > +     } else
> > +             ret += scnprintf(kbuf + ret, count - ret, "OTHER_PAGE\n");
>
>         if (folio_test_slab(folio))
>                 ret += slab_page_owner_info(folio, kbuf + ret, count - ret);
>         else if (folio_test_anon(folio))
>                 ret += anon_page_owner_info(folio, kbuf + ret, count - ret);
>         else if (folio_test_movable(folio))
>                 ret += null_page_owner_info(folio, kbuf + ret, count - ret);
>         else if (folio->mapping)
>                 ret += file_page_owner_info(folio, kbuf + ret, count - ret);
>         else
>                 ret += null_page_owner_info(folio, kbuf + ret, count - ret);
>
> In this scenario, I have the anon handling ksm pages, but if that's not
> desirable, we can add
>
>         else if (folio_test_ksm(folio))
>                 ret += ksm_page_owner_info(folio, kbuf + ret, count - ret);
>
> right before the folio_test_anon() clause

Thank you very much for your advice and guidance.
From the perspective of a folio, it cannot obtain information about
all the situations in which folios are allocated.
If we want to determine whether a folio is related to vmalloc or
kernel_stack or the other memory allocation process,
using just a folio parameter is not sufficient. To achieve this goal,
we can add a callback function to provide more extensibility and
information.

for example:

(the following "OTHER_PAGE" will be replaced with the specified
information later)

 Page allocated via order 0, mask
0x2102(__GFP_HIGHMEM|__GFP_NOWARN|__GFP_ZERO), pid 0, tgid 0
(swapper/0), ts 167618849 ns
OTHER_PAGE
PFN 0x4a92 type Unmovable Block 37 type Unmovable Flags
0x1fffc0000000000(node=0|zone=1|lastcpupid=0x3fff)
 post_alloc_hook+0x77/0xf0
 get_page_from_freelist+0x58d/0x14e0
 __alloc_pages+0x1b2/0x380
 __alloc_pages_bulk+0x39f/0x620
 alloc_pages_bulk_array_mempolicy+0x1f4/0x210
 __vmalloc_node_range+0x756/0x870
 __vmalloc_node+0x48/0x60
 gen_pool_add_owner+0x3e/0xb0
 mce_gen_pool_init+0x5a/0x90
 mcheck_cpu_init+0x170/0x4c0
 identify_cpu+0x55f/0x7e0
 arch_cpu_finalize_init+0x10/0x100
 start_kernel+0x517/0x8e0
 x86_64_start_reservations+0x18/0x30
 x86_64_start_kernel+0xc6/0xe0
 secondary_startup_64_no_verify+0x178/0x17b
Matthew Wilcox Nov. 9, 2023, 3:36 p.m. UTC | #3
On Thu, Nov 09, 2023 at 11:25:18PM +0800, Jeff Xie wrote:
> >From the perspective of a folio, it cannot obtain information about
> all the situations in which folios are allocated.
> If we want to determine whether a folio is related to vmalloc or
> kernel_stack or the other memory allocation process,
> using just a folio parameter is not sufficient. To achieve this goal,
> we can add a callback function to provide more extensibility and
> information.

But we want that anyway (or at least I do).  You're right that vmalloc
pages are not marked as being vmalloc pages and don't contain the
information about which vmalloc area they belong to.  I've talked about
ways we can add that information to folios in the past, but I have a lot
of other projects I'm working on.  Are you interested in doing that?
Jeff Xie Nov. 9, 2023, 4:04 p.m. UTC | #4
November 9, 2023 at 11:36 PM, "Matthew Wilcox" <willy@infradead.org> wrote:


> 
> On Thu, Nov 09, 2023 at 11:25:18PM +0800, Jeff Xie wrote:
> 
> > 
> > From the perspective of a folio, it cannot obtain information about
> >  all the situations in which folios are allocated.
> >  If we want to determine whether a folio is related to vmalloc or
> >  kernel_stack or the other memory allocation process,
> >  using just a folio parameter is not sufficient. To achieve this goal,
> >  we can add a callback function to provide more extensibility and
> >  information.
> > 
> 
> But we want that anyway (or at least I do). You're right that vmalloc
> pages are not marked as being vmalloc pages and don't contain the
> information about which vmalloc area they belong to. I've talked about
> ways we can add that information to folios in the past, but I have a lot
> of other projects I'm working on. Are you interested in doing that?
>

Certainly, I'm willing to give it a try. If a folio can include vmalloc information
or more information, this is great. I may need to understand the background of why
you proposed this method in the past.

--
Jeff Xie
Matthew Wilcox Nov. 9, 2023, 4:38 p.m. UTC | #5
On Thu, Nov 09, 2023 at 04:04:39PM +0000, jeff.xie@linux.dev wrote:
> November 9, 2023 at 11:36 PM, "Matthew Wilcox" <willy@infradead.org> wrote:
> > But we want that anyway (or at least I do). You're right that vmalloc
> > pages are not marked as being vmalloc pages and don't contain the
> > information about which vmalloc area they belong to. I've talked about
> > ways we can add that information to folios in the past, but I have a lot
> > of other projects I'm working on. Are you interested in doing that?
> >
> 
> Certainly, I'm willing to give it a try. If a folio can include vmalloc information
> or more information, this is great. I may need to understand the background of why
> you proposed this method in the past.

I can't find the proposal now, but it's basically this:

Turn PG_slab into a PG_kernel.  If PG_kernel is set, then other flags
change their meaning.  Flags that should be reusable: writeback,
referenced, uptodate, lru, active, workingset, private, reclaim

One of those flags gets reused to be the new slab.  So, eg
folio_test_slab() becomes:

	return (folio->flags & (PG_kernel | PG_slab)) == (PG_kernel | PG_slab);

Now we have somewhere that we can use for PG_vmalloc (also PG_reserved
can become a PG_kernel sub-flag, freeing up a page flag).

We'd need to change some helpers.  eg folio_mapping() currently does:

        if (unlikely(folio_test_slab(folio)))
                return NULL;

and that should be:

	if (unlikely(folio_test_kernel(folio)))
		return NULL;

With that in place, we can reuse the folio->mapping space to point to
the struct vm_struct that allocated it.

This isn't an easy project and will require a lot of testing.  It has
some upsides, like freeing up a page flag.
diff mbox series

Patch

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 119a0c9d2a8b..71698d82df7c 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -4,6 +4,9 @@ 
 
 #include <linux/jump_label.h>
 
+typedef int (folio_alloc_post_page_owner_t)(struct folio *folio, struct task_struct *tsk,
+			void *data, char *kbuf, size_t count);
+
 #ifdef CONFIG_PAGE_OWNER
 extern struct static_key_false page_owner_inited;
 extern struct page_ext_operations page_owner_ops;
@@ -17,6 +20,8 @@  extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone);
+extern void set_folio_alloc_post_page_owner(struct folio *folio,
+		folio_alloc_post_page_owner_t *folio_alloc_post_page_owner, void *data);
 
 static inline void reset_page_owner(struct page *page, unsigned short order)
 {
@@ -72,5 +77,9 @@  static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 static inline void dump_page_owner(const struct page *page)
 {
 }
+static inline void set_folio_alloc_post_page_owner(struct folio *folio,
+		folio_alloc_post_page_owner_t *folio_alloc_post_page_owner, void *data)
+{
+}
 #endif /* CONFIG_PAGE_OWNER */
 #endif /* __LINUX_PAGE_OWNER_H */
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4f13ce7d2452..4de03a7a10d4 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -32,6 +32,9 @@  struct page_owner {
 	char comm[TASK_COMM_LEN];
 	pid_t pid;
 	pid_t tgid;
+	folio_alloc_post_page_owner_t *folio_alloc_post_page_owner;
+	/* for folio_alloc_post_page_owner function parameter */
+	void *data;
 };
 
 static bool page_owner_enabled __initdata;
@@ -152,6 +155,8 @@  void __reset_page_owner(struct page *page, unsigned short order)
 		page_owner = get_page_owner(page_ext);
 		page_owner->free_handle = handle;
 		page_owner->free_ts_nsec = free_ts_nsec;
+		page_owner->folio_alloc_post_page_owner = NULL;
+		page_owner->data = NULL;
 		page_ext = page_ext_next(page_ext);
 	}
 	page_ext_put(page_ext);
@@ -256,6 +261,8 @@  void __folio_copy_owner(struct folio *newfolio, struct folio *old)
 	new_page_owner->ts_nsec = old_page_owner->ts_nsec;
 	new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
 	strcpy(new_page_owner->comm, old_page_owner->comm);
+	new_page_owner->folio_alloc_post_page_owner = old_page_owner->folio_alloc_post_page_owner;
+	new_page_owner->data = old_page_owner->data;
 
 	/*
 	 * We don't clear the bit on the old folio as it's going to be freed
@@ -272,6 +279,25 @@  void __folio_copy_owner(struct folio *newfolio, struct folio *old)
 	page_ext_put(old_ext);
 }
 
+void set_folio_alloc_post_page_owner(struct folio *folio,
+		folio_alloc_post_page_owner_t *folio_alloc_post_page_owner, void *data)
+{
+	struct page *page;
+	struct page_ext *page_ext;
+	struct page_owner *page_owner;
+
+	page = &folio->page;
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return;
+
+	page_owner = get_page_owner(page_ext);
+	page_owner->folio_alloc_post_page_owner = folio_alloc_post_page_owner;
+	page_owner->data = data;
+
+	page_ext_put(page_ext);
+}
+
 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				       pg_data_t *pgdat, struct zone *zone)
 {
@@ -400,6 +426,7 @@  print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		depot_stack_handle_t handle)
 {
 	int ret, pageblock_mt, page_mt;
+	struct task_struct *tsk;
 	char *kbuf;
 
 	count = min_t(size_t, count, PAGE_SIZE);
@@ -414,6 +441,15 @@  print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 			page_owner->tgid, page_owner->comm,
 			page_owner->ts_nsec);
 
+	if (page_owner->folio_alloc_post_page_owner) {
+		rcu_read_lock();
+		tsk = find_task_by_pid_ns(page_owner->pid, &init_pid_ns);
+		rcu_read_unlock();
+		ret += page_owner->folio_alloc_post_page_owner(page_folio(page), tsk, page_owner->data,
+				kbuf + ret, count - ret);
+	} else
+		ret += scnprintf(kbuf + ret, count - ret, "OTHER_PAGE\n");
+
 	/* Print information relevant to grouping pages by mobility */
 	pageblock_mt = get_pageblock_migratetype(page);
 	page_mt  = gfp_migratetype(page_owner->gfp_mask);