diff mbox series

[RFC,v3,2/8] mm: aggregate working set information into histograms

Message ID 20240327213108.2384666-3-yuanchu@google.com (mailing list archive)
State New
Headers show
Series mm: workingset reporting | expand

Commit Message

Yuanchu Xie March 27, 2024, 9:31 p.m. UTC
Hierarchically aggregate all memcgs' MGLRU generations and their
page counts into working set page age histograms.
The histograms break down the system's working set per-node,
per-anon/file.

The sysfs interfaces are as follows:
/sys/devices/system/node/nodeX/page_age
	A per-node page age histogram, showing an aggregate of the
	node's lruvecs. The information is extracted from MGLRU's
	per-generation page counters. Reading this file causes a
	hierarchical aging of all lruvecs, scanning pages and creates a
	new generation in each lruvec.
	For example:
	1000 anon=0 file=0
	2000 anon=0 file=0
	100000 anon=5533696 file=5566464
	18446744073709551615 anon=0 file=0

/sys/devices/system/node/nodeX/page_age_interval
	A comma separated list of time in milliseconds that configures
	what the page age histogram uses for aggregation.

Signed-off-by: Yuanchu Xie <yuanchu@google.com>
---
 drivers/base/node.c               |   3 +
 include/linux/mmzone.h            |   4 +
 include/linux/workingset_report.h |  69 +++++
 mm/Kconfig                        |   9 +
 mm/Makefile                       |   1 +
 mm/internal.h                     |   9 +
 mm/memcontrol.c                   |   2 +
 mm/mmzone.c                       |   2 +
 mm/vmscan.c                       |  34 ++-
 mm/workingset_report.c            | 413 ++++++++++++++++++++++++++++++
 10 files changed, 545 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/workingset_report.h
 create mode 100644 mm/workingset_report.c

Comments

Huang, Ying April 9, 2024, 7:18 a.m. UTC | #1
Yuanchu Xie <yuanchu@google.com> writes:

> Hierarchically aggregate all memcgs' MGLRU generations and their
> page counts into working set page age histograms.
> The histograms break down the system's working set per-node,
> per-anon/file.
>
> The sysfs interfaces are as follows:
> /sys/devices/system/node/nodeX/page_age
> 	A per-node page age histogram, showing an aggregate of the
> 	node's lruvecs. The information is extracted from MGLRU's
> 	per-generation page counters. Reading this file causes a
> 	hierarchical aging of all lruvecs, scanning pages and creates a
> 	new generation in each lruvec.
> 	For example:
> 	1000 anon=0 file=0
> 	2000 anon=0 file=0
> 	100000 anon=5533696 file=5566464
> 	18446744073709551615 anon=0 file=0
>
> /sys/devices/system/node/nodeX/page_age_interval
> 	A comma separated list of time in milliseconds that configures
> 	what the page age histogram uses for aggregation.
>
> Signed-off-by: Yuanchu Xie <yuanchu@google.com>
> ---
>  drivers/base/node.c               |   3 +
>  include/linux/mmzone.h            |   4 +
>  include/linux/workingset_report.h |  69 +++++
>  mm/Kconfig                        |   9 +
>  mm/Makefile                       |   1 +
>  mm/internal.h                     |   9 +
>  mm/memcontrol.c                   |   2 +
>  mm/mmzone.c                       |   2 +
>  mm/vmscan.c                       |  34 ++-
>  mm/workingset_report.c            | 413 ++++++++++++++++++++++++++++++
>  10 files changed, 545 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/workingset_report.h
>  create mode 100644 mm/workingset_report.c
>
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 1c05640461dd..4f589b8253f4 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -20,6 +20,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/swap.h>
>  #include <linux/slab.h>
> +#include <linux/workingset_report.h>
>  
>  static const struct bus_type node_subsys = {
>  	.name = "node",
> @@ -625,6 +626,7 @@ static int register_node(struct node *node, int num)
>  	} else {
>  		hugetlb_register_node(node);
>  		compaction_register_node(node);
> +		wsr_register_node(node);
>  	}
>  
>  	return error;
> @@ -641,6 +643,7 @@ void unregister_node(struct node *node)
>  {
>  	hugetlb_unregister_node(node);
>  	compaction_unregister_node(node);
> +	wsr_unregister_node(node);
>  	node_remove_accesses(node);
>  	node_remove_caches(node);
>  	device_unregister(&node->dev);
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index a497f189d988..8839931646ee 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -24,6 +24,7 @@
>  #include <linux/local_lock.h>
>  #include <linux/zswap.h>
>  #include <asm/page.h>
> +#include <linux/workingset_report.h>
>  
>  /* Free memory management - zoned buddy allocator.  */
>  #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
> @@ -625,6 +626,9 @@ struct lruvec {
>  	struct lru_gen_mm_state		mm_state;
>  #endif
>  #endif /* CONFIG_LRU_GEN */
> +#ifdef CONFIG_WORKINGSET_REPORT
> +	struct wsr_state	wsr;
> +#endif /* CONFIG_WORKINGSET_REPORT */
>  #ifdef CONFIG_MEMCG
>  	struct pglist_data *pgdat;
>  #endif
> diff --git a/include/linux/workingset_report.h b/include/linux/workingset_report.h
> new file mode 100644
> index 000000000000..0de640cb1ef0
> --- /dev/null
> +++ b/include/linux/workingset_report.h
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_WORKINGSET_REPORT_H
> +#define _LINUX_WORKINGSET_REPORT_H
> +
> +#include <linux/types.h>
> +#include <linux/mutex.h>
> +
> +struct mem_cgroup;
> +struct pglist_data;
> +struct node;
> +struct lruvec;
> +
> +#ifdef CONFIG_WORKINGSET_REPORT
> +
> +#define WORKINGSET_REPORT_MIN_NR_BINS 2
> +#define WORKINGSET_REPORT_MAX_NR_BINS 32
> +
> +#define WORKINGSET_INTERVAL_MAX ((unsigned long)-1)
> +#define ANON_AND_FILE 2
> +
> +struct wsr_report_bin {
> +	unsigned long idle_age;
> +	unsigned long nr_pages[ANON_AND_FILE];
> +};
> +
> +struct wsr_report_bins {
> +	unsigned long nr_bins;
> +	/* last bin contains WORKINGSET_INTERVAL_MAX */
> +	struct wsr_report_bin bins[WORKINGSET_REPORT_MAX_NR_BINS];
> +};
> +
> +struct wsr_page_age_histo {
> +	unsigned long timestamp;
> +	struct wsr_report_bins bins;
> +};
> +
> +struct wsr_state {
> +	/* breakdown of workingset by page age */
> +	struct mutex page_age_lock;
> +	struct wsr_page_age_histo *page_age;
> +};
> +
> +void wsr_init(struct lruvec *lruvec);
> +void wsr_destroy(struct lruvec *lruvec);
> +
> +/*
> + * Returns true if the wsr is configured to be refreshed.
> + * The next refresh time is stored in refresh_time.
> + */
> +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root,
> +			struct pglist_data *pgdat);
> +void wsr_register_node(struct node *node);
> +void wsr_unregister_node(struct node *node);
> +#else
> +static inline void wsr_init(struct lruvec *lruvec)
> +{
> +}
> +static inline void wsr_destroy(struct lruvec *lruvec)
> +{
> +}
> +static inline void wsr_register_node(struct node *node)
> +{
> +}
> +static inline void wsr_unregister_node(struct node *node)
> +{
> +}
> +#endif /* CONFIG_WORKINGSET_REPORT */
> +
> +#endif /* _LINUX_WORKINGSET_REPORT_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ffc3a2ba3a8c..212f203b10b9 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1261,6 +1261,15 @@ config LOCK_MM_AND_FIND_VMA
>  config IOMMU_MM_DATA
>  	bool
>  
> +config WORKINGSET_REPORT
> +	bool "Working set reporting"
> +	depends on LRU_GEN && SYSFS
> +	help
> +	  Report system and per-memcg working set to userspace.
> +
> +	  This option exports stats and events giving the user more insight
> +	  into its memory working set.
> +
>  source "mm/damon/Kconfig"
>  
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index e4b5b75aaec9..57093657030d 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -92,6 +92,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
>  obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
>  obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> +obj-$(CONFIG_WORKINGSET_REPORT) += workingset_report.o
>  ifdef CONFIG_SWAP
>  obj-$(CONFIG_MEMCG) += swap_cgroup.o
>  endif
> diff --git a/mm/internal.h b/mm/internal.h
> index f309a010d50f..5e0caba64ee4 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -198,12 +198,21 @@ extern unsigned long highest_memmap_pfn;
>  /*
>   * in mm/vmscan.c:
>   */
> +struct scan_control;
>  bool isolate_lru_page(struct page *page);
>  bool folio_isolate_lru(struct folio *folio);
>  void putback_lru_page(struct page *page);
>  void folio_putback_lru(struct folio *folio);
>  extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
>  
> +#ifdef CONFIG_WORKINGSET_REPORT
> +/*
> + * in mm/wsr.c
> + */
> +/* Requires wsr->page_age_lock held */
> +void wsr_refresh_scan(struct lruvec *lruvec);
> +#endif
> +
>  /*
>   * in mm/rmap.c:
>   */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1ed40f9d3a27..2f07141de16c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -65,6 +65,7 @@
>  #include <linux/seq_buf.h>
>  #include <linux/sched/isolation.h>
>  #include <linux/kmemleak.h>
> +#include <linux/workingset_report.h>
>  #include "internal.h"
>  #include <net/sock.h>
>  #include <net/ip.h>
> @@ -5457,6 +5458,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  	if (!pn)
>  		return;
>  
> +	wsr_destroy(&pn->lruvec);
>  	free_percpu(pn->lruvec_stats_percpu);
>  	kfree(pn);
>  }
> diff --git a/mm/mmzone.c b/mm/mmzone.c
> index c01896eca736..efca44c1b84b 100644
> --- a/mm/mmzone.c
> +++ b/mm/mmzone.c
> @@ -90,6 +90,8 @@ void lruvec_init(struct lruvec *lruvec)
>  	 */
>  	list_del(&lruvec->lists[LRU_UNEVICTABLE]);
>  
> +	wsr_init(lruvec);
> +
>  	lru_gen_init_lruvec(lruvec);
>  }
>  
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1a7c7d537db6..b694d80ab2d1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -56,6 +56,7 @@
>  #include <linux/khugepaged.h>
>  #include <linux/rculist_nulls.h>
>  #include <linux/random.h>
> +#include <linux/workingset_report.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
> @@ -3815,7 +3816,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
>  	return success;
>  }
>  
> -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
> +bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
>  			       struct scan_control *sc, bool can_swap, bool force_scan)

It appears that this change isn't necessary.

>  {
>  	bool success;
> @@ -5606,6 +5607,8 @@ static int __init init_lru_gen(void)
>  	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
>  		pr_err("lru_gen: failed to create sysfs group\n");
>  
> +	wsr_register_node(NULL);
> +
>  	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
>  	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
>  
> @@ -5613,6 +5616,35 @@ static int __init init_lru_gen(void)
>  };
>  late_initcall(init_lru_gen);
>  
> +/******************************************************************************
> + *                          workingset reporting
> + ******************************************************************************/
> +#ifdef CONFIG_WORKINGSET_REPORT
> +void wsr_refresh_scan(struct lruvec *lruvec)
> +{
> +	DEFINE_MAX_SEQ(lruvec);
> +	struct scan_control sc = {
> +		.may_writepage = true,
> +		.may_unmap = true,
> +		.may_swap = true,
> +		.proactive = true,
> +		.reclaim_idx = MAX_NR_ZONES - 1,
> +		.gfp_mask = GFP_KERNEL,
> +	};
> +	unsigned int flags;
> +
> +	set_task_reclaim_state(current, &sc.reclaim_state);
> +	flags = memalloc_noreclaim_save();
> +	/*
> +	 * setting can_swap=true and force_scan=true ensures
> +	 * proper workingset stats when the system cannot swap.
> +	 */
> +	try_to_inc_max_seq(lruvec, max_seq, &sc, true, true);
> +	memalloc_noreclaim_restore(flags);
> +	set_task_reclaim_state(current, NULL);
> +}
> +#endif /* CONFIG_WORKINGSET_REPORT */
> +
>  #else /* !CONFIG_LRU_GEN */
>  
>  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
> diff --git a/mm/workingset_report.c b/mm/workingset_report.c
> new file mode 100644
> index 000000000000..98cdaffcb6b4
> --- /dev/null
> +++ b/mm/workingset_report.c
> @@ -0,0 +1,413 @@
> +// SPDX-License-Identifier: GPL-2.0
> +//
> +#include <linux/export.h>
> +#include <linux/lockdep.h>
> +#include <linux/jiffies.h>
> +#include <linux/kernfs.h>
> +#include <linux/memcontrol.h>
> +#include <linux/rcupdate.h>
> +#include <linux/mutex.h>
> +#include <linux/err.h>
> +#include <linux/atomic.h>
> +#include <linux/node.h>
> +#include <linux/mmzone.h>
> +#include <linux/mm.h>
> +#include <linux/mm_inline.h>
> +#include <linux/workingset_report.h>
> +
> +#include "internal.h"
> +
> +void wsr_init(struct lruvec *lruvec)
> +{
> +	struct wsr_state *wsr = &lruvec->wsr;
> +
> +	memset(wsr, 0, sizeof(*wsr));
> +	mutex_init(&wsr->page_age_lock);
> +}
> +
> +void wsr_destroy(struct lruvec *lruvec)
> +{
> +	struct wsr_state *wsr = &lruvec->wsr;
> +
> +	mutex_destroy(&wsr->page_age_lock);
> +	kfree(wsr->page_age);
> +	memset(wsr, 0, sizeof(*wsr));
> +}
> +
> +static int workingset_report_intervals_parse(char *src,
> +				      struct wsr_report_bins *bins)
> +{
> +	int err = 0, i = 0;
> +	char *cur, *next = strim(src);
> +
> +	if (*next == '\0')
> +		return 0;
> +
> +	while ((cur = strsep(&next, ","))) {
> +		unsigned int interval;
> +
> +		err = kstrtouint(cur, 0, &interval);
> +		if (err)
> +			goto out;
> +
> +		bins->bins[i].idle_age = msecs_to_jiffies(interval);
> +		if (i > 0 && bins->bins[i].idle_age <= bins->bins[i - 1].idle_age) {
> +			err = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (++i == WORKINGSET_REPORT_MAX_NR_BINS) {
> +			err = -ERANGE;
> +			goto out;
> +		}
> +	}
> +
> +	if (i && i < WORKINGSET_REPORT_MIN_NR_BINS - 1) {
> +		err = -ERANGE;
> +		goto out;
> +	}
> +
> +	bins->nr_bins = i;
> +	bins->bins[i].idle_age = WORKINGSET_INTERVAL_MAX;
> +out:
> +	return err ?: i;
> +}
> +
> +static unsigned long get_gen_start_time(const struct lru_gen_folio *lrugen,
> +					unsigned long seq,
> +					unsigned long max_seq,
> +					unsigned long curr_timestamp)
> +{
> +	int younger_gen;
> +
> +	if (seq == max_seq)
> +		return curr_timestamp;
> +	younger_gen = lru_gen_from_seq(seq + 1);
> +	return READ_ONCE(lrugen->timestamps[younger_gen]);
> +}
> +
> +static void collect_page_age_type(const struct lru_gen_folio *lrugen,
> +				  struct wsr_report_bin *bin,
> +				  unsigned long max_seq, unsigned long min_seq,
> +				  unsigned long curr_timestamp, int type)
> +{
> +	unsigned long seq;
> +
> +	for (seq = max_seq; seq + 1 > min_seq; seq--) {
> +		int gen, zone;
> +		unsigned long gen_end, gen_start, size = 0;
> +
> +		gen = lru_gen_from_seq(seq);
> +
> +		for (zone = 0; zone < MAX_NR_ZONES; zone++)
> +			size += max(
> +				READ_ONCE(lrugen->nr_pages[gen][type][zone]),
> +				0L);
> +
> +		gen_start = get_gen_start_time(lrugen, seq, max_seq,
> +					       curr_timestamp);
> +		gen_end = READ_ONCE(lrugen->timestamps[gen]);
> +
> +		while (bin->idle_age != WORKINGSET_INTERVAL_MAX &&
> +		       time_before(gen_end + bin->idle_age, curr_timestamp)) {
> +			unsigned long gen_in_bin = (long)gen_start -
> +						   (long)curr_timestamp +
> +						   (long)bin->idle_age;
> +			unsigned long gen_len = (long)gen_start - (long)gen_end;
> +
> +			if (!gen_len)
> +				break;
> +			if (gen_in_bin) {
> +				unsigned long split_bin =
> +					size / gen_len * gen_in_bin;
> +
> +				bin->nr_pages[type] += split_bin;
> +				size -= split_bin;
> +			}
> +			gen_start = curr_timestamp - bin->idle_age;
> +			bin++;
> +		}
> +		bin->nr_pages[type] += size;
> +	}
> +}
> +
> +/*
> + * proportionally aggregate Multi-gen LRU bins into a working set report
> + * MGLRU generations:
> + * current time
> + * |         max_seq timestamp
> + * |         |     max_seq - 1 timestamp
> + * |         |     |               unbounded
> + * |         |     |               |
> + * --------------------------------
> + * | max_seq | ... | ... | min_seq
> + * --------------------------------
> + *
> + * Bins:
> + *
> + * current time
> + * |       current - idle_age[0]
> + * |       |     current - idle_age[1]
> + * |       |     |               unbounded
> + * |       |     |               |
> + * ------------------------------
> + * | bin 0 | ... | ... | bin n-1
> + * ------------------------------
> + *
> + * Assume the heuristic that pages are in the MGLRU generation
> + * through uniform accesses, so we can aggregate them
> + * proportionally into bins.
> + */
> +static void collect_page_age(struct wsr_page_age_histo *page_age,
> +			     const struct lruvec *lruvec)
> +{
> +	int type;
> +	const struct lru_gen_folio *lrugen = &lruvec->lrugen;
> +	unsigned long curr_timestamp = jiffies;
> +	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
> +	unsigned long min_seq[ANON_AND_FILE] = {
> +		READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_ANON]),
> +		READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_FILE]),
> +	};
> +	struct wsr_report_bins *bins = &page_age->bins;
> +
> +	for (type = 0; type < ANON_AND_FILE; type++) {
> +		struct wsr_report_bin *bin = &bins->bins[0];
> +
> +		collect_page_age_type(lrugen, bin, max_seq, min_seq[type],
> +				      curr_timestamp, type);
> +	}
> +}
> +
> +/* First step: hierarchically scan child memcgs. */
> +static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root,
> +			 struct pglist_data *pgdat)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	memcg = mem_cgroup_iter(root, NULL, NULL);
> +	do {
> +		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> +
> +		wsr_refresh_scan(lruvec);
> +		cond_resched();
> +	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +}
> +
> +/* Second step: aggregate child memcgs into the page age histogram. */
> +static void refresh_aggregate(struct wsr_page_age_histo *page_age,
> +			      struct mem_cgroup *root,
> +			      struct pglist_data *pgdat)
> +{
> +	struct mem_cgroup *memcg;
> +	struct wsr_report_bin *bin;
> +
> +	/*
> +	 * page_age_intervals should free the page_age struct
> +	 * if no intervals are provided.
> +	 */
> +	VM_WARN_ON_ONCE(page_age->bins.bins[0].idle_age ==
> +			WORKINGSET_INTERVAL_MAX);
> +
> +	for (bin = page_age->bins.bins;
> +	     bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) {
> +		bin->nr_pages[0] = 0;
> +		bin->nr_pages[1] = 0;
> +	}
> +	/* the last used bin has idle_age == WORKINGSET_INTERVAL_MAX. */
> +	bin->nr_pages[0] = 0;
> +	bin->nr_pages[1] = 0;
> +
> +	memcg = mem_cgroup_iter(root, NULL, NULL);
> +	do {
> +		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> +
> +		collect_page_age(page_age, lruvec);
> +		cond_resched();
> +	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +	WRITE_ONCE(page_age->timestamp, jiffies);
> +}
> +
> +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root,
> +			struct pglist_data *pgdat)
> +{
> +	struct wsr_page_age_histo *page_age;
> +
> +	if (!READ_ONCE(wsr->page_age))
> +		return false;
> +
> +	refresh_scan(wsr, root, pgdat);
> +	mutex_lock(&wsr->page_age_lock);
> +	page_age = READ_ONCE(wsr->page_age);
> +	if (page_age)
> +		refresh_aggregate(page_age, root, pgdat);
> +	mutex_unlock(&wsr->page_age_lock);
> +	return !!page_age;
> +}
> +EXPORT_SYMBOL_GPL(wsr_refresh_report);
> +
> +static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
> +{
> +	int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
> +					    first_memory_node;
> +
> +	return NODE_DATA(nid);
> +}
> +
> +static struct wsr_state *kobj_to_wsr(struct kobject *kobj)
> +{
> +	return &mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))->wsr;
> +}
> +
> +static ssize_t page_age_intervals_show(struct kobject *kobj,
> +				       struct kobj_attribute *attr, char *buf)
> +{
> +	int len = 0;
> +	struct wsr_state *wsr = kobj_to_wsr(kobj);
> +
> +	mutex_lock(&wsr->page_age_lock);
> +
> +	if (!!wsr->page_age) {
> +		int i;
> +		int nr_bins = wsr->page_age->bins.nr_bins;
> +
> +		for (i = 0; i < nr_bins; ++i) {
> +			struct wsr_report_bin *bin =
> +				&wsr->page_age->bins.bins[i];
> +
> +			len += sysfs_emit_at(buf, len, "%u",
> +					     jiffies_to_msecs(bin->idle_age));
> +			if (i + 1 < nr_bins)
> +				len += sysfs_emit_at(buf, len, ",");
> +		}
> +	}
> +	len += sysfs_emit_at(buf, len, "\n");
> +
> +	mutex_unlock(&wsr->page_age_lock);
> +	return len;
> +}
> +
> +static ssize_t page_age_intervals_store(struct kobject *kobj,
> +					struct kobj_attribute *attr,
> +					const char *src, size_t len)
> +{
> +	struct wsr_page_age_histo *page_age = NULL, *old;
> +	char *buf = NULL;
> +	int err = 0;
> +	struct wsr_state *wsr = kobj_to_wsr(kobj);
> +
> +	buf = kstrdup(src, GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto failed;
> +	}
> +
> +	page_age =
> +		kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL_ACCOUNT);
> +
> +	if (!page_age) {
> +		err = -ENOMEM;
> +		goto failed;
> +	}
> +
> +	err = workingset_report_intervals_parse(buf, &page_age->bins);
> +	if (err < 0)
> +		goto failed;
> +
> +	if (err == 0) {
> +		kfree(page_age);
> +		page_age = NULL;
> +	}
> +
> +	mutex_lock(&wsr->page_age_lock);
> +	old = xchg(&wsr->page_age, page_age);
> +	mutex_unlock(&wsr->page_age_lock);
> +	kfree(old);
> +	kfree(buf);
> +	return len;
> +failed:
> +	kfree(page_age);
> +	kfree(buf);
> +
> +	return err;
> +}
> +
> +static struct kobj_attribute page_age_intervals_attr =
> +	__ATTR_RW(page_age_intervals);
> +
> +static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr,
> +			     char *buf)
> +{
> +	struct wsr_report_bin *bin;
> +	int ret = 0;
> +	struct wsr_state *wsr = kobj_to_wsr(kobj);
> +
> +	if (!READ_ONCE(wsr->page_age))
> +		return -EINVAL;
> +
> +	wsr_refresh_report(wsr, NULL, kobj_to_pgdat(kobj));
> +
> +	mutex_lock(&wsr->page_age_lock);
> +	if (!wsr->page_age) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +
> +	for (bin = wsr->page_age->bins.bins;
> +	     bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++)
> +		ret += sysfs_emit_at(buf, ret, "%u anon=%lu file=%lu\n",
> +				     jiffies_to_msecs(bin->idle_age),
> +				     bin->nr_pages[0] * PAGE_SIZE,
> +				     bin->nr_pages[1] * PAGE_SIZE);
> +
> +	ret += sysfs_emit_at(buf, ret, "%lu anon=%lu file=%lu\n",
> +			     WORKINGSET_INTERVAL_MAX,
> +			     bin->nr_pages[0] * PAGE_SIZE,
> +			     bin->nr_pages[1] * PAGE_SIZE);
> +
> +unlock:
> +	mutex_unlock(&wsr->page_age_lock);
> +	return ret;
> +}
> +
> +static struct kobj_attribute page_age_attr = __ATTR_RO(page_age);
> +
> +static struct attribute *workingset_report_attrs[] = {
> +	&page_age_intervals_attr.attr, &page_age_attr.attr, NULL
> +};
> +
> +static const struct attribute_group workingset_report_attr_group = {
> +	.name = "workingset_report",
> +	.attrs = workingset_report_attrs,
> +};
> +
> +void wsr_register_node(struct node *node)
> +{
> +	struct kobject *kobj = node ? &node->dev.kobj : mm_kobj;
> +	struct wsr_state *wsr;
> +
> +	if (IS_ENABLED(CONFIG_NUMA) && !node)
> +		return;
> +
> +	wsr = kobj_to_wsr(kobj);
> +
> +	if (sysfs_create_group(kobj, &workingset_report_attr_group)) {
> +		pr_warn("WSR failed to created group");
> +		return;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(wsr_register_node);
> +
> +void wsr_unregister_node(struct node *node)
> +{
> +	struct kobject *kobj = &node->dev.kobj;
> +	struct wsr_state *wsr;
> +
> +	if (IS_ENABLED(CONFIG_NUMA) && !node)
> +		return;
> +
> +	wsr = kobj_to_wsr(kobj);
> +	sysfs_remove_group(kobj, &workingset_report_attr_group);
> +	wsr_destroy(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj)));
> +}
> +EXPORT_SYMBOL_GPL(wsr_unregister_node);

--
Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1c05640461dd..4f589b8253f4 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -20,6 +20,7 @@ 
 #include <linux/pm_runtime.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/workingset_report.h>
 
 static const struct bus_type node_subsys = {
 	.name = "node",
@@ -625,6 +626,7 @@  static int register_node(struct node *node, int num)
 	} else {
 		hugetlb_register_node(node);
 		compaction_register_node(node);
+		wsr_register_node(node);
 	}
 
 	return error;
@@ -641,6 +643,7 @@  void unregister_node(struct node *node)
 {
 	hugetlb_unregister_node(node);
 	compaction_unregister_node(node);
+	wsr_unregister_node(node);
 	node_remove_accesses(node);
 	node_remove_caches(node);
 	device_unregister(&node->dev);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d988..8839931646ee 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,6 +24,7 @@ 
 #include <linux/local_lock.h>
 #include <linux/zswap.h>
 #include <asm/page.h>
+#include <linux/workingset_report.h>
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
@@ -625,6 +626,9 @@  struct lruvec {
 	struct lru_gen_mm_state		mm_state;
 #endif
 #endif /* CONFIG_LRU_GEN */
+#ifdef CONFIG_WORKINGSET_REPORT
+	struct wsr_state	wsr;
+#endif /* CONFIG_WORKINGSET_REPORT */
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
diff --git a/include/linux/workingset_report.h b/include/linux/workingset_report.h
new file mode 100644
index 000000000000..0de640cb1ef0
--- /dev/null
+++ b/include/linux/workingset_report.h
@@ -0,0 +1,69 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WORKINGSET_REPORT_H
+#define _LINUX_WORKINGSET_REPORT_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+struct mem_cgroup;
+struct pglist_data;
+struct node;
+struct lruvec;
+
+#ifdef CONFIG_WORKINGSET_REPORT
+
+#define WORKINGSET_REPORT_MIN_NR_BINS 2
+#define WORKINGSET_REPORT_MAX_NR_BINS 32
+
+#define WORKINGSET_INTERVAL_MAX ((unsigned long)-1)
+#define ANON_AND_FILE 2
+
+struct wsr_report_bin {
+	unsigned long idle_age;
+	unsigned long nr_pages[ANON_AND_FILE];
+};
+
+struct wsr_report_bins {
+	unsigned long nr_bins;
+	/* last bin contains WORKINGSET_INTERVAL_MAX */
+	struct wsr_report_bin bins[WORKINGSET_REPORT_MAX_NR_BINS];
+};
+
+struct wsr_page_age_histo {
+	unsigned long timestamp;
+	struct wsr_report_bins bins;
+};
+
+struct wsr_state {
+	/* breakdown of workingset by page age */
+	struct mutex page_age_lock;
+	struct wsr_page_age_histo *page_age;
+};
+
+void wsr_init(struct lruvec *lruvec);
+void wsr_destroy(struct lruvec *lruvec);
+
+/*
+ * Returns true if the wsr is configured to be refreshed.
+ * The next refresh time is stored in refresh_time.
+ */
+bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root,
+			struct pglist_data *pgdat);
+void wsr_register_node(struct node *node);
+void wsr_unregister_node(struct node *node);
+#else
+static inline void wsr_init(struct lruvec *lruvec)
+{
+}
+static inline void wsr_destroy(struct lruvec *lruvec)
+{
+}
+static inline void wsr_register_node(struct node *node)
+{
+}
+static inline void wsr_unregister_node(struct node *node)
+{
+}
+#endif /* CONFIG_WORKINGSET_REPORT */
+
+#endif /* _LINUX_WORKINGSET_REPORT_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8c..212f203b10b9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1261,6 +1261,15 @@  config LOCK_MM_AND_FIND_VMA
 config IOMMU_MM_DATA
 	bool
 
+config WORKINGSET_REPORT
+	bool "Working set reporting"
+	depends on LRU_GEN && SYSFS
+	help
+	  Report system and per-memcg working set to userspace.
+
+	  This option exports stats and events giving the user more insight
+	  into its memory working set.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index e4b5b75aaec9..57093657030d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -92,6 +92,7 @@  obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_WORKINGSET_REPORT) += workingset_report.o
 ifdef CONFIG_SWAP
 obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..5e0caba64ee4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -198,12 +198,21 @@  extern unsigned long highest_memmap_pfn;
 /*
  * in mm/vmscan.c:
  */
+struct scan_control;
 bool isolate_lru_page(struct page *page);
 bool folio_isolate_lru(struct folio *folio);
 void putback_lru_page(struct page *page);
 void folio_putback_lru(struct folio *folio);
 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
 
+#ifdef CONFIG_WORKINGSET_REPORT
+/*
+ * in mm/wsr.c
+ */
+/* Requires wsr->page_age_lock held */
+void wsr_refresh_scan(struct lruvec *lruvec);
+#endif
+
 /*
  * in mm/rmap.c:
  */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1ed40f9d3a27..2f07141de16c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -65,6 +65,7 @@ 
 #include <linux/seq_buf.h>
 #include <linux/sched/isolation.h>
 #include <linux/kmemleak.h>
+#include <linux/workingset_report.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -5457,6 +5458,7 @@  static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return;
 
+	wsr_destroy(&pn->lruvec);
 	free_percpu(pn->lruvec_stats_percpu);
 	kfree(pn);
 }
diff --git a/mm/mmzone.c b/mm/mmzone.c
index c01896eca736..efca44c1b84b 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -90,6 +90,8 @@  void lruvec_init(struct lruvec *lruvec)
 	 */
 	list_del(&lruvec->lists[LRU_UNEVICTABLE]);
 
+	wsr_init(lruvec);
+
 	lru_gen_init_lruvec(lruvec);
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a7c7d537db6..b694d80ab2d1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@ 
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/workingset_report.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3815,7 +3816,7 @@  static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	return success;
 }
 
-static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 			       struct scan_control *sc, bool can_swap, bool force_scan)
 {
 	bool success;
@@ -5606,6 +5607,8 @@  static int __init init_lru_gen(void)
 	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
 		pr_err("lru_gen: failed to create sysfs group\n");
 
+	wsr_register_node(NULL);
+
 	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
 	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
 
@@ -5613,6 +5616,35 @@  static int __init init_lru_gen(void)
 };
 late_initcall(init_lru_gen);
 
+/******************************************************************************
+ *                          workingset reporting
+ ******************************************************************************/
+#ifdef CONFIG_WORKINGSET_REPORT
+void wsr_refresh_scan(struct lruvec *lruvec)
+{
+	DEFINE_MAX_SEQ(lruvec);
+	struct scan_control sc = {
+		.may_writepage = true,
+		.may_unmap = true,
+		.may_swap = true,
+		.proactive = true,
+		.reclaim_idx = MAX_NR_ZONES - 1,
+		.gfp_mask = GFP_KERNEL,
+	};
+	unsigned int flags;
+
+	set_task_reclaim_state(current, &sc.reclaim_state);
+	flags = memalloc_noreclaim_save();
+	/*
+	 * setting can_swap=true and force_scan=true ensures
+	 * proper workingset stats when the system cannot swap.
+	 */
+	try_to_inc_max_seq(lruvec, max_seq, &sc, true, true);
+	memalloc_noreclaim_restore(flags);
+	set_task_reclaim_state(current, NULL);
+}
+#endif /* CONFIG_WORKINGSET_REPORT */
+
 #else /* !CONFIG_LRU_GEN */
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
diff --git a/mm/workingset_report.c b/mm/workingset_report.c
new file mode 100644
index 000000000000..98cdaffcb6b4
--- /dev/null
+++ b/mm/workingset_report.c
@@ -0,0 +1,413 @@ 
+// SPDX-License-Identifier: GPL-2.0
+//
+#include <linux/export.h>
+#include <linux/lockdep.h>
+#include <linux/jiffies.h>
+#include <linux/kernfs.h>
+#include <linux/memcontrol.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <linux/atomic.h>
+#include <linux/node.h>
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/workingset_report.h>
+
+#include "internal.h"
+
+void wsr_init(struct lruvec *lruvec)
+{
+	struct wsr_state *wsr = &lruvec->wsr;
+
+	memset(wsr, 0, sizeof(*wsr));
+	mutex_init(&wsr->page_age_lock);
+}
+
+void wsr_destroy(struct lruvec *lruvec)
+{
+	struct wsr_state *wsr = &lruvec->wsr;
+
+	mutex_destroy(&wsr->page_age_lock);
+	kfree(wsr->page_age);
+	memset(wsr, 0, sizeof(*wsr));
+}
+
+static int workingset_report_intervals_parse(char *src,
+				      struct wsr_report_bins *bins)
+{
+	int err = 0, i = 0;
+	char *cur, *next = strim(src);
+
+	if (*next == '\0')
+		return 0;
+
+	while ((cur = strsep(&next, ","))) {
+		unsigned int interval;
+
+		err = kstrtouint(cur, 0, &interval);
+		if (err)
+			goto out;
+
+		bins->bins[i].idle_age = msecs_to_jiffies(interval);
+		if (i > 0 && bins->bins[i].idle_age <= bins->bins[i - 1].idle_age) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (++i == WORKINGSET_REPORT_MAX_NR_BINS) {
+			err = -ERANGE;
+			goto out;
+		}
+	}
+
+	if (i && i < WORKINGSET_REPORT_MIN_NR_BINS - 1) {
+		err = -ERANGE;
+		goto out;
+	}
+
+	bins->nr_bins = i;
+	bins->bins[i].idle_age = WORKINGSET_INTERVAL_MAX;
+out:
+	return err ?: i;
+}
+
+static unsigned long get_gen_start_time(const struct lru_gen_folio *lrugen,
+					unsigned long seq,
+					unsigned long max_seq,
+					unsigned long curr_timestamp)
+{
+	int younger_gen;
+
+	if (seq == max_seq)
+		return curr_timestamp;
+	younger_gen = lru_gen_from_seq(seq + 1);
+	return READ_ONCE(lrugen->timestamps[younger_gen]);
+}
+
+static void collect_page_age_type(const struct lru_gen_folio *lrugen,
+				  struct wsr_report_bin *bin,
+				  unsigned long max_seq, unsigned long min_seq,
+				  unsigned long curr_timestamp, int type)
+{
+	unsigned long seq;
+
+	for (seq = max_seq; seq + 1 > min_seq; seq--) {
+		int gen, zone;
+		unsigned long gen_end, gen_start, size = 0;
+
+		gen = lru_gen_from_seq(seq);
+
+		for (zone = 0; zone < MAX_NR_ZONES; zone++)
+			size += max(
+				READ_ONCE(lrugen->nr_pages[gen][type][zone]),
+				0L);
+
+		gen_start = get_gen_start_time(lrugen, seq, max_seq,
+					       curr_timestamp);
+		gen_end = READ_ONCE(lrugen->timestamps[gen]);
+
+		while (bin->idle_age != WORKINGSET_INTERVAL_MAX &&
+		       time_before(gen_end + bin->idle_age, curr_timestamp)) {
+			unsigned long gen_in_bin = (long)gen_start -
+						   (long)curr_timestamp +
+						   (long)bin->idle_age;
+			unsigned long gen_len = (long)gen_start - (long)gen_end;
+
+			if (!gen_len)
+				break;
+			if (gen_in_bin) {
+				unsigned long split_bin =
+					size / gen_len * gen_in_bin;
+
+				bin->nr_pages[type] += split_bin;
+				size -= split_bin;
+			}
+			gen_start = curr_timestamp - bin->idle_age;
+			bin++;
+		}
+		bin->nr_pages[type] += size;
+	}
+}
+
+/*
+ * proportionally aggregate Multi-gen LRU bins into a working set report
+ * MGLRU generations:
+ * current time
+ * |         max_seq timestamp
+ * |         |     max_seq - 1 timestamp
+ * |         |     |               unbounded
+ * |         |     |               |
+ * --------------------------------
+ * | max_seq | ... | ... | min_seq
+ * --------------------------------
+ *
+ * Bins:
+ *
+ * current time
+ * |       current - idle_age[0]
+ * |       |     current - idle_age[1]
+ * |       |     |               unbounded
+ * |       |     |               |
+ * ------------------------------
+ * | bin 0 | ... | ... | bin n-1
+ * ------------------------------
+ *
+ * Assume the heuristic that pages are in the MGLRU generation
+ * through uniform accesses, so we can aggregate them
+ * proportionally into bins.
+ */
+static void collect_page_age(struct wsr_page_age_histo *page_age,
+			     const struct lruvec *lruvec)
+{
+	int type;
+	const struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	unsigned long curr_timestamp = jiffies;
+	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
+	unsigned long min_seq[ANON_AND_FILE] = {
+		READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_ANON]),
+		READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_FILE]),
+	};
+	struct wsr_report_bins *bins = &page_age->bins;
+
+	for (type = 0; type < ANON_AND_FILE; type++) {
+		struct wsr_report_bin *bin = &bins->bins[0];
+
+		collect_page_age_type(lrugen, bin, max_seq, min_seq[type],
+				      curr_timestamp, type);
+	}
+}
+
+/* First step: hierarchically scan child memcgs. */
+static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root,
+			 struct pglist_data *pgdat)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_iter(root, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+		wsr_refresh_scan(lruvec);
+		cond_resched();
+	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+}
+
+/* Second step: aggregate child memcgs into the page age histogram. */
+static void refresh_aggregate(struct wsr_page_age_histo *page_age,
+			      struct mem_cgroup *root,
+			      struct pglist_data *pgdat)
+{
+	struct mem_cgroup *memcg;
+	struct wsr_report_bin *bin;
+
+	/*
+	 * page_age_intervals should free the page_age struct
+	 * if no intervals are provided.
+	 */
+	VM_WARN_ON_ONCE(page_age->bins.bins[0].idle_age ==
+			WORKINGSET_INTERVAL_MAX);
+
+	for (bin = page_age->bins.bins;
+	     bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) {
+		bin->nr_pages[0] = 0;
+		bin->nr_pages[1] = 0;
+	}
+	/* the last used bin has idle_age == WORKINGSET_INTERVAL_MAX. */
+	bin->nr_pages[0] = 0;
+	bin->nr_pages[1] = 0;
+
+	memcg = mem_cgroup_iter(root, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+		collect_page_age(page_age, lruvec);
+		cond_resched();
+	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+	WRITE_ONCE(page_age->timestamp, jiffies);
+}
+
+bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root,
+			struct pglist_data *pgdat)
+{
+	struct wsr_page_age_histo *page_age;
+
+	if (!READ_ONCE(wsr->page_age))
+		return false;
+
+	refresh_scan(wsr, root, pgdat);
+	mutex_lock(&wsr->page_age_lock);
+	page_age = READ_ONCE(wsr->page_age);
+	if (page_age)
+		refresh_aggregate(page_age, root, pgdat);
+	mutex_unlock(&wsr->page_age_lock);
+	return !!page_age;
+}
+EXPORT_SYMBOL_GPL(wsr_refresh_report);
+
+static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
+{
+	int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
+					    first_memory_node;
+
+	return NODE_DATA(nid);
+}
+
+static struct wsr_state *kobj_to_wsr(struct kobject *kobj)
+{
+	return &mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))->wsr;
+}
+
+static ssize_t page_age_intervals_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	int len = 0;
+	struct wsr_state *wsr = kobj_to_wsr(kobj);
+
+	mutex_lock(&wsr->page_age_lock);
+
+	if (!!wsr->page_age) {
+		int i;
+		int nr_bins = wsr->page_age->bins.nr_bins;
+
+		for (i = 0; i < nr_bins; ++i) {
+			struct wsr_report_bin *bin =
+				&wsr->page_age->bins.bins[i];
+
+			len += sysfs_emit_at(buf, len, "%u",
+					     jiffies_to_msecs(bin->idle_age));
+			if (i + 1 < nr_bins)
+				len += sysfs_emit_at(buf, len, ",");
+		}
+	}
+	len += sysfs_emit_at(buf, len, "\n");
+
+	mutex_unlock(&wsr->page_age_lock);
+	return len;
+}
+
+static ssize_t page_age_intervals_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *src, size_t len)
+{
+	struct wsr_page_age_histo *page_age = NULL, *old;
+	char *buf = NULL;
+	int err = 0;
+	struct wsr_state *wsr = kobj_to_wsr(kobj);
+
+	buf = kstrdup(src, GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	page_age =
+		kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL_ACCOUNT);
+
+	if (!page_age) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	err = workingset_report_intervals_parse(buf, &page_age->bins);
+	if (err < 0)
+		goto failed;
+
+	if (err == 0) {
+		kfree(page_age);
+		page_age = NULL;
+	}
+
+	mutex_lock(&wsr->page_age_lock);
+	old = xchg(&wsr->page_age, page_age);
+	mutex_unlock(&wsr->page_age_lock);
+	kfree(old);
+	kfree(buf);
+	return len;
+failed:
+	kfree(page_age);
+	kfree(buf);
+
+	return err;
+}
+
+static struct kobj_attribute page_age_intervals_attr =
+	__ATTR_RW(page_age_intervals);
+
+static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	struct wsr_report_bin *bin;
+	int ret = 0;
+	struct wsr_state *wsr = kobj_to_wsr(kobj);
+
+	if (!READ_ONCE(wsr->page_age))
+		return -EINVAL;
+
+	wsr_refresh_report(wsr, NULL, kobj_to_pgdat(kobj));
+
+	mutex_lock(&wsr->page_age_lock);
+	if (!wsr->page_age) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	for (bin = wsr->page_age->bins.bins;
+	     bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++)
+		ret += sysfs_emit_at(buf, ret, "%u anon=%lu file=%lu\n",
+				     jiffies_to_msecs(bin->idle_age),
+				     bin->nr_pages[0] * PAGE_SIZE,
+				     bin->nr_pages[1] * PAGE_SIZE);
+
+	ret += sysfs_emit_at(buf, ret, "%lu anon=%lu file=%lu\n",
+			     WORKINGSET_INTERVAL_MAX,
+			     bin->nr_pages[0] * PAGE_SIZE,
+			     bin->nr_pages[1] * PAGE_SIZE);
+
+unlock:
+	mutex_unlock(&wsr->page_age_lock);
+	return ret;
+}
+
+static struct kobj_attribute page_age_attr = __ATTR_RO(page_age);
+
+static struct attribute *workingset_report_attrs[] = {
+	&page_age_intervals_attr.attr, &page_age_attr.attr, NULL
+};
+
+static const struct attribute_group workingset_report_attr_group = {
+	.name = "workingset_report",
+	.attrs = workingset_report_attrs,
+};
+
+void wsr_register_node(struct node *node)
+{
+	struct kobject *kobj = node ? &node->dev.kobj : mm_kobj;
+	struct wsr_state *wsr;
+
+	if (IS_ENABLED(CONFIG_NUMA) && !node)
+		return;
+
+	wsr = kobj_to_wsr(kobj);
+
+	if (sysfs_create_group(kobj, &workingset_report_attr_group)) {
+		pr_warn("WSR failed to created group");
+		return;
+	}
+}
+EXPORT_SYMBOL_GPL(wsr_register_node);
+
+void wsr_unregister_node(struct node *node)
+{
+	struct kobject *kobj = &node->dev.kobj;
+	struct wsr_state *wsr;
+
+	if (IS_ENABLED(CONFIG_NUMA) && !node)
+		return;
+
+	wsr = kobj_to_wsr(kobj);
+	sysfs_remove_group(kobj, &workingset_report_attr_group);
+	wsr_destroy(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj)));
+}
+EXPORT_SYMBOL_GPL(wsr_unregister_node);