[RFC,1/2] psi: introduce memory.pressure.stat

Message ID	20220801004205.1593100-1-ran.xiaokai@zte.com.cn (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: cgel.zte@gmail.com To: hannes@cmpxchg.org, akpm@linux-foundation.org, tj@kernel.org, axboe@kernel.dk, vdavydov.dev@gmail.com Cc: ran.xiaokai@zte.com.cn, linux-mm@kvack.org, linux-kernel@vger.kernel.org, cgel <cgel@zte.com.cn> Subject: [RFC PATCH 1/2] psi: introduce memory.pressure.stat Date: Mon, 1 Aug 2022 00:42:04 +0000 Message-Id: <20220801004205.1593100-1-ran.xiaokai@zte.com.cn> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[RFC,1/2] psi: introduce memory.pressure.stat \| expand [RFC,1/2] psi: introduce memory.pressure.stat [RFC,2/2] psi: account for memory stall types

Message ID

20220801004205.1593100-1-ran.xiaokai@zte.com.cn (mailing list archive)

State

New

Headers

From: cgel.zte@gmail.com
To: hannes@cmpxchg.org,
	akpm@linux-foundation.org,
	tj@kernel.org,
	axboe@kernel.dk,
	vdavydov.dev@gmail.com
Cc: ran.xiaokai@zte.com.cn,
	linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	cgel <cgel@zte.com.cn>
Subject: [RFC PATCH 1/2] psi: introduce memory.pressure.stat
Date: Mon,  1 Aug 2022 00:42:04 +0000
Message-Id: <20220801004205.1593100-1-ran.xiaokai@zte.com.cn>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[RFC,1/2] psi: introduce memory.pressure.stat | expand

Commit Message

CGEL Aug. 1, 2022, 12:42 a.m. UTC

From: cgel <cgel@zte.com.cn>

For now psi memory pressure account for all the mem stall in the
system, And didnot provide a detailed information why the stall
happens. This patch introduce a cgroupu knob memory.pressure.stat,
it tells the detailed stall information of all memory events and it
format and the corresponding proc interface.

for the cgroup, add memory.pressure.stat and it shows:
kswapd: avg10=0.00 avg60=0.00 avg300=0.00 total=0
direct reclaim: avg10=0.00 avg60=0.00 avg300=0.12 total=42356
kcompacted: avg10=0.00 avg60=0.00 avg300=0.00 total=0
direct compact: avg10=0.00 avg60=0.00 avg300=0.00 total=0
cgroup reclaim: avg10=0.00 avg60=0.00 avg300=0.00 total=0
workingset thrashing:   avg10=0.00 avg60=0.00 avg300=0.00 total=0

for the system wide, a proc file introduced as pressure/memory_stat
and the format is the same as the cgroup interface.

With this detaled information, for example, if the system is stalled
because of kcompacted, compaction_proactiveness can be promoted so
pro-compaction can be involved earlier.

Signed-off-by: cgel <cgel@zte.com.cn>
---
 include/linux/psi.h       |   7 +--
 include/linux/psi_types.h |  34 +++++++++++++
 kernel/cgroup/cgroup.c    |  11 ++++
 kernel/sched/psi.c        | 126 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 168 insertions(+), 10 deletions(-)

Comments

Jonathan Cameron Aug. 1, 2022, 4:16 p.m. UTC | #1

On Mon,  1 Aug 2022 00:42:04 +0000
cgel.zte@gmail.com wrote:

> From: cgel <cgel@zte.com.cn>
> 
Trivial things noticed in passing (not a full review!)

Please spell check patch descriptions.

> For now psi memory pressure account for all the mem stall in the
> system, And didnot provide a detailed information why the stall
did not
> happens. This patch introduce a cgroupu knob memory.pressure.stat,
> it tells the detailed stall information of all memory events and it
> format and the corresponding proc interface.
> 
> for the cgroup, add memory.pressure.stat and it shows:
> kswapd: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> direct reclaim: avg10=0.00 avg60=0.00 avg300=0.12 total=42356
> kcompacted: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> direct compact: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> cgroup reclaim: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> workingset thrashing:   avg10=0.00 avg60=0.00 avg300=0.00 total=0
> 
> for the system wide, a proc file introduced as pressure/memory_stat
> and the format is the same as the cgroup interface.
> 
> With this detaled information, for example, if the system is stalled

detailed

> because of kcompacted, compaction_proactiveness can be promoted so
> pro-compaction can be involved earlier.
> 
> Signed-off-by: cgel <cgel@zte.com.cn>
> ---
>  include/linux/psi.h       |   7 +--
>  include/linux/psi_types.h |  34 +++++++++++++
>  kernel/cgroup/cgroup.c    |  11 ++++
>  kernel/sched/psi.c        | 126 +++++++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 168 insertions(+), 10 deletions(-)
> 
...

> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 07aaf9b..194ea78 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -9,6 +9,8 @@
>  
>  #ifdef CONFIG_PSI
>  
> +#define PSI_MASK(x)	((1UL << (x))-1)
> +
>  /* Tracked task states */
>  enum psi_task_count {
>  	NR_IOWAIT,
> @@ -22,6 +24,10 @@ enum psi_task_count {
>  #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
>  #define TSK_RUNNING	(1 << NR_RUNNING)
>  
> +#define TSK_COUNT_MASK	PSI_MASK(NR_PSI_TASK_COUNTS)
> +#define TSK_COUNT_SHIFT	8
> +
> +
One blank line is probably enough.

>  /* Resources that workloads could be stalled on */
>  enum psi_res {
>  	PSI_IO,
> @@ -53,6 +59,27 @@ enum psi_aggregators {
>  	NR_PSI_AGGREGATORS,
>  };


> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 806fc9d..b50ab92 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3613,6 +3613,13 @@ static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
>  
>  	return psi_show(seq, psi, PSI_MEM);
>  }

Almost certainly what blank lines around this.  I'm not sure what reasoning behind
not having it for the other functions in this block is though...

> +static int cgroup_memory_pressure_stat_show(struct seq_file *seq, void *v)
> +{
> +	struct cgroup *cgroup = seq_css(seq)->cgroup;
> +	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
> +
> +	return psi_mem_pressure_stat_show(seq, psi);
> +}
>  static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
>  {
>  	struct cgroup *cgroup = seq_css(seq)->cgroup;
> @@ -4930,6 +4937,10 @@ static struct cftype cgroup_base_files[] = {
>  		.poll = cgroup_pressure_poll,
>  		.release = cgroup_pressure_release,
>  	},
> +	{
> +		.name = "memory.pressure.stat",
> +		.seq_show = cgroup_memory_pressure_stat_show,
> +	},
>  	{
>  		.name = "cpu.pressure",
>  		.seq_show = cgroup_cpu_pressure_show,
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 9154e74..072d535 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -279,6 +279,35 @@ static void get_recent_times(struct psi_group *group, int cpu,
..

>  	/*
> @@ -714,7 +770,7 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>  			state_mask |= (1 << s);
>  	}
>  	groupc->state_mask = state_mask;
> -

Probably better to leave the white space unless there is a strong reason to drop
the blank line.

> +	groupc->state_memstall = state_memstall;
>  	write_seqcount_end(&groupc->seq);
>  
>  	return state_mask;
..

>  static int psi_io_show(struct seq_file *m, void *v)
>  {
>  	return psi_show(m, &psi_system, PSI_IO);
> @@ -998,7 +1101,10 @@ static int psi_memory_open(struct inode *inode, struct file *file)
>  {
>  	return single_open(file, psi_memory_show, NULL);
>  }
> -

Fix this.  It's noise in this patch and the white space is probably desirable anyway.

> +static int psi_memory_stat_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, psi_mem_pressure_stat_show, NULL);
> +}
>  static int psi_cpu_open(struct inode *inode, struct file *file)
>  {
>  	return single_open(file, psi_cpu_show, NULL);
> @@ -1271,7 +1377,12 @@ static const struct file_operations psi_memory_fops = {
>  	.poll           = psi_fop_poll,
>  	.release        = psi_fop_release,
>  };
> -

And this

> +static const struct file_operations psi_memory_stat_fops = {
> +	.open           = psi_memory_stat_open,
> +	.read           = seq_read,
> +	.llseek         = seq_lseek,
> +	.release        = psi_fop_release,
> +};
>  static const struct file_operations psi_cpu_fops = {
>  	.open           = psi_cpu_open,
>  	.read           = seq_read,
> @@ -1286,6 +1397,7 @@ static int __init psi_proc_init(void)
>  	proc_mkdir("pressure", NULL);
>  	proc_create("pressure/io", 0, NULL, &psi_io_fops);
>  	proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
> +	proc_create("pressure/memory_stat", 0, NULL, &psi_memory_stat_fops);
>  	proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
>  	return 0;
>  }

Johannes Weiner Aug. 3, 2022, 1:55 p.m. UTC | #2

On Mon, Aug 01, 2022 at 12:42:04AM +0000, cgel.zte@gmail.com wrote:
> From: cgel <cgel@zte.com.cn>
> 
> For now psi memory pressure account for all the mem stall in the
> system, And didnot provide a detailed information why the stall
> happens. This patch introduce a cgroupu knob memory.pressure.stat,
> it tells the detailed stall information of all memory events and it
> format and the corresponding proc interface.
> 
> for the cgroup, add memory.pressure.stat and it shows:
> kswapd: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> direct reclaim: avg10=0.00 avg60=0.00 avg300=0.12 total=42356
> kcompacted: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> direct compact: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> cgroup reclaim: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> workingset thrashing:   avg10=0.00 avg60=0.00 avg300=0.00 total=0
> 
> for the system wide, a proc file introduced as pressure/memory_stat
> and the format is the same as the cgroup interface.
> 
> With this detaled information, for example, if the system is stalled
> because of kcompacted, compaction_proactiveness can be promoted so
> pro-compaction can be involved earlier.
> 
> Signed-off-by: cgel <cgel@zte.com.cn>

> @@ -64,9 +91,11 @@ struct psi_group_cpu {
>  
>  	/* Aggregate pressure state derived from the tasks */
>  	u32 state_mask;
> +	u32 state_memstall;
>  
>  	/* Period time sampling buckets for each state of interest (ns) */
>  	u32 times[NR_PSI_STATES];
> +	u32 times_mem[PSI_MEM_STATES];

This doubles the psi cache footprint on every context switch, wakeup,
sleep, etc. in the scheduler. You're also adding more branches to
those same paths. It'll measurably affect everybody who is using psi.

Yet, in the years of using psi in production myself, I've never felt
the need for what this patch provides. There are event counters for
everything that contributes to pressure, and it's never been hard to
rootcause spikes. There are also things like bpftrace that let you
identify who is stalling for how long in order to do one-off tuning
and systems introspection.

For this to get merged, it needs a better explanation of the usecase
that requires this information to be broadly available all the time.
And it needs to bring down the impact on everybody else who doesn't
want this - either by reducing the footprint or by making the feature
optional.

CGEL Aug. 17, 2022, 2:59 a.m. UTC | #3

On Wed, Aug 03, 2022 at 09:55:39AM -0400, Johannes Weiner wrote:
> On Mon, Aug 01, 2022 at 12:42:04AM +0000, cgel.zte@gmail.com wrote:
> > From: cgel <cgel@zte.com.cn>
> > 
> > For now psi memory pressure account for all the mem stall in the
> > system, And didnot provide a detailed information why the stall
> > happens. This patch introduce a cgroupu knob memory.pressure.stat,
> > it tells the detailed stall information of all memory events and it
> > format and the corresponding proc interface.
> > 
> > for the cgroup, add memory.pressure.stat and it shows:
> > kswapd: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> > direct reclaim: avg10=0.00 avg60=0.00 avg300=0.12 total=42356
> > kcompacted: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> > direct compact: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> > cgroup reclaim: avg10=0.00 avg60=0.00 avg300=0.00 total=0
> > workingset thrashing:   avg10=0.00 avg60=0.00 avg300=0.00 total=0
> > 
> > for the system wide, a proc file introduced as pressure/memory_stat
> > and the format is the same as the cgroup interface.
> > 
> > With this detaled information, for example, if the system is stalled
> > because of kcompacted, compaction_proactiveness can be promoted so
> > pro-compaction can be involved earlier.
> > 
> > Signed-off-by: cgel <cgel@zte.com.cn>
> 
> > @@ -64,9 +91,11 @@ struct psi_group_cpu {
> >  
> >  	/* Aggregate pressure state derived from the tasks */
> >  	u32 state_mask;
> > +	u32 state_memstall;
> >  
> >  	/* Period time sampling buckets for each state of interest (ns) */
> >  	u32 times[NR_PSI_STATES];
> > +	u32 times_mem[PSI_MEM_STATES];
> 
> This doubles the psi cache footprint on every context switch, wakeup,
> sleep, etc. in the scheduler. You're also adding more branches to
> those same paths. It'll measurably affect everybody who is using psi.
> 
> Yet, in the years of using psi in production myself, I've never felt
> the need for what this patch provides. There are event counters for
> everything that contributes to pressure, and it's never been hard to
> rootcause spikes. There are also things like bpftrace that let you
> identify who is stalling for how long in order to do one-off tuning
> and systems introspection.
> 
We think this patch is not for rootcause spikes, it's for automatic optimize
memory besides oomd, especially for sysctl adjustment. For example if we see
much pressure of direct reclaim the automatic optimize program might turn up
watermark_scale_factor.
The base idea is that this patch gives user a brief UI to know what kind of
memory pressure the system is suffering, and to optimize the system in a fine
grain. It could provide data for user to adjust watermark_boost_factor,
extfrag_threshold, compaction_proactiveness,transparent_hugepage/defrag,
swappiness, vfs_cache_pressure, madvise(), which may not easy for to do
before.

It's not easy for automatic optimize program to use tools likes bpftrace or
ftrace to do this.

While we may use CONFIG_PSI_XX or bootparam to turn on/off this patch to avoid
additional footprint for user who not need this.

CGEL Sept. 6, 2022, 11:40 a.m. UTC | #4

On Wed, Aug 17, 2022 at 02:59:48AM +0000, CGEL wrote:
> On Wed, Aug 03, 2022 at 09:55:39AM -0400, Johannes Weiner wrote:
> > On Mon, Aug 01, 2022 at 12:42:04AM +0000, cgel.zte@gmail.com wrote:
> > 
> > This doubles the psi cache footprint on every context switch, wakeup,
> > sleep, etc. in the scheduler. You're also adding more branches to
> > those same paths. It'll measurably affect everybody who is using psi.
> > 
> > Yet, in the years of using psi in production myself, I've never felt
> > the need for what this patch provides. There are event counters for
> > everything that contributes to pressure, and it's never been hard to
> > rootcause spikes. There are also things like bpftrace that let you
> > identify who is stalling for how long in order to do one-off tuning
> > and systems introspection.
> > 
> We think this patch is not for rootcause spikes, it's for automatic optimize
> memory besides oomd, especially for sysctl adjustment. For example if we see
> much pressure of direct reclaim the automatic optimize program might turn up
> watermark_scale_factor.
> The base idea is that this patch gives user a brief UI to know what kind of
> memory pressure the system is suffering, and to optimize the system in a fine
> grain. It could provide data for user to adjust watermark_boost_factor,
> extfrag_threshold, compaction_proactiveness,transparent_hugepage/defrag,
> swappiness, vfs_cache_pressure, madvise(), which may not easy for to do
> before.
> 
> It's not easy for automatic optimize program to use tools likes bpftrace or
> ftrace to do this.
> 
> While we may use CONFIG_PSI_XX or bootparam to turn on/off this patch to avoid
> additional footprint for user who not need this.

Hi
   Do you think this is praciseable?

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7b3de73..163da43 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -19,10 +19,11 @@  void psi_init(void);
 void psi_task_change(struct task_struct *task, int clear, int set);
 
 void psi_memstall_tick(struct task_struct *task, int cpu);
-void psi_memstall_enter(unsigned long *flags);
-void psi_memstall_leave(unsigned long *flags);
+void psi_memstall_enter(unsigned long *flags, int mem_state);
+void psi_memstall_leave(unsigned long *flags, int mem_state);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+int psi_mem_pressure_stat_show(struct seq_file *m, void *v);
 
 #ifdef CONFIG_CGROUPS
 int psi_cgroup_alloc(struct cgroup *cgrp);
@@ -41,7 +42,7 @@  __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 
 static inline void psi_init(void) {}
 
-static inline void psi_memstall_enter(unsigned long *flags) {}
+static inline void psi_memstall_enter(unsigned long *flags, int mem_state) {}
 static inline void psi_memstall_leave(unsigned long *flags) {}
 
 #ifdef CONFIG_CGROUPS
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 07aaf9b..194ea78 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -9,6 +9,8 @@ 
 
 #ifdef CONFIG_PSI
 
+#define PSI_MASK(x)	((1UL << (x))-1)
+
 /* Tracked task states */
 enum psi_task_count {
 	NR_IOWAIT,
@@ -22,6 +24,10 @@  enum psi_task_count {
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
 
+#define TSK_COUNT_MASK	PSI_MASK(NR_PSI_TASK_COUNTS)
+#define TSK_COUNT_SHIFT	8
+
+
 /* Resources that workloads could be stalled on */
 enum psi_res {
 	PSI_IO,
@@ -53,6 +59,27 @@  enum psi_aggregators {
 	NR_PSI_AGGREGATORS,
 };
 
+/* Causes for mem pressure */
+enum psi_memstall_states {
+	PSI_MEM_KSWAPD,
+	PSI_MEM_DRECALAIM,
+	PSI_MEM_KCOMPACTED,
+	PSI_MEM_DCOMPACT,
+	PSI_MEM_CGROUP,
+	PSI_MEM_SWAP,
+	PSI_MEM_WORKINGSET,
+	PSI_MEM_STATES,
+};
+
+#define TSK_MEMSTALL_SHIFT		8
+#define TSK_MEMSTALL_KSWAPD 	(1 << (PSI_MEM_KSWAPD + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_DRECLAIM 	(1 << (PSI_MEM_KCOMPACTED + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_KCOMPACTED 	(1 << (PSI_MEM_DCOMPACT + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_DCOMPACT 	(1 << (PSI_MEM_CGROUP + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_CGROUP 	(1 << (PSI_MEM_DRECALAIM + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_WORKINGSET 		(1 << (PSI_MEM_WORKINGSET + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_MASK	(PSI_MASK(TSK_MEMSTALL_SHIFT) << TSK_COUNT_SHIFT)
+
 struct psi_group_cpu {
 	/* 1st cacheline updated by the scheduler */
 
@@ -64,9 +91,11 @@  struct psi_group_cpu {
 
 	/* Aggregate pressure state derived from the tasks */
 	u32 state_mask;
+	u32 state_memstall;
 
 	/* Period time sampling buckets for each state of interest (ns) */
 	u32 times[NR_PSI_STATES];
+	u32 times_mem[PSI_MEM_STATES];
 
 	/* Time of last task change in this group (rq_clock) */
 	u64 state_start;
@@ -76,6 +105,7 @@  struct psi_group_cpu {
 	/* Delta detection against the sampling buckets */
 	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
 			____cacheline_aligned_in_smp;
+	u32 times_mem_prev[PSI_MEM_STATES];
 };
 
 /* PSI growth tracking window */
@@ -144,6 +174,10 @@  struct psi_group {
 	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
 	unsigned long avg[NR_PSI_STATES - 1][3];
 
+	u64 total_mems[PSI_MEM_STATES - 1];
+	unsigned long avg_mems[PSI_MEM_STATES - 1][3];
+	u64 avg_total_mems[PSI_MEM_STATES - 1];
+
 	/* Monitor work control */
 	atomic_t poll_scheduled;
 	struct kthread_worker __rcu *poll_kworker;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 806fc9d..b50ab92 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3613,6 +3613,13 @@  static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 
 	return psi_show(seq, psi, PSI_MEM);
 }
+static int cgroup_memory_pressure_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_mem_pressure_stat_show(seq, psi);
+}
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
@@ -4930,6 +4937,10 @@  static struct cftype cgroup_base_files[] = {
 		.poll = cgroup_pressure_poll,
 		.release = cgroup_pressure_release,
 	},
+	{
+		.name = "memory.pressure.stat",
+		.seq_show = cgroup_memory_pressure_stat_show,
+	},
 	{
 		.name = "cpu.pressure",
 		.seq_show = cgroup_cpu_pressure_show,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e74..072d535 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -279,6 +279,35 @@  static void get_recent_times(struct psi_group *group, int cpu,
 	}
 }
 
+static void get_recent_mem_times(struct psi_group *group, int cpu, u32 *times_mem)
+{
+	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	u64 now, state_start;
+	enum psi_memstall_states s;
+	unsigned int seq;
+	u32 state_mask;
+
+	do {
+		seq = read_seqcount_begin(&groupc->seq);
+		now = cpu_clock(cpu);
+		memcpy(times_mem, groupc->times_mem, sizeof(groupc->times_mem));
+		state_mask = groupc->state_mask;
+		state_start = groupc->state_start;
+	} while (read_seqcount_retry(&groupc->seq, seq));
+
+	for (s = 0; s < PSI_MEM_STATES; s++) {
+		u32 delta;
+
+		if (state_mask & (1 << s))
+			times_mem[s] += now - state_start;
+
+		delta = times_mem[s] - groupc->times_mem_prev[s];
+		groupc->times_mem_prev[s] = times_mem[s];
+
+		times_mem[s] = delta;
+	}
+}
+
 static void calc_avgs(unsigned long avg[3], int missed_periods,
 		      u64 time, u64 period)
 {
@@ -304,6 +333,7 @@  static void collect_percpu_times(struct psi_group *group,
 				 u32 *pchanged_states)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
+	u64 delta_mems[PSI_MEM_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
 	u32 changed_states = 0;
 	int cpu;
@@ -319,11 +349,16 @@  static void collect_percpu_times(struct psi_group *group,
 	 */
 	for_each_possible_cpu(cpu) {
 		u32 times[NR_PSI_STATES];
+		u32 times_mem[PSI_MEM_STATES];
+
 		u32 nonidle;
 		u32 cpu_changed_states;
 
 		get_recent_times(group, cpu, aggregator, times,
 				&cpu_changed_states);
+		if (times[PSI_MEM_SOME])
+			get_recent_mem_times(group, cpu, times_mem);
+
 		changed_states |= cpu_changed_states;
 
 		nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
@@ -350,6 +385,10 @@  static void collect_percpu_times(struct psi_group *group,
 		group->total[aggregator][s] +=
 				div_u64(deltas[s], max(nonidle_total, 1UL));
 
+	for (s = 0; s < PSI_MEM_STATES - 1; s++)
+		group->total_mems[s] +=
+				div_u64(delta_mems[s], max(nonidle_total, 1UL));
+
 	if (pchanged_states)
 		*pchanged_states = changed_states;
 }
@@ -404,6 +443,16 @@  static u64 update_averages(struct psi_group *group, u64 now)
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
 
+	for (s = 0; s < PSI_MEM_STATES - 1; s++) {
+		u32 sample;
+
+		sample = group->total_mems[s] - group->avg_total_mems[s];
+		if (sample > period)
+			sample = period;
+		group->avg_total_mems[s] += sample;
+		calc_avgs(group->avg_mems[s], missed_periods, sample, period);
+	}
+
 	return avg_next_update;
 }
 
@@ -628,6 +677,7 @@  static void record_times(struct psi_group_cpu *groupc, int cpu,
 {
 	u32 delta;
 	u64 now;
+	int state_memstall = groupc->state_memstall;
 
 	now = cpu_clock(cpu);
 	delta = now - groupc->state_start;
@@ -641,6 +691,7 @@  static void record_times(struct psi_group_cpu *groupc, int cpu,
 
 	if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
 		groupc->times[PSI_MEM_SOME] += delta;
+		groupc->times_mem[state_memstall] += delta;
 		if (groupc->state_mask & (1 << PSI_MEM_FULL))
 			groupc->times[PSI_MEM_FULL] += delta;
 		else if (memstall_tick) {
@@ -676,7 +727,12 @@  static u32 psi_group_change(struct psi_group *group, int cpu,
 	unsigned int t, m;
 	enum psi_states s;
 	u32 state_mask = 0;
+	u32 state_memstall = 0;
 
+	if (set & TSK_MEMSTALL) {
+		state_memstall = set & TSK_MEMSTALL_MASK;
+		set &= TSK_COUNT_MASK;
+	}
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
 	/*
@@ -714,7 +770,7 @@  static u32 psi_group_change(struct psi_group *group, int cpu,
 			state_mask |= (1 << s);
 	}
 	groupc->state_mask = state_mask;
-
+	groupc->state_memstall = state_memstall;
 	write_seqcount_end(&groupc->seq);
 
 	return state_mask;
@@ -810,7 +866,7 @@  void psi_memstall_tick(struct task_struct *task, int cpu)
  * Marks the calling task as being stalled due to a lack of memory,
  * such as waiting for a refault or performing reclaim.
  */
-void psi_memstall_enter(unsigned long *flags)
+void psi_memstall_enter(unsigned long *flags, int mem_state)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -829,7 +885,7 @@  void psi_memstall_enter(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->flags |= PF_MEMSTALL;
-	psi_task_change(current, 0, TSK_MEMSTALL);
+	psi_task_change(current, 0, TSK_MEMSTALL | mem_state);
 
 	rq_unlock_irq(rq, &rf);
 }
@@ -840,7 +896,7 @@  void psi_memstall_enter(unsigned long *flags)
  *
  * Marks the calling task as no longer stalled due to lack of memory.
  */
-void psi_memstall_leave(unsigned long *flags)
+void psi_memstall_leave(unsigned long *flags, int mem_state)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -858,7 +914,7 @@  void psi_memstall_leave(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->flags &= ~PF_MEMSTALL;
-	psi_task_change(current, TSK_MEMSTALL, 0);
+	psi_task_change(current, TSK_MEMSTALL | mem_state, 0);
 
 	rq_unlock_irq(rq, &rf);
 }
@@ -974,6 +1030,53 @@  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	return 0;
 }
 
+const char * const memstall_text[] = {
+	"kswapd",
+	"direct reclaim",
+	"kcompacted",
+	"direct compact",
+	"cgroup reclaim",
+	"swap",
+	"workingset",
+};
+
+int psi_mem_pressure_stat_show(struct seq_file *m, void *v)
+{
+	int s;
+	u64 now;
+	struct psi_group *group = &psi_system;
+
+	if (static_branch_likely(&psi_disabled))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&group->avgs_lock);
+	now = sched_clock();
+	collect_percpu_times(group, PSI_AVGS, NULL);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
+	mutex_unlock(&group->avgs_lock);
+
+	for (s = 0; s < PSI_MEM_STATES; s++) {
+		unsigned long avg[3];
+		u64 total;
+		int w;
+
+		for (w = 0; w < 3; w++)
+			avg[w] = group->avg_mems[s][w];
+
+		total = div_u64(group->total_mems[PSI_AVGS], NSEC_PER_USEC);
+
+		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+			   memstall_text[s],
+			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+			   total);
+	}
+
+	return 0;
+}
+
 static int psi_io_show(struct seq_file *m, void *v)
 {
 	return psi_show(m, &psi_system, PSI_IO);
@@ -998,7 +1101,10 @@  static int psi_memory_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, psi_memory_show, NULL);
 }
-
+static int psi_memory_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psi_mem_pressure_stat_show, NULL);
+}
 static int psi_cpu_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, psi_cpu_show, NULL);
@@ -1271,7 +1377,12 @@  static const struct file_operations psi_memory_fops = {
 	.poll           = psi_fop_poll,
 	.release        = psi_fop_release,
 };
-
+static const struct file_operations psi_memory_stat_fops = {
+	.open           = psi_memory_stat_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = psi_fop_release,
+};
 static const struct file_operations psi_cpu_fops = {
 	.open           = psi_cpu_open,
 	.read           = seq_read,
@@ -1286,6 +1397,7 @@  static int __init psi_proc_init(void)
 	proc_mkdir("pressure", NULL);
 	proc_create("pressure/io", 0, NULL, &psi_io_fops);
 	proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+	proc_create("pressure/memory_stat", 0, NULL, &psi_memory_stat_fops);
 	proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
 	return 0;
 }

[RFC,1/2] psi: introduce memory.pressure.stat

Commit Message

Comments

Patch