diff mbox series

[V2] mm: compaction: support triggering of proactive compaction by user

Message ID 1621345058-26676-1-git-send-email-charante@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series [V2] mm: compaction: support triggering of proactive compaction by user | expand

Commit Message

Charan Teja Kalla May 18, 2021, 1:37 p.m. UTC
The proactive compaction[1] gets triggered for every 500msec and run
compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
pages based on the value set to sysctl.compaction_proactiveness.
Triggering the compaction for every 500msec in search of
COMPACTION_HPAGE_ORDER pages is not needed for all applications,
especially on the embedded system usecases which may have few MB's of
RAM. Enabling the proactive compaction in its state will endup in
running almost always on such systems.

Other side, proactive compaction can still be very much useful for
getting a set of higher order pages in some controllable
manner(controlled by using the sysctl.compaction_proactiveness). Thus on
systems where enabling the proactive compaction always may proove not
required, can trigger the same from user space on write to its sysctl
interface. As an example, say app launcher decide to launch the memory
heavy application which can be launched fast if it gets more higher
order pages thus launcher can prepare the system in advance by
triggering the proactive compaction from userspace.

This triggering of proactive compaction is done on a write to
sysctl.compaction_proactiveness by user.

[1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a

Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
---
changes in V2: 
    - remove /proc interface trigger for proactive compaction
    - Intention is same that add a way to trigger proactive compaction by user.

changes in V1:
    -  https://lore.kernel.org/lkml/1619098678-8501-1-git-send-email-charante@codeaurora.org/

 include/linux/compaction.h |  2 ++
 include/linux/mmzone.h     |  1 +
 kernel/sysctl.c            |  2 +-
 mm/compaction.c            | 35 ++++++++++++++++++++++++++++++++---
 4 files changed, 36 insertions(+), 4 deletions(-)

Comments

Chu,Kaiping May 19, 2021, 1:41 a.m. UTC | #1
> -----邮件原件-----
> 发件人: charante=codeaurora.org@mg.codeaurora.org
> <charante=codeaurora.org@mg.codeaurora.org> 代表 Charan Teja Reddy
> 发送时间: 2021年5月18日 21:38
> 收件人: akpm@linux-foundation.org; mcgrof@kernel.org;
> keescook@chromium.org; yzaikin@google.com; vbabka@suse.cz;
> nigupta@nvidia.com; bhe@redhat.com; mateusznosek0@gmail.com;
> sh_def@163.com; iamjoonsoo.kim@lge.com; vinmenon@codeaurora.org
> 抄送: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> linux-fsdevel@vger.kernel.org; Charan Teja Reddy <charante@codeaurora.org>
> 主题: [PATCH V2] mm: compaction: support triggering of proactive
> compaction by user
> 
> The proactive compaction[1] gets triggered for every 500msec and run
> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
> pages based on the value set to sysctl.compaction_proactiveness.
> Triggering the compaction for every 500msec in search of
> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
> especially on the embedded system usecases which may have few MB's of
> RAM. Enabling the proactive compaction in its state will endup in running
> almost always on such systems.
> 
> Other side, proactive compaction can still be very much useful for getting a set
> of higher order pages in some controllable manner(controlled by using the
> sysctl.compaction_proactiveness). Thus on systems where enabling the
> proactive compaction always may proove not required, can trigger the same
> from user space on write to its sysctl interface. As an example, say app
> launcher decide to launch the memory heavy application which can be
> launched fast if it gets more higher order pages thus launcher can prepare the
> system in advance by triggering the proactive compaction from userspace.
> 
> This triggering of proactive compaction is done on a write to
> sysctl.compaction_proactiveness by user.

If you want to trigger compaction from userspace, you can use " echo 1 > /proc/sys/vm/compact_memory", there is no need to be so complex.

> 
> [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?i
> d=facdaa917c4d5a376d09d25865f5a863f906234a
> 
> Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
> ---
> changes in V2:
>     - remove /proc interface trigger for proactive compaction
>     - Intention is same that add a way to trigger proactive compaction by user.
> 
> changes in V1:
>     -
> https://lore.kernel.org/lkml/1619098678-8501-1-git-send-email-charante@co
> deaurora.org/
> 
>  include/linux/compaction.h |  2 ++
>  include/linux/mmzone.h     |  1 +
>  kernel/sysctl.c            |  2 +-
>  mm/compaction.c            | 35
> ++++++++++++++++++++++++++++++++---
>  4 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h index
> 4221888..04d5d9f 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int
> order)  extern unsigned int sysctl_compaction_proactiveness;  extern int
> sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);
> +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
> +		int write, void *buffer, size_t *length, loff_t *ppos);
>  extern int sysctl_extfrag_threshold;
>  extern int sysctl_compact_unevictable_allowed;
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index
> 0d53eba..9455809 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -815,6 +815,7 @@ typedef struct pglist_data {
>  	enum zone_type kcompactd_highest_zoneidx;
>  	wait_queue_head_t kcompactd_wait;
>  	struct task_struct *kcompactd;
> +	bool proactive_compact_trigger;
>  #endif
>  	/*
>  	 * This is a per-node reserve of pages that are not available diff --git
> a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84..bed2fad 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -2840,7 +2840,7 @@ static struct ctl_table vm_table[] = {
>  		.data		= &sysctl_compaction_proactiveness,
>  		.maxlen		= sizeof(sysctl_compaction_proactiveness),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= compaction_proactiveness_sysctl_handler,
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= &one_hundred,
>  	},
> diff --git a/mm/compaction.c b/mm/compaction.c index 84fde27..9056693
> 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2708,6 +2708,30 @@ static void compact_nodes(void)
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
> 
> +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int
> write,
> +		void *buffer, size_t *length, loff_t *ppos) {
> +	int rc, nid;
> +
> +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (rc)
> +		return rc;
> +
> +	if (write && sysctl_compaction_proactiveness) {
> +		for_each_online_node(nid) {
> +			pg_data_t *pgdat = NODE_DATA(nid);
> +
> +			if (pgdat->proactive_compact_trigger)
> +				continue;
> +
> +			pgdat->proactive_compact_trigger = true;
> +			wake_up_interruptible(&pgdat->kcompactd_wait);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * This is the entry point for compacting all nodes via
>   * /proc/sys/vm/compact_memory
> @@ -2752,7 +2776,8 @@ void compaction_unregister_node(struct node
> *node)
> 
>  static inline bool kcompactd_work_requested(pg_data_t *pgdat)  {
> -	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
> +	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
> +		pgdat->proactive_compact_trigger;
>  }
> 
>  static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2905,7
> +2930,8 @@ static int kcompactd(void *p)
>  		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
>  		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
>  			kcompactd_work_requested(pgdat),
> -			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
> +			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC)) &&
> +			!pgdat->proactive_compact_trigger) {
> 
>  			psi_memstall_enter(&pflags);
>  			kcompactd_do_work(pgdat);
> @@ -2919,7 +2945,7 @@ static int kcompactd(void *p)
> 
>  			if (proactive_defer) {
>  				proactive_defer--;
> -				continue;
> +				goto loop;
>  			}
>  			prev_score = fragmentation_score_node(pgdat);
>  			proactive_compact_node(pgdat);
> @@ -2931,6 +2957,9 @@ static int kcompactd(void *p)
>  			proactive_defer = score < prev_score ?
>  					0 : 1 << COMPACT_MAX_DEFER_SHIFT;
>  		}
> +loop:
> +		if (pgdat->proactive_compact_trigger)
> +			pgdat->proactive_compact_trigger = false;
>  	}
> 
>  	return 0;
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
> member of the Code Aurora Forum, hosted by The Linux Foundation
Charan Teja Kalla May 19, 2021, 10:19 a.m. UTC | #2
Thanks Kaiping for your review comments!!

On 5/19/2021 7:11 AM, Chu,Kaiping wrote:
>> This triggering of proactive compaction is done on a write to
>> sysctl.compaction_proactiveness by user.
> If you want to trigger compaction from userspace, you can use " echo 1 > /proc/sys/vm/compact_memory", there is no need to be so complex.

1) compact_memory is intended for debug interface. And moreover we can't
issue the compaction in some controlled manner as write to this node
triggers the full node compaction. This patch aims at users who want to
do the compaction in some controlled manner from user space. Example
user is app launcher preparing the system before launching a memory
hungry app.

2) Also, with the current implementation of proactive compaction, say
user sets the sysctl.compaction_proactiveness, the values can have
effect only in the next HPAGE_FRAG_CHECK_INTERVAL_MSEC(500msec), IOW,
the proactive compaction can run with the new settings only after
500msec which can make the user to wait for 500msec after setting a
value in the compaction_proactiveness to think that the value written is
came into effectiveness. Say user want to launch a gaming application
which has higher memory requirements and its launch time is proportional
to the available higher order pages. So, what he can do to get the
larger number of pages is set the compaction_proactivness to higher
value, continue launching the application and once finishes can set the
proactivness to original value. But with the current implementation the
value set may not come into effectiveness at all because of the 500msec
delay.Thus,the patch also handles the scenario of requiring the
proactive compaction to run immediately once user sets the
'compaction_proactiveness'.

May be I need to update the commit log even more clear about why can't
we use the 'compact_memory' and requirements to need to run the
proactive compaction immediately once user changes the
compaction_proactivness.

>
Charan Teja Kalla May 25, 2021, 2:20 p.m. UTC | #3
Gentle ping.

Thanks,
Charan

On 5/18/2021 7:07 PM, Charan Teja Reddy wrote:
> The proactive compaction[1] gets triggered for every 500msec and run
> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
> pages based on the value set to sysctl.compaction_proactiveness.
> Triggering the compaction for every 500msec in search of
> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
> especially on the embedded system usecases which may have few MB's of
> RAM. Enabling the proactive compaction in its state will endup in
> running almost always on such systems.
> 
> Other side, proactive compaction can still be very much useful for
> getting a set of higher order pages in some controllable
> manner(controlled by using the sysctl.compaction_proactiveness). Thus on
> systems where enabling the proactive compaction always may proove not
> required, can trigger the same from user space on write to its sysctl
> interface. As an example, say app launcher decide to launch the memory
> heavy application which can be launched fast if it gets more higher
> order pages thus launcher can prepare the system in advance by
> triggering the proactive compaction from userspace.
> 
> This triggering of proactive compaction is done on a write to
> sysctl.compaction_proactiveness by user.
> 
> [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a
> 
> Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
> ---
> changes in V2: 
>     - remove /proc interface trigger for proactive compaction
>     - Intention is same that add a way to trigger proactive compaction by user.
> 
> changes in V1:
>     -  https://lore.kernel.org/lkml/1619098678-8501-1-git-send-email-charante@codeaurora.org/
> 
>  include/linux/compaction.h |  2 ++
>  include/linux/mmzone.h     |  1 +
>  kernel/sysctl.c            |  2 +-
>  mm/compaction.c            | 35 ++++++++++++++++++++++++++++++++---
>  4 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index 4221888..04d5d9f 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
>  extern unsigned int sysctl_compaction_proactiveness;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);
> +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
> +		int write, void *buffer, size_t *length, loff_t *ppos);
>  extern int sysctl_extfrag_threshold;
>  extern int sysctl_compact_unevictable_allowed;
>  
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 0d53eba..9455809 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -815,6 +815,7 @@ typedef struct pglist_data {
>  	enum zone_type kcompactd_highest_zoneidx;
>  	wait_queue_head_t kcompactd_wait;
>  	struct task_struct *kcompactd;
> +	bool proactive_compact_trigger;
>  #endif
>  	/*
>  	 * This is a per-node reserve of pages that are not available
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 14edf84..bed2fad 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -2840,7 +2840,7 @@ static struct ctl_table vm_table[] = {
>  		.data		= &sysctl_compaction_proactiveness,
>  		.maxlen		= sizeof(sysctl_compaction_proactiveness),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= compaction_proactiveness_sysctl_handler,
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= &one_hundred,
>  	},
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 84fde27..9056693 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2708,6 +2708,30 @@ static void compact_nodes(void)
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
>  
> +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
> +		void *buffer, size_t *length, loff_t *ppos)
> +{
> +	int rc, nid;
> +
> +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (rc)
> +		return rc;
> +
> +	if (write && sysctl_compaction_proactiveness) {
> +		for_each_online_node(nid) {
> +			pg_data_t *pgdat = NODE_DATA(nid);
> +
> +			if (pgdat->proactive_compact_trigger)
> +				continue;
> +
> +			pgdat->proactive_compact_trigger = true;
> +			wake_up_interruptible(&pgdat->kcompactd_wait);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * This is the entry point for compacting all nodes via
>   * /proc/sys/vm/compact_memory
> @@ -2752,7 +2776,8 @@ void compaction_unregister_node(struct node *node)
>  
>  static inline bool kcompactd_work_requested(pg_data_t *pgdat)
>  {
> -	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
> +	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
> +		pgdat->proactive_compact_trigger;
>  }
>  
>  static bool kcompactd_node_suitable(pg_data_t *pgdat)
> @@ -2905,7 +2930,8 @@ static int kcompactd(void *p)
>  		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
>  		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
>  			kcompactd_work_requested(pgdat),
> -			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
> +			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC)) &&
> +			!pgdat->proactive_compact_trigger) {
>  
>  			psi_memstall_enter(&pflags);
>  			kcompactd_do_work(pgdat);
> @@ -2919,7 +2945,7 @@ static int kcompactd(void *p)
>  
>  			if (proactive_defer) {
>  				proactive_defer--;
> -				continue;
> +				goto loop;
>  			}
>  			prev_score = fragmentation_score_node(pgdat);
>  			proactive_compact_node(pgdat);
> @@ -2931,6 +2957,9 @@ static int kcompactd(void *p)
>  			proactive_defer = score < prev_score ?
>  					0 : 1 << COMPACT_MAX_DEFER_SHIFT;
>  		}
> +loop:
> +		if (pgdat->proactive_compact_trigger)
> +			pgdat->proactive_compact_trigger = false;
>  	}
>  
>  	return 0;
>
Nitin Gupta May 25, 2021, 8:35 p.m. UTC | #4
> -----Original Message-----
> From: charante=codeaurora.org@mg.codeaurora.org
> <charante=codeaurora.org@mg.codeaurora.org> On Behalf Of Charan Teja
> Reddy
> Sent: Tuesday, May 18, 2021 6:38 AM
> To: akpm@linux-foundation.org; mcgrof@kernel.org;
> keescook@chromium.org; yzaikin@google.com; vbabka@suse.cz; Nitin
> Gupta <nigupta@nvidia.com>; bhe@redhat.com;
> mateusznosek0@gmail.com; sh_def@163.com; iamjoonsoo.kim@lge.com;
> vinmenon@codeaurora.org
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; linux-
> fsdevel@vger.kernel.org; Charan Teja Reddy <charante@codeaurora.org>
> Subject: [PATCH V2] mm: compaction: support triggering of proactive
> compaction by user
> 
> External email: Use caution opening links or attachments
> 
> 
> The proactive compaction[1] gets triggered for every 500msec and run
> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
> pages based on the value set to sysctl.compaction_proactiveness.
> Triggering the compaction for every 500msec in search of
> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
> especially on the embedded system usecases which may have few MB's of
> RAM. Enabling the proactive compaction in its state will endup in running
> almost always on such systems.
> 

You can disable proactive compaction by setting sysctl.compaction_proactiveness to 0.


> Other side, proactive compaction can still be very much useful for getting a
> set of higher order pages in some controllable manner(controlled by using
> the sysctl.compaction_proactiveness). Thus on systems where enabling the
> proactive compaction always may proove not required, can trigger the same
> from user space on write to its sysctl interface. As an example, say app
> launcher decide to launch the memory heavy application which can be
> launched fast if it gets more higher order pages thus launcher can prepare
> the system in advance by triggering the proactive compaction from
> userspace.
> 

You can always do: echo 1 > /proc/sys/vm/compact_memory
On a small system, this should not take much time.

Hijacking proactive compaction for one-off compaction (say, before a large app launch)
does not sound right to me.

Thanks,
Nitin
Charan Teja Kalla May 27, 2021, 9:28 a.m. UTC | #5
Thanks Nitin for your inputs!!

On 5/26/2021 2:05 AM, Nitin Gupta wrote:
> The proactive compaction[1] gets triggered for every 500msec and run
> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
> pages based on the value set to sysctl.compaction_proactiveness.
> Triggering the compaction for every 500msec in search of
> 
> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
>> especially on the embedded system usecases which may have few MB's of
>> RAM. Enabling the proactive compaction in its state will endup in running
>> almost always on such systems.
>>
> You can disable proactive compaction by setting sysctl.compaction_proactiveness to 0.

Agree. But proactive compaction got its own uses too like it knows when
to stop the compaction, instead of simply doing the full node
compaction, thus we don't want to disable it always.

> 
> 
>> As an example, say app
>> launcher decide to launch the memory heavy application which can be
>> launched fast if it gets more higher order pages thus launcher can prepare
>> the system in advance by triggering the proactive compaction from
>> userspace.
>>
> You can always do: echo 1 > /proc/sys/vm/compact_memory
> On a small system, this should not take much time.

Hmm... With 3GB Snapdragon system, we have observed that write to
compact_memory is taking peak time of 400+msec, could be that
MIGRATE_SYNC on a full node is causing this peak, which is much time.


> 
> Hijacking proactive compaction for one-off compaction (say, before a large app launch)
> does not sound right to me.

Actually we are using the proactive compaction to 'just prepare the
system before asking for huge memory' as compact_memory can take longer
and is not controllable like proactive compaction.

In the V1 of this patch, we actually created a /proc interface(similar
to compact_memory), providing a way to trigger the proactive compaction
from user space. https://lore.kernel.org/patchwork/patch/1417064/. But
since this involved a new /proc interface addition, in V2 we just
implemented an alternative way to it.

Another problem, I think, this patch tried to address is that, in the
existing implementation it is not guaranteed the user set value of
compaction_proactiveness is effective unless atleast
HPAGE_FRAG_CHECK_INTERVAL_MSEC(500msec) is elapsed, Right? Does this
seems correct provided we had given this user interface and can't
specified any where when this value will be effective(where it comes
into effect in the next compact thread wake up for proactive compaction).

Consider the below testcase where a user thinks that the application he
is going to run is performance critical thus decides to do the below steps:
1) Save the present the compaction_proactiveness (Say it is zero thus
disabled)
2) Set the compaction_proactiveness to 100.
3) Allocate memory for the application.
4) Restore the compaction_proactiveness.(set to disabled again)
5) Then proactive compaction is tried to run.

First, Does the user doing the above steps are valid?
If yes, then we should guarantee to the user that proactive compaction
atleast tried to run when the user changed the proactiveness.
If not, I feel, we should document that 'once user changed the
compaction_proactiveness, he need to wait atleast
HPAGE_FRAG_CHECK_INTERVAL_MSEC before considering that the value he
tried to set is effective and proactive compaction tried to run on
that'. Doesn't this seem okay?

--Charan
Nitin Gupta May 27, 2021, 11:52 p.m. UTC | #6
> -----Original Message-----
> From: charante=codeaurora.org@mg.codeaurora.org
> <charante=codeaurora.org@mg.codeaurora.org> On Behalf Of Charan Teja
> Kalla
> Sent: Thursday, May 27, 2021 2:28 AM
> To: Nitin Gupta <nigupta@nvidia.com>; akpm@linux-foundation.org;
> mcgrof@kernel.org; keescook@chromium.org; yzaikin@google.com;
> vbabka@suse.cz; bhe@redhat.com; mateusznosek0@gmail.com;
> sh_def@163.com; iamjoonsoo.kim@lge.com; vinmenon@codeaurora.org
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; linux-
> fsdevel@vger.kernel.org
> Subject: Re: [PATCH V2] mm: compaction: support triggering of proactive
> compaction by user
> 
> External email: Use caution opening links or attachments
> 
> 
> Thanks Nitin for your inputs!!
> 
> On 5/26/2021 2:05 AM, Nitin Gupta wrote:
> > The proactive compaction[1] gets triggered for every 500msec and run
> > compaction on the node for COMPACTION_HPAGE_ORDER (usually order-
> 9)
> > pages based on the value set to sysctl.compaction_proactiveness.
> > Triggering the compaction for every 500msec in search of
> >
> > COMPACTION_HPAGE_ORDER pages is not needed for all applications,
> >> especially on the embedded system usecases which may have few MB's of
> >> RAM. Enabling the proactive compaction in its state will endup in
> >> running almost always on such systems.
> >>
> > You can disable proactive compaction by setting
> sysctl.compaction_proactiveness to 0.
> 
> Agree. But proactive compaction got its own uses too like it knows when to
> stop the compaction, instead of simply doing the full node compaction, thus
> we don't want to disable it always.
> 
> >
> >
> >> As an example, say app
> >> launcher decide to launch the memory heavy application which can be
> >> launched fast if it gets more higher order pages thus launcher can
> >> prepare the system in advance by triggering the proactive compaction
> >> from userspace.
> >>
> > You can always do: echo 1 > /proc/sys/vm/compact_memory On a small
> > system, this should not take much time.
> 
> Hmm... With 3GB Snapdragon system, we have observed that write to
> compact_memory is taking peak time of 400+msec, could be that
> MIGRATE_SYNC on a full node is causing this peak, which is much time.
> 
>
> >
> > Hijacking proactive compaction for one-off compaction (say, before a
> > large app launch) does not sound right to me.
> 
> Actually we are using the proactive compaction to 'just prepare the system
> before asking for huge memory' as compact_memory can take longer and is
> not controllable like proactive compaction.
> 
> In the V1 of this patch, we actually created a /proc interface(similar to
> compact_memory), providing a way to trigger the proactive compaction from
> user space. https://lore.kernel.org/patchwork/patch/1417064/. But since
> this involved a new /proc interface addition, in V2 we just implemented an
> alternative way to it.
> 
> Another problem, I think, this patch tried to address is that, in the existing
> implementation it is not guaranteed the user set value of
> compaction_proactiveness is effective unless atleast
> HPAGE_FRAG_CHECK_INTERVAL_MSEC(500msec) is elapsed, Right? Does this
> seems correct provided we had given this user interface and can't specified
> any where when this value will be effective(where it comes into effect in the
> next compact thread wake up for proactive compaction).
> 
> Consider the below testcase where a user thinks that the application he is
> going to run is performance critical thus decides to do the below steps:
> 1) Save the present the compaction_proactiveness (Say it is zero thus
> disabled)
> 2) Set the compaction_proactiveness to 100.
> 3) Allocate memory for the application.
> 4) Restore the compaction_proactiveness.(set to disabled again)
> 5) Then proactive compaction is tried to run.
> 
> First, Does the user doing the above steps are valid?
> If yes, then we should guarantee to the user that proactive compaction
> atleast tried to run when the user changed the proactiveness.
> If not, I feel, we should document that 'once user changed the
> compaction_proactiveness, he need to wait atleast
> HPAGE_FRAG_CHECK_INTERVAL_MSEC before considering that the value he
> tried to set is effective and proactive compaction tried to run on that'.
> Doesn't this seem okay?

Proactive compaction does not guarantee if the kernel will be able to achieve
fragmentation targets implied from the compaction_proactiveness sysctl. It also
does not guarantee how much time it will take to reach desired fragmentation
levels (if at all possible). Also, HPAGE_FRAG_CHECK_INTERVAL_MSEC is the
maximum sleep time, depending on relative timing of your sysctl input and
the timeout.

The intent of proactive compaction is to maintain desired fragmentation levels
wrt the default hugepage size in the background so we don't have to pay latency
cost associated with on-demand compaction. I don't like the idea of forcing a round
of compaction on sysctl write. Maybe add a Kconfig parameter for setting
HPAGE_FRAG_CHECK_INTERVAL_MSEC to say 1msec?

Thanks,
Nitin
Vlastimil Babka May 28, 2021, 3:19 p.m. UTC | #7
+CC linux-api

On 5/18/21 3:37 PM, Charan Teja Reddy wrote:
> The proactive compaction[1] gets triggered for every 500msec and run
> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
> pages based on the value set to sysctl.compaction_proactiveness.
> Triggering the compaction for every 500msec in search of
> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
> especially on the embedded system usecases which may have few MB's of
> RAM. Enabling the proactive compaction in its state will endup in
> running almost always on such systems.
> 
> Other side, proactive compaction can still be very much useful for
> getting a set of higher order pages in some controllable
> manner(controlled by using the sysctl.compaction_proactiveness). Thus on
> systems where enabling the proactive compaction always may proove not
> required, can trigger the same from user space on write to its sysctl
> interface. As an example, say app launcher decide to launch the memory
> heavy application which can be launched fast if it gets more higher
> order pages thus launcher can prepare the system in advance by
> triggering the proactive compaction from userspace.
> 
> This triggering of proactive compaction is done on a write to
> sysctl.compaction_proactiveness by user.
> 
> [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a
> 
> Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>

Cancelling all current sleeps immediately when the controlling variable changes
doesn't sound wrong to me.
A question below:

> ---
> changes in V2: 
>     - remove /proc interface trigger for proactive compaction
>     - Intention is same that add a way to trigger proactive compaction by user.
> 
> changes in V1:
>     -  https://lore.kernel.org/lkml/1619098678-8501-1-git-send-email-charante@codeaurora.org/
> 
>  include/linux/compaction.h |  2 ++
>  include/linux/mmzone.h     |  1 +
>  kernel/sysctl.c            |  2 +-
>  mm/compaction.c            | 35 ++++++++++++++++++++++++++++++++---
>  4 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index 4221888..04d5d9f 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
>  extern unsigned int sysctl_compaction_proactiveness;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);
> +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
> +		int write, void *buffer, size_t *length, loff_t *ppos);
>  extern int sysctl_extfrag_threshold;
>  extern int sysctl_compact_unevictable_allowed;
>  
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 0d53eba..9455809 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -815,6 +815,7 @@ typedef struct pglist_data {
>  	enum zone_type kcompactd_highest_zoneidx;
>  	wait_queue_head_t kcompactd_wait;
>  	struct task_struct *kcompactd;
> +	bool proactive_compact_trigger;
>  #endif
>  	/*
>  	 * This is a per-node reserve of pages that are not available
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 14edf84..bed2fad 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -2840,7 +2840,7 @@ static struct ctl_table vm_table[] = {
>  		.data		= &sysctl_compaction_proactiveness,
>  		.maxlen		= sizeof(sysctl_compaction_proactiveness),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= compaction_proactiveness_sysctl_handler,
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= &one_hundred,
>  	},
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 84fde27..9056693 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2708,6 +2708,30 @@ static void compact_nodes(void)
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
>  
> +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
> +		void *buffer, size_t *length, loff_t *ppos)
> +{
> +	int rc, nid;
> +
> +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (rc)
> +		return rc;
> +
> +	if (write && sysctl_compaction_proactiveness) {
> +		for_each_online_node(nid) {
> +			pg_data_t *pgdat = NODE_DATA(nid);
> +
> +			if (pgdat->proactive_compact_trigger)
> +				continue;
> +
> +			pgdat->proactive_compact_trigger = true;
> +			wake_up_interruptible(&pgdat->kcompactd_wait);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * This is the entry point for compacting all nodes via
>   * /proc/sys/vm/compact_memory
> @@ -2752,7 +2776,8 @@ void compaction_unregister_node(struct node *node)
>  
>  static inline bool kcompactd_work_requested(pg_data_t *pgdat)
>  {
> -	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
> +	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
> +		pgdat->proactive_compact_trigger;
>  }
>  
>  static bool kcompactd_node_suitable(pg_data_t *pgdat)
> @@ -2905,7 +2930,8 @@ static int kcompactd(void *p)
>  		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
>  		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
>  			kcompactd_work_requested(pgdat),
> -			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
> +			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC)) &&
> +			!pgdat->proactive_compact_trigger) {
>  
>  			psi_memstall_enter(&pflags);
>  			kcompactd_do_work(pgdat);
> @@ -2919,7 +2945,7 @@ static int kcompactd(void *p)
>  
>  			if (proactive_defer) {
>  				proactive_defer--;
> -				continue;
> +				goto loop;

I don't understand this part. If we kick kcompactd from the sysctl handler
because we are changing proactiveness, shouldn't we also discard any accumulated
defer score?

>  			}
>  			prev_score = fragmentation_score_node(pgdat);
>  			proactive_compact_node(pgdat);
> @@ -2931,6 +2957,9 @@ static int kcompactd(void *p)
>  			proactive_defer = score < prev_score ?
>  					0 : 1 << COMPACT_MAX_DEFER_SHIFT;
>  		}
> +loop:
> +		if (pgdat->proactive_compact_trigger)
> +			pgdat->proactive_compact_trigger = false;
>  	}
>  
>  	return 0;
>
Charan Teja Kalla May 28, 2021, 5:02 p.m. UTC | #8
Thanks Vlastimil for your inputs!!

On 5/28/2021 8:49 PM, Vlastimil Babka wrote:
> On 5/18/21 3:37 PM, Charan Teja Reddy wrote:
>> The proactive compaction[1] gets triggered for every 500msec and run
>> compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9)
>> pages based on the value set to sysctl.compaction_proactiveness.
>> Triggering the compaction for every 500msec in search of
>> COMPACTION_HPAGE_ORDER pages is not needed for all applications,
>> especially on the embedded system usecases which may have few MB's of
>> RAM. Enabling the proactive compaction in its state will endup in
>> running almost always on such systems.
>> This triggering of proactive compaction is done on a write to
>> sysctl.compaction_proactiveness by user.
>>
>> [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a
>>
>> Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
> Cancelling all current sleeps immediately when the controlling variable changes
> doesn't sound wrong to me.

Agree that cancelling sleeps is not wrong here.

> A question below:
> 
>> ---
>> changes in V2: 
>>     - remove /proc interface trigger for proactive compaction
>>     - Intention is same that add a way to trigger proactive compaction by user.
>>  			if (proactive_defer) {
>>  				proactive_defer--;
>> -				continue;
>> +				goto loop;
> I don't understand this part. If we kick kcompactd from the sysctl handler
> because we are changing proactiveness, shouldn't we also discard any accumulated
> defer score?

Yes, we should be discarding the accumulated defer score when user
changing the proactiveness, even when writing higher proactiveness value
than for which it was deferred. Will raise V3 for this.

> 

--Charan
Charan Teja Kalla May 29, 2021, 2:57 a.m. UTC | #9
Thanks Nitin for your inputs.

On 5/28/2021 5:22 AM, Nitin Gupta wrote:
>> First, Does the user doing the above steps are valid?
>> If yes, then we should guarantee to the user that proactive compaction
>> atleast tried to run when the user changed the proactiveness.
>> If not, I feel, we should document that 'once user changed the
>> compaction_proactiveness, he need to wait atleast
>> HPAGE_FRAG_CHECK_INTERVAL_MSEC before considering that the value he
>> tried to set is effective and proactive compaction tried to run on that'.
>> Doesn't this seem okay?
> Proactive compaction does not guarantee if the kernel will be able to achieve
> fragmentation targets implied from the compaction_proactiveness sysctl. It also
> does not guarantee how much time it will take to reach desired fragmentation
> levels (if at all possible). 

Shouldn't we add these lines in the Documentation. Will raise a patch If
it is fine.


> Maybe add a Kconfig parameter for setting
> HPAGE_FRAG_CHECK_INTERVAL_MSEC to say 1msec?

I really don't have an use case to make the
HPAGE_FRAG_CHECK_INTERVAL_MSEC as config option. But should we make it
as Kconfig option and let the user decide how aggressively proactive
compaction should do the job of system defragmentation in his system?
Selection will be limited in the range of 10msec to 500msec, defaults to
500msec.

--Thanks
diff mbox series

Patch

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4221888..04d5d9f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -84,6 +84,8 @@  static inline unsigned long compact_gap(unsigned int order)
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
+extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
+		int write, void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d53eba..9455809 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -815,6 +815,7 @@  typedef struct pglist_data {
 	enum zone_type kcompactd_highest_zoneidx;
 	wait_queue_head_t kcompactd_wait;
 	struct task_struct *kcompactd;
+	bool proactive_compact_trigger;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 14edf84..bed2fad 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2840,7 +2840,7 @@  static struct ctl_table vm_table[] = {
 		.data		= &sysctl_compaction_proactiveness,
 		.maxlen		= sizeof(sysctl_compaction_proactiveness),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= compaction_proactiveness_sysctl_handler,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
diff --git a/mm/compaction.c b/mm/compaction.c
index 84fde27..9056693 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2708,6 +2708,30 @@  static void compact_nodes(void)
  */
 unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
 
+int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *length, loff_t *ppos)
+{
+	int rc, nid;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	if (write && sysctl_compaction_proactiveness) {
+		for_each_online_node(nid) {
+			pg_data_t *pgdat = NODE_DATA(nid);
+
+			if (pgdat->proactive_compact_trigger)
+				continue;
+
+			pgdat->proactive_compact_trigger = true;
+			wake_up_interruptible(&pgdat->kcompactd_wait);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * This is the entry point for compacting all nodes via
  * /proc/sys/vm/compact_memory
@@ -2752,7 +2776,8 @@  void compaction_unregister_node(struct node *node)
 
 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
 {
-	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
+		pgdat->proactive_compact_trigger;
 }
 
 static bool kcompactd_node_suitable(pg_data_t *pgdat)
@@ -2905,7 +2930,8 @@  static int kcompactd(void *p)
 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
 		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
 			kcompactd_work_requested(pgdat),
-			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC)) &&
+			!pgdat->proactive_compact_trigger) {
 
 			psi_memstall_enter(&pflags);
 			kcompactd_do_work(pgdat);
@@ -2919,7 +2945,7 @@  static int kcompactd(void *p)
 
 			if (proactive_defer) {
 				proactive_defer--;
-				continue;
+				goto loop;
 			}
 			prev_score = fragmentation_score_node(pgdat);
 			proactive_compact_node(pgdat);
@@ -2931,6 +2957,9 @@  static int kcompactd(void *p)
 			proactive_defer = score < prev_score ?
 					0 : 1 << COMPACT_MAX_DEFER_SHIFT;
 		}
+loop:
+		if (pgdat->proactive_compact_trigger)
+			pgdat->proactive_compact_trigger = false;
 	}
 
 	return 0;