diff mbox series

mm/compaction:let proactive compaction order configurable

Message ID 1618218330-50591-1-git-send-email-chukaiping@baidu.com (mailing list archive)
State New, archived
Headers show
Series mm/compaction:let proactive compaction order configurable | expand

Commit Message

Chu,Kaiping April 12, 2021, 9:05 a.m. UTC
Currently the proactive compaction order is fixed to
COMPACTION_HPAGE_ORDER(9), it's OK in most machines with lots of
normal 4KB memory, but it's too high for the machines with small
normal memory, for example the machines with most memory configured
as 1GB hugetlbfs huge pages. In these machines the max order of
free pages is often below 9, and it's always below 9 even with hard
compaction. This will lead to proactive compaction be triggered very
frequently. In these machines we only care about order of 3 or 4.
This patch export the oder to proc and let it configurable
by user, and the default value is still COMPACTION_HPAGE_ORDER.

Signed-off-by: chukaiping <chukaiping@baidu.com>
---
 include/linux/compaction.h |    1 +
 kernel/sysctl.c            |   10 ++++++++++
 mm/compaction.c            |    7 ++++---
 3 files changed, 15 insertions(+), 3 deletions(-)

Comments

Oleksandr Natalenko April 12, 2021, 4:57 p.m. UTC | #1
Hello.

On Mon, Apr 12, 2021 at 05:05:30PM +0800, chukaiping wrote:
> Currently the proactive compaction order is fixed to
> COMPACTION_HPAGE_ORDER(9), it's OK in most machines with lots of
> normal 4KB memory, but it's too high for the machines with small
> normal memory, for example the machines with most memory configured
> as 1GB hugetlbfs huge pages. In these machines the max order of
> free pages is often below 9, and it's always below 9 even with hard
> compaction. This will lead to proactive compaction be triggered very
> frequently. In these machines we only care about order of 3 or 4.
> This patch export the oder to proc and let it configurable
> by user, and the default value is still COMPACTION_HPAGE_ORDER.
> 
> Signed-off-by: chukaiping <chukaiping@baidu.com>
> ---
>  include/linux/compaction.h |    1 +
>  kernel/sysctl.c            |   10 ++++++++++
>  mm/compaction.c            |    7 ++++---
>  3 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index ed4070e..151ccd1 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -83,6 +83,7 @@ static inline unsigned long compact_gap(unsigned int order)
>  #ifdef CONFIG_COMPACTION
>  extern int sysctl_compact_memory;
>  extern unsigned int sysctl_compaction_proactiveness;
> +extern unsigned int sysctl_compaction_order;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);
>  extern int sysctl_extfrag_threshold;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 62fbd09..277df31 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -114,6 +114,7 @@
>  static int __maybe_unused neg_one = -1;
>  static int __maybe_unused two = 2;
>  static int __maybe_unused four = 4;
> +static int __maybe_unused ten = 10;

^^ does the upper limit have to be hard-coded like this?

>  static unsigned long zero_ul;
>  static unsigned long one_ul = 1;
>  static unsigned long long_max = LONG_MAX;
> @@ -2871,6 +2872,15 @@ int proc_do_static_key(struct ctl_table *table, int write,
>  		.extra2		= &one_hundred,
>  	},
>  	{
> +		.procname       = "compaction_order",
> +		.data           = &sysctl_compaction_order,
> +		.maxlen         = sizeof(sysctl_compaction_order),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec_minmax,
> +		.extra1         = SYSCTL_ZERO,

I wonder what happens if this knob is set to 0. Have you tested such a
corner case?

> +		.extra2         = &ten,
> +	},
> +	{
>  		.procname	= "extfrag_threshold",
>  		.data		= &sysctl_extfrag_threshold,
>  		.maxlen		= sizeof(int),
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e04f447..a192996 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1925,16 +1925,16 @@ static bool kswapd_is_running(pg_data_t *pgdat)
>  
>  /*
>   * A zone's fragmentation score is the external fragmentation wrt to the
> - * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
> + * sysctl_compaction_order. It returns a value in the range [0, 100].
>   */
>  static unsigned int fragmentation_score_zone(struct zone *zone)
>  {
> -	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
> +	return extfrag_for_order(zone, sysctl_compaction_order);
>  }
>  
>  /*
>   * A weighted zone's fragmentation score is the external fragmentation
> - * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
> + * wrt to the sysctl_compaction_order scaled by the zone's size. It
>   * returns a value in the range [0, 100].
>   *
>   * The scaling factor ensures that proactive compaction focuses on larger
> @@ -2666,6 +2666,7 @@ static void compact_nodes(void)
>   * background. It takes values in the range [0, 100].
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
> +unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
>  
>  /*
>   * This is the entry point for compacting all nodes via
> -- 
> 1.7.1
>
David Rientjes April 12, 2021, 6:26 p.m. UTC | #2
On Mon, 12 Apr 2021, chukaiping wrote:

> Currently the proactive compaction order is fixed to
> COMPACTION_HPAGE_ORDER(9), it's OK in most machines with lots of
> normal 4KB memory, but it's too high for the machines with small
> normal memory, for example the machines with most memory configured
> as 1GB hugetlbfs huge pages. In these machines the max order of
> free pages is often below 9, and it's always below 9 even with hard
> compaction. This will lead to proactive compaction be triggered very
> frequently. In these machines we only care about order of 3 or 4.
> This patch export the oder to proc and let it configurable
> by user, and the default value is still COMPACTION_HPAGE_ORDER.
> 

I'm curious why you have proactive compaction enabled at all in this case?

The order-9 threshold is likely to optimize for hugepage availability, but 
in your setup it appears that's not a goal.

So what benefit does proactive compaction provide if only done for order-3 
or order-4?

> Signed-off-by: chukaiping <chukaiping@baidu.com>
> ---
>  include/linux/compaction.h |    1 +
>  kernel/sysctl.c            |   10 ++++++++++
>  mm/compaction.c            |    7 ++++---
>  3 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index ed4070e..151ccd1 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -83,6 +83,7 @@ static inline unsigned long compact_gap(unsigned int order)
>  #ifdef CONFIG_COMPACTION
>  extern int sysctl_compact_memory;
>  extern unsigned int sysctl_compaction_proactiveness;
> +extern unsigned int sysctl_compaction_order;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);
>  extern int sysctl_extfrag_threshold;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 62fbd09..277df31 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -114,6 +114,7 @@
>  static int __maybe_unused neg_one = -1;
>  static int __maybe_unused two = 2;
>  static int __maybe_unused four = 4;
> +static int __maybe_unused ten = 10;
>  static unsigned long zero_ul;
>  static unsigned long one_ul = 1;
>  static unsigned long long_max = LONG_MAX;
> @@ -2871,6 +2872,15 @@ int proc_do_static_key(struct ctl_table *table, int write,
>  		.extra2		= &one_hundred,
>  	},
>  	{
> +		.procname       = "compaction_order",
> +		.data           = &sysctl_compaction_order,
> +		.maxlen         = sizeof(sysctl_compaction_order),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec_minmax,
> +		.extra1         = SYSCTL_ZERO,
> +		.extra2         = &ten,
> +	},
> +	{
>  		.procname	= "extfrag_threshold",
>  		.data		= &sysctl_extfrag_threshold,
>  		.maxlen		= sizeof(int),
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e04f447..a192996 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1925,16 +1925,16 @@ static bool kswapd_is_running(pg_data_t *pgdat)
>  
>  /*
>   * A zone's fragmentation score is the external fragmentation wrt to the
> - * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
> + * sysctl_compaction_order. It returns a value in the range [0, 100].
>   */
>  static unsigned int fragmentation_score_zone(struct zone *zone)
>  {
> -	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
> +	return extfrag_for_order(zone, sysctl_compaction_order);
>  }
>  
>  /*
>   * A weighted zone's fragmentation score is the external fragmentation
> - * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
> + * wrt to the sysctl_compaction_order scaled by the zone's size. It
>   * returns a value in the range [0, 100].
>   *
>   * The scaling factor ensures that proactive compaction focuses on larger
> @@ -2666,6 +2666,7 @@ static void compact_nodes(void)
>   * background. It takes values in the range [0, 100].
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
> +unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
>  
>  /*
>   * This is the entry point for compacting all nodes via
> -- 
> 1.7.1
> 
>
kernel test robot April 12, 2021, 6:53 p.m. UTC | #3
Hi chukaiping,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kees/for-next/pstore]
[also build test ERROR on linus/master v5.12-rc7 next-20210412]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/chukaiping/mm-compaction-let-proactive-compaction-order-configurable/20210412-172336
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/pstore
config: powerpc-cell_defconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a203321bf356e9514ca678c96119df72d6bfa803
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review chukaiping/mm-compaction-let-proactive-compaction-order-configurable/20210412-172336
        git checkout a203321bf356e9514ca678c96119df72d6bfa803
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/mmu.h:149,
                    from arch/powerpc/include/asm/lppaca.h:46,
                    from arch/powerpc/include/asm/paca.h:17,
                    from arch/powerpc/include/asm/current.h:13,
                    from include/linux/sched.h:12,
                    from include/linux/ratelimit.h:6,
                    from include/linux/dev_printk.h:16,
                    from include/linux/device.h:15,
                    from include/linux/node.h:18,
                    from include/linux/cpu.h:17,
                    from mm/compaction.c:11:
>> arch/powerpc/include/asm/page.h:39:28: error: initializer element is not constant
      39 | #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
         |                            ^
   mm/compaction.c:66:32: note: in expansion of macro 'HUGETLB_PAGE_ORDER'
      66 | #define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
         |                                ^~~~~~~~~~~~~~~~~~
   mm/compaction.c:2669:54: note: in expansion of macro 'COMPACTION_HPAGE_ORDER'
    2669 | unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
         |                                                      ^~~~~~~~~~~~~~~~~~~~~~


vim +39 arch/powerpc/include/asm/page.h

5cd16ee934eafc include/asm-powerpc/page.h      Michael Ellerman 2005-11-11  25  
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  26  #ifndef __ASSEMBLY__
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  27  #ifndef CONFIG_HUGETLB_PAGE
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  28  #define HPAGE_SHIFT PAGE_SHIFT
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  29  #elif defined(CONFIG_PPC_BOOK3S_64)
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  30  extern unsigned int hpage_shift;
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  31  #define HPAGE_SHIFT hpage_shift
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  32  #elif defined(CONFIG_PPC_8xx)
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  33  #define HPAGE_SHIFT		19	/* 512k pages */
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  34  #elif defined(CONFIG_PPC_FSL_BOOK3E)
c5710cd2073503 arch/powerpc/include/asm/page.h Christophe Leroy 2019-04-26  35  #define HPAGE_SHIFT		22	/* 4M pages */
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  36  #endif
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  37  #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  38  #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28 @39  #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  40  #define HUGE_MAX_HSTATE		(MMU_PAGE_COUNT-1)
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  41  #endif
41151e77a4d96e arch/powerpc/include/asm/page.h Becky Bruce      2011-06-28  42  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot April 12, 2021, 7:22 p.m. UTC | #4
Hi chukaiping,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kees/for-next/pstore]
[also build test ERROR on linus/master v5.12-rc7 next-20210412]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/chukaiping/mm-compaction-let-proactive-compaction-order-configurable/20210412-172336
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/pstore
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a203321bf356e9514ca678c96119df72d6bfa803
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review chukaiping/mm-compaction-let-proactive-compaction-order-configurable/20210412-172336
        git checkout a203321bf356e9514ca678c96119df72d6bfa803
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/ia64/include/asm/ptrace.h:46,
                    from arch/ia64/include/asm/processor.h:20,
                    from arch/ia64/include/asm/thread_info.h:12,
                    from include/linux/thread_info.h:58,
                    from include/asm-generic/preempt.h:5,
                    from ./arch/ia64/include/generated/asm/preempt.h:1,
                    from include/linux/preempt.h:78,
                    from include/linux/rcupdate.h:27,
                    from include/linux/rculist.h:11,
                    from include/linux/pid.h:5,
                    from include/linux/sched.h:14,
                    from include/linux/ratelimit.h:6,
                    from include/linux/dev_printk.h:16,
                    from include/linux/device.h:15,
                    from include/linux/node.h:18,
                    from include/linux/cpu.h:17,
                    from mm/compaction.c:11:
>> arch/ia64/include/asm/page.h:153:29: error: initializer element is not constant
     153 | # define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
         |                             ^
   mm/compaction.c:66:32: note: in expansion of macro 'HUGETLB_PAGE_ORDER'
      66 | #define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
         |                                ^~~~~~~~~~~~~~~~~~
   mm/compaction.c:2669:54: note: in expansion of macro 'COMPACTION_HPAGE_ORDER'
    2669 | unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
         |                                                      ^~~~~~~~~~~~~~~~~~~~~~


vim +153 arch/ia64/include/asm/page.h

^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  149  
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  150  #ifdef CONFIG_HUGETLB_PAGE
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  151  # define htlbpage_to_page(x)	(((unsigned long) REGION_NUMBER(x) << 61)			\
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  152  				 | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16 @153  # define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  154  extern unsigned int hpage_shift;
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  155  #endif
^1da177e4c3f41 include/asm-ia64/page.h Linus Torvalds 2005-04-16  156  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Chu,Kaiping April 14, 2021, 1:42 a.m. UTC | #5
Hi Oleksandr,
Please see my answer inline.

BR,
Chu Kaiping

-----邮件原件-----
发件人: Oleksandr Natalenko <oleksandr@natalenko.name> 
发送时间: 2021年4月13日 0:58
收件人: Chu,Kaiping <chukaiping@baidu.com>
抄送: mcgrof@kernel.org; keescook@chromium.org; yzaikin@google.com; akpm@linux-foundation.org; linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-mm@kvack.org
主题: Re: [PATCH] mm/compaction:let proactive compaction order configurable

Hello.

On Mon, Apr 12, 2021 at 05:05:30PM +0800, chukaiping wrote:
> Currently the proactive compaction order is fixed to 
> COMPACTION_HPAGE_ORDER(9), it's OK in most machines with lots of 
> normal 4KB memory, but it's too high for the machines with small 
> normal memory, for example the machines with most memory configured as 
> 1GB hugetlbfs huge pages. In these machines the max order of free 
> pages is often below 9, and it's always below 9 even with hard 
> compaction. This will lead to proactive compaction be triggered very 
> frequently. In these machines we only care about order of 3 or 4.
> This patch export the oder to proc and let it configurable by user, 
> and the default value is still COMPACTION_HPAGE_ORDER.
> 
> Signed-off-by: chukaiping <chukaiping@baidu.com>
> ---
>  include/linux/compaction.h |    1 +
>  kernel/sysctl.c            |   10 ++++++++++
>  mm/compaction.c            |    7 ++++---
>  3 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h 
> index ed4070e..151ccd1 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -83,6 +83,7 @@ static inline unsigned long compact_gap(unsigned int 
> order)  #ifdef CONFIG_COMPACTION  extern int sysctl_compact_memory;  
> extern unsigned int sysctl_compaction_proactiveness;
> +extern unsigned int sysctl_compaction_order;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);  extern int 
> sysctl_extfrag_threshold; diff --git a/kernel/sysctl.c 
> b/kernel/sysctl.c index 62fbd09..277df31 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -114,6 +114,7 @@
>  static int __maybe_unused neg_one = -1;  static int __maybe_unused 
> two = 2;  static int __maybe_unused four = 4;
> +static int __maybe_unused ten = 10;

^^ does the upper limit have to be hard-coded like this?
--> the max order of buddy is defined by MAX_ORDER, I will change it to MAX_ORDER is next patch.

>  static unsigned long zero_ul;
>  static unsigned long one_ul = 1;
>  static unsigned long long_max = LONG_MAX;
> @@ -2871,6 +2872,15 @@ int proc_do_static_key(struct ctl_table *table, int write,
>  		.extra2		= &one_hundred,
>  	},
>  	{
> +		.procname       = "compaction_order",
> +		.data           = &sysctl_compaction_order,
> +		.maxlen         = sizeof(sysctl_compaction_order),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec_minmax,
> +		.extra1         = SYSCTL_ZERO,

I wonder what happens if this knob is set to 0. Have you tested such a
corner case?
--> in theory, 0 is also a configurable value, but the fragment index of order 0 is always 0, so it won't do any proactive compaction. I have had a test, if set order to 0, there is no any error, but proactive compaction won't happen.

> +		.extra2         = &ten,
> +	},
> +	{
>  		.procname	= "extfrag_threshold",
>  		.data		= &sysctl_extfrag_threshold,
>  		.maxlen		= sizeof(int),
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e04f447..a192996 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1925,16 +1925,16 @@ static bool kswapd_is_running(pg_data_t *pgdat)
>  
>  /*
>   * A zone's fragmentation score is the external fragmentation wrt to the
> - * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
> + * sysctl_compaction_order. It returns a value in the range [0, 100].
>   */
>  static unsigned int fragmentation_score_zone(struct zone *zone)
>  {
> -	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
> +	return extfrag_for_order(zone, sysctl_compaction_order);
>  }
>  
>  /*
>   * A weighted zone's fragmentation score is the external fragmentation
> - * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
> + * wrt to the sysctl_compaction_order scaled by the zone's size. It
>   * returns a value in the range [0, 100].
>   *
>   * The scaling factor ensures that proactive compaction focuses on larger
> @@ -2666,6 +2666,7 @@ static void compact_nodes(void)
>   * background. It takes values in the range [0, 100].
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
> +unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
>  
>  /*
>   * This is the entry point for compacting all nodes via
> -- 
> 1.7.1
>
Chu,Kaiping April 14, 2021, 1:56 a.m. UTC | #6
Hi Rientjes,
In our case we don't care about the allocation delay of transparent huge pages, but the proactive compaction is really useful to us. If no proactive compaction currently kernel will do memory compaction only when the allocation of high order memory will fail, while this is too late. When the machine is in heavy load, many processes maybe trigger compaction at the same time, this will lead to serious lock contention, and will make the machine very slowly.
Do proactive compaction from time to time will keep the fragment index at low level, and reduce soft lockup rate.
The order of 3 or 4 is only an experience value, we may change it according to machine load.

BR,
Chu Kaiping

-----邮件原件-----
发件人: David Rientjes <rientjes@google.com> 
发送时间: 2021年4月13日 2:26
收件人: Chu,Kaiping <chukaiping@baidu.com>
抄送: mcgrof@kernel.org; keescook@chromium.org; yzaikin@google.com; akpm@linux-foundation.org; linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-mm@kvack.org
主题: Re: [PATCH] mm/compaction:let proactive compaction order configurable

On Mon, 12 Apr 2021, chukaiping wrote:

> Currently the proactive compaction order is fixed to 
> COMPACTION_HPAGE_ORDER(9), it's OK in most machines with lots of 
> normal 4KB memory, but it's too high for the machines with small 
> normal memory, for example the machines with most memory configured as 
> 1GB hugetlbfs huge pages. In these machines the max order of free 
> pages is often below 9, and it's always below 9 even with hard 
> compaction. This will lead to proactive compaction be triggered very 
> frequently. In these machines we only care about order of 3 or 4.
> This patch export the oder to proc and let it configurable by user, 
> and the default value is still COMPACTION_HPAGE_ORDER.
> 

I'm curious why you have proactive compaction enabled at all in this case?

The order-9 threshold is likely to optimize for hugepage availability, but in your setup it appears that's not a goal.

So what benefit does proactive compaction provide if only done for order-3 or order-4?

> Signed-off-by: chukaiping <chukaiping@baidu.com>
> ---
>  include/linux/compaction.h |    1 +
>  kernel/sysctl.c            |   10 ++++++++++
>  mm/compaction.c            |    7 ++++---
>  3 files changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h 
> index ed4070e..151ccd1 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -83,6 +83,7 @@ static inline unsigned long compact_gap(unsigned int 
> order)  #ifdef CONFIG_COMPACTION  extern int sysctl_compact_memory;  
> extern unsigned int sysctl_compaction_proactiveness;
> +extern unsigned int sysctl_compaction_order;
>  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
>  			void *buffer, size_t *length, loff_t *ppos);  extern int 
> sysctl_extfrag_threshold; diff --git a/kernel/sysctl.c 
> b/kernel/sysctl.c index 62fbd09..277df31 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -114,6 +114,7 @@
>  static int __maybe_unused neg_one = -1;  static int __maybe_unused 
> two = 2;  static int __maybe_unused four = 4;
> +static int __maybe_unused ten = 10;
>  static unsigned long zero_ul;
>  static unsigned long one_ul = 1;
>  static unsigned long long_max = LONG_MAX; @@ -2871,6 +2872,15 @@ int 
> proc_do_static_key(struct ctl_table *table, int write,
>  		.extra2		= &one_hundred,
>  	},
>  	{
> +		.procname       = "compaction_order",
> +		.data           = &sysctl_compaction_order,
> +		.maxlen         = sizeof(sysctl_compaction_order),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec_minmax,
> +		.extra1         = SYSCTL_ZERO,
> +		.extra2         = &ten,
> +	},
> +	{
>  		.procname	= "extfrag_threshold",
>  		.data		= &sysctl_extfrag_threshold,
>  		.maxlen		= sizeof(int),
> diff --git a/mm/compaction.c b/mm/compaction.c index e04f447..a192996 
> 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1925,16 +1925,16 @@ static bool kswapd_is_running(pg_data_t 
> *pgdat)
>  
>  /*
>   * A zone's fragmentation score is the external fragmentation wrt to 
> the
> - * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
> + * sysctl_compaction_order. It returns a value in the range [0, 100].
>   */
>  static unsigned int fragmentation_score_zone(struct zone *zone)  {
> -	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
> +	return extfrag_for_order(zone, sysctl_compaction_order);
>  }
>  
>  /*
>   * A weighted zone's fragmentation score is the external 
> fragmentation
> - * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
> + * wrt to the sysctl_compaction_order scaled by the zone's size. It
>   * returns a value in the range [0, 100].
>   *
>   * The scaling factor ensures that proactive compaction focuses on 
> larger @@ -2666,6 +2666,7 @@ static void compact_nodes(void)
>   * background. It takes values in the range [0, 100].
>   */
>  unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
> +unsigned int __read_mostly sysctl_compaction_order = 
> +COMPACTION_HPAGE_ORDER;
>  
>  /*
>   * This is the entry point for compacting all nodes via
> --
> 1.7.1
> 
>
diff mbox series

Patch

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ed4070e..151ccd1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -83,6 +83,7 @@  static inline unsigned long compact_gap(unsigned int order)
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern unsigned int sysctl_compaction_proactiveness;
+extern unsigned int sysctl_compaction_order;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09..277df31 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,7 @@ 
 static int __maybe_unused neg_one = -1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
+static int __maybe_unused ten = 10;
 static unsigned long zero_ul;
 static unsigned long one_ul = 1;
 static unsigned long long_max = LONG_MAX;
@@ -2871,6 +2872,15 @@  int proc_do_static_key(struct ctl_table *table, int write,
 		.extra2		= &one_hundred,
 	},
 	{
+		.procname       = "compaction_order",
+		.data           = &sysctl_compaction_order,
+		.maxlen         = sizeof(sysctl_compaction_order),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = &ten,
+	},
+	{
 		.procname	= "extfrag_threshold",
 		.data		= &sysctl_extfrag_threshold,
 		.maxlen		= sizeof(int),
diff --git a/mm/compaction.c b/mm/compaction.c
index e04f447..a192996 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1925,16 +1925,16 @@  static bool kswapd_is_running(pg_data_t *pgdat)
 
 /*
  * A zone's fragmentation score is the external fragmentation wrt to the
- * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
+ * sysctl_compaction_order. It returns a value in the range [0, 100].
  */
 static unsigned int fragmentation_score_zone(struct zone *zone)
 {
-	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+	return extfrag_for_order(zone, sysctl_compaction_order);
 }
 
 /*
  * A weighted zone's fragmentation score is the external fragmentation
- * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
+ * wrt to the sysctl_compaction_order scaled by the zone's size. It
  * returns a value in the range [0, 100].
  *
  * The scaling factor ensures that proactive compaction focuses on larger
@@ -2666,6 +2666,7 @@  static void compact_nodes(void)
  * background. It takes values in the range [0, 100].
  */
 unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+unsigned int __read_mostly sysctl_compaction_order = COMPACTION_HPAGE_ORDER;
 
 /*
  * This is the entry point for compacting all nodes via