diff mbox series

[RFC] mm/madvise: implement MADV_STOCKPILE (kswapd from user space)

Message ID 155895155861.2824.318013775811596173.stgit@buzz (mailing list archive)
State New, archived
Headers show
Series [RFC] mm/madvise: implement MADV_STOCKPILE (kswapd from user space) | expand

Commit Message

Konstantin Khlebnikov May 27, 2019, 10:05 a.m. UTC
Memory cgroup has no background memory reclaimer. Reclaiming after passing
high-limit blocks task because works synchronously in task-work.

This implements manual kswapd-style memory reclaim initiated by userspace.
It reclaims both physical memory and cgroup pages. It works in context of
task who calls syscall madvise thus cpu time is accounted correctly.

Interface:

ret = madvise(ptr, size, MADV_STOCKPILE)

Returns:
  0         - ok, free memory >= size
  -EINVAL   - not supported
  -ENOMEM   - not enough memory/cgroup limit
  -EINTR    - interrupted by pending signal
  -EAGAIN   - cannot reclaim enough memory

Argument 'size' is interpreted size of required free memory.
Implementation triggers direct reclaim until amount of free memory is
lower than that size. Argument 'ptr' could points to vma for specifying
numa allocation policy, right now should be NULL.

Usage scenario: independent thread or standalone daemon estimates rate of
allocations and calls MADV_STOCKPILE in loop to prepare free pages.
Thus fast path avoids allocation latency induced by direct reclaim.

We are using this embedded into memory allocator based on MADV_FREE.


Demonstration in memory cgroup with limit 1G:

touch zero
truncate -s 5G zero

Without stockpile:

perf stat -e vmscan:* md5sum zero

 Performance counter stats for 'md5sum zero':

                 0      vmscan:mm_vmscan_kswapd_sleep
                 0      vmscan:mm_vmscan_kswapd_wake
                 0      vmscan:mm_vmscan_wakeup_kswapd
                 0      vmscan:mm_vmscan_direct_reclaim_begin
             10147      vmscan:mm_vmscan_memcg_reclaim_begin
                 0      vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
                 0      vmscan:mm_vmscan_direct_reclaim_end
             10147      vmscan:mm_vmscan_memcg_reclaim_end
                 0      vmscan:mm_vmscan_memcg_softlimit_reclaim_end
             99910      vmscan:mm_shrink_slab_start
             99910      vmscan:mm_shrink_slab_end
             39654      vmscan:mm_vmscan_lru_isolate
                 0      vmscan:mm_vmscan_writepage
             39652      vmscan:mm_vmscan_lru_shrink_inactive
                 2      vmscan:mm_vmscan_lru_shrink_active
             19982      vmscan:mm_vmscan_inactive_list_is_low

      10.886832585 seconds time elapsed

       8.928366000 seconds user
       1.935212000 seconds sys

With stockpile:

stockpile 100 10 &   # up to 100M every 10ms
perf stat -e vmscan:* md5sum zero

 Performance counter stats for 'md5sum zero':

                 0      vmscan:mm_vmscan_kswapd_sleep
                 0      vmscan:mm_vmscan_kswapd_wake
                 0      vmscan:mm_vmscan_wakeup_kswapd
                 0      vmscan:mm_vmscan_direct_reclaim_begin
                 0      vmscan:mm_vmscan_memcg_reclaim_begin
                 0      vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
                 0      vmscan:mm_vmscan_direct_reclaim_end
                 0      vmscan:mm_vmscan_memcg_reclaim_end
                 0      vmscan:mm_vmscan_memcg_softlimit_reclaim_end
                 0      vmscan:mm_shrink_slab_start
                 0      vmscan:mm_shrink_slab_end
                 0      vmscan:mm_vmscan_lru_isolate
                 0      vmscan:mm_vmscan_writepage
                 0      vmscan:mm_vmscan_lru_shrink_inactive
                 0      vmscan:mm_vmscan_lru_shrink_active
                 0      vmscan:mm_vmscan_inactive_list_is_low

      10.469776675 seconds time elapsed

       8.976261000 seconds user
       1.491378000 seconds sys

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 include/linux/memcontrol.h             |    6 +++++
 include/uapi/asm-generic/mman-common.h |    2 ++
 mm/madvise.c                           |   39 ++++++++++++++++++++++++++++++
 mm/memcontrol.c                        |   41 ++++++++++++++++++++++++++++++++
 tools/vm/Makefile                      |    2 +-
 tools/vm/stockpile.c                   |   30 +++++++++++++++++++++++
 6 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 tools/vm/stockpile.c

Comments

Michal Hocko May 27, 2019, 2:12 p.m. UTC | #1
[Cc linux-api. Please always cc this list when proposing a new user
 visible api. Keeping the rest of the email intact for reference]

On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
> Memory cgroup has no background memory reclaimer. Reclaiming after passing
> high-limit blocks task because works synchronously in task-work.
> 
> This implements manual kswapd-style memory reclaim initiated by userspace.
> It reclaims both physical memory and cgroup pages. It works in context of
> task who calls syscall madvise thus cpu time is accounted correctly.
> 
> Interface:
> 
> ret = madvise(ptr, size, MADV_STOCKPILE)
> 
> Returns:
>   0         - ok, free memory >= size
>   -EINVAL   - not supported
>   -ENOMEM   - not enough memory/cgroup limit
>   -EINTR    - interrupted by pending signal
>   -EAGAIN   - cannot reclaim enough memory
> 
> Argument 'size' is interpreted size of required free memory.
> Implementation triggers direct reclaim until amount of free memory is
> lower than that size. Argument 'ptr' could points to vma for specifying
> numa allocation policy, right now should be NULL.
> 
> Usage scenario: independent thread or standalone daemon estimates rate of
> allocations and calls MADV_STOCKPILE in loop to prepare free pages.
> Thus fast path avoids allocation latency induced by direct reclaim.
> 
> We are using this embedded into memory allocator based on MADV_FREE.
> 
> 
> Demonstration in memory cgroup with limit 1G:
> 
> touch zero
> truncate -s 5G zero
> 
> Without stockpile:
> 
> perf stat -e vmscan:* md5sum zero
> 
>  Performance counter stats for 'md5sum zero':
> 
>                  0      vmscan:mm_vmscan_kswapd_sleep
>                  0      vmscan:mm_vmscan_kswapd_wake
>                  0      vmscan:mm_vmscan_wakeup_kswapd
>                  0      vmscan:mm_vmscan_direct_reclaim_begin
>              10147      vmscan:mm_vmscan_memcg_reclaim_begin
>                  0      vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
>                  0      vmscan:mm_vmscan_direct_reclaim_end
>              10147      vmscan:mm_vmscan_memcg_reclaim_end
>                  0      vmscan:mm_vmscan_memcg_softlimit_reclaim_end
>              99910      vmscan:mm_shrink_slab_start
>              99910      vmscan:mm_shrink_slab_end
>              39654      vmscan:mm_vmscan_lru_isolate
>                  0      vmscan:mm_vmscan_writepage
>              39652      vmscan:mm_vmscan_lru_shrink_inactive
>                  2      vmscan:mm_vmscan_lru_shrink_active
>              19982      vmscan:mm_vmscan_inactive_list_is_low
> 
>       10.886832585 seconds time elapsed
> 
>        8.928366000 seconds user
>        1.935212000 seconds sys
> 
> With stockpile:
> 
> stockpile 100 10 &   # up to 100M every 10ms
> perf stat -e vmscan:* md5sum zero
> 
>  Performance counter stats for 'md5sum zero':
> 
>                  0      vmscan:mm_vmscan_kswapd_sleep
>                  0      vmscan:mm_vmscan_kswapd_wake
>                  0      vmscan:mm_vmscan_wakeup_kswapd
>                  0      vmscan:mm_vmscan_direct_reclaim_begin
>                  0      vmscan:mm_vmscan_memcg_reclaim_begin
>                  0      vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
>                  0      vmscan:mm_vmscan_direct_reclaim_end
>                  0      vmscan:mm_vmscan_memcg_reclaim_end
>                  0      vmscan:mm_vmscan_memcg_softlimit_reclaim_end
>                  0      vmscan:mm_shrink_slab_start
>                  0      vmscan:mm_shrink_slab_end
>                  0      vmscan:mm_vmscan_lru_isolate
>                  0      vmscan:mm_vmscan_writepage
>                  0      vmscan:mm_vmscan_lru_shrink_inactive
>                  0      vmscan:mm_vmscan_lru_shrink_active
>                  0      vmscan:mm_vmscan_inactive_list_is_low
> 
>       10.469776675 seconds time elapsed
> 
>        8.976261000 seconds user
>        1.491378000 seconds sys
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> ---
>  include/linux/memcontrol.h             |    6 +++++
>  include/uapi/asm-generic/mman-common.h |    2 ++
>  mm/madvise.c                           |   39 ++++++++++++++++++++++++++++++
>  mm/memcontrol.c                        |   41 ++++++++++++++++++++++++++++++++
>  tools/vm/Makefile                      |    2 +-
>  tools/vm/stockpile.c                   |   30 +++++++++++++++++++++++
>  6 files changed, 119 insertions(+), 1 deletion(-)
>  create mode 100644 tools/vm/stockpile.c
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index bc74d6a4407c..25325f18ad55 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -517,6 +517,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
>  }
>  
>  void mem_cgroup_handle_over_high(void);
> +int mem_cgroup_stockpile(unsigned long goal_pages);
>  
>  unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
>  
> @@ -968,6 +969,11 @@ static inline void mem_cgroup_handle_over_high(void)
>  {
>  }
>  
> +static inline int mem_cgroup_stockpile(unsigned long goal_page)
> +{
> +	return 0;
> +}
> +
>  static inline void mem_cgroup_enter_user_fault(void)
>  {
>  }
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index abd238d0f7a4..675145864fee 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -64,6 +64,8 @@
>  #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
>  #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
>  
> +#define MADV_STOCKPILE	20		/* stockpile free pages */
> +
>  /* compatibility flags */
>  #define MAP_FILE	0
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 628022e674a7..f908b08ecc9f 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -686,6 +686,41 @@ static int madvise_inject_error(int behavior,
>  }
>  #endif
>  
> +static long madvise_stockpile(unsigned long start, size_t len)
> +{
> +	unsigned long goal_pages, progress;
> +	struct zonelist *zonelist;
> +	int ret;
> +
> +	if (start)
> +		return -EINVAL;
> +
> +	goal_pages = len >> PAGE_SHIFT;
> +
> +	if (goal_pages > totalram_pages() - totalreserve_pages)
> +		return -ENOMEM;
> +
> +	ret = mem_cgroup_stockpile(goal_pages);
> +	if (ret)
> +		return ret;
> +
> +	/* TODO: use vma mempolicy */
> +	zonelist = node_zonelist(numa_node_id(), GFP_HIGHUSER);
> +
> +	while (global_zone_page_state(NR_FREE_PAGES) <
> +			goal_pages + totalreserve_pages) {
> +
> +		if (signal_pending(current))
> +			return -EINTR;
> +
> +		progress = try_to_free_pages(zonelist, 0, GFP_HIGHUSER, NULL);
> +		if (!progress)
> +			return -EAGAIN;
> +	}
> +
> +	return 0;
> +}
> +
>  static long
>  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
>  		unsigned long start, unsigned long end, int behavior)
> @@ -728,6 +763,7 @@ madvise_behavior_valid(int behavior)
>  	case MADV_DODUMP:
>  	case MADV_WIPEONFORK:
>  	case MADV_KEEPONFORK:
> +	case MADV_STOCKPILE:
>  #ifdef CONFIG_MEMORY_FAILURE
>  	case MADV_SOFT_OFFLINE:
>  	case MADV_HWPOISON:
> @@ -834,6 +870,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  		return madvise_inject_error(behavior, start, start + len_in);
>  #endif
>  
> +	if (behavior == MADV_STOCKPILE)
> +		return madvise_stockpile(start, len);
> +
>  	write = madvise_need_mmap_write(behavior);
>  	if (write) {
>  		if (down_write_killable(&current->mm->mmap_sem))
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e50a2db5b4ff..dc23dc6bbeb3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2276,6 +2276,47 @@ void mem_cgroup_handle_over_high(void)
>  	current->memcg_nr_pages_over_high = 0;
>  }
>  
> +int mem_cgroup_stockpile(unsigned long goal_pages)
> +{
> +	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> +	unsigned long limit, nr_free, progress;
> +	struct mem_cgroup *memcg, *pos;
> +	int ret = 0;
> +
> +	pos = memcg = get_mem_cgroup_from_mm(current->mm);
> +
> +retry:
> +	if (signal_pending(current)) {
> +		ret = -EINTR;
> +		goto out;
> +	}
> +
> +	limit = min(pos->memory.max, pos->high);
> +	if (goal_pages > limit) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	nr_free = limit - page_counter_read(&pos->memory);
> +	if ((long)nr_free < (long)goal_pages) {
> +		progress = try_to_free_mem_cgroup_pages(pos,
> +				goal_pages - nr_free, GFP_HIGHUSER, true);
> +		if (progress || nr_retries--)
> +			goto retry;
> +		ret = -EAGAIN;
> +		goto out;
> +	}
> +
> +	nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> +	pos = parent_mem_cgroup(pos);
> +	if (pos)
> +		goto retry;
> +
> +out:
> +	css_put(&memcg->css);
> +	return ret;
> +}
> +
>  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  		      unsigned int nr_pages)
>  {
> diff --git a/tools/vm/Makefile b/tools/vm/Makefile
> index 20f6cf04377f..e5b5bc0d9421 100644
> --- a/tools/vm/Makefile
> +++ b/tools/vm/Makefile
> @@ -1,7 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  # Makefile for vm tools
>  #
> -TARGETS=page-types slabinfo page_owner_sort
> +TARGETS=page-types slabinfo page_owner_sort stockpile
>  
>  LIB_DIR = ../lib/api
>  LIBS = $(LIB_DIR)/libapi.a
> diff --git a/tools/vm/stockpile.c b/tools/vm/stockpile.c
> new file mode 100644
> index 000000000000..245e24f293ec
> --- /dev/null
> +++ b/tools/vm/stockpile.c
> @@ -0,0 +1,30 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <sys/mman.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <err.h>
> +#include <errno.h>
> +
> +#ifndef MADV_STOCKPILE
> +# define MADV_STOCKPILE	20
> +#endif
> +
> +int main(int argc, char **argv)
> +{
> +	int interval;
> +	size_t size;
> +	int ret;
> +
> +	if (argc != 3)
> +		errx(1, "usage: %s <size_mb> <interval_ms>", argv[0]);
> +
> +	size = atol(argv[1]) << 20;
> +	interval = atoi(argv[2]) * 1000;
> +
> +	while (1) {
> +		ret = madvise(NULL, size, MADV_STOCKPILE);
> +		if (ret && errno != EAGAIN)
> +			err(2, "madvise(NULL, %zu, MADV_STOCKPILE)", size);
> +		usleep(interval);
> +	}
> +}
>
Michal Hocko May 27, 2019, 2:21 p.m. UTC | #2
On Mon 27-05-19 16:12:23, Michal Hocko wrote:
> [Cc linux-api. Please always cc this list when proposing a new user
>  visible api. Keeping the rest of the email intact for reference]
> 
> On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
[...]
> > This implements manual kswapd-style memory reclaim initiated by userspace.
> > It reclaims both physical memory and cgroup pages. It works in context of
> > task who calls syscall madvise thus cpu time is accounted correctly.

I do not follow. Does this mean that the madvise always reclaims from
the memcg the process is member of?
Konstantin Khlebnikov May 27, 2019, 2:30 p.m. UTC | #3
On 27.05.2019 17:21, Michal Hocko wrote:
> On Mon 27-05-19 16:12:23, Michal Hocko wrote:
>> [Cc linux-api. Please always cc this list when proposing a new user
>>   visible api. Keeping the rest of the email intact for reference]
>>
>> On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
> [...]
>>> This implements manual kswapd-style memory reclaim initiated by userspace.
>>> It reclaims both physical memory and cgroup pages. It works in context of
>>> task who calls syscall madvise thus cpu time is accounted correctly.
> 
> I do not follow. Does this mean that the madvise always reclaims from
> the memcg the process is member of?
> 

First it reclaims in its own memcg while limit - usage < requested.
Then repeats this in parent memcg and so on. And at least pokes global
direct reclaimer while system wide free memory is less than requested.

So, if machine is divided into containers without overcommit global
reclaim will never happens - memcg will free enough memory.
Michal Hocko May 27, 2019, 2:39 p.m. UTC | #4
On Mon 27-05-19 16:21:56, Michal Hocko wrote:
> On Mon 27-05-19 16:12:23, Michal Hocko wrote:
> > [Cc linux-api. Please always cc this list when proposing a new user
> >  visible api. Keeping the rest of the email intact for reference]
> > 
> > On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
> [...]
> > > This implements manual kswapd-style memory reclaim initiated by userspace.
> > > It reclaims both physical memory and cgroup pages. It works in context of
> > > task who calls syscall madvise thus cpu time is accounted correctly.
> 
> I do not follow. Does this mean that the madvise always reclaims from
> the memcg the process is member of?

OK, I've had a quick look at the implementation (the semantic should be
clear from the patch descrition btw.) and it goes all the way up the
hierarchy and finally try to impose the same limit to the global state.
This doesn't really make much sense to me. For few reasons.

First of all it breaks isolation where one subgroup can influence a
different hierarchy via parent reclaim.

I also have a problem with conflating the global and memcg states. Does
it really make any sense to have the same target to the global state
as per-memcg? How are you supposed to use this interface to shrink a
particular memcg or for the global situation with a proportional
distribution to all memcgs?

There also doens't seem to be anything about security model for this
operation. There is no capability check from a quick look. Is it really
safe to expose such a functionality for a common user?

Last but not least, I am not really convinced that madvise is a proper
interface. It stretches the API which is address range based and it has
per-process implications.
Konstantin Khlebnikov May 28, 2019, 6:25 a.m. UTC | #5
On 27.05.2019 17:39, Michal Hocko wrote:
> On Mon 27-05-19 16:21:56, Michal Hocko wrote:
>> On Mon 27-05-19 16:12:23, Michal Hocko wrote:
>>> [Cc linux-api. Please always cc this list when proposing a new user
>>>   visible api. Keeping the rest of the email intact for reference]
>>>
>>> On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
>> [...]
>>>> This implements manual kswapd-style memory reclaim initiated by userspace.
>>>> It reclaims both physical memory and cgroup pages. It works in context of
>>>> task who calls syscall madvise thus cpu time is accounted correctly.
>>
>> I do not follow. Does this mean that the madvise always reclaims from
>> the memcg the process is member of?
> 
> OK, I've had a quick look at the implementation (the semantic should be
> clear from the patch descrition btw.) and it goes all the way up the
> hierarchy and finally try to impose the same limit to the global state.
> This doesn't really make much sense to me. For few reasons.
> 
> First of all it breaks isolation where one subgroup can influence a
> different hierarchy via parent reclaim.

madvise(NULL, size, MADV_STOCKPILE) is the same as memory allocation and
freeing immediately, but without pinning memory and provoking oom.

So, there is shouldn't be any isolation or security issues.

At least probably it should be limited with portion of limit (like half)
instead of whole limit as it does now.

> 
> I also have a problem with conflating the global and memcg states. Does
> it really make any sense to have the same target to the global state
> as per-memcg? How are you supposed to use this interface to shrink a
> particular memcg or for the global situation with a proportional
> distribution to all memcgs?

For now this is out of my use cease. This could be done in userspace
with multiple daemons in different contexts and connection between them.
In this case each daemon should apply pressure only its own level.

Also kernel could remember static pressure applied from each cgroup which
fades away when memory is allocated. And each call adds this pressure to
own requests to cooperate with neighbours. But rhight I don't know how to
implement this without over-engineering. Pure userspace solution looks
much better.

> 
> There also doens't seem to be anything about security model for this
> operation. There is no capability check from a quick look. Is it really
> safe to expose such a functionality for a common user?

Yep, it seems save. This is same as memory allocation and freeing.

> 
> Last but not least, I am not really convinced that madvise is a proper
> interface. It stretches the API which is address range based and it has
> per-process implications.
> 

Well, this is silly but semantic could be explained as preparation for
memory allocation via faulting into region. But since it doesn't need
to know exact range starting address could be arbitrary.

Also we employ MADV_POPULATE which implements batched faults into range
for robust memory allocation and undo for MADV_FREE. Will publish later.
Michal Hocko May 28, 2019, 6:51 a.m. UTC | #6
On Tue 28-05-19 09:25:13, Konstantin Khlebnikov wrote:
> On 27.05.2019 17:39, Michal Hocko wrote:
> > On Mon 27-05-19 16:21:56, Michal Hocko wrote:
> > > On Mon 27-05-19 16:12:23, Michal Hocko wrote:
> > > > [Cc linux-api. Please always cc this list when proposing a new user
> > > >   visible api. Keeping the rest of the email intact for reference]
> > > > 
> > > > On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
> > > [...]
> > > > > This implements manual kswapd-style memory reclaim initiated by userspace.
> > > > > It reclaims both physical memory and cgroup pages. It works in context of
> > > > > task who calls syscall madvise thus cpu time is accounted correctly.
> > > 
> > > I do not follow. Does this mean that the madvise always reclaims from
> > > the memcg the process is member of?
> > 
> > OK, I've had a quick look at the implementation (the semantic should be
> > clear from the patch descrition btw.) and it goes all the way up the
> > hierarchy and finally try to impose the same limit to the global state.
> > This doesn't really make much sense to me. For few reasons.
> > 
> > First of all it breaks isolation where one subgroup can influence a
> > different hierarchy via parent reclaim.
> 
> madvise(NULL, size, MADV_STOCKPILE) is the same as memory allocation and
> freeing immediately, but without pinning memory and provoking oom.
>
> So, there is shouldn't be any isolation or security issues.
> 
> At least probably it should be limited with portion of limit (like half)
> instead of whole limit as it does now.

I do not think so. If a process is running inside a memcg then it is
a subject of a limit and that implies an isolation. What you are
proposing here is to allow escaping that restriction unless I am missing
something. Just consider the following setup

		root (total memory = 2G)
		 / \
           (1G) A   B (1G)
                   / \
           (500M) C   D (500M)

all of them used up close to the limit and a process inside D requests
shrinking to 250M. Unless I am misunderstanding this implementation
will shrink D, B root to 250M (which means reclaiming C and A as well)
and then globally if that was not sufficient. So you have allowed D to
"allocate" 1,75G of memory effectively, right?
 
> > 
> > I also have a problem with conflating the global and memcg states. Does
> > it really make any sense to have the same target to the global state
> > as per-memcg? How are you supposed to use this interface to shrink a
> > particular memcg or for the global situation with a proportional
> > distribution to all memcgs?
> 
> For now this is out of my use cease. This could be done in userspace
> with multiple daemons in different contexts and connection between them.
> In this case each daemon should apply pressure only its own level.

Do you expect all daemons to agree on their shrinking target? Could you
elaborate? I simply do not see how this can work with memcgs lower in
the hierarchy having a smaller limit than their parents.
Konstantin Khlebnikov May 28, 2019, 7:30 a.m. UTC | #7
On 28.05.2019 9:51, Michal Hocko wrote:
> On Tue 28-05-19 09:25:13, Konstantin Khlebnikov wrote:
>> On 27.05.2019 17:39, Michal Hocko wrote:
>>> On Mon 27-05-19 16:21:56, Michal Hocko wrote:
>>>> On Mon 27-05-19 16:12:23, Michal Hocko wrote:
>>>>> [Cc linux-api. Please always cc this list when proposing a new user
>>>>>    visible api. Keeping the rest of the email intact for reference]
>>>>>
>>>>> On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
>>>> [...]
>>>>>> This implements manual kswapd-style memory reclaim initiated by userspace.
>>>>>> It reclaims both physical memory and cgroup pages. It works in context of
>>>>>> task who calls syscall madvise thus cpu time is accounted correctly.
>>>>
>>>> I do not follow. Does this mean that the madvise always reclaims from
>>>> the memcg the process is member of?
>>>
>>> OK, I've had a quick look at the implementation (the semantic should be
>>> clear from the patch descrition btw.) and it goes all the way up the
>>> hierarchy and finally try to impose the same limit to the global state.
>>> This doesn't really make much sense to me. For few reasons.
>>>
>>> First of all it breaks isolation where one subgroup can influence a
>>> different hierarchy via parent reclaim.
>>
>> madvise(NULL, size, MADV_STOCKPILE) is the same as memory allocation and
>> freeing immediately, but without pinning memory and provoking oom.
>>
>> So, there is shouldn't be any isolation or security issues.
>>
>> At least probably it should be limited with portion of limit (like half)
>> instead of whole limit as it does now.
> 
> I do not think so. If a process is running inside a memcg then it is
> a subject of a limit and that implies an isolation. What you are
> proposing here is to allow escaping that restriction unless I am missing
> something. Just consider the following setup
> 
> 		root (total memory = 2G)
> 		 / \
>             (1G) A   B (1G)
>                     / \
>             (500M) C   D (500M)
> 
> all of them used up close to the limit and a process inside D requests
> shrinking to 250M. Unless I am misunderstanding this implementation
> will shrink D, B root to 250M (which means reclaiming C and A as well)
> and then globally if that was not sufficient. So you have allowed D to
> "allocate" 1,75G of memory effectively, right?

It shrinks not 'size' memory - only while usage + size > limit.
So, after reclaiming 250M in D all other levels will have 250M free.

Of course there might be race because reclaimer works with one level
at the time. Probably it should start from inner level at each iteration.

>   
>>>
>>> I also have a problem with conflating the global and memcg states. Does
>>> it really make any sense to have the same target to the global state
>>> as per-memcg? How are you supposed to use this interface to shrink a
>>> particular memcg or for the global situation with a proportional
>>> distribution to all memcgs?
>>
>> For now this is out of my use cease. This could be done in userspace
>> with multiple daemons in different contexts and connection between them.
>> In this case each daemon should apply pressure only its own level.
> 
> Do you expect all daemons to agree on their shrinking target? Could you
> elaborate? I simply do not see how this can work with memcgs lower in
> the hierarchy having a smaller limit than their parents.
> 

Daemons could distribute pressure among leaves and propagate it into parents.
Together with low-limit this gives enough control over pressure distribution.
Michal Hocko May 28, 2019, 7:38 a.m. UTC | #8
On Tue 28-05-19 10:30:12, Konstantin Khlebnikov wrote:
> On 28.05.2019 9:51, Michal Hocko wrote:
> > On Tue 28-05-19 09:25:13, Konstantin Khlebnikov wrote:
> > > On 27.05.2019 17:39, Michal Hocko wrote:
> > > > On Mon 27-05-19 16:21:56, Michal Hocko wrote:
> > > > > On Mon 27-05-19 16:12:23, Michal Hocko wrote:
> > > > > > [Cc linux-api. Please always cc this list when proposing a new user
> > > > > >    visible api. Keeping the rest of the email intact for reference]
> > > > > > 
> > > > > > On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
> > > > > [...]
> > > > > > > This implements manual kswapd-style memory reclaim initiated by userspace.
> > > > > > > It reclaims both physical memory and cgroup pages. It works in context of
> > > > > > > task who calls syscall madvise thus cpu time is accounted correctly.
> > > > > 
> > > > > I do not follow. Does this mean that the madvise always reclaims from
> > > > > the memcg the process is member of?
> > > > 
> > > > OK, I've had a quick look at the implementation (the semantic should be
> > > > clear from the patch descrition btw.) and it goes all the way up the
> > > > hierarchy and finally try to impose the same limit to the global state.
> > > > This doesn't really make much sense to me. For few reasons.
> > > > 
> > > > First of all it breaks isolation where one subgroup can influence a
> > > > different hierarchy via parent reclaim.
> > > 
> > > madvise(NULL, size, MADV_STOCKPILE) is the same as memory allocation and
> > > freeing immediately, but without pinning memory and provoking oom.
> > > 
> > > So, there is shouldn't be any isolation or security issues.
> > > 
> > > At least probably it should be limited with portion of limit (like half)
> > > instead of whole limit as it does now.
> > 
> > I do not think so. If a process is running inside a memcg then it is
> > a subject of a limit and that implies an isolation. What you are
> > proposing here is to allow escaping that restriction unless I am missing
> > something. Just consider the following setup
> > 
> > 		root (total memory = 2G)
> > 		 / \
> >             (1G) A   B (1G)
> >                     / \
> >             (500M) C   D (500M)
> > 
> > all of them used up close to the limit and a process inside D requests
> > shrinking to 250M. Unless I am misunderstanding this implementation
> > will shrink D, B root to 250M (which means reclaiming C and A as well)
> > and then globally if that was not sufficient. So you have allowed D to
> > "allocate" 1,75G of memory effectively, right?
> 
> It shrinks not 'size' memory - only while usage + size > limit.
> So, after reclaiming 250M in D all other levels will have 250M free.

Could you define the exact semantic? Ideally something for the manual
page please?
Konstantin Khlebnikov May 28, 2019, 8:04 a.m. UTC | #9
On 28.05.2019 10:38, Michal Hocko wrote:
> On Tue 28-05-19 10:30:12, Konstantin Khlebnikov wrote:
>> On 28.05.2019 9:51, Michal Hocko wrote:
>>> On Tue 28-05-19 09:25:13, Konstantin Khlebnikov wrote:
>>>> On 27.05.2019 17:39, Michal Hocko wrote:
>>>>> On Mon 27-05-19 16:21:56, Michal Hocko wrote:
>>>>>> On Mon 27-05-19 16:12:23, Michal Hocko wrote:
>>>>>>> [Cc linux-api. Please always cc this list when proposing a new user
>>>>>>>     visible api. Keeping the rest of the email intact for reference]
>>>>>>>
>>>>>>> On Mon 27-05-19 13:05:58, Konstantin Khlebnikov wrote:
>>>>>> [...]
>>>>>>>> This implements manual kswapd-style memory reclaim initiated by userspace.
>>>>>>>> It reclaims both physical memory and cgroup pages. It works in context of
>>>>>>>> task who calls syscall madvise thus cpu time is accounted correctly.
>>>>>>
>>>>>> I do not follow. Does this mean that the madvise always reclaims from
>>>>>> the memcg the process is member of?
>>>>>
>>>>> OK, I've had a quick look at the implementation (the semantic should be
>>>>> clear from the patch descrition btw.) and it goes all the way up the
>>>>> hierarchy and finally try to impose the same limit to the global state.
>>>>> This doesn't really make much sense to me. For few reasons.
>>>>>
>>>>> First of all it breaks isolation where one subgroup can influence a
>>>>> different hierarchy via parent reclaim.
>>>>
>>>> madvise(NULL, size, MADV_STOCKPILE) is the same as memory allocation and
>>>> freeing immediately, but without pinning memory and provoking oom.
>>>>
>>>> So, there is shouldn't be any isolation or security issues.
>>>>
>>>> At least probably it should be limited with portion of limit (like half)
>>>> instead of whole limit as it does now.
>>>
>>> I do not think so. If a process is running inside a memcg then it is
>>> a subject of a limit and that implies an isolation. What you are
>>> proposing here is to allow escaping that restriction unless I am missing
>>> something. Just consider the following setup
>>>
>>> 		root (total memory = 2G)
>>> 		 / \
>>>              (1G) A   B (1G)
>>>                      / \
>>>              (500M) C   D (500M)
>>>
>>> all of them used up close to the limit and a process inside D requests
>>> shrinking to 250M. Unless I am misunderstanding this implementation
>>> will shrink D, B root to 250M (which means reclaiming C and A as well)
>>> and then globally if that was not sufficient. So you have allowed D to
>>> "allocate" 1,75G of memory effectively, right?
>>
>> It shrinks not 'size' memory - only while usage + size > limit.
>> So, after reclaiming 250M in D all other levels will have 250M free.
> 
> Could you define the exact semantic? Ideally something for the manual
> page please?
> 

Like kswapd which works with thresholds of free memory this one reclaims
until 'free' (i.e. memory which could be allocated without invoking
direct recliam of any kind) is lower than passed 'size' argument.

Thus right after madvise(NULL, size, MADV_STOCKPILE) 'size' bytes
could be allocated in this memory cgroup without extra latency from
reclaimer if there is no other memory consumers.

Reclaimed memory is simply put into free lists in common buddy allocator,
there is no reserves for particular task or cgroup.

If overall memory allocation rate is smooth without rough spikes then
calling MADV_STOCKPILE in loop periodically provides enough room for
allocations and eliminates direct reclaim from all other tasks.
As a result this eliminates unpredictable delays caused by
direct reclaim in random places.
Michal Hocko May 28, 2019, 8:42 a.m. UTC | #10
On Tue 28-05-19 11:04:46, Konstantin Khlebnikov wrote:
> On 28.05.2019 10:38, Michal Hocko wrote:
[...]
> > Could you define the exact semantic? Ideally something for the manual
> > page please?
> > 
> 
> Like kswapd which works with thresholds of free memory this one reclaims
> until 'free' (i.e. memory which could be allocated without invoking
> direct recliam of any kind) is lower than passed 'size' argument.

s@lower@higher@ I guess

> Thus right after madvise(NULL, size, MADV_STOCKPILE) 'size' bytes
> could be allocated in this memory cgroup without extra latency from
> reclaimer if there is no other memory consumers.
> 
> Reclaimed memory is simply put into free lists in common buddy allocator,
> there is no reserves for particular task or cgroup.
> 
> If overall memory allocation rate is smooth without rough spikes then
> calling MADV_STOCKPILE in loop periodically provides enough room for
> allocations and eliminates direct reclaim from all other tasks.
> As a result this eliminates unpredictable delays caused by
> direct reclaim in random places.

OK, this makes it more clear to me. Thanks for the clarification!
I have clearly misunderstood and misinterpreted target as the reclaim
target rather than free memory target.  Sorry about the confusion.
I sill think that this looks like an abuse of the madvise but if there
is a wider consensus this is acceptable I will not stand in the way.
Konstantin Khlebnikov May 28, 2019, 8:58 a.m. UTC | #11
On 28.05.2019 11:42, Michal Hocko wrote:
> On Tue 28-05-19 11:04:46, Konstantin Khlebnikov wrote:
>> On 28.05.2019 10:38, Michal Hocko wrote:
> [...]
>>> Could you define the exact semantic? Ideally something for the manual
>>> page please?
>>>
>>
>> Like kswapd which works with thresholds of free memory this one reclaims
>> until 'free' (i.e. memory which could be allocated without invoking
>> direct recliam of any kind) is lower than passed 'size' argument.
> 
> s@lower@higher@ I guess

Yep. My wording still bad.
'size' argument should be called 'watermark' or 'threshold'.

I.e. reclaim while 'free' memory is lower passed 'threshold'.

> 
>> Thus right after madvise(NULL, size, MADV_STOCKPILE) 'size' bytes
>> could be allocated in this memory cgroup without extra latency from
>> reclaimer if there is no other memory consumers.
>>
>> Reclaimed memory is simply put into free lists in common buddy allocator,
>> there is no reserves for particular task or cgroup.
>>
>> If overall memory allocation rate is smooth without rough spikes then
>> calling MADV_STOCKPILE in loop periodically provides enough room for
>> allocations and eliminates direct reclaim from all other tasks.
>> As a result this eliminates unpredictable delays caused by
>> direct reclaim in random places.
> 
> OK, this makes it more clear to me. Thanks for the clarification!
> I have clearly misunderstood and misinterpreted target as the reclaim
> target rather than free memory target.  Sorry about the confusion.
> I sill think that this looks like an abuse of the madvise but if there
> is a wider consensus this is acceptable I will not stand in the way.
>
Shakeel Butt May 28, 2019, 2:56 p.m. UTC | #12
On Tue, May 28, 2019 at 1:42 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Tue 28-05-19 11:04:46, Konstantin Khlebnikov wrote:
> > On 28.05.2019 10:38, Michal Hocko wrote:
> [...]
> > > Could you define the exact semantic? Ideally something for the manual
> > > page please?
> > >
> >
> > Like kswapd which works with thresholds of free memory this one reclaims
> > until 'free' (i.e. memory which could be allocated without invoking
> > direct recliam of any kind) is lower than passed 'size' argument.
>
> s@lower@higher@ I guess
>
> > Thus right after madvise(NULL, size, MADV_STOCKPILE) 'size' bytes
> > could be allocated in this memory cgroup without extra latency from
> > reclaimer if there is no other memory consumers.
> >
> > Reclaimed memory is simply put into free lists in common buddy allocator,
> > there is no reserves for particular task or cgroup.
> >
> > If overall memory allocation rate is smooth without rough spikes then
> > calling MADV_STOCKPILE in loop periodically provides enough room for
> > allocations and eliminates direct reclaim from all other tasks.
> > As a result this eliminates unpredictable delays caused by
> > direct reclaim in random places.
>
> OK, this makes it more clear to me. Thanks for the clarification!
> I have clearly misunderstood and misinterpreted target as the reclaim
> target rather than free memory target.  Sorry about the confusion.
> I sill think that this looks like an abuse of the madvise but if there
> is a wider consensus this is acceptable I will not stand in the way.
>
>

I agree with Michal that madvise does not seem like a right API for
this use-case, a 'proactive reclaim'.

This is conflating memcg and global proactive reclaim. There are
use-cases which would prefer to have centralized control on the system
wide proactive reclaim because system level memory overcommit is
controlled by the admin. Decoupling global and per-memcg proactive
reclaim will allow mechanism to implement both use-cases (yours and
this one).

The madvise() is requiring that the proactive reclaim process should
be in the target memcg.  I think a memcg interface instead of madvise
is better as it will allow the job owner to control cpu resources of
the proactive reclaim. With madvise, the proactive reclaim has to
share cpu with the target sub-task of the job (or do some tricks with
the hierarchy).

The current implementation is polling-based. I think a reactive
approach based on some watermarks would be better. Polling may be fine
for servers but for power restricted devices, reactive approach is
preferable.

The current implementation is bypassing PSI for global reclaim.
However I am not sure how should PSI interact with proactive reclaim
in general.

thanks,
Shakeel
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bc74d6a4407c..25325f18ad55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -517,6 +517,7 @@  unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
 }
 
 void mem_cgroup_handle_over_high(void);
+int mem_cgroup_stockpile(unsigned long goal_pages);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
@@ -968,6 +969,11 @@  static inline void mem_cgroup_handle_over_high(void)
 {
 }
 
+static inline int mem_cgroup_stockpile(unsigned long goal_page)
+{
+	return 0;
+}
+
 static inline void mem_cgroup_enter_user_fault(void)
 {
 }
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index abd238d0f7a4..675145864fee 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -64,6 +64,8 @@ 
 #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
 #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
 
+#define MADV_STOCKPILE	20		/* stockpile free pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..f908b08ecc9f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -686,6 +686,41 @@  static int madvise_inject_error(int behavior,
 }
 #endif
 
+static long madvise_stockpile(unsigned long start, size_t len)
+{
+	unsigned long goal_pages, progress;
+	struct zonelist *zonelist;
+	int ret;
+
+	if (start)
+		return -EINVAL;
+
+	goal_pages = len >> PAGE_SHIFT;
+
+	if (goal_pages > totalram_pages() - totalreserve_pages)
+		return -ENOMEM;
+
+	ret = mem_cgroup_stockpile(goal_pages);
+	if (ret)
+		return ret;
+
+	/* TODO: use vma mempolicy */
+	zonelist = node_zonelist(numa_node_id(), GFP_HIGHUSER);
+
+	while (global_zone_page_state(NR_FREE_PAGES) <
+			goal_pages + totalreserve_pages) {
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		progress = try_to_free_pages(zonelist, 0, GFP_HIGHUSER, NULL);
+		if (!progress)
+			return -EAGAIN;
+	}
+
+	return 0;
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -728,6 +763,7 @@  madvise_behavior_valid(int behavior)
 	case MADV_DODUMP:
 	case MADV_WIPEONFORK:
 	case MADV_KEEPONFORK:
+	case MADV_STOCKPILE:
 #ifdef CONFIG_MEMORY_FAILURE
 	case MADV_SOFT_OFFLINE:
 	case MADV_HWPOISON:
@@ -834,6 +870,9 @@  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
+	if (behavior == MADV_STOCKPILE)
+		return madvise_stockpile(start, len);
+
 	write = madvise_need_mmap_write(behavior);
 	if (write) {
 		if (down_write_killable(&current->mm->mmap_sem))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e50a2db5b4ff..dc23dc6bbeb3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2276,6 +2276,47 @@  void mem_cgroup_handle_over_high(void)
 	current->memcg_nr_pages_over_high = 0;
 }
 
+int mem_cgroup_stockpile(unsigned long goal_pages)
+{
+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+	unsigned long limit, nr_free, progress;
+	struct mem_cgroup *memcg, *pos;
+	int ret = 0;
+
+	pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+retry:
+	if (signal_pending(current)) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	limit = min(pos->memory.max, pos->high);
+	if (goal_pages > limit) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nr_free = limit - page_counter_read(&pos->memory);
+	if ((long)nr_free < (long)goal_pages) {
+		progress = try_to_free_mem_cgroup_pages(pos,
+				goal_pages - nr_free, GFP_HIGHUSER, true);
+		if (progress || nr_retries--)
+			goto retry;
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+	pos = parent_mem_cgroup(pos);
+	if (pos)
+		goto retry;
+
+out:
+	css_put(&memcg->css);
+	return ret;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		      unsigned int nr_pages)
 {
diff --git a/tools/vm/Makefile b/tools/vm/Makefile
index 20f6cf04377f..e5b5bc0d9421 100644
--- a/tools/vm/Makefile
+++ b/tools/vm/Makefile
@@ -1,7 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for vm tools
 #
-TARGETS=page-types slabinfo page_owner_sort
+TARGETS=page-types slabinfo page_owner_sort stockpile
 
 LIB_DIR = ../lib/api
 LIBS = $(LIB_DIR)/libapi.a
diff --git a/tools/vm/stockpile.c b/tools/vm/stockpile.c
new file mode 100644
index 000000000000..245e24f293ec
--- /dev/null
+++ b/tools/vm/stockpile.c
@@ -0,0 +1,30 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <err.h>
+#include <errno.h>
+
+#ifndef MADV_STOCKPILE
+# define MADV_STOCKPILE	20
+#endif
+
+int main(int argc, char **argv)
+{
+	int interval;
+	size_t size;
+	int ret;
+
+	if (argc != 3)
+		errx(1, "usage: %s <size_mb> <interval_ms>", argv[0]);
+
+	size = atol(argv[1]) << 20;
+	interval = atoi(argv[2]) * 1000;
+
+	while (1) {
+		ret = madvise(NULL, size, MADV_STOCKPILE);
+		if (ret && errno != EAGAIN)
+			err(2, "madvise(NULL, %zu, MADV_STOCKPILE)", size);
+		usleep(interval);
+	}
+}