diff mbox series

memory cgroup pagecache and inode problem

Message ID 20190120231551.213847-1-shakeelb@google.com (mailing list archive)
State New, archived
Headers show
Series memory cgroup pagecache and inode problem | expand

Commit Message

Shakeel Butt Jan. 20, 2019, 11:15 p.m. UTC
On Wed, Jan 16, 2019 at 9:07 PM Yang Shi <shy828301@gmail.com> wrote:
...
> > > You mean it solves the problem by retrying more times?  Actually, I'm
> > > not sure if you have swap setup in your test, but force_empty does do
> > > swap if swap is on. This may cause it can't reclaim all the page cache
> > > in 5 retries.  I have a patch within that series to skip swap.
> >
> > Basically yes, retrying solves the problem. But compared to immediate retries, a scheduled retry in a few seconds is much more effective.
>
> This may suggest doing force_empty in a worker is more effective in
> fact. Not sure if this is good enough to convince Johannes or not.
>

From what I understand what we actually want is to force_empty an
offlined memcg. How about we change the semantics of force_empty on
root_mem_cgroup? Currently force_empty on root_mem_cgroup returns
-EINVAL. Rather than that, let's do force_empty on all offlined memcgs
if user does force_empty on root_mem_cgroup. Something like following.

---
 mm/memcontrol.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

Comments

Shakeel Butt Jan. 20, 2019, 11:20 p.m. UTC | #1
On Sun, Jan 20, 2019 at 3:16 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Jan 16, 2019 at 9:07 PM Yang Shi <shy828301@gmail.com> wrote:
> ...
> > > > You mean it solves the problem by retrying more times?  Actually, I'm
> > > > not sure if you have swap setup in your test, but force_empty does do
> > > > swap if swap is on. This may cause it can't reclaim all the page cache
> > > > in 5 retries.  I have a patch within that series to skip swap.
> > >
> > > Basically yes, retrying solves the problem. But compared to immediate retries, a scheduled retry in a few seconds is much more effective.
> >
> > This may suggest doing force_empty in a worker is more effective in
> > fact. Not sure if this is good enough to convince Johannes or not.
> >
>
> From what I understand what we actually want is to force_empty an
> offlined memcg. How about we change the semantics of force_empty on
> root_mem_cgroup? Currently force_empty on root_mem_cgroup returns
> -EINVAL. Rather than that, let's do force_empty on all offlined memcgs
> if user does force_empty on root_mem_cgroup. Something like following.
>

Basically we don't need to add more complexity in kernel like
async/workers/timeouts/workqueues to run force_empty, if we expose a
way to force_empty offlined memcgs.

> ---
>  mm/memcontrol.c | 22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a4ac554be7e8..51daa2935c41 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2898,14 +2898,16 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
>   *
>   * Caller is responsible for holding css reference for memcg.
>   */
> -static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> +static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool online)
>  {
>         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>
>         /* we call try-to-free pages for make this cgroup empty */
> -       lru_add_drain_all();
>
> -       drain_all_stock(memcg);
> +       if (online) {
> +               lru_add_drain_all();
> +               drain_all_stock(memcg);
> +       }
>
>         /* try to free all pages in this cgroup */
>         while (nr_retries && page_counter_read(&memcg->memory)) {
> @@ -2915,7 +2917,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>                         return -EINTR;
>
>                 progress = try_to_free_mem_cgroup_pages(memcg, 1,
> -                                                       GFP_KERNEL, true);
> +                                                       GFP_KERNEL, online);
>                 if (!progress) {
>                         nr_retries--;
>                         /* maybe some writeback is necessary */
> @@ -2932,10 +2934,16 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
>                                             loff_t off)
>  {
>         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +       struct mem_cgroup *mi;
>
> -       if (mem_cgroup_is_root(memcg))
> -               return -EINVAL;
> -       return mem_cgroup_force_empty(memcg) ?: nbytes;
> +       if (mem_cgroup_is_root(memcg)) {
> +               for_each_mem_cgroup_tree(mi, memcg) {
> +                       if (!mem_cgroup_online(mi))
> +                               mem_cgroup_force_empty(mi, false);
> +               }
> +               return 0;
> +       }
> +       return mem_cgroup_force_empty(memcg, true) ?: nbytes;
>  }
>
>  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
> --
> 2.20.1.321.g9e740568ce-goog
>
Michal Hocko Jan. 21, 2019, 10:27 a.m. UTC | #2
On Sun 20-01-19 15:15:51, Shakeel Butt wrote:
> On Wed, Jan 16, 2019 at 9:07 PM Yang Shi <shy828301@gmail.com> wrote:
> ...
> > > > You mean it solves the problem by retrying more times?  Actually, I'm
> > > > not sure if you have swap setup in your test, but force_empty does do
> > > > swap if swap is on. This may cause it can't reclaim all the page cache
> > > > in 5 retries.  I have a patch within that series to skip swap.
> > >
> > > Basically yes, retrying solves the problem. But compared to immediate retries, a scheduled retry in a few seconds is much more effective.
> >
> > This may suggest doing force_empty in a worker is more effective in
> > fact. Not sure if this is good enough to convince Johannes or not.
> >
> 
> >From what I understand what we actually want is to force_empty an
> offlined memcg. How about we change the semantics of force_empty on
> root_mem_cgroup? Currently force_empty on root_mem_cgroup returns
> -EINVAL. Rather than that, let's do force_empty on all offlined memcgs
> if user does force_empty on root_mem_cgroup. Something like following.

No, I do not thing we want to make root memcg somehow special here. I do
recognize two things here
1) people seem to want to have a control over when a specific cgroup
gets reclaimed (basically force_empty)
2) people would like the above to happen when a memcg is offlined

The first part is not present in v2 and we should discuss whether we
want to expose it because it hasn't been added due to lack of usecases.
The later is discussed [1] already so let's continue there.

[1] http://lkml.kernel.org/r/1547061285-100329-1-git-send-email-yang.shi@linux.alibaba.com
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a4ac554be7e8..51daa2935c41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2898,14 +2898,16 @@  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  *
  * Caller is responsible for holding css reference for memcg.
  */
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool online)
 {
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 
 	/* we call try-to-free pages for make this cgroup empty */
-	lru_add_drain_all();
 
-	drain_all_stock(memcg);
+	if (online) {
+		lru_add_drain_all();
+		drain_all_stock(memcg);
+	}
 
 	/* try to free all pages in this cgroup */
 	while (nr_retries && page_counter_read(&memcg->memory)) {
@@ -2915,7 +2917,7 @@  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		progress = try_to_free_mem_cgroup_pages(memcg, 1,
-							GFP_KERNEL, true);
+							GFP_KERNEL, online);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -2932,10 +2934,16 @@  static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
 					    loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct mem_cgroup *mi;
 
-	if (mem_cgroup_is_root(memcg))
-		return -EINVAL;
-	return mem_cgroup_force_empty(memcg) ?: nbytes;
+	if (mem_cgroup_is_root(memcg)) {
+		for_each_mem_cgroup_tree(mi, memcg) {
+			if (!mem_cgroup_online(mi))
+				mem_cgroup_force_empty(mi, false);
+		}
+		return 0;
+	}
+	return mem_cgroup_force_empty(memcg, true) ?: nbytes;
 }
 
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,