diff mbox series

[v8,2/9] hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations

Message ID 20191030013701.39647-2-almasrymina@google.com (mailing list archive)
State New
Headers show
Series [v8,1/9] hugetlb_cgroup: Add hugetlb_cgroup reservation counter | expand

Commit Message

Mina Almasry Oct. 30, 2019, 1:36 a.m. UTC
Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
usage or hugetlb reservation counter.

Adds a new interface to uncharge a hugetlb_cgroup counter via
hugetlb_cgroup_uncharge_counter.

Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.

Signed-off-by: Mina Almasry <almasrymina@google.com>

---
 include/linux/hugetlb_cgroup.h |  67 +++++++++++++---------
 mm/hugetlb.c                   |  17 +++---
 mm/hugetlb_cgroup.c            | 100 +++++++++++++++++++++++++--------
 3 files changed, 130 insertions(+), 54 deletions(-)

--
2.24.0.rc1.363.gb1bccd3e3d-goog

Comments

Mike Kravetz Nov. 8, 2019, 12:57 a.m. UTC | #1
On 10/29/19 6:36 PM, Mina Almasry wrote:
> Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
> usage or hugetlb reservation counter.
> 
> Adds a new interface to uncharge a hugetlb_cgroup counter via
> hugetlb_cgroup_uncharge_counter.
> 
> Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
> hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.
> 
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> 
> ---
>  include/linux/hugetlb_cgroup.h |  67 +++++++++++++---------
>  mm/hugetlb.c                   |  17 +++---
>  mm/hugetlb_cgroup.c            | 100 +++++++++++++++++++++++++--------
>  3 files changed, 130 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index 063962f6dfc6a..1bb58a63af586 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
>   * Minimum page order trackable by hugetlb cgroup.
>   * At least 3 pages are necessary for all the tracking information.
>   */
> -#define HUGETLB_CGROUP_MIN_ORDER	2
> +#define HUGETLB_CGROUP_MIN_ORDER 3

Correct me if misremembering, but I think the reson you changed this was
so that you could use page[3].private.  Correct?
In that case isn't page[3] the last page of an order 2 allocation?
If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
and update the preceding comment to say that at least 4 pages are necessary.

> 
>  #ifdef CONFIG_CGROUP_HUGETLB
> 
> -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
> +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> +							      bool reserved)
>  {
>  	VM_BUG_ON_PAGE(!PageHuge(page), page);
> 
>  	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
>  		return NULL;
> -	return (struct hugetlb_cgroup *)page[2].private;
> +	if (reserved)
> +		return (struct hugetlb_cgroup *)page[3].private;
> +	else
> +		return (struct hugetlb_cgroup *)page[2].private;
>  }
> 
> -static inline
> -int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
> +static inline int set_hugetlb_cgroup(struct page *page,
> +				     struct hugetlb_cgroup *h_cg,
> +				     bool reservation)
>  {
>  	VM_BUG_ON_PAGE(!PageHuge(page), page);
> 
>  	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
>  		return -1;
> -	page[2].private	= (unsigned long)h_cg;
> +	if (reservation)
> +		page[3].private = (unsigned long)h_cg;
> +	else
> +		page[2].private = (unsigned long)h_cg;
>  	return 0;
>  }
> 
> @@ -52,26 +60,33 @@ static inline bool hugetlb_cgroup_disabled(void)
>  }
> 
>  extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> -					struct hugetlb_cgroup **ptr);
> +					struct hugetlb_cgroup **ptr,
> +					bool reserved);
>  extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
>  					 struct hugetlb_cgroup *h_cg,
> -					 struct page *page);
> +					 struct page *page, bool reserved);
>  extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
> -					 struct page *page);
> +					 struct page *page, bool reserved);
> +
>  extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> -					   struct hugetlb_cgroup *h_cg);
> +					   struct hugetlb_cgroup *h_cg,
> +					   bool reserved);
> +extern void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
> +					    unsigned long nr_pages);
> +
>  extern void hugetlb_cgroup_file_init(void) __init;
>  extern void hugetlb_cgroup_migrate(struct page *oldhpage,
>  				   struct page *newhpage);
> 
>  #else
> -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
> +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> +							      bool reserved)
>  {
>  	return NULL;
>  }
> 
> -static inline
> -int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
> +static inline int set_hugetlb_cgroup(struct page *page,
> +				     struct hugetlb_cgroup *h_cg, bool reserved)
>  {
>  	return 0;
>  }
> @@ -81,28 +96,30 @@ static inline bool hugetlb_cgroup_disabled(void)
>  	return true;
>  }
> 
> -static inline int
> -hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> -			     struct hugetlb_cgroup **ptr)
> +static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> +					       struct hugetlb_cgroup **ptr,
> +					       bool reserved)
>  {
>  	return 0;
>  }
> 
> -static inline void
> -hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> -			     struct hugetlb_cgroup *h_cg,
> -			     struct page *page)
> +static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> +						struct hugetlb_cgroup *h_cg,
> +						struct page *page,
> +						bool reserved)
>  {
>  }
> 
> -static inline void
> -hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
> +static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
> +						struct page *page,
> +						bool reserved)
>  {
>  }
> 
> -static inline void
> -hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> -			       struct hugetlb_cgroup *h_cg)
> +static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
> +						  unsigned long nr_pages,
> +						  struct hugetlb_cgroup *h_cg,
> +						  bool reserved)
>  {
>  }
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 37195cd60a345..325d5454bf168 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1140,7 +1140,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
>  				1 << PG_active | 1 << PG_private |
>  				1 << PG_writeback);
>  	}
> -	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
> +	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page, false), page);

Shouldn't we add a VM_BUG_ON_PAGE for reserved as well?
Oh, I see it is done in a later patch.
I guess it is not here as there are no users of reserved yet?
Same observation in other places in hugetlb.c below.
Since you add the API changes for reserved here, as well as define
page[3].private to store info, I don't think it would hurt to add
the initialization and checking and cleanup here as well.
Thoughts?

>  	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
>  	set_page_refcounted(page);
>  	if (hstate_is_gigantic(h)) {
> @@ -1250,8 +1250,8 @@ void free_huge_page(struct page *page)
> 
>  	spin_lock(&hugetlb_lock);
>  	clear_page_huge_active(page);
> -	hugetlb_cgroup_uncharge_page(hstate_index(h),
> -				     pages_per_huge_page(h), page);
> +	hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h),
> +				     page, false);
>  	if (restore_reserve)
>  		h->resv_huge_pages++;
> 
> @@ -1277,7 +1277,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  	INIT_LIST_HEAD(&page->lru);
>  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>  	spin_lock(&hugetlb_lock);
> -	set_hugetlb_cgroup(page, NULL);
> +	set_hugetlb_cgroup(page, NULL, false);
>  	h->nr_huge_pages++;
>  	h->nr_huge_pages_node[nid]++;
>  	spin_unlock(&hugetlb_lock);
> @@ -2063,7 +2063,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  			gbl_chg = 1;
>  	}
> 
> -	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
> +	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg,
> +					   false);
>  	if (ret)
>  		goto out_subpool_put;
> 
> @@ -2087,7 +2088,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  		list_move(&page->lru, &h->hugepage_activelist);
>  		/* Fall through */
>  	}
> -	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
> +	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page,
> +				     false);
>  	spin_unlock(&hugetlb_lock);
> 
>  	set_page_private(page, (unsigned long)spool);
> @@ -2111,7 +2113,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	return page;
> 
>  out_uncharge_cgroup:
> -	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
> +	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg,
> +				       false);
>  out_subpool_put:
>  	if (map_chg || avoid_reserve)
>  		hugepage_subpool_put_pages(spool, 1);
> diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
> index 1ed4448ca41d3..854117513979b 100644
> --- a/mm/hugetlb_cgroup.c
> +++ b/mm/hugetlb_cgroup.c
> @@ -73,8 +73,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
>  	int idx;
> 
>  	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
> -		if (page_counter_read(&h_cg->hugepage[idx]))
> +		if (page_counter_read(
> +			    hugetlb_cgroup_get_counter(h_cg, idx, true)) ||
> +		    page_counter_read(
> +			    hugetlb_cgroup_get_counter(h_cg, idx, false))) {
>  			return true;
> +		}
>  	}
>  	return false;
>  }
> @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
>  	int idx;
> 
>  	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
> -		struct page_counter *counter = &h_cgroup->hugepage[idx];
>  		struct page_counter *parent = NULL;

Should we perhaps rename 'parent' to 'fault_parent' to be consistent?
That makes me think if perhaps the naming in the previous patch should
be more explicit.  Make the existing names explicitly contin 'fault' as
the new names contain 'reservation'.
Just a thought.

> +		struct page_counter *reserved_parent = NULL;
>  		unsigned long limit;
>  		int ret;
> 
> -		if (parent_h_cgroup)
> -			parent = &parent_h_cgroup->hugepage[idx];
> -		page_counter_init(counter, parent);
> +		if (parent_h_cgroup) {
> +			parent = hugetlb_cgroup_get_counter(parent_h_cgroup,
> +							    idx, false);
> +			reserved_parent = hugetlb_cgroup_get_counter(
> +				parent_h_cgroup, idx, true);
> +		}
> +		page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
> +							     false),
> +				  parent);
> +		page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
> +							     true),
> +				  reserved_parent);
> 
>  		limit = round_down(PAGE_COUNTER_MAX,
>  				   1 << huge_page_order(&hstates[idx]));
> -		ret = page_counter_set_max(counter, limit);
> +
> +		ret = page_counter_set_max(
> +			hugetlb_cgroup_get_counter(h_cgroup, idx, false),
> +			limit);
> +		ret = page_counter_set_max(
> +			hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit);
>  		VM_BUG_ON(ret);
>  	}
>  }
> @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
>  	kfree(h_cgroup);
>  }
> 
> +static void hugetlb_cgroup_move_parent_reservation(int idx,
> +						   struct hugetlb_cgroup *h_cg)
> +{
> +	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
> +
> +	/* Move the reservation counters. */
> +	if (!parent_hugetlb_cgroup(h_cg)) {
> +		parent = root_h_cgroup;
> +		/* root has no limit */
> +		page_counter_charge(
> +			&root_h_cgroup->reserved_hugepage[idx],
> +			page_counter_read(
> +				hugetlb_cgroup_get_counter(h_cg, idx, true)));
> +	}
> +
> +	/* Take the pages off the local counter */
> +	page_counter_cancel(
> +		hugetlb_cgroup_get_counter(h_cg, idx, true),
> +		page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
> +}

I know next to nothing about cgroups and am just comparing this to the
existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
updates the cgroup pointer in each page being moved.  Do we need to do
something similar for reservations being moved (move pointer in reservation)?
Mina Almasry Nov. 8, 2019, 11:48 p.m. UTC | #2
On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 10/29/19 6:36 PM, Mina Almasry wrote:
> > Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
> > usage or hugetlb reservation counter.
> >
> > Adds a new interface to uncharge a hugetlb_cgroup counter via
> > hugetlb_cgroup_uncharge_counter.
> >
> > Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
> > hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.
> >
> > Signed-off-by: Mina Almasry <almasrymina@google.com>
> >
> > ---
> >  include/linux/hugetlb_cgroup.h |  67 +++++++++++++---------
> >  mm/hugetlb.c                   |  17 +++---
> >  mm/hugetlb_cgroup.c            | 100 +++++++++++++++++++++++++--------
> >  3 files changed, 130 insertions(+), 54 deletions(-)
> >
> > diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> > index 063962f6dfc6a..1bb58a63af586 100644
> > --- a/include/linux/hugetlb_cgroup.h
> > +++ b/include/linux/hugetlb_cgroup.h
> > @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
> >   * Minimum page order trackable by hugetlb cgroup.
> >   * At least 3 pages are necessary for all the tracking information.
> >   */
> > -#define HUGETLB_CGROUP_MIN_ORDER     2
> > +#define HUGETLB_CGROUP_MIN_ORDER 3
>
> Correct me if misremembering, but I think the reson you changed this was
> so that you could use page[3].private.  Correct?
> In that case isn't page[3] the last page of an order 2 allocation?
> If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
> and update the preceding comment to say that at least 4 pages are necessary.
>

Yes, I just misunderstood what MIN_ORDER means. I'll revert the code change.

> >
> >  #ifdef CONFIG_CGROUP_HUGETLB
> >
> > -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
> > +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> > +                                                           bool reserved)
> >  {
> >       VM_BUG_ON_PAGE(!PageHuge(page), page);
> >
> >       if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
> >               return NULL;
> > -     return (struct hugetlb_cgroup *)page[2].private;
> > +     if (reserved)
> > +             return (struct hugetlb_cgroup *)page[3].private;
> > +     else
> > +             return (struct hugetlb_cgroup *)page[2].private;
> >  }
> >
> > -static inline
> > -int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
> > +static inline int set_hugetlb_cgroup(struct page *page,
> > +                                  struct hugetlb_cgroup *h_cg,
> > +                                  bool reservation)
> >  {
> >       VM_BUG_ON_PAGE(!PageHuge(page), page);
> >
> >       if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
> >               return -1;
> > -     page[2].private = (unsigned long)h_cg;
> > +     if (reservation)
> > +             page[3].private = (unsigned long)h_cg;
> > +     else
> > +             page[2].private = (unsigned long)h_cg;
> >       return 0;
> >  }
> >
> > @@ -52,26 +60,33 @@ static inline bool hugetlb_cgroup_disabled(void)
> >  }
> >
> >  extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> > -                                     struct hugetlb_cgroup **ptr);
> > +                                     struct hugetlb_cgroup **ptr,
> > +                                     bool reserved);
> >  extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> >                                        struct hugetlb_cgroup *h_cg,
> > -                                      struct page *page);
> > +                                      struct page *page, bool reserved);
> >  extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
> > -                                      struct page *page);
> > +                                      struct page *page, bool reserved);
> > +
> >  extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> > -                                        struct hugetlb_cgroup *h_cg);
> > +                                        struct hugetlb_cgroup *h_cg,
> > +                                        bool reserved);
> > +extern void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
> > +                                         unsigned long nr_pages);
> > +
> >  extern void hugetlb_cgroup_file_init(void) __init;
> >  extern void hugetlb_cgroup_migrate(struct page *oldhpage,
> >                                  struct page *newhpage);
> >
> >  #else
> > -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
> > +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> > +                                                           bool reserved)
> >  {
> >       return NULL;
> >  }
> >
> > -static inline
> > -int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
> > +static inline int set_hugetlb_cgroup(struct page *page,
> > +                                  struct hugetlb_cgroup *h_cg, bool reserved)
> >  {
> >       return 0;
> >  }
> > @@ -81,28 +96,30 @@ static inline bool hugetlb_cgroup_disabled(void)
> >       return true;
> >  }
> >
> > -static inline int
> > -hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> > -                          struct hugetlb_cgroup **ptr)
> > +static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> > +                                            struct hugetlb_cgroup **ptr,
> > +                                            bool reserved)
> >  {
> >       return 0;
> >  }
> >
> > -static inline void
> > -hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> > -                          struct hugetlb_cgroup *h_cg,
> > -                          struct page *page)
> > +static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> > +                                             struct hugetlb_cgroup *h_cg,
> > +                                             struct page *page,
> > +                                             bool reserved)
> >  {
> >  }
> >
> > -static inline void
> > -hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
> > +static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
> > +                                             struct page *page,
> > +                                             bool reserved)
> >  {
> >  }
> >
> > -static inline void
> > -hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> > -                            struct hugetlb_cgroup *h_cg)
> > +static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
> > +                                               unsigned long nr_pages,
> > +                                               struct hugetlb_cgroup *h_cg,
> > +                                               bool reserved)
> >  {
> >  }
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 37195cd60a345..325d5454bf168 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1140,7 +1140,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
> >                               1 << PG_active | 1 << PG_private |
> >                               1 << PG_writeback);
> >       }
> > -     VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
> > +     VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page, false), page);
>
> Shouldn't we add a VM_BUG_ON_PAGE for reserved as well?
> Oh, I see it is done in a later patch.
> I guess it is not here as there are no users of reserved yet?

Yes precisely.

> Same observation in other places in hugetlb.c below.
> Since you add the API changes for reserved here, as well as define
> page[3].private to store info, I don't think it would hurt to add
> the initialization and checking and cleanup here as well.
> Thoughts?
>

Yes that makes sense as well. I'll take a look.

> >       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
> >       set_page_refcounted(page);
> >       if (hstate_is_gigantic(h)) {
> > @@ -1250,8 +1250,8 @@ void free_huge_page(struct page *page)
> >
> >       spin_lock(&hugetlb_lock);
> >       clear_page_huge_active(page);
> > -     hugetlb_cgroup_uncharge_page(hstate_index(h),
> > -                                  pages_per_huge_page(h), page);
> > +     hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h),
> > +                                  page, false);
> >       if (restore_reserve)
> >               h->resv_huge_pages++;
> >
> > @@ -1277,7 +1277,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >       INIT_LIST_HEAD(&page->lru);
> >       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> >       spin_lock(&hugetlb_lock);
> > -     set_hugetlb_cgroup(page, NULL);
> > +     set_hugetlb_cgroup(page, NULL, false);
> >       h->nr_huge_pages++;
> >       h->nr_huge_pages_node[nid]++;
> >       spin_unlock(&hugetlb_lock);
> > @@ -2063,7 +2063,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >                       gbl_chg = 1;
> >       }
> >
> > -     ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
> > +     ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg,
> > +                                        false);
> >       if (ret)
> >               goto out_subpool_put;
> >
> > @@ -2087,7 +2088,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >               list_move(&page->lru, &h->hugepage_activelist);
> >               /* Fall through */
> >       }
> > -     hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
> > +     hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page,
> > +                                  false);
> >       spin_unlock(&hugetlb_lock);
> >
> >       set_page_private(page, (unsigned long)spool);
> > @@ -2111,7 +2113,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >       return page;
> >
> >  out_uncharge_cgroup:
> > -     hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
> > +     hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg,
> > +                                    false);
> >  out_subpool_put:
> >       if (map_chg || avoid_reserve)
> >               hugepage_subpool_put_pages(spool, 1);
> > diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
> > index 1ed4448ca41d3..854117513979b 100644
> > --- a/mm/hugetlb_cgroup.c
> > +++ b/mm/hugetlb_cgroup.c
> > @@ -73,8 +73,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
> >       int idx;
> >
> >       for (idx = 0; idx < hugetlb_max_hstate; idx++) {
> > -             if (page_counter_read(&h_cg->hugepage[idx]))
> > +             if (page_counter_read(
> > +                         hugetlb_cgroup_get_counter(h_cg, idx, true)) ||
> > +                 page_counter_read(
> > +                         hugetlb_cgroup_get_counter(h_cg, idx, false))) {
> >                       return true;
> > +             }
> >       }
> >       return false;
> >  }
> > @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
> >       int idx;
> >
> >       for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
> > -             struct page_counter *counter = &h_cgroup->hugepage[idx];
> >               struct page_counter *parent = NULL;
>
> Should we perhaps rename 'parent' to 'fault_parent' to be consistent?

Yes that makes sense; will do.

> That makes me think if perhaps the naming in the previous patch should
> be more explicit.  Make the existing names explicitly contin 'fault' as
> the new names contain 'reservation'.
> Just a thought.
>

You mean change the names of the actual user-facing files? I'm all for
better names but that would break existing users that read/write the
hugetlb_cgroup.2MB.usage_in_bytes/limit_in_bytes users, and so I would
assume is a no-go.

> > +             struct page_counter *reserved_parent = NULL;
> >               unsigned long limit;
> >               int ret;
> >
> > -             if (parent_h_cgroup)
> > -                     parent = &parent_h_cgroup->hugepage[idx];
> > -             page_counter_init(counter, parent);
> > +             if (parent_h_cgroup) {
> > +                     parent = hugetlb_cgroup_get_counter(parent_h_cgroup,
> > +                                                         idx, false);
> > +                     reserved_parent = hugetlb_cgroup_get_counter(
> > +                             parent_h_cgroup, idx, true);
> > +             }
> > +             page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
> > +                                                          false),
> > +                               parent);
> > +             page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
> > +                                                          true),
> > +                               reserved_parent);
> >
> >               limit = round_down(PAGE_COUNTER_MAX,
> >                                  1 << huge_page_order(&hstates[idx]));
> > -             ret = page_counter_set_max(counter, limit);
> > +
> > +             ret = page_counter_set_max(
> > +                     hugetlb_cgroup_get_counter(h_cgroup, idx, false),
> > +                     limit);
> > +             ret = page_counter_set_max(
> > +                     hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit);
> >               VM_BUG_ON(ret);
> >       }
> >  }
> > @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
> >       kfree(h_cgroup);
> >  }
> >
> > +static void hugetlb_cgroup_move_parent_reservation(int idx,
> > +                                                struct hugetlb_cgroup *h_cg)
> > +{
> > +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
> > +
> > +     /* Move the reservation counters. */
> > +     if (!parent_hugetlb_cgroup(h_cg)) {
> > +             parent = root_h_cgroup;
> > +             /* root has no limit */
> > +             page_counter_charge(
> > +                     &root_h_cgroup->reserved_hugepage[idx],
> > +                     page_counter_read(
> > +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
> > +     }
> > +
> > +     /* Take the pages off the local counter */
> > +     page_counter_cancel(
> > +             hugetlb_cgroup_get_counter(h_cg, idx, true),
> > +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
> > +}
>
> I know next to nothing about cgroups and am just comparing this to the
> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
> updates the cgroup pointer in each page being moved.  Do we need to do
> something similar for reservations being moved (move pointer in reservation)?
>

Oh, good catch. Yes I need to be doing that. I should probably
consolidate those routines so the code doesn't miss things like this.

Thanks again for reviewing!

> --
> Mike Kravetz
>
> >
> >  /*
> >   * Should be called with hugetlb_lock held.
> > @@ -142,7 +180,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
> >       struct hugetlb_cgroup *page_hcg;
> >       struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
> >
> > -     page_hcg = hugetlb_cgroup_from_page(page);
> > +     page_hcg = hugetlb_cgroup_from_page(page, false);
> >       /*
> >        * We can have pages in active list without any cgroup
> >        * ie, hugepage with less than 3 pages. We can safely
> > @@ -161,7 +199,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
> >       /* Take the pages off the local counter */
> >       page_counter_cancel(counter, nr_pages);
> >
> > -     set_hugetlb_cgroup(page, parent);
> > +     set_hugetlb_cgroup(page, parent, false);
> >  out:
> >       return;
> >  }
> > @@ -180,6 +218,7 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
> >       do {
> >               for_each_hstate(h) {
> >                       spin_lock(&hugetlb_lock);
> > +                     hugetlb_cgroup_move_parent_reservation(idx, h_cg);
> >                       list_for_each_entry(page, &h->hugepage_activelist, lru)
> >                               hugetlb_cgroup_move_parent(idx, h_cg, page);
> >
> > @@ -191,7 +230,7 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
> >  }
> >
> >  int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> > -                              struct hugetlb_cgroup **ptr)
> > +                              struct hugetlb_cgroup **ptr, bool reserved)
> >  {
> >       int ret = 0;
> >       struct page_counter *counter;
> > @@ -214,8 +253,11 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> >       }
> >       rcu_read_unlock();
> >
> > -     if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
> > +     if (!page_counter_try_charge(hugetlb_cgroup_get_counter(h_cg, idx,
> > +                                                             reserved),
> > +                                  nr_pages, &counter)) {
> >               ret = -ENOMEM;
> > +     }
> >       css_put(&h_cg->css);
> >  done:
> >       *ptr = h_cg;
> > @@ -225,12 +267,12 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
> >  /* Should be called with hugetlb_lock held */
> >  void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> >                                 struct hugetlb_cgroup *h_cg,
> > -                               struct page *page)
> > +                               struct page *page, bool reserved)
> >  {
> >       if (hugetlb_cgroup_disabled() || !h_cg)
> >               return;
> >
> > -     set_hugetlb_cgroup(page, h_cg);
> > +     set_hugetlb_cgroup(page, h_cg, reserved);
> >       return;
> >  }
> >
> > @@ -238,23 +280,26 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
> >   * Should be called with hugetlb_lock held
> >   */
> >  void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
> > -                               struct page *page)
> > +                               struct page *page, bool reserved)
> >  {
> >       struct hugetlb_cgroup *h_cg;
> >
> >       if (hugetlb_cgroup_disabled())
> >               return;
> >       lockdep_assert_held(&hugetlb_lock);
> > -     h_cg = hugetlb_cgroup_from_page(page);
> > +     h_cg = hugetlb_cgroup_from_page(page, reserved);
> >       if (unlikely(!h_cg))
> >               return;
> > -     set_hugetlb_cgroup(page, NULL);
> > -     page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
> > +     set_hugetlb_cgroup(page, NULL, reserved);
> > +
> > +     page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
> > +                           nr_pages);
> > +
> >       return;
> >  }
> >
> >  void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> > -                                 struct hugetlb_cgroup *h_cg)
> > +                                 struct hugetlb_cgroup *h_cg, bool reserved)
> >  {
> >       if (hugetlb_cgroup_disabled() || !h_cg)
> >               return;
> > @@ -262,8 +307,17 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
> >       if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
> >               return;
> >
> > -     page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
> > -     return;
> > +     page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
> > +                           nr_pages);
> > +}
> > +
> > +void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
> > +                                  unsigned long nr_pages)
> > +{
> > +     if (hugetlb_cgroup_disabled() || !p)
> > +             return;
> > +
> > +     page_counter_uncharge(p, nr_pages);
> >  }
> >
> >  static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
> > @@ -475,6 +529,7 @@ void __init hugetlb_cgroup_file_init(void)
> >  void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
> >  {
> >       struct hugetlb_cgroup *h_cg;
> > +     struct hugetlb_cgroup *h_cg_reservation;
> >       struct hstate *h = page_hstate(oldhpage);
> >
> >       if (hugetlb_cgroup_disabled())
> > @@ -482,11 +537,12 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
> >
> >       VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
> >       spin_lock(&hugetlb_lock);
> > -     h_cg = hugetlb_cgroup_from_page(oldhpage);
> > -     set_hugetlb_cgroup(oldhpage, NULL);
> > +     h_cg = hugetlb_cgroup_from_page(oldhpage, false);
> > +     h_cg_reservation = hugetlb_cgroup_from_page(oldhpage, true);
> > +     set_hugetlb_cgroup(oldhpage, NULL, false);
> >
> >       /* move the h_cg details to new cgroup */
> > -     set_hugetlb_cgroup(newhpage, h_cg);
> > +     set_hugetlb_cgroup(newhpage, h_cg_reservation, true);
> >       list_move(&newhpage->lru, &h->hugepage_activelist);
> >       spin_unlock(&hugetlb_lock);
> >       return;
> > --
> > 2.24.0.rc1.363.gb1bccd3e3d-goog
> >
Mike Kravetz Nov. 9, 2019, 12:01 a.m. UTC | #3
On 11/8/19 3:48 PM, Mina Almasry wrote:
> On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 10/29/19 6:36 PM, Mina Almasry wrote:
>>> @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
>>>   * Minimum page order trackable by hugetlb cgroup.
>>>   * At least 3 pages are necessary for all the tracking information.
>>>   */
>>> -#define HUGETLB_CGROUP_MIN_ORDER     2
>>> +#define HUGETLB_CGROUP_MIN_ORDER 3
>>
>> Correct me if misremembering, but I think the reson you changed this was
>> so that you could use page[3].private.  Correct?
>> In that case isn't page[3] the last page of an order 2 allocation?
>> If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
>> and update the preceding comment to say that at least 4 pages are necessary.
>>
> 
> Yes, I just misunderstood what MIN_ORDER means. I'll revert the code change.

But, do update the comment please.

<snip>
>>> @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
>>>       int idx;
>>>
>>>       for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
>>> -             struct page_counter *counter = &h_cgroup->hugepage[idx];
>>>               struct page_counter *parent = NULL;
>>
>> Should we perhaps rename 'parent' to 'fault_parent' to be consistent?
> 
> Yes that makes sense; will do.
> 
>> That makes me think if perhaps the naming in the previous patch should
>> be more explicit.  Make the existing names explicitly contin 'fault' as
>> the new names contain 'reservation'.
>> Just a thought.
>>
> 
> You mean change the names of the actual user-facing files? I'm all for
> better names but that would break existing users that read/write the
> hugetlb_cgroup.2MB.usage_in_bytes/limit_in_bytes users, and so I would
> assume is a no-go.
> 

I was thinking about internal variables/definitions such as:

+enum {
+ /* Tracks hugetlb memory faulted in. */
+ HUGETLB_RES_USAGE,
+ /* Tracks hugetlb memory reserved. */
+ HUGETLB_RES_RESERVATION_USAGE,
+ /* Limit for hugetlb memory faulted in. */
+ HUGETLB_RES_LIMIT,
+ /* Limit for hugetlb memory reserved. */
+ HUGETLB_RES_RESERVATION_LIMIT,
+ /* Max usage for hugetlb memory faulted in. */
+ HUGETLB_RES_MAX_USAGE,
+ /* Max usage for hugetlb memory reserved. */
+ HUGETLB_RES_RESERVATION_MAX_USAGE,
+ /* Faulted memory accounting fail count. */
+ HUGETLB_RES_FAILCNT,
+ /* Reserved memory accounting fail count. */
+ HUGETLB_RES_RESERVATION_FAILCNT,
+ HUGETLB_RES_NULL,
+ HUGETLB_RES_MAX,
+};

But, I guess the existing definitions (such as HUGETLB_RES_LIMIT) correspond
closely to the externally visible name.  In that case, you should leave them
as is and ignore my comment.

<ship>
>>> @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
>>>       kfree(h_cgroup);
>>>  }
>>>
>>> +static void hugetlb_cgroup_move_parent_reservation(int idx,
>>> +                                                struct hugetlb_cgroup *h_cg)
>>> +{
>>> +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
>>> +
>>> +     /* Move the reservation counters. */
>>> +     if (!parent_hugetlb_cgroup(h_cg)) {
>>> +             parent = root_h_cgroup;
>>> +             /* root has no limit */
>>> +             page_counter_charge(
>>> +                     &root_h_cgroup->reserved_hugepage[idx],
>>> +                     page_counter_read(
>>> +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>> +     }
>>> +
>>> +     /* Take the pages off the local counter */
>>> +     page_counter_cancel(
>>> +             hugetlb_cgroup_get_counter(h_cg, idx, true),
>>> +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>> +}
>>
>> I know next to nothing about cgroups and am just comparing this to the
>> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
>> updates the cgroup pointer in each page being moved.  Do we need to do
>> something similar for reservations being moved (move pointer in reservation)?
>>
> 
> Oh, good catch. Yes I need to be doing that. I should probably
> consolidate those routines so the code doesn't miss things like this.

This might get a bit ugly/complicated?  Seems like you will need to examine
all hugetlbfs inodes and vma's mapping those inodes.
Mina Almasry Nov. 9, 2019, 12:40 a.m. UTC | #4
On Fri, Nov 8, 2019 at 4:01 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 11/8/19 3:48 PM, Mina Almasry wrote:
> > On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>
> >> On 10/29/19 6:36 PM, Mina Almasry wrote:
> >>> @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
> >>>   * Minimum page order trackable by hugetlb cgroup.
> >>>   * At least 3 pages are necessary for all the tracking information.
> >>>   */
> >>> -#define HUGETLB_CGROUP_MIN_ORDER     2
> >>> +#define HUGETLB_CGROUP_MIN_ORDER 3
> >>
> >> Correct me if misremembering, but I think the reson you changed this was
> >> so that you could use page[3].private.  Correct?
> >> In that case isn't page[3] the last page of an order 2 allocation?
> >> If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
> >> and update the preceding comment to say that at least 4 pages are necessary.
> >>
> >
> > Yes, I just misunderstood what MIN_ORDER means. I'll revert the code change.
>
> But, do update the comment please.
>

Will do.

> <snip>
> >>> @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
> >>>       int idx;
> >>>
> >>>       for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
> >>> -             struct page_counter *counter = &h_cgroup->hugepage[idx];
> >>>               struct page_counter *parent = NULL;
> >>
> >> Should we perhaps rename 'parent' to 'fault_parent' to be consistent?
> >
> > Yes that makes sense; will do.
> >
> >> That makes me think if perhaps the naming in the previous patch should
> >> be more explicit.  Make the existing names explicitly contin 'fault' as
> >> the new names contain 'reservation'.
> >> Just a thought.
> >>
> >
> > You mean change the names of the actual user-facing files? I'm all for
> > better names but that would break existing users that read/write the
> > hugetlb_cgroup.2MB.usage_in_bytes/limit_in_bytes users, and so I would
> > assume is a no-go.
> >
>
> I was thinking about internal variables/definitions such as:
>
> +enum {
> + /* Tracks hugetlb memory faulted in. */
> + HUGETLB_RES_USAGE,
> + /* Tracks hugetlb memory reserved. */
> + HUGETLB_RES_RESERVATION_USAGE,
> + /* Limit for hugetlb memory faulted in. */
> + HUGETLB_RES_LIMIT,
> + /* Limit for hugetlb memory reserved. */
> + HUGETLB_RES_RESERVATION_LIMIT,
> + /* Max usage for hugetlb memory faulted in. */
> + HUGETLB_RES_MAX_USAGE,
> + /* Max usage for hugetlb memory reserved. */
> + HUGETLB_RES_RESERVATION_MAX_USAGE,
> + /* Faulted memory accounting fail count. */
> + HUGETLB_RES_FAILCNT,
> + /* Reserved memory accounting fail count. */
> + HUGETLB_RES_RESERVATION_FAILCNT,
> + HUGETLB_RES_NULL,
> + HUGETLB_RES_MAX,
> +};
>
> But, I guess the existing definitions (such as HUGETLB_RES_LIMIT) correspond
> closely to the externally visible name.  In that case, you should leave them
> as is and ignore my comment.
>
> <ship>
> >>> @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
> >>>       kfree(h_cgroup);
> >>>  }
> >>>
> >>> +static void hugetlb_cgroup_move_parent_reservation(int idx,
> >>> +                                                struct hugetlb_cgroup *h_cg)
> >>> +{
> >>> +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
> >>> +
> >>> +     /* Move the reservation counters. */
> >>> +     if (!parent_hugetlb_cgroup(h_cg)) {
> >>> +             parent = root_h_cgroup;
> >>> +             /* root has no limit */
> >>> +             page_counter_charge(
> >>> +                     &root_h_cgroup->reserved_hugepage[idx],
> >>> +                     page_counter_read(
> >>> +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
> >>> +     }
> >>> +
> >>> +     /* Take the pages off the local counter */
> >>> +     page_counter_cancel(
> >>> +             hugetlb_cgroup_get_counter(h_cg, idx, true),
> >>> +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
> >>> +}
> >>
> >> I know next to nothing about cgroups and am just comparing this to the
> >> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
> >> updates the cgroup pointer in each page being moved.  Do we need to do
> >> something similar for reservations being moved (move pointer in reservation)?
> >>
> >
> > Oh, good catch. Yes I need to be doing that. I should probably
> > consolidate those routines so the code doesn't miss things like this.
>
> This might get a bit ugly/complicated?  Seems like you will need to examine
> all hugetlbfs inodes and vma's mapping those inodes.
>

Hmm yes on closer look it does seem like this is not straightforward.
I'll write a test that does this reparenting so I can start running
into the issue and poke for solutions. Off the top of my head, I think
maybe we can just not reparent the hugetlb reservations - the
hugetlb_cgroup stays alive until all its memory is uncharged. That
shouldn't be too bad. Today, I think memcg doesn't reparent memory
when it gets offlined.

I'll poke at this a bit and come back with suggestions, you may want
to hold off reviewing the rest of the patches until then.

> --
> Mike Kravetz
Mike Kravetz Nov. 9, 2019, 12:46 a.m. UTC | #5
On 11/8/19 4:40 PM, Mina Almasry wrote:
> On Fri, Nov 8, 2019 at 4:01 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 11/8/19 3:48 PM, Mina Almasry wrote:
>>> On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>
>>>> On 10/29/19 6:36 PM, Mina Almasry wrote:
>>>>> @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
>>>>>   * Minimum page order trackable by hugetlb cgroup.
>>>>>   * At least 3 pages are necessary for all the tracking information.
>>>>>   */
>>>>> -#define HUGETLB_CGROUP_MIN_ORDER     2
>>>>> +#define HUGETLB_CGROUP_MIN_ORDER 3
>>>>
>>>> Correct me if misremembering, but I think the reson you changed this was
>>>> so that you could use page[3].private.  Correct?
>>>> In that case isn't page[3] the last page of an order 2 allocation?
>>>> If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
>>>> and update the preceding comment to say that at least 4 pages are necessary.
>>>>
>>>
>>> Yes, I just misunderstood what MIN_ORDER means. I'll revert the code change.
>>
>> But, do update the comment please.
>>
> 
> Will do.
> 
>> <snip>
>>>>> @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
>>>>>       int idx;
>>>>>
>>>>>       for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
>>>>> -             struct page_counter *counter = &h_cgroup->hugepage[idx];
>>>>>               struct page_counter *parent = NULL;
>>>>
>>>> Should we perhaps rename 'parent' to 'fault_parent' to be consistent?
>>>
>>> Yes that makes sense; will do.
>>>
>>>> That makes me think if perhaps the naming in the previous patch should
>>>> be more explicit.  Make the existing names explicitly contin 'fault' as
>>>> the new names contain 'reservation'.
>>>> Just a thought.
>>>>
>>>
>>> You mean change the names of the actual user-facing files? I'm all for
>>> better names but that would break existing users that read/write the
>>> hugetlb_cgroup.2MB.usage_in_bytes/limit_in_bytes users, and so I would
>>> assume is a no-go.
>>>
>>
>> I was thinking about internal variables/definitions such as:
>>
>> +enum {
>> + /* Tracks hugetlb memory faulted in. */
>> + HUGETLB_RES_USAGE,
>> + /* Tracks hugetlb memory reserved. */
>> + HUGETLB_RES_RESERVATION_USAGE,
>> + /* Limit for hugetlb memory faulted in. */
>> + HUGETLB_RES_LIMIT,
>> + /* Limit for hugetlb memory reserved. */
>> + HUGETLB_RES_RESERVATION_LIMIT,
>> + /* Max usage for hugetlb memory faulted in. */
>> + HUGETLB_RES_MAX_USAGE,
>> + /* Max usage for hugetlb memory reserved. */
>> + HUGETLB_RES_RESERVATION_MAX_USAGE,
>> + /* Faulted memory accounting fail count. */
>> + HUGETLB_RES_FAILCNT,
>> + /* Reserved memory accounting fail count. */
>> + HUGETLB_RES_RESERVATION_FAILCNT,
>> + HUGETLB_RES_NULL,
>> + HUGETLB_RES_MAX,
>> +};
>>
>> But, I guess the existing definitions (such as HUGETLB_RES_LIMIT) correspond
>> closely to the externally visible name.  In that case, you should leave them
>> as is and ignore my comment.
>>
>> <ship>
>>>>> @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
>>>>>       kfree(h_cgroup);
>>>>>  }
>>>>>
>>>>> +static void hugetlb_cgroup_move_parent_reservation(int idx,
>>>>> +                                                struct hugetlb_cgroup *h_cg)
>>>>> +{
>>>>> +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
>>>>> +
>>>>> +     /* Move the reservation counters. */
>>>>> +     if (!parent_hugetlb_cgroup(h_cg)) {
>>>>> +             parent = root_h_cgroup;
>>>>> +             /* root has no limit */
>>>>> +             page_counter_charge(
>>>>> +                     &root_h_cgroup->reserved_hugepage[idx],
>>>>> +                     page_counter_read(
>>>>> +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>>>> +     }
>>>>> +
>>>>> +     /* Take the pages off the local counter */
>>>>> +     page_counter_cancel(
>>>>> +             hugetlb_cgroup_get_counter(h_cg, idx, true),
>>>>> +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>>>> +}
>>>>
>>>> I know next to nothing about cgroups and am just comparing this to the
>>>> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
>>>> updates the cgroup pointer in each page being moved.  Do we need to do
>>>> something similar for reservations being moved (move pointer in reservation)?
>>>>
>>>
>>> Oh, good catch. Yes I need to be doing that. I should probably
>>> consolidate those routines so the code doesn't miss things like this.
>>
>> This might get a bit ugly/complicated?  Seems like you will need to examine
>> all hugetlbfs inodes and vma's mapping those inodes.
>>
> 
> Hmm yes on closer look it does seem like this is not straightforward.
> I'll write a test that does this reparenting so I can start running
> into the issue and poke for solutions. Off the top of my head, I think
> maybe we can just not reparent the hugetlb reservations - the
> hugetlb_cgroup stays alive until all its memory is uncharged. That
> shouldn't be too bad. Today, I think memcg doesn't reparent memory
> when it gets offlined.
> 
> I'll poke at this a bit and come back with suggestions, you may want
> to hold off reviewing the rest of the patches until then.


Ok, if we start considering what the correct cgroup reparenting semantics
should be it would be good to get input from others with more cgroup
experience.
Mina Almasry Nov. 25, 2019, 8:26 p.m. UTC | #6
On Fri, Nov 8, 2019 at 4:46 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 11/8/19 4:40 PM, Mina Almasry wrote:
> > On Fri, Nov 8, 2019 at 4:01 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>
> >> On 11/8/19 3:48 PM, Mina Almasry wrote:
> >>> On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>>>
> >>>> On 10/29/19 6:36 PM, Mina Almasry wrote:
> >>>>> @@ -22,27 +22,35 @@ struct hugetlb_cgroup;
> >>>>>   * Minimum page order trackable by hugetlb cgroup.
> >>>>>   * At least 3 pages are necessary for all the tracking information.
> >>>>>   */
> >>>>> -#define HUGETLB_CGROUP_MIN_ORDER     2
> >>>>> +#define HUGETLB_CGROUP_MIN_ORDER 3
> >>>>
> >>>> Correct me if misremembering, but I think the reson you changed this was
> >>>> so that you could use page[3].private.  Correct?
> >>>> In that case isn't page[3] the last page of an order 2 allocation?
> >>>> If my understanding is correct, then leave HUGETLB_CGROUP_MIN_ORDER as is
> >>>> and update the preceding comment to say that at least 4 pages are necessary.
> >>>>
> >>>
> >>> Yes, I just misunderstood what MIN_ORDER means. I'll revert the code change.
> >>
> >> But, do update the comment please.
> >>
> >
> > Will do.
> >
> >> <snip>
> >>>>> @@ -85,18 +89,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
> >>>>>       int idx;
> >>>>>
> >>>>>       for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
> >>>>> -             struct page_counter *counter = &h_cgroup->hugepage[idx];
> >>>>>               struct page_counter *parent = NULL;
> >>>>
> >>>> Should we perhaps rename 'parent' to 'fault_parent' to be consistent?
> >>>
> >>> Yes that makes sense; will do.
> >>>
> >>>> That makes me think if perhaps the naming in the previous patch should
> >>>> be more explicit.  Make the existing names explicitly contin 'fault' as
> >>>> the new names contain 'reservation'.
> >>>> Just a thought.
> >>>>
> >>>
> >>> You mean change the names of the actual user-facing files? I'm all for
> >>> better names but that would break existing users that read/write the
> >>> hugetlb_cgroup.2MB.usage_in_bytes/limit_in_bytes users, and so I would
> >>> assume is a no-go.
> >>>
> >>
> >> I was thinking about internal variables/definitions such as:
> >>
> >> +enum {
> >> + /* Tracks hugetlb memory faulted in. */
> >> + HUGETLB_RES_USAGE,
> >> + /* Tracks hugetlb memory reserved. */
> >> + HUGETLB_RES_RESERVATION_USAGE,
> >> + /* Limit for hugetlb memory faulted in. */
> >> + HUGETLB_RES_LIMIT,
> >> + /* Limit for hugetlb memory reserved. */
> >> + HUGETLB_RES_RESERVATION_LIMIT,
> >> + /* Max usage for hugetlb memory faulted in. */
> >> + HUGETLB_RES_MAX_USAGE,
> >> + /* Max usage for hugetlb memory reserved. */
> >> + HUGETLB_RES_RESERVATION_MAX_USAGE,
> >> + /* Faulted memory accounting fail count. */
> >> + HUGETLB_RES_FAILCNT,
> >> + /* Reserved memory accounting fail count. */
> >> + HUGETLB_RES_RESERVATION_FAILCNT,
> >> + HUGETLB_RES_NULL,
> >> + HUGETLB_RES_MAX,
> >> +};
> >>
> >> But, I guess the existing definitions (such as HUGETLB_RES_LIMIT) correspond
> >> closely to the externally visible name.  In that case, you should leave them
> >> as is and ignore my comment.
> >>
> >> <ship>
> >>>>> @@ -126,6 +144,26 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
> >>>>>       kfree(h_cgroup);
> >>>>>  }
> >>>>>
> >>>>> +static void hugetlb_cgroup_move_parent_reservation(int idx,
> >>>>> +                                                struct hugetlb_cgroup *h_cg)
> >>>>> +{
> >>>>> +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
> >>>>> +
> >>>>> +     /* Move the reservation counters. */
> >>>>> +     if (!parent_hugetlb_cgroup(h_cg)) {
> >>>>> +             parent = root_h_cgroup;
> >>>>> +             /* root has no limit */
> >>>>> +             page_counter_charge(
> >>>>> +                     &root_h_cgroup->reserved_hugepage[idx],
> >>>>> +                     page_counter_read(
> >>>>> +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
> >>>>> +     }
> >>>>> +
> >>>>> +     /* Take the pages off the local counter */
> >>>>> +     page_counter_cancel(
> >>>>> +             hugetlb_cgroup_get_counter(h_cg, idx, true),
> >>>>> +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
> >>>>> +}
> >>>>
> >>>> I know next to nothing about cgroups and am just comparing this to the
> >>>> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
> >>>> updates the cgroup pointer in each page being moved.  Do we need to do
> >>>> something similar for reservations being moved (move pointer in reservation)?
> >>>>
> >>>
> >>> Oh, good catch. Yes I need to be doing that. I should probably
> >>> consolidate those routines so the code doesn't miss things like this.
> >>
> >> This might get a bit ugly/complicated?  Seems like you will need to examine
> >> all hugetlbfs inodes and vma's mapping those inodes.
> >>
> >
> > Hmm yes on closer look it does seem like this is not straightforward.
> > I'll write a test that does this reparenting so I can start running
> > into the issue and poke for solutions. Off the top of my head, I think
> > maybe we can just not reparent the hugetlb reservations - the
> > hugetlb_cgroup stays alive until all its memory is uncharged. That
> > shouldn't be too bad. Today, I think memcg doesn't reparent memory
> > when it gets offlined.
> >
> > I'll poke at this a bit and come back with suggestions, you may want
> > to hold off reviewing the rest of the patches until then.
>
>
> Ok, if we start considering what the correct cgroup reparenting semantics
> should be it would be good to get input from others with more cgroup
> experience.
>

So I looked into this and prototyped a couple of solutions:

1. We could repartent the hugetlb reservation using the same approach
that today we repartent hugetlb faults. Basically today faulted
hugetlb pages live on hstate->hugepage_activelist. When a hugetlb
cgroup gets offlined, this list is transversed and any pages on it
that point to the cgroup being offlined and reparented. hugetlb_lock
is used to make sure cgroup offlining doesn't race with a page being
freed. I can add another list, but one that has pointers to the
reservations made. When the cgroup is being offlined, it transverses
this list, and reparents any reservations (which will need to acquire
the proper resv_map->lock to do the parenting). hugetlb_lock needs
also to be acquired here to make sure that resv_map release doesn't
race with another thread reparenting the memory in that resv map.

Pros: Maintains current parenting behavior, and makes sure that
reparenting of reservations works exactly the same way as reparenting
of hugetlb faults.
Cons: Code is a bit complex. There may be subtle object lifetime bugs,
since I'm not 100% sure acquiring hugetlb_lock removes all races.

2. We could just not reparent hugetlb reservations. I.e. on hugetlb
cgroup offlining, the hugetlb faults get reparented (which maintains
current user facing behavior), but hugetlb reservation charges remain
charged to the hugetlb cgroup. The cgroup lives as a zombie until all
the reservations are uncharged.

Pros: Much easier implementation. Converges behavior of memcg and
hugetlb cgroup, since memcg also doesn't reparent memory charged to
it.
Cons: Behavior change as hugetlb cgroups will become zombies if there
are reservations charged to them. I've discussed offlist with Shakeel,
and AFAICT there are absolutely no user facing behavior change to
zombie cgroups. Only if the user is specifically detecting for
zombies.

I'm torn between these 2 options right now, but leaning towards #2. I
think I will propose #2 in a patch for review, and if anyone is broken
by that (again, my understanding is that is very unlikely), then I
propose a patch that reverts the changes in #2 and implements the
changes in #1.

Any feedback from Shakeel or other people with cgroup expertise
(especially for hugetlb cgroup or memcg)  is very useful here.

> --
> Mike Kravetz
Mike Kravetz Nov. 26, 2019, 12:05 a.m. UTC | #7
On 11/25/19 12:26 PM, Mina Almasry wrote:
> On Fri, Nov 8, 2019 at 4:46 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 11/8/19 4:40 PM, Mina Almasry wrote:
>>> On Fri, Nov 8, 2019 at 4:01 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>
>>>> On 11/8/19 3:48 PM, Mina Almasry wrote:
>>>>> On Thu, Nov 7, 2019 at 4:57 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>>>
>>>>>> On 10/29/19 6:36 PM, Mina Almasry wrote:
>>>>>>>
>>>>>>> +static void hugetlb_cgroup_move_parent_reservation(int idx,
>>>>>>> +                                                struct hugetlb_cgroup *h_cg)
>>>>>>> +{
>>>>>>> +     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
>>>>>>> +
>>>>>>> +     /* Move the reservation counters. */
>>>>>>> +     if (!parent_hugetlb_cgroup(h_cg)) {
>>>>>>> +             parent = root_h_cgroup;
>>>>>>> +             /* root has no limit */
>>>>>>> +             page_counter_charge(
>>>>>>> +                     &root_h_cgroup->reserved_hugepage[idx],
>>>>>>> +                     page_counter_read(
>>>>>>> +                             hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     /* Take the pages off the local counter */
>>>>>>> +     page_counter_cancel(
>>>>>>> +             hugetlb_cgroup_get_counter(h_cg, idx, true),
>>>>>>> +             page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
>>>>>>> +}
>>>>>>
>>>>>> I know next to nothing about cgroups and am just comparing this to the
>>>>>> existing hugetlb_cgroup_move_parent() routine.  hugetlb_cgroup_move_parent
>>>>>> updates the cgroup pointer in each page being moved.  Do we need to do
>>>>>> something similar for reservations being moved (move pointer in reservation)?
>>>>>>
>>>>>
>>>>> Oh, good catch. Yes I need to be doing that. I should probably
>>>>> consolidate those routines so the code doesn't miss things like this.
>>>>
>>>> This might get a bit ugly/complicated?  Seems like you will need to examine
>>>> all hugetlbfs inodes and vma's mapping those inodes.
>>>>
>>>
>>> Hmm yes on closer look it does seem like this is not straightforward.
>>> I'll write a test that does this reparenting so I can start running
>>> into the issue and poke for solutions. Off the top of my head, I think
>>> maybe we can just not reparent the hugetlb reservations - the
>>> hugetlb_cgroup stays alive until all its memory is uncharged. That
>>> shouldn't be too bad. Today, I think memcg doesn't reparent memory
>>> when it gets offlined.
>>>
>>> I'll poke at this a bit and come back with suggestions, you may want
>>> to hold off reviewing the rest of the patches until then.
>>
>>
>> Ok, if we start considering what the correct cgroup reparenting semantics
>> should be it would be good to get input from others with more cgroup
>> experience.
>>
> 
> So I looked into this and prototyped a couple of solutions:
> 
> 1. We could repartent the hugetlb reservation using the same approach
> that today we repartent hugetlb faults. Basically today faulted
> hugetlb pages live on hstate->hugepage_activelist. When a hugetlb
> cgroup gets offlined, this list is transversed and any pages on it
> that point to the cgroup being offlined and reparented. hugetlb_lock
> is used to make sure cgroup offlining doesn't race with a page being
> freed. I can add another list, but one that has pointers to the
> reservations made. When the cgroup is being offlined, it transverses
> this list, and reparents any reservations (which will need to acquire
> the proper resv_map->lock to do the parenting). hugetlb_lock needs
> also to be acquired here to make sure that resv_map release doesn't
> race with another thread reparenting the memory in that resv map.
> 
> Pros: Maintains current parenting behavior, and makes sure that
> reparenting of reservations works exactly the same way as reparenting
> of hugetlb faults.
> Cons: Code is a bit complex. There may be subtle object lifetime bugs,
> since I'm not 100% sure acquiring hugetlb_lock removes all races.
> 
> 2. We could just not reparent hugetlb reservations. I.e. on hugetlb
> cgroup offlining, the hugetlb faults get reparented (which maintains
> current user facing behavior), but hugetlb reservation charges remain
> charged to the hugetlb cgroup. The cgroup lives as a zombie until all
> the reservations are uncharged.
> 
> Pros: Much easier implementation. Converges behavior of memcg and
> hugetlb cgroup, since memcg also doesn't reparent memory charged to
> it.
> Cons: Behavior change as hugetlb cgroups will become zombies if there
> are reservations charged to them. I've discussed offlist with Shakeel,
> and AFAICT there are absolutely no user facing behavior change to
> zombie cgroups. Only if the user is specifically detecting for
> zombies.
> 
> I'm torn between these 2 options right now, but leaning towards #2. I
> think I will propose #2 in a patch for review, and if anyone is broken
> by that (again, my understanding is that is very unlikely), then I
> propose a patch that reverts the changes in #2 and implements the
> changes in #1.

I of course like option #2 because it introduces fewer (if any) additional
changes to the hugetlb reservation code for non-cgroup users. :)

> Any feedback from Shakeel or other people with cgroup expertise
> (especially for hugetlb cgroup or memcg)  is very useful here.

Yes, that would be very helpful.
diff mbox series

Patch

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 063962f6dfc6a..1bb58a63af586 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -22,27 +22,35 @@  struct hugetlb_cgroup;
  * Minimum page order trackable by hugetlb cgroup.
  * At least 3 pages are necessary for all the tracking information.
  */
-#define HUGETLB_CGROUP_MIN_ORDER	2
+#define HUGETLB_CGROUP_MIN_ORDER 3

 #ifdef CONFIG_CGROUP_HUGETLB

-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
+							      bool reserved)
 {
 	VM_BUG_ON_PAGE(!PageHuge(page), page);

 	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
 		return NULL;
-	return (struct hugetlb_cgroup *)page[2].private;
+	if (reserved)
+		return (struct hugetlb_cgroup *)page[3].private;
+	else
+		return (struct hugetlb_cgroup *)page[2].private;
 }

-static inline
-int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
+static inline int set_hugetlb_cgroup(struct page *page,
+				     struct hugetlb_cgroup *h_cg,
+				     bool reservation)
 {
 	VM_BUG_ON_PAGE(!PageHuge(page), page);

 	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
 		return -1;
-	page[2].private	= (unsigned long)h_cg;
+	if (reservation)
+		page[3].private = (unsigned long)h_cg;
+	else
+		page[2].private = (unsigned long)h_cg;
 	return 0;
 }

@@ -52,26 +60,33 @@  static inline bool hugetlb_cgroup_disabled(void)
 }

 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
-					struct hugetlb_cgroup **ptr);
+					struct hugetlb_cgroup **ptr,
+					bool reserved);
 extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 					 struct hugetlb_cgroup *h_cg,
-					 struct page *page);
+					 struct page *page, bool reserved);
 extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
-					 struct page *page);
+					 struct page *page, bool reserved);
+
 extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
-					   struct hugetlb_cgroup *h_cg);
+					   struct hugetlb_cgroup *h_cg,
+					   bool reserved);
+extern void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
+					    unsigned long nr_pages);
+
 extern void hugetlb_cgroup_file_init(void) __init;
 extern void hugetlb_cgroup_migrate(struct page *oldhpage,
 				   struct page *newhpage);

 #else
-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
+							      bool reserved)
 {
 	return NULL;
 }

-static inline
-int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
+static inline int set_hugetlb_cgroup(struct page *page,
+				     struct hugetlb_cgroup *h_cg, bool reserved)
 {
 	return 0;
 }
@@ -81,28 +96,30 @@  static inline bool hugetlb_cgroup_disabled(void)
 	return true;
 }

-static inline int
-hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
-			     struct hugetlb_cgroup **ptr)
+static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+					       struct hugetlb_cgroup **ptr,
+					       bool reserved)
 {
 	return 0;
 }

-static inline void
-hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
-			     struct hugetlb_cgroup *h_cg,
-			     struct page *page)
+static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+						struct hugetlb_cgroup *h_cg,
+						struct page *page,
+						bool reserved)
 {
 }

-static inline void
-hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
+static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+						struct page *page,
+						bool reserved)
 {
 }

-static inline void
-hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
-			       struct hugetlb_cgroup *h_cg)
+static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
+						  unsigned long nr_pages,
+						  struct hugetlb_cgroup *h_cg,
+						  bool reserved)
 {
 }

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 37195cd60a345..325d5454bf168 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1140,7 +1140,7 @@  static void update_and_free_page(struct hstate *h, struct page *page)
 				1 << PG_active | 1 << PG_private |
 				1 << PG_writeback);
 	}
-	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page, false), page);
 	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
 	set_page_refcounted(page);
 	if (hstate_is_gigantic(h)) {
@@ -1250,8 +1250,8 @@  void free_huge_page(struct page *page)

 	spin_lock(&hugetlb_lock);
 	clear_page_huge_active(page);
-	hugetlb_cgroup_uncharge_page(hstate_index(h),
-				     pages_per_huge_page(h), page);
+	hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h),
+				     page, false);
 	if (restore_reserve)
 		h->resv_huge_pages++;

@@ -1277,7 +1277,7 @@  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	spin_lock(&hugetlb_lock);
-	set_hugetlb_cgroup(page, NULL);
+	set_hugetlb_cgroup(page, NULL, false);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
 	spin_unlock(&hugetlb_lock);
@@ -2063,7 +2063,8 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 			gbl_chg = 1;
 	}

-	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg,
+					   false);
 	if (ret)
 		goto out_subpool_put;

@@ -2087,7 +2088,8 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 		list_move(&page->lru, &h->hugepage_activelist);
 		/* Fall through */
 	}
-	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page,
+				     false);
 	spin_unlock(&hugetlb_lock);

 	set_page_private(page, (unsigned long)spool);
@@ -2111,7 +2113,8 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	return page;

 out_uncharge_cgroup:
-	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg,
+				       false);
 out_subpool_put:
 	if (map_chg || avoid_reserve)
 		hugepage_subpool_put_pages(spool, 1);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 1ed4448ca41d3..854117513979b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -73,8 +73,12 @@  static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
 	int idx;

 	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
-		if (page_counter_read(&h_cg->hugepage[idx]))
+		if (page_counter_read(
+			    hugetlb_cgroup_get_counter(h_cg, idx, true)) ||
+		    page_counter_read(
+			    hugetlb_cgroup_get_counter(h_cg, idx, false))) {
 			return true;
+		}
 	}
 	return false;
 }
@@ -85,18 +89,32 @@  static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
 	int idx;

 	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
-		struct page_counter *counter = &h_cgroup->hugepage[idx];
 		struct page_counter *parent = NULL;
+		struct page_counter *reserved_parent = NULL;
 		unsigned long limit;
 		int ret;

-		if (parent_h_cgroup)
-			parent = &parent_h_cgroup->hugepage[idx];
-		page_counter_init(counter, parent);
+		if (parent_h_cgroup) {
+			parent = hugetlb_cgroup_get_counter(parent_h_cgroup,
+							    idx, false);
+			reserved_parent = hugetlb_cgroup_get_counter(
+				parent_h_cgroup, idx, true);
+		}
+		page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
+							     false),
+				  parent);
+		page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
+							     true),
+				  reserved_parent);

 		limit = round_down(PAGE_COUNTER_MAX,
 				   1 << huge_page_order(&hstates[idx]));
-		ret = page_counter_set_max(counter, limit);
+
+		ret = page_counter_set_max(
+			hugetlb_cgroup_get_counter(h_cgroup, idx, false),
+			limit);
+		ret = page_counter_set_max(
+			hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit);
 		VM_BUG_ON(ret);
 	}
 }
@@ -126,6 +144,26 @@  static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
 	kfree(h_cgroup);
 }

+static void hugetlb_cgroup_move_parent_reservation(int idx,
+						   struct hugetlb_cgroup *h_cg)
+{
+	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+
+	/* Move the reservation counters. */
+	if (!parent_hugetlb_cgroup(h_cg)) {
+		parent = root_h_cgroup;
+		/* root has no limit */
+		page_counter_charge(
+			&root_h_cgroup->reserved_hugepage[idx],
+			page_counter_read(
+				hugetlb_cgroup_get_counter(h_cg, idx, true)));
+	}
+
+	/* Take the pages off the local counter */
+	page_counter_cancel(
+		hugetlb_cgroup_get_counter(h_cg, idx, true),
+		page_counter_read(hugetlb_cgroup_get_counter(h_cg, idx, true)));
+}

 /*
  * Should be called with hugetlb_lock held.
@@ -142,7 +180,7 @@  static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
 	struct hugetlb_cgroup *page_hcg;
 	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);

-	page_hcg = hugetlb_cgroup_from_page(page);
+	page_hcg = hugetlb_cgroup_from_page(page, false);
 	/*
 	 * We can have pages in active list without any cgroup
 	 * ie, hugepage with less than 3 pages. We can safely
@@ -161,7 +199,7 @@  static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
 	/* Take the pages off the local counter */
 	page_counter_cancel(counter, nr_pages);

-	set_hugetlb_cgroup(page, parent);
+	set_hugetlb_cgroup(page, parent, false);
 out:
 	return;
 }
@@ -180,6 +218,7 @@  static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
 	do {
 		for_each_hstate(h) {
 			spin_lock(&hugetlb_lock);
+			hugetlb_cgroup_move_parent_reservation(idx, h_cg);
 			list_for_each_entry(page, &h->hugepage_activelist, lru)
 				hugetlb_cgroup_move_parent(idx, h_cg, page);

@@ -191,7 +230,7 @@  static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
 }

 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
-				 struct hugetlb_cgroup **ptr)
+				 struct hugetlb_cgroup **ptr, bool reserved)
 {
 	int ret = 0;
 	struct page_counter *counter;
@@ -214,8 +253,11 @@  int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 	}
 	rcu_read_unlock();

-	if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
+	if (!page_counter_try_charge(hugetlb_cgroup_get_counter(h_cg, idx,
+								reserved),
+				     nr_pages, &counter)) {
 		ret = -ENOMEM;
+	}
 	css_put(&h_cg->css);
 done:
 	*ptr = h_cg;
@@ -225,12 +267,12 @@  int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 /* Should be called with hugetlb_lock held */
 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 				  struct hugetlb_cgroup *h_cg,
-				  struct page *page)
+				  struct page *page, bool reserved)
 {
 	if (hugetlb_cgroup_disabled() || !h_cg)
 		return;

-	set_hugetlb_cgroup(page, h_cg);
+	set_hugetlb_cgroup(page, h_cg, reserved);
 	return;
 }

@@ -238,23 +280,26 @@  void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  * Should be called with hugetlb_lock held
  */
 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
-				  struct page *page)
+				  struct page *page, bool reserved)
 {
 	struct hugetlb_cgroup *h_cg;

 	if (hugetlb_cgroup_disabled())
 		return;
 	lockdep_assert_held(&hugetlb_lock);
-	h_cg = hugetlb_cgroup_from_page(page);
+	h_cg = hugetlb_cgroup_from_page(page, reserved);
 	if (unlikely(!h_cg))
 		return;
-	set_hugetlb_cgroup(page, NULL);
-	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
+	set_hugetlb_cgroup(page, NULL, reserved);
+
+	page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
+			      nr_pages);
+
 	return;
 }

 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
-				    struct hugetlb_cgroup *h_cg)
+				    struct hugetlb_cgroup *h_cg, bool reserved)
 {
 	if (hugetlb_cgroup_disabled() || !h_cg)
 		return;
@@ -262,8 +307,17 @@  void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
 		return;

-	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
-	return;
+	page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
+			      nr_pages);
+}
+
+void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
+				     unsigned long nr_pages)
+{
+	if (hugetlb_cgroup_disabled() || !p)
+		return;
+
+	page_counter_uncharge(p, nr_pages);
 }

 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
@@ -475,6 +529,7 @@  void __init hugetlb_cgroup_file_init(void)
 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
 {
 	struct hugetlb_cgroup *h_cg;
+	struct hugetlb_cgroup *h_cg_reservation;
 	struct hstate *h = page_hstate(oldhpage);

 	if (hugetlb_cgroup_disabled())
@@ -482,11 +537,12 @@  void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)

 	VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
 	spin_lock(&hugetlb_lock);
-	h_cg = hugetlb_cgroup_from_page(oldhpage);
-	set_hugetlb_cgroup(oldhpage, NULL);
+	h_cg = hugetlb_cgroup_from_page(oldhpage, false);
+	h_cg_reservation = hugetlb_cgroup_from_page(oldhpage, true);
+	set_hugetlb_cgroup(oldhpage, NULL, false);

 	/* move the h_cg details to new cgroup */
-	set_hugetlb_cgroup(newhpage, h_cg);
+	set_hugetlb_cgroup(newhpage, h_cg_reservation, true);
 	list_move(&newhpage->lru, &h->hugepage_activelist);
 	spin_unlock(&hugetlb_lock);
 	return;