diff mbox series

[v9,3/8] hugetlb_cgroup: add reservation accounting for private mappings

Message ID 20191217231615.164161-3-almasrymina@google.com (mailing list archive)
State New
Headers show
Series [v9,1/8] hugetlb_cgroup: Add hugetlb_cgroup reservation counter | expand

Commit Message

Mina Almasry Dec. 17, 2019, 11:16 p.m. UTC
Normally the pointer to the cgroup to uncharge hangs off the struct
page, and gets queried when it's time to free the page. With
hugetlb_cgroup reservations, this is not possible. Because it's possible
for a page to be reserved by one task and actually faulted in by another
task.

The best place to put the hugetlb_cgroup pointer to uncharge for
reservations is in the resv_map. But, because the resv_map has different
semantics for private and shared mappings, the code patch to
charge/uncharge shared and private mappings is different. This patch
implements charging and uncharging for private mappings.

For private mappings, the counter to uncharge is in
resv_map->reservation_counter. On initializing the resv_map this is set
to NULL. On reservation of a region in private mapping, the tasks
hugetlb_cgroup is charged and the hugetlb_cgroup is placed is
resv_map->reservation_counter.

On hugetlb_vm_op_close, we uncharge resv_map->reservation_counter.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Hillf Danton <hdanton@sina.com>

---

Changes in V9:
- Updated for reparenting of hugetlb reservation accounting.

---
 include/linux/hugetlb.h        |  9 +++++++
 include/linux/hugetlb_cgroup.h | 27 +++++++++++++++++++
 mm/hugetlb.c                   | 47 +++++++++++++++++++++++++++++++++-
 mm/hugetlb_cgroup.c            | 28 --------------------
 4 files changed, 82 insertions(+), 29 deletions(-)

--
2.24.1.735.g03f4e72817-goog

Comments

Mike Kravetz Jan. 14, 2020, 12:55 a.m. UTC | #1
On 12/17/19 3:16 PM, Mina Almasry wrote:
> Normally the pointer to the cgroup to uncharge hangs off the struct
> page, and gets queried when it's time to free the page. With
> hugetlb_cgroup reservations, this is not possible. Because it's possible
> for a page to be reserved by one task and actually faulted in by another
> task.
> 
> The best place to put the hugetlb_cgroup pointer to uncharge for
> reservations is in the resv_map. But, because the resv_map has different
> semantics for private and shared mappings, the code patch to
> charge/uncharge shared and private mappings is different. This patch
> implements charging and uncharging for private mappings.
> 
> For private mappings, the counter to uncharge is in
> resv_map->reservation_counter. On initializing the resv_map this is set
> to NULL. On reservation of a region in private mapping, the tasks
> hugetlb_cgroup is charged and the hugetlb_cgroup is placed is
> resv_map->reservation_counter.
> 
> On hugetlb_vm_op_close, we uncharge resv_map->reservation_counter.
> 
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> Acked-by: Hillf Danton <hdanton@sina.com>
> 
> ---
> 
> Changes in V9:
> - Updated for reparenting of hugetlb reservation accounting.
> 
> ---
>  include/linux/hugetlb.h        |  9 +++++++
>  include/linux/hugetlb_cgroup.h | 27 +++++++++++++++++++
>  mm/hugetlb.c                   | 47 +++++++++++++++++++++++++++++++++-
>  mm/hugetlb_cgroup.c            | 28 --------------------
>  4 files changed, 82 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index dea6143aa0685..e6ab499ba2086 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -46,6 +46,15 @@ struct resv_map {
>  	long adds_in_progress;
>  	struct list_head region_cache;
>  	long region_cache_count;
> +#ifdef CONFIG_CGROUP_HUGETLB
> +	/*
> +	 * On private mappings, the counter to uncharge reservations is stored
> +	 * here. If these fields are 0, then the mapping is shared.

Will *reservation_counter ALWAYS be non-NULL for private mappings?

More on this below.

> +	 */
> +	struct page_counter *reservation_counter;
> +	unsigned long pages_per_hpage;
> +	struct cgroup_subsys_state *css;
> +#endif
>  };
>  extern struct resv_map *resv_map_alloc(void);
>  void resv_map_release(struct kref *ref);
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index eab8a70d5bcb5..8c320accefe87 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -25,6 +25,33 @@ struct hugetlb_cgroup;
>  #define HUGETLB_CGROUP_MIN_ORDER	2
> 
>  #ifdef CONFIG_CGROUP_HUGETLB
> +enum hugetlb_memory_event {
> +	HUGETLB_MAX,
> +	HUGETLB_NR_MEMORY_EVENTS,
> +};
> +
> +struct hugetlb_cgroup {
> +	struct cgroup_subsys_state css;
> +
> +	/*
> +	 * the counter to account for hugepages from hugetlb.
> +	 */
> +	struct page_counter hugepage[HUGE_MAX_HSTATE];
> +
> +	/*
> +	 * the counter to account for hugepage reservations from hugetlb.
> +	 */
> +	struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
> +
> +	atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> +	atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> +
> +	/* Handle for "hugetlb.events" */
> +	struct cgroup_file events_file[HUGE_MAX_HSTATE];
> +
> +	/* Handle for "hugetlb.events.local" */
> +	struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
> +};
> 
>  static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
>  							      bool reserved)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e6e8240f1718c..7782977970301 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -665,6 +665,17 @@ struct resv_map *resv_map_alloc(void)
>  	INIT_LIST_HEAD(&resv_map->regions);
> 
>  	resv_map->adds_in_progress = 0;
> +#ifdef CONFIG_CGROUP_HUGETLB
> +	/*
> +	 * Initialize these to 0. On shared mappings, 0's here indicate these
> +	 * fields don't do cgroup accounting. On private mappings, these will be
> +	 * re-initialized to the proper values, to indicate that hugetlb cgroup
> +	 * reservations are to be un-charged from here.
> +	 */
> +	resv_map->reservation_counter = NULL;
> +	resv_map->pages_per_hpage = 0;
> +	resv_map->css = NULL;
> +#endif
> 
>  	INIT_LIST_HEAD(&resv_map->region_cache);
>  	list_add(&rg->link, &resv_map->region_cache);
> @@ -3145,7 +3156,20 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> 
>  	reserve = (end - start) - region_count(resv, start, end);
> 
> -	kref_put(&resv->refs, resv_map_release);
> +#ifdef CONFIG_CGROUP_HUGETLB
> +	/*
> +	 * Since we check for HPAGE_RESV_OWNER above, this must a private
> +	 * mapping, and these values should be none-zero, and should point to
> +	 * the hugetlb_cgroup counter to uncharge for this reservation.
> +	 */
> +	WARN_ON(!resv->reservation_counter);
> +	WARN_ON(!resv->pages_per_hpage);
> +	WARN_ON(!resv->css);

I was once again wondering if these were always non-NULL for private mappings.
It seems that reservation_counter (h_gc) would be NULL in these cases from
these early checks in hugetlb_cgroup_charge_cgroup().

int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                 struct hugetlb_cgroup **ptr, bool reserved)
{
        int ret = 0;
        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = NULL;

        if (hugetlb_cgroup_disabled())
                goto done;
        /*
         * We don't charge any cgroup if the compound page have less
         * than 3 pages.
         */
        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
                goto done;
	...

It seems like the following hugetlb_cgroup_uncharge_counter() guards
against reservation_counter being NULL (for some of the same reasons).

> +
> +	hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
> +					(end - start) * resv->pages_per_hpage,
> +					resv->css);
> +#endif
> 
>  	if (reserve) {
>  		/*
> @@ -3155,6 +3179,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
>  		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
>  		hugetlb_acct_memory(h, -gbl_reserve);
>  	}
> +
> +	kref_put(&resv->refs, resv_map_release);
>  }
> 
>  static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
> @@ -4501,6 +4527,7 @@ int hugetlb_reserve_pages(struct inode *inode,
>  	struct hstate *h = hstate_inode(inode);
>  	struct hugepage_subpool *spool = subpool_inode(inode);
>  	struct resv_map *resv_map;
> +	struct hugetlb_cgroup *h_cg;
>  	long gbl_reserve;
> 
>  	/* This should never happen */
> @@ -4534,12 +4561,30 @@ int hugetlb_reserve_pages(struct inode *inode,
>  		chg = region_chg(resv_map, from, to);
> 
>  	} else {
> +		/* Private mapping. */
>  		resv_map = resv_map_alloc();
>  		if (!resv_map)
>  			return -ENOMEM;
> 
>  		chg = to - from;
> 
> +		if (hugetlb_cgroup_charge_cgroup(hstate_index(h),
> +						 chg * pages_per_huge_page(h),
> +						 &h_cg, true)) {
> +			kref_put(&resv_map->refs, resv_map_release);
> +			return -ENOMEM;
> +		}
> +

Shouldn't this code be in the #ifdef CONFIG_CGROUP_HUGETLB block?
Mina Almasry Jan. 14, 2020, 10:52 p.m. UTC | #2
On Mon, Jan 13, 2020 at 4:55 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index dea6143aa0685..e6ab499ba2086 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -46,6 +46,15 @@ struct resv_map {
> >       long adds_in_progress;
> >       struct list_head region_cache;
> >       long region_cache_count;
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > +     /*
> > +      * On private mappings, the counter to uncharge reservations is stored
> > +      * here. If these fields are 0, then the mapping is shared.
>
> Will *reservation_counter ALWAYS be non-NULL for private mappings?
>
> More on this below.
>
> > +      */
> > +     struct page_counter *reservation_counter;
> > +     unsigned long pages_per_hpage;
> > +     struct cgroup_subsys_state *css;
> > +#endif
> >  };
> >  extern struct resv_map *resv_map_alloc(void);
> >  void resv_map_release(struct kref *ref);
> > diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> > index eab8a70d5bcb5..8c320accefe87 100644
> > --- a/include/linux/hugetlb_cgroup.h
> > +++ b/include/linux/hugetlb_cgroup.h
> > @@ -25,6 +25,33 @@ struct hugetlb_cgroup;
> >  #define HUGETLB_CGROUP_MIN_ORDER     2
> >
> >  #ifdef CONFIG_CGROUP_HUGETLB
> > +enum hugetlb_memory_event {
> > +     HUGETLB_MAX,
> > +     HUGETLB_NR_MEMORY_EVENTS,
> > +};
> > +
> > +struct hugetlb_cgroup {
> > +     struct cgroup_subsys_state css;
> > +
> > +     /*
> > +      * the counter to account for hugepages from hugetlb.
> > +      */
> > +     struct page_counter hugepage[HUGE_MAX_HSTATE];
> > +
> > +     /*
> > +      * the counter to account for hugepage reservations from hugetlb.
> > +      */
> > +     struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
> > +
> > +     atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> > +     atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> > +
> > +     /* Handle for "hugetlb.events" */
> > +     struct cgroup_file events_file[HUGE_MAX_HSTATE];
> > +
> > +     /* Handle for "hugetlb.events.local" */
> > +     struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
> > +};
> >
> >  static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> >                                                             bool reserved)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index e6e8240f1718c..7782977970301 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -665,6 +665,17 @@ struct resv_map *resv_map_alloc(void)
> >       INIT_LIST_HEAD(&resv_map->regions);
> >
> >       resv_map->adds_in_progress = 0;
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > +     /*
> > +      * Initialize these to 0. On shared mappings, 0's here indicate these
> > +      * fields don't do cgroup accounting. On private mappings, these will be
> > +      * re-initialized to the proper values, to indicate that hugetlb cgroup
> > +      * reservations are to be un-charged from here.
> > +      */
> > +     resv_map->reservation_counter = NULL;
> > +     resv_map->pages_per_hpage = 0;
> > +     resv_map->css = NULL;
> > +#endif
> >
> >       INIT_LIST_HEAD(&resv_map->region_cache);
> >       list_add(&rg->link, &resv_map->region_cache);
> > @@ -3145,7 +3156,20 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> >
> >       reserve = (end - start) - region_count(resv, start, end);
> >
> > -     kref_put(&resv->refs, resv_map_release);
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > +     /*
> > +      * Since we check for HPAGE_RESV_OWNER above, this must a private
> > +      * mapping, and these values should be none-zero, and should point to
> > +      * the hugetlb_cgroup counter to uncharge for this reservation.
> > +      */
> > +     WARN_ON(!resv->reservation_counter);
> > +     WARN_ON(!resv->pages_per_hpage);
> > +     WARN_ON(!resv->css);
>
> I was once again wondering if these were always non-NULL for private mappings.
> It seems that reservation_counter (h_gc) would be NULL in these cases from
> these early checks in hugetlb_cgroup_charge_cgroup().
>

You are right. I'm fixing in v10 the code and comments to account for
h_cg potentially being NULL, but I'm having trouble testing. Looking
at the code, I'm a bit confused by the checks. Seems to me
hugetlb_cgroup_disabled() is the same as #ifdef CONFIG_CGROUP_HUGETLB;
I can't find a way to enable the Kconfig but have that return false
unless I hack the code. Also seems to me checking huge_page_order is
just super definsive; I skimmed the hugepage sizes allowed code and I
can't find an arch that allows you to configure hugetlb page size to <
2^HUGETLB_CGROUP_MIN_ORDER pages. So in reality these will never fire,
IIUC.

> int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
>                                  struct hugetlb_cgroup **ptr, bool reserved)
> {
>         int ret = 0;
>         struct page_counter *counter;
>         struct hugetlb_cgroup *h_cg = NULL;
>
>         if (hugetlb_cgroup_disabled())
>                 goto done;
>         /*
>          * We don't charge any cgroup if the compound page have less
>          * than 3 pages.
>          */
>         if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
>                 goto done;
>         ...
>
> It seems like the following hugetlb_cgroup_uncharge_counter() guards
> against reservation_counter being NULL (for some of the same reasons).
>
> > +
> > +     hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
> > +                                     (end - start) * resv->pages_per_hpage,
> > +                                     resv->css);
> > +#endif
> >
> >       if (reserve) {
> >               /*
> > @@ -3155,6 +3179,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> >               gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
> >               hugetlb_acct_memory(h, -gbl_reserve);
> >       }
> > +
> > +     kref_put(&resv->refs, resv_map_release);
> >  }
> >
> >  static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
> > @@ -4501,6 +4527,7 @@ int hugetlb_reserve_pages(struct inode *inode,
> >       struct hstate *h = hstate_inode(inode);
> >       struct hugepage_subpool *spool = subpool_inode(inode);
> >       struct resv_map *resv_map;
> > +     struct hugetlb_cgroup *h_cg;
> >       long gbl_reserve;
> >
> >       /* This should never happen */
> > @@ -4534,12 +4561,30 @@ int hugetlb_reserve_pages(struct inode *inode,
> >               chg = region_chg(resv_map, from, to);
> >
> >       } else {
> > +             /* Private mapping. */
> >               resv_map = resv_map_alloc();
> >               if (!resv_map)
> >                       return -ENOMEM;
> >
> >               chg = to - from;
> >
> > +             if (hugetlb_cgroup_charge_cgroup(hstate_index(h),
> > +                                              chg * pages_per_huge_page(h),
> > +                                              &h_cg, true)) {
> > +                     kref_put(&resv_map->refs, resv_map_release);
> > +                     return -ENOMEM;
> > +             }
> > +
>
> Shouldn't this code be in the #ifdef CONFIG_CGROUP_HUGETLB block?

Not necessary AFAICT, hugetlb_cgroup_charge_cgroup stub returns 0 (no-op).
Mike Kravetz Jan. 17, 2020, 10:09 p.m. UTC | #3
On 1/14/20 2:52 PM, Mina Almasry wrote:
> On Mon, Jan 13, 2020 at 4:55 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>> +#ifdef CONFIG_CGROUP_HUGETLB
>>> +     /*
>>> +      * Since we check for HPAGE_RESV_OWNER above, this must a private
>>> +      * mapping, and these values should be none-zero, and should point to
>>> +      * the hugetlb_cgroup counter to uncharge for this reservation.
>>> +      */
>>> +     WARN_ON(!resv->reservation_counter);
>>> +     WARN_ON(!resv->pages_per_hpage);
>>> +     WARN_ON(!resv->css);
>>
>> I was once again wondering if these were always non-NULL for private mappings.
>> It seems that reservation_counter (h_gc) would be NULL in these cases from
>> these early checks in hugetlb_cgroup_charge_cgroup().
>>
> 
> You are right. I'm fixing in v10 the code and comments to account for
> h_cg potentially being NULL, but I'm having trouble testing. Looking
> at the code, I'm a bit confused by the checks. Seems to me
> hugetlb_cgroup_disabled() is the same as #ifdef CONFIG_CGROUP_HUGETLB;
> I can't find a way to enable the Kconfig but have that return false
> unless I hack the code.

What about the boot options?

cgroup_disable=
cgroup_no_v1=
Mina Almasry Jan. 22, 2020, 9:40 p.m. UTC | #4
On Fri, Jan 17, 2020 at 2:09 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 1/14/20 2:52 PM, Mina Almasry wrote:
> > On Mon, Jan 13, 2020 at 4:55 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>> +#ifdef CONFIG_CGROUP_HUGETLB
> >>> +     /*
> >>> +      * Since we check for HPAGE_RESV_OWNER above, this must a private
> >>> +      * mapping, and these values should be none-zero, and should point to
> >>> +      * the hugetlb_cgroup counter to uncharge for this reservation.
> >>> +      */
> >>> +     WARN_ON(!resv->reservation_counter);
> >>> +     WARN_ON(!resv->pages_per_hpage);
> >>> +     WARN_ON(!resv->css);
> >>
> >> I was once again wondering if these were always non-NULL for private mappings.
> >> It seems that reservation_counter (h_gc) would be NULL in these cases from
> >> these early checks in hugetlb_cgroup_charge_cgroup().
> >>
> >
> > You are right. I'm fixing in v10 the code and comments to account for
> > h_cg potentially being NULL, but I'm having trouble testing. Looking
> > at the code, I'm a bit confused by the checks. Seems to me
> > hugetlb_cgroup_disabled() is the same as #ifdef CONFIG_CGROUP_HUGETLB;
> > I can't find a way to enable the Kconfig but have that return false
> > unless I hack the code.
>
> What about the boot options?
>
> cgroup_disable=
> cgroup_no_v1=

Thanks, cgroup_disable=hugetlb does it. I ran the the libhugetlbfs
tests with patchset v10 and it passed, so it seems the latest version
of the patch should be fine. Of course my hugetlb cgroup tests fail
outright when hugetlb cgroups are disabled so those don't say anything
useful.
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index dea6143aa0685..e6ab499ba2086 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -46,6 +46,15 @@  struct resv_map {
 	long adds_in_progress;
 	struct list_head region_cache;
 	long region_cache_count;
+#ifdef CONFIG_CGROUP_HUGETLB
+	/*
+	 * On private mappings, the counter to uncharge reservations is stored
+	 * here. If these fields are 0, then the mapping is shared.
+	 */
+	struct page_counter *reservation_counter;
+	unsigned long pages_per_hpage;
+	struct cgroup_subsys_state *css;
+#endif
 };
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index eab8a70d5bcb5..8c320accefe87 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -25,6 +25,33 @@  struct hugetlb_cgroup;
 #define HUGETLB_CGROUP_MIN_ORDER	2

 #ifdef CONFIG_CGROUP_HUGETLB
+enum hugetlb_memory_event {
+	HUGETLB_MAX,
+	HUGETLB_NR_MEMORY_EVENTS,
+};
+
+struct hugetlb_cgroup {
+	struct cgroup_subsys_state css;
+
+	/*
+	 * the counter to account for hugepages from hugetlb.
+	 */
+	struct page_counter hugepage[HUGE_MAX_HSTATE];
+
+	/*
+	 * the counter to account for hugepage reservations from hugetlb.
+	 */
+	struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
+
+	atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+	atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+
+	/* Handle for "hugetlb.events" */
+	struct cgroup_file events_file[HUGE_MAX_HSTATE];
+
+	/* Handle for "hugetlb.events.local" */
+	struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
+};

 static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
 							      bool reserved)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e6e8240f1718c..7782977970301 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -665,6 +665,17 @@  struct resv_map *resv_map_alloc(void)
 	INIT_LIST_HEAD(&resv_map->regions);

 	resv_map->adds_in_progress = 0;
+#ifdef CONFIG_CGROUP_HUGETLB
+	/*
+	 * Initialize these to 0. On shared mappings, 0's here indicate these
+	 * fields don't do cgroup accounting. On private mappings, these will be
+	 * re-initialized to the proper values, to indicate that hugetlb cgroup
+	 * reservations are to be un-charged from here.
+	 */
+	resv_map->reservation_counter = NULL;
+	resv_map->pages_per_hpage = 0;
+	resv_map->css = NULL;
+#endif

 	INIT_LIST_HEAD(&resv_map->region_cache);
 	list_add(&rg->link, &resv_map->region_cache);
@@ -3145,7 +3156,20 @@  static void hugetlb_vm_op_close(struct vm_area_struct *vma)

 	reserve = (end - start) - region_count(resv, start, end);

-	kref_put(&resv->refs, resv_map_release);
+#ifdef CONFIG_CGROUP_HUGETLB
+	/*
+	 * Since we check for HPAGE_RESV_OWNER above, this must a private
+	 * mapping, and these values should be none-zero, and should point to
+	 * the hugetlb_cgroup counter to uncharge for this reservation.
+	 */
+	WARN_ON(!resv->reservation_counter);
+	WARN_ON(!resv->pages_per_hpage);
+	WARN_ON(!resv->css);
+
+	hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
+					(end - start) * resv->pages_per_hpage,
+					resv->css);
+#endif

 	if (reserve) {
 		/*
@@ -3155,6 +3179,8 @@  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
 		hugetlb_acct_memory(h, -gbl_reserve);
 	}
+
+	kref_put(&resv->refs, resv_map_release);
 }

 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -4501,6 +4527,7 @@  int hugetlb_reserve_pages(struct inode *inode,
 	struct hstate *h = hstate_inode(inode);
 	struct hugepage_subpool *spool = subpool_inode(inode);
 	struct resv_map *resv_map;
+	struct hugetlb_cgroup *h_cg;
 	long gbl_reserve;

 	/* This should never happen */
@@ -4534,12 +4561,30 @@  int hugetlb_reserve_pages(struct inode *inode,
 		chg = region_chg(resv_map, from, to);

 	} else {
+		/* Private mapping. */
 		resv_map = resv_map_alloc();
 		if (!resv_map)
 			return -ENOMEM;

 		chg = to - from;

+		if (hugetlb_cgroup_charge_cgroup(hstate_index(h),
+						 chg * pages_per_huge_page(h),
+						 &h_cg, true)) {
+			kref_put(&resv_map->refs, resv_map_release);
+			return -ENOMEM;
+		}
+
+#ifdef CONFIG_CGROUP_HUGETLB
+		/*
+		 * Since this branch handles private mappings, we attach the
+		 * counter to uncharge for this reservation off resv_map.
+		 */
+		resv_map->reservation_counter =
+			&h_cg->reserved_hugepage[hstate_index(h)];
+		resv_map->pages_per_hpage = pages_per_huge_page(h);
+#endif
+
 		set_vma_resv_map(vma, resv_map);
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index b03270b0d5833..e066122ab9f49 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -23,34 +23,6 @@ 
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>

-enum hugetlb_memory_event {
-	HUGETLB_MAX,
-	HUGETLB_NR_MEMORY_EVENTS,
-};
-
-struct hugetlb_cgroup {
-	struct cgroup_subsys_state css;
-
-	/*
-	 * the counter to account for hugepages from hugetlb.
-	 */
-	struct page_counter hugepage[HUGE_MAX_HSTATE];
-
-	/*
-	 * the counter to account for hugepage reservations from hugetlb.
-	 */
-	struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
-
-	atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
-	atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
-
-	/* Handle for "hugetlb.events" */
-	struct cgroup_file events_file[HUGE_MAX_HSTATE];
-
-	/* Handle for "hugetlb.events.local" */
-	struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
-};
-
 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)