diff mbox

mm, hugetlb_cgroup: suppress SIGBUS when hugetlb_cgroup charge fails

Message ID alpine.DEB.2.21.1805251316090.167008@chino.kir.corp.google.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Rientjes May 25, 2018, 8:16 p.m. UTC
When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
page fault handler.

Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
is handled correctly.  This is consistent with failing mem cgroup charges
in the non-hugetlb fault path.

At the same time, restructure the return paths of alloc_huge_page() so it
is consistent.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/hugetlb.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

Comments

Andrew Morton May 25, 2018, 8:44 p.m. UTC | #1
On Fri, 25 May 2018 13:16:45 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> page fault handler.
> 
> Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
> is handled correctly.  This is consistent with failing mem cgroup charges
> in the non-hugetlb fault path.
> 
> At the same time, restructure the return paths of alloc_huge_page() so it
> is consistent.

Patch doesn't appear to match the changelog?

> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	 * code of zero indicates a reservation exists (no change).
>  	 */
>  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> -	if (map_chg < 0)
> -		return ERR_PTR(-ENOMEM);
> +	if (map_chg < 0) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}

This doesn't change the return value.

>  	/*
>  	 * Processes that did not create the mapping will have no
> @@ -2019,8 +2021,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	if (map_chg || avoid_reserve) {
>  		gbl_chg = hugepage_subpool_get_pages(spool, 1);
>  		if (gbl_chg < 0) {
> -			vma_end_reservation(h, vma, addr);
> -			return ERR_PTR(-ENOSPC);
> +			ret = -ENOSPC;
> +			goto out_reservation;
>  		}

Nor does this.
 
>  		/*
> @@ -2049,8 +2051,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	if (!page) {
>  		spin_unlock(&hugetlb_lock);
>  		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
> -		if (!page)
> +		if (!page) {
> +			ret = -ENOSPC;
>  			goto out_uncharge_cgroup;
> +		}

Nor does this.

>  		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
>  			SetPagePrivate(page);
>  			h->resv_huge_pages--;
> @@ -2087,8 +2091,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  out_subpool_put:
>  	if (map_chg || avoid_reserve)
>  		hugepage_subpool_put_pages(spool, 1);
> +out_reservation:
>  	vma_end_reservation(h, vma, addr);
> -	return ERR_PTR(-ENOSPC);
> +out:
> +	return ERR_PTR(ret);
>  }
>  

It would be nice if you could add a comment over alloc_huge_page()
explaining the return values (at least).  Why sometimes ENOMEM, other
times ENOSPC?
David Rientjes May 25, 2018, 8:59 p.m. UTC | #2
On Fri, 25 May 2018, Andrew Morton wrote:

> On Fri, 25 May 2018 13:16:45 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
> 
> > When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> > ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> > page fault handler.
> > 
> > Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
> > is handled correctly.  This is consistent with failing mem cgroup charges
> > in the non-hugetlb fault path.
> > 
> > At the same time, restructure the return paths of alloc_huge_page() so it
> > is consistent.
> 
> Patch doesn't appear to match the changelog?
> 

In what way?

> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >  	 * code of zero indicates a reservation exists (no change).
> >  	 */
> >  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> > -	if (map_chg < 0)
> > -		return ERR_PTR(-ENOMEM);
> > +	if (map_chg < 0) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> 
> This doesn't change the return value.
> 

This, and the subsequent comments, are referring to the third paragraph of 
the changelog.

The functional part of the change is for the 
hugetlb_cgroup_charge_cgroup() return value that is now actually used.

> >  	/*
> >  	 * Processes that did not create the mapping will have no
> > @@ -2019,8 +2021,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >  	if (map_chg || avoid_reserve) {
> >  		gbl_chg = hugepage_subpool_get_pages(spool, 1);
> >  		if (gbl_chg < 0) {
> > -			vma_end_reservation(h, vma, addr);
> > -			return ERR_PTR(-ENOSPC);
> > +			ret = -ENOSPC;
> > +			goto out_reservation;
> >  		}
> 
> Nor does this.
>  
> >  		/*
> > @@ -2049,8 +2051,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >  	if (!page) {
> >  		spin_unlock(&hugetlb_lock);
> >  		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
> > -		if (!page)
> > +		if (!page) {
> > +			ret = -ENOSPC;
> >  			goto out_uncharge_cgroup;
> > +		}
> 
> Nor does this.
> 
> >  		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
> >  			SetPagePrivate(page);
> >  			h->resv_huge_pages--;
> > @@ -2087,8 +2091,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> >  out_subpool_put:
> >  	if (map_chg || avoid_reserve)
> >  		hugepage_subpool_put_pages(spool, 1);
> > +out_reservation:
> >  	vma_end_reservation(h, vma, addr);
> > -	return ERR_PTR(-ENOSPC);
> > +out:
> > +	return ERR_PTR(ret);
> >  }
> >  
> 
> It would be nice if you could add a comment over alloc_huge_page()
> explaining the return values (at least).  Why sometimes ENOMEM, other
> times ENOSPC?
> 

The ENOSPC is used to specifically induce a VM_FAULT_SIGBUS, which 
Documentation/vm/hugetlbfs_reserv.txt specifies is how faults are handled 
if no hugetlb pages are available.
Andrew Morton May 25, 2018, 9:09 p.m. UTC | #3
On Fri, 25 May 2018 13:59:40 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> > >  	 * code of zero indicates a reservation exists (no change).
> > >  	 */
> > >  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> > > -	if (map_chg < 0)
> > > -		return ERR_PTR(-ENOMEM);
> > > +	if (map_chg < 0) {
> > > +		ret = -ENOMEM;
> > > +		goto out;
> > > +	}
> > 
> > This doesn't change the return value.
> > 
> 
> This, and the subsequent comments, are referring to the third paragraph of 
> the changelog.
> 
> The functional part of the change is for the 
> hugetlb_cgroup_charge_cgroup() return value that is now actually used.


Ah.  Missed that bit.

Altered changelog:

: When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
: ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
: page fault handler.
: 
: This is because the return value from hugetlb_cgroup_charge_cgroup() is
: overwritten with ERR_PTR(-ENOSPC).
: 
: Instead, propagate the error code from hugetlb_cgroup_charge_cgroup()
: (ERR_PTR(-ENOMEM)), so VM_FAULT_OOM is handled correctly.  This is
: consistent with failing mem cgroup charges in the non-hugetlb fault path.
: 
: At the same time, restructure the return paths of alloc_huge_page() so it
: is consistent.

>
> > It would be nice if you could add a comment over alloc_huge_page()
> > explaining the return values (at least).  Why sometimes ENOMEM, other
> > times ENOSPC?
> > 
> 
> The ENOSPC is used to specifically induce a VM_FAULT_SIGBUS, which 
> Documentation/vm/hugetlbfs_reserv.txt specifies is how faults are handled 
> if no hugetlb pages are available.

That wasn't a code comment ;) Nobody will know to go looking in
hugetlbfs_reserv.txt - it isn't even referred to from mm/*.c.
David Rientjes May 25, 2018, 10:18 p.m. UTC | #4
On Fri, 25 May 2018, Andrew Morton wrote:

> > > > --- a/mm/hugetlb.c
> > > > +++ b/mm/hugetlb.c
> > > > @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> > > >  	 * code of zero indicates a reservation exists (no change).
> > > >  	 */
> > > >  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> > > > -	if (map_chg < 0)
> > > > -		return ERR_PTR(-ENOMEM);
> > > > +	if (map_chg < 0) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out;
> > > > +	}
> > > 
> > > This doesn't change the return value.
> > > 
> > 
> > This, and the subsequent comments, are referring to the third paragraph of 
> > the changelog.
> > 
> > The functional part of the change is for the 
> > hugetlb_cgroup_charge_cgroup() return value that is now actually used.
> 
> 
> Ah.  Missed that bit.
> 

If you'd like this separated into two separate patches, one that fixes the 
actual issue with the hugetlb_cgroup_charge_cgroup() return value and the 
other to use a single exit path with ERR_PTR(ret), that might make it 
easier.  I think the latter is why the bug was introduced: it's too easy 
to force -ENOSPC unintentionally.

> Altered changelog:
> 
> : When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> : ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> : page fault handler.
> : 
> : This is because the return value from hugetlb_cgroup_charge_cgroup() is
> : overwritten with ERR_PTR(-ENOSPC).
> : 
> : Instead, propagate the error code from hugetlb_cgroup_charge_cgroup()
> : (ERR_PTR(-ENOMEM)), so VM_FAULT_OOM is handled correctly.  This is
> : consistent with failing mem cgroup charges in the non-hugetlb fault path.
> : 
> : At the same time, restructure the return paths of alloc_huge_page() so it
> : is consistent.
> 

LGTM, thanks.

> >
> > > It would be nice if you could add a comment over alloc_huge_page()
> > > explaining the return values (at least).  Why sometimes ENOMEM, other
> > > times ENOSPC?
> > > 
> > 
> > The ENOSPC is used to specifically induce a VM_FAULT_SIGBUS, which 
> > Documentation/vm/hugetlbfs_reserv.txt specifies is how faults are handled 
> > if no hugetlb pages are available.
> 
> That wasn't a code comment ;) Nobody will know to go looking in
> hugetlbfs_reserv.txt - it isn't even referred to from mm/*.c.
> 

Let's see what Mike and Aneesh say, because they may object to using 
VM_FAULT_OOM because there's no way to guarantee that we'll come under the 
limit of hugetlb_cgroup as a result of the oom.  My assumption is that we 
use VM_FAULT_SIGBUS since oom killing will not guarantee that the 
allocation can succeed.  But now a process can get a SIGBUS if its hugetlb 
pages are not allocatable or its under a limit imposed by hugetlb_cgroup 
that it's not aware of.  Faulting hugetlb pages is certainly risky 
business these days...

Perhaps the optimal solution for reaching hugetlb_cgroup limits is to 
induce an oom kill from within the hugetlb_cgroup itself?  Otherwise the 
unlucky process to fault their hugetlb pages last gets SIGBUS.
Michal Hocko May 28, 2018, 8:52 a.m. UTC | #5
On Fri 25-05-18 13:16:45, David Rientjes wrote:
> When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> page fault handler.
> 
> Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
> is handled correctly.  This is consistent with failing mem cgroup charges
> in the non-hugetlb fault path.

Could you describe the acutal problem you are trying to solve, please?
Also could you explain Why should be the charge and the reservation
failure any different? My memory is dim but the original hugetlb code
was aiming at being compatible with the reservation failures because in
essence the hugetlb simply subdivides the existing pool between cgroups.
I might misremember of course but the changelog should be much more
clear in that case.

> At the same time, restructure the return paths of alloc_huge_page() so it
> is consistent.

Please make an unrelated change in a separate commit.

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  mm/hugetlb.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2006,8 +2006,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	 * code of zero indicates a reservation exists (no change).
>  	 */
>  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> -	if (map_chg < 0)
> -		return ERR_PTR(-ENOMEM);
> +	if (map_chg < 0) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
>  
>  	/*
>  	 * Processes that did not create the mapping will have no
> @@ -2019,8 +2021,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	if (map_chg || avoid_reserve) {
>  		gbl_chg = hugepage_subpool_get_pages(spool, 1);
>  		if (gbl_chg < 0) {
> -			vma_end_reservation(h, vma, addr);
> -			return ERR_PTR(-ENOSPC);
> +			ret = -ENOSPC;
> +			goto out_reservation;
>  		}
>  
>  		/*
> @@ -2049,8 +2051,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  	if (!page) {
>  		spin_unlock(&hugetlb_lock);
>  		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
> -		if (!page)
> +		if (!page) {
> +			ret = -ENOSPC;
>  			goto out_uncharge_cgroup;
> +		}
>  		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
>  			SetPagePrivate(page);
>  			h->resv_huge_pages--;
> @@ -2087,8 +2091,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
>  out_subpool_put:
>  	if (map_chg || avoid_reserve)
>  		hugepage_subpool_put_pages(spool, 1);
> +out_reservation:
>  	vma_end_reservation(h, vma, addr);
> -	return ERR_PTR(-ENOSPC);
> +out:
> +	return ERR_PTR(ret);
>  }
>  
>  int alloc_bootmem_huge_page(struct hstate *h)
Michal Hocko May 28, 2018, 9:03 a.m. UTC | #6
On Fri 25-05-18 15:18:11, David Rientjes wrote:
[...]
> Let's see what Mike and Aneesh say, because they may object to using 
> VM_FAULT_OOM because there's no way to guarantee that we'll come under the 
> limit of hugetlb_cgroup as a result of the oom.  My assumption is that we 
> use VM_FAULT_SIGBUS since oom killing will not guarantee that the 
> allocation can succeed.

Yes. And the lack of hugetlb awareness in the oom killer is another
reason. There is absolutely no reason to kill a task when somebody
misconfigured the hugetlb pool.

> But now a process can get a SIGBUS if its hugetlb 
> pages are not allocatable or its under a limit imposed by hugetlb_cgroup 
> that it's not aware of.  Faulting hugetlb pages is certainly risky 
> business these days...

It's always been and I am afraid it will always be unless somebody
simply reimplements the current code to be NUMA aware for example (it is
just too easy to drain a per NODE reserves...).

> Perhaps the optimal solution for reaching hugetlb_cgroup limits is to 
> induce an oom kill from within the hugetlb_cgroup itself?  Otherwise the 
> unlucky process to fault their hugetlb pages last gets SIGBUS.

Hmm, so you expect that the killed task would simply return pages to the
pool? Wouldn't that require to have a hugetlb cgroup OOM killer that
would only care about hugetlb reservations of tasks? Is that worth all
the effort and the additional code?
Mike Kravetz May 29, 2018, 6:13 p.m. UTC | #7
On 05/25/2018 01:16 PM, David Rientjes wrote:
> When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> page fault handler.
> 
> Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
> is handled correctly.  This is consistent with failing mem cgroup charges
> in the non-hugetlb fault path.

Apologies for the late reply.

I am not %100 sure we want to make this change.  When hugetlb cgroup support
was added by Aneesh, the intention was for the application to get SIGBUS.

commit 2bc64a204697
https://lwn.net/Articles/499255/

Since the code has always caused SIGBUS when exceeding cgroup limit, there
may be applications depending on this behavior.  I would be especially
concerned with HPC applications which were the original purpose for adding
the feature.

Perhaps, the original code should have returned ENOMEM to be consistent as
in your patch.  That does seem to be the more correct behavior.  But, do we
want to change behavior now (admittedly undocumented) and potentially break
some application?

I echo Michal's question about the reason for the change.  If there is a
real problem or issue to solve, that makes more of a case for making the
change.  If it is simply code/behavior cleanup for consistency then I would
suggest not making the change, but rather documenting this as another
hugetlbfs "special behavior".

As a quick test, I added a shell to a hugetlb cgroup with 2 huge page
limit and started a task using huge pages.

Current behavior
----------------
ssh_to_dbg # sudo ./test_mmap 4
mapping 4 huge pages
address 7f62bba00000 read (-)
address 7f62bbc00000 read (-)
Bus error
ssh_to_dbg #

Behavior with patch
-------------------
ssh_to_dbg # sudo ./test_mmap 4
mapping 4 huge pages
address 7f62bba00000 read (-)
address 7f62bbc00000 read (-)
Connection to dbg closed by remote host.
Connection to dbf closed.

OOM did kick in (lots of console/log output) and killed the shell
as well.
David Rientjes May 30, 2018, 8:51 p.m. UTC | #8
Hi Mike,

On Tue, 29 May 2018, Mike Kravetz wrote:

> > When charging to a hugetlb_cgroup fails, alloc_huge_page() returns
> > ERR_PTR(-ENOSPC) which will cause VM_FAULT_SIGBUS to be returned to the
> > page fault handler.
> > 
> > Instead, return the proper error code, ERR_PTR(-ENOMEM), so VM_FAULT_OOM
> > is handled correctly.  This is consistent with failing mem cgroup charges
> > in the non-hugetlb fault path.
> 
> Apologies for the late reply.
> 
> I am not %100 sure we want to make this change.  When hugetlb cgroup support
> was added by Aneesh, the intention was for the application to get SIGBUS.
> 
> commit 2bc64a204697
> https://lwn.net/Articles/499255/
> 
> Since the code has always caused SIGBUS when exceeding cgroup limit, there
> may be applications depending on this behavior.  I would be especially
> concerned with HPC applications which were the original purpose for adding
> the feature.
> 
> Perhaps, the original code should have returned ENOMEM to be consistent as
> in your patch.  That does seem to be the more correct behavior.  But, do we
> want to change behavior now (admittedly undocumented) and potentially break
> some application?
> 
> I echo Michal's question about the reason for the change.  If there is a
> real problem or issue to solve, that makes more of a case for making the
> change.  If it is simply code/behavior cleanup for consistency then I would
> suggest not making the change, but rather documenting this as another
> hugetlbfs "special behavior".
> 

Yes, I mentioned the backwards compatibility issue and I'm not sure there 
is a likely way around it.  But it's rather unfortunate that applications 
can become constrained in such a way that SIGBUS may be unavoidable if 
alloc_buddy_huge_page_with_mpol() cannot allocate from surplus and/or the 
hugetlb_cgroup limit is reached.  Not only are both racy, but applications 
prior to hugetlb_cgroup was introduced may have avoided SIGBUS by checking 
global hstate and are now limited to hugetlb_cgroup constraints 
unknowingly.  It's also not possible to avoid the SIGBUS by trying to 
terminate a lower priority process that has hugetlb reservations.

I'm not sure there is a path forward that can make this more 
deterministic.  We have customers who have reported receiving SIGBUS deep 
in their allocation stack using MAP_HUGETLB and were checking global 
hstate but were unaware of any hugetlb_cgroup restriction.

Andrew, please drop the patch.  I'd like to know if anybody has any ideas 
on how this can be more userspace friendly, however.
diff mbox

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2006,8 +2006,10 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * code of zero indicates a reservation exists (no change).
 	 */
 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
-	if (map_chg < 0)
-		return ERR_PTR(-ENOMEM);
+	if (map_chg < 0) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * Processes that did not create the mapping will have no
@@ -2019,8 +2021,8 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (map_chg || avoid_reserve) {
 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
 		if (gbl_chg < 0) {
-			vma_end_reservation(h, vma, addr);
-			return ERR_PTR(-ENOSPC);
+			ret = -ENOSPC;
+			goto out_reservation;
 		}
 
 		/*
@@ -2049,8 +2051,10 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
-		if (!page)
+		if (!page) {
+			ret = -ENOSPC;
 			goto out_uncharge_cgroup;
+		}
 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
 			SetPagePrivate(page);
 			h->resv_huge_pages--;
@@ -2087,8 +2091,10 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 out_subpool_put:
 	if (map_chg || avoid_reserve)
 		hugepage_subpool_put_pages(spool, 1);
+out_reservation:
 	vma_end_reservation(h, vma, addr);
-	return ERR_PTR(-ENOSPC);
+out:
+	return ERR_PTR(ret);
 }
 
 int alloc_bootmem_huge_page(struct hstate *h)