diff mbox series

[v6,4/6] mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY

Message ID 1626077374-81682-5-git-send-email-feng.tang@intel.com (mailing list archive)
State New
Headers show
Series Introduce multi-preference mempolicy | expand

Commit Message

Feng Tang July 12, 2021, 8:09 a.m. UTC
From: Ben Widawsky <ben.widawsky@intel.com>

Implement the missing huge page allocation functionality while obeying
the preferred node semantics. This is similar to the implementation
for general page allocation, as it uses a fallback mechanism to try
multiple preferred nodes first, and then all other nodes.

[Thanks to 0day bot for caching the missing #ifdef CONFIG_NUMA issue]

Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Co-developed-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 mm/hugetlb.c   | 25 +++++++++++++++++++++++++
 mm/mempolicy.c |  3 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

Comments

Mike Kravetz July 21, 2021, 8:49 p.m. UTC | #1
On 7/12/21 1:09 AM, Feng Tang wrote:
> From: Ben Widawsky <ben.widawsky@intel.com>
> 
> Implement the missing huge page allocation functionality while obeying
> the preferred node semantics. This is similar to the implementation
> for general page allocation, as it uses a fallback mechanism to try
> multiple preferred nodes first, and then all other nodes.
> 
> [Thanks to 0day bot for caching the missing #ifdef CONFIG_NUMA issue]
> 
> Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> Co-developed-by: Feng Tang <feng.tang@intel.com>
> Signed-off-by: Feng Tang <feng.tang@intel.com>
> ---
>  mm/hugetlb.c   | 25 +++++++++++++++++++++++++
>  mm/mempolicy.c |  3 ++-
>  2 files changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 924553aa8f78..3e84508c1b8c 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1164,7 +1164,18 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>  
>  	gfp_mask = htlb_alloc_mask(h);
>  	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> +#ifdef CONFIG_NUMA
> +	if (mpol->mode == MPOL_PREFERRED_MANY) {
> +		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> +		if (page)
> +			goto check_reserve;
> +		/* Fallback to all nodes */
> +		nodemask = NULL;
> +	}
> +#endif
>  	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> +
> +check_reserve:
>  	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
>  		SetHPageRestoreReserve(page);
>  		h->resv_huge_pages--;
> @@ -2095,6 +2106,20 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
>  	nodemask_t *nodemask;
>  
>  	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> +#ifdef CONFIG_NUMA
> +	if (mpol->mode == MPOL_PREFERRED_MANY) {
> +		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;

I believe __GFP_NOWARN will be added later in alloc_buddy_huge_page, so
no need to add here?

> +
> +		page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
> +		if (page) {
> +			mpol_cond_put(mpol);
> +			return page;
> +		}
> +
> +		/* Fallback to all nodes */
> +		nodemask = NULL;
> +	}
> +#endif
>  	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
>  	mpol_cond_put(mpol);
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 9dce67fc9bb6..93f8789758a7 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -2054,7 +2054,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
>  					huge_page_shift(hstate_vma(vma)));
>  	} else {
>  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
> -		if ((*mpol)->mode == MPOL_BIND)
> +		if ((*mpol)->mode == MPOL_BIND ||
> +		    (*mpol)->mode == MPOL_PREFERRED_MANY)
>  			*nodemask = &(*mpol)->nodes;
>  	}
>  	return nid;
> 

Other than the one nit above,

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Feng Tang July 22, 2021, 8:11 a.m. UTC | #2
Mike,

On Wed, Jul 21, 2021 at 01:49:15PM -0700, Mike Kravetz wrote:
> On 7/12/21 1:09 AM, Feng Tang wrote:
> > From: Ben Widawsky <ben.widawsky@intel.com>
> > 
> > Implement the missing huge page allocation functionality while obeying
> > the preferred node semantics. This is similar to the implementation
> > for general page allocation, as it uses a fallback mechanism to try
> > multiple preferred nodes first, and then all other nodes.
> > 
> > [Thanks to 0day bot for caching the missing #ifdef CONFIG_NUMA issue]
> > 
> > Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
> > Suggested-by: Michal Hocko <mhocko@suse.com>
> > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > Co-developed-by: Feng Tang <feng.tang@intel.com>
> > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > ---
> >  mm/hugetlb.c   | 25 +++++++++++++++++++++++++
> >  mm/mempolicy.c |  3 ++-
> >  2 files changed, 27 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 924553aa8f78..3e84508c1b8c 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1164,7 +1164,18 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> >  
> >  	gfp_mask = htlb_alloc_mask(h);
> >  	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> > +#ifdef CONFIG_NUMA
> > +	if (mpol->mode == MPOL_PREFERRED_MANY) {
> > +		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> > +		if (page)
> > +			goto check_reserve;
> > +		/* Fallback to all nodes */
> > +		nodemask = NULL;
> > +	}
> > +#endif
> >  	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> > +
> > +check_reserve:
> >  	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
> >  		SetHPageRestoreReserve(page);
> >  		h->resv_huge_pages--;
> > @@ -2095,6 +2106,20 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
> >  	nodemask_t *nodemask;
> >  
> >  	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> > +#ifdef CONFIG_NUMA
> > +	if (mpol->mode == MPOL_PREFERRED_MANY) {
> > +		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
> 
> I believe __GFP_NOWARN will be added later in alloc_buddy_huge_page, so
> no need to add here?

Thanks for the suggestion, will remove it. 

> > +
> > +		page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
> > +		if (page) {
> > +			mpol_cond_put(mpol);
> > +			return page;
> > +		}
> > +
> > +		/* Fallback to all nodes */
> > +		nodemask = NULL;
> > +	}
> > +#endif
> >  	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> >  	mpol_cond_put(mpol);
> >  
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 9dce67fc9bb6..93f8789758a7 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -2054,7 +2054,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
> >  					huge_page_shift(hstate_vma(vma)));
> >  	} else {
> >  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
> > -		if ((*mpol)->mode == MPOL_BIND)
> > +		if ((*mpol)->mode == MPOL_BIND ||
> > +		    (*mpol)->mode == MPOL_PREFERRED_MANY)
> >  			*nodemask = &(*mpol)->nodes;
> >  	}
> >  	return nid;
> > 
> 
> Other than the one nit above,
> 
> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>

Thanks!



Andrew,

I have to ask for your help again to fold this to the 4/6 patch, thanks!

- Feng

---------------------------8<--------------------------------------------

From de1cd29d8da96856a6d754a30a4c7585d87b8348 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 22 Jul 2021 16:00:49 +0800
Subject: [PATCH] mm/hugetlb: remove the unneeded __GFP_NOWARN flag setting

As the alloc_buddy_huge_page() will set it anyway.

Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 mm/hugetlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 528947d..a96e283 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2162,9 +2162,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
 #ifdef CONFIG_NUMA
 	if (mpol->mode == MPOL_PREFERRED_MANY) {
-		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
-
-		page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+		page = alloc_surplus_huge_page(h,
+					gfp_mask & ~__GFP_DIRECT_RECLAIM,
+					nid, nodemask, false);
 		if (page) {
 			mpol_cond_put(mpol);
 			return page;
Michal Hocko July 22, 2021, 9:42 a.m. UTC | #3
On Wed 21-07-21 13:49:15, Mike Kravetz wrote:
> On 7/12/21 1:09 AM, Feng Tang wrote:
[...]
> > +#ifdef CONFIG_NUMA
> > +	if (mpol->mode == MPOL_PREFERRED_MANY) {
> > +		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
> 
> I believe __GFP_NOWARN will be added later in alloc_buddy_huge_page, so
> no need to add here?

The mask is manipulated here anyway and the __GFP_NOWARN is really
telling that there is no need to report the failure for _this_
allocation request. alloc_surplus_huge_page might alter that in whatever
way in the future. So I would keep NOWARN here for the code clarity
rather than rely on some implicit assumption down the path.
Mike Kravetz July 22, 2021, 4:21 p.m. UTC | #4
On 7/22/21 2:42 AM, Michal Hocko wrote:
> On Wed 21-07-21 13:49:15, Mike Kravetz wrote:
>> On 7/12/21 1:09 AM, Feng Tang wrote:
> [...]
>>> +#ifdef CONFIG_NUMA
>>> +	if (mpol->mode == MPOL_PREFERRED_MANY) {
>>> +		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
>>
>> I believe __GFP_NOWARN will be added later in alloc_buddy_huge_page, so
>> no need to add here?
> 
> The mask is manipulated here anyway and the __GFP_NOWARN is really
> telling that there is no need to report the failure for _this_
> allocation request. alloc_surplus_huge_page might alter that in whatever
> way in the future. So I would keep NOWARN here for the code clarity
> rather than rely on some implicit assumption down the path.

Makes sense.  Better to leave the __GFP_NOWARN here for clarity.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 924553aa8f78..3e84508c1b8c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1164,7 +1164,18 @@  static struct page *dequeue_huge_page_vma(struct hstate *h,
 
 	gfp_mask = htlb_alloc_mask(h);
 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+#ifdef CONFIG_NUMA
+	if (mpol->mode == MPOL_PREFERRED_MANY) {
+		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+		if (page)
+			goto check_reserve;
+		/* Fallback to all nodes */
+		nodemask = NULL;
+	}
+#endif
 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+check_reserve:
 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
 		SetHPageRestoreReserve(page);
 		h->resv_huge_pages--;
@@ -2095,6 +2106,20 @@  struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 	nodemask_t *nodemask;
 
 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+#ifdef CONFIG_NUMA
+	if (mpol->mode == MPOL_PREFERRED_MANY) {
+		gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+
+		page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
+		if (page) {
+			mpol_cond_put(mpol);
+			return page;
+		}
+
+		/* Fallback to all nodes */
+		nodemask = NULL;
+	}
+#endif
 	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
 	mpol_cond_put(mpol);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9dce67fc9bb6..93f8789758a7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2054,7 +2054,8 @@  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
-		if ((*mpol)->mode == MPOL_BIND)
+		if ((*mpol)->mode == MPOL_BIND ||
+		    (*mpol)->mode == MPOL_PREFERRED_MANY)
 			*nodemask = &(*mpol)->nodes;
 	}
 	return nid;