diff mbox series

[v4,11/13] mm/mempolicy: huge-page allocation for many preferred

Message ID 1615952410-36895-12-git-send-email-feng.tang@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduced multi-preference mempolicy | expand

Commit Message

Feng Tang March 17, 2021, 3:40 a.m. UTC
From: Ben Widawsky <ben.widawsky@intel.com>

Implement the missing huge page allocation functionality while obeying
the preferred node semantics.

This uses a fallback mechanism to try multiple preferred nodes first,
and then all other nodes. It cannot use the helper function that was
introduced because huge page allocation already has its own helpers and
it was more LOC, and effort to try to consolidate that.

The weirdness is MPOL_PREFERRED_MANY can't be called yet because it is
part of the UAPI we haven't yet exposed. Instead of make that define
global, it's simply changed with the UAPI patch.

[ feng: add NOWARN flag, and skip the direct reclaim to speedup allocation
  in some case ]

Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 mm/hugetlb.c   | 26 +++++++++++++++++++++++---
 mm/mempolicy.c |  3 ++-
 2 files changed, 25 insertions(+), 4 deletions(-)

Comments

kernel test robot March 17, 2021, 7:19 a.m. UTC | #1
Hi Feng,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.12-rc3]
[cannot apply to next-20210316]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Feng-Tang/Introduced-multi-preference-mempolicy/20210317-114204
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git a74e6a014c9d4d4161061f770c9b4f98372ac778
config: s390-randconfig-r022-20210317 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 8ef111222a3dd12a9175f69c3bff598c46e8bdf7)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/0day-ci/linux/commit/3bfe0c833846b79ae8bbfff60906c9d7c244c3b8
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Feng-Tang/Introduced-multi-preference-mempolicy/20210317-114204
        git checkout 3bfe0c833846b79ae8bbfff60906c9d7c244c3b8
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=s390 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:119:21: note: expanded from macro '__swab32'
           ___constant_swab32(x) :                 \
                              ^
   include/uapi/linux/swab.h:19:12: note: expanded from macro '___constant_swab32'
           (((__u32)(x) & (__u32)0x000000ffUL) << 24) |            \
                     ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:119:21: note: expanded from macro '__swab32'
           ___constant_swab32(x) :                 \
                              ^
   include/uapi/linux/swab.h:20:12: note: expanded from macro '___constant_swab32'
           (((__u32)(x) & (__u32)0x0000ff00UL) <<  8) |            \
                     ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:119:21: note: expanded from macro '__swab32'
           ___constant_swab32(x) :                 \
                              ^
   include/uapi/linux/swab.h:21:12: note: expanded from macro '___constant_swab32'
           (((__u32)(x) & (__u32)0x00ff0000UL) >>  8) |            \
                     ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:119:21: note: expanded from macro '__swab32'
           ___constant_swab32(x) :                 \
                              ^
   include/uapi/linux/swab.h:22:12: note: expanded from macro '___constant_swab32'
           (((__u32)(x) & (__u32)0xff000000UL) >> 24)))
                     ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:120:12: note: expanded from macro '__swab32'
           __fswab32(x))
                     ^
   In file included from mm/hugetlb.c:19:
   In file included from include/linux/memblock.h:14:
   In file included from arch/s390/include/asm/dma.h:5:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:501:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:511:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:521:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:609:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsb(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:617:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsw(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:625:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsl(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:634:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesb(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:643:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesw(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:652:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesl(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
>> mm/hugetlb.c:1129:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:52: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                      ^~~~
>> mm/hugetlb.c:1129:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:61: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                               ^~~~
>> mm/hugetlb.c:1129:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:86: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                                                        ^~~~
   include/linux/compiler.h:69:3: note: expanded from macro '__trace_if_value'
           (cond) ?                                        \
            ^~~~
   mm/hugetlb.c:1896:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:52: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                      ^~~~
   mm/hugetlb.c:1896:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:61: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                               ^~~~
   mm/hugetlb.c:1896:12: error: no member named 'mode' in 'struct mempolicy'
           if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
               ~~~~  ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                                                 ^~~~
   include/linux/compiler.h:58:86: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                                                        ^~~~
   include/linux/compiler.h:69:3: note: expanded from macro '__trace_if_value'
           (cond) ?                                        \
            ^~~~
   20 warnings and 6 errors generated.


vim +1129 mm/hugetlb.c

  1102	
  1103	static struct page *dequeue_huge_page_vma(struct hstate *h,
  1104					struct vm_area_struct *vma,
  1105					unsigned long address, int avoid_reserve,
  1106					long chg)
  1107	{
  1108		struct page *page = NULL;
  1109		struct mempolicy *mpol;
  1110		gfp_t gfp_mask;
  1111		nodemask_t *nodemask;
  1112		int nid;
  1113	
  1114		/*
  1115		 * A child process with MAP_PRIVATE mappings created by their parent
  1116		 * have no page reserves. This check ensures that reservations are
  1117		 * not "stolen". The child may still get SIGKILLed
  1118		 */
  1119		if (!vma_has_reserves(vma, chg) &&
  1120				h->free_huge_pages - h->resv_huge_pages == 0)
  1121			goto err;
  1122	
  1123		/* If reserves cannot be used, ensure enough pages are in the pool */
  1124		if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
  1125			goto err;
  1126	
  1127		gfp_mask = htlb_alloc_mask(h);
  1128		nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> 1129		if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
  1130			gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
  1131	
  1132			gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
  1133			page = dequeue_huge_page_nodemask(h,
  1134					gfp_mask1, nid, nodemask);
  1135			if (!page)
  1136				page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
  1137		} else {
  1138			page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
  1139		}
  1140		if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
  1141			SetHPageRestoreReserve(page);
  1142			h->resv_huge_pages--;
  1143		}
  1144	
  1145		mpol_cond_put(mpol);
  1146		return page;
  1147	
  1148	err:
  1149		return NULL;
  1150	}
  1151	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Michal Hocko April 14, 2021, 1:25 p.m. UTC | #2
Please use hugetlb prefix to make it explicit that this is hugetlb
related.

On Wed 17-03-21 11:40:08, Feng Tang wrote:
> From: Ben Widawsky <ben.widawsky@intel.com>
> 
> Implement the missing huge page allocation functionality while obeying
> the preferred node semantics.
> 
> This uses a fallback mechanism to try multiple preferred nodes first,
> and then all other nodes. It cannot use the helper function that was
> introduced because huge page allocation already has its own helpers and
> it was more LOC, and effort to try to consolidate that.
> 
> The weirdness is MPOL_PREFERRED_MANY can't be called yet because it is
> part of the UAPI we haven't yet exposed. Instead of make that define
> global, it's simply changed with the UAPI patch.
> 
> [ feng: add NOWARN flag, and skip the direct reclaim to speedup allocation
>   in some case ]
> 
> Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> Signed-off-by: Feng Tang <feng.tang@intel.com>
> ---
>  mm/hugetlb.c   | 26 +++++++++++++++++++++++---
>  mm/mempolicy.c |  3 ++-
>  2 files changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 8fb42c6..9dfbfa3 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1105,7 +1105,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>  				unsigned long address, int avoid_reserve,
>  				long chg)
>  {
> -	struct page *page;
> +	struct page *page = NULL;
>  	struct mempolicy *mpol;
>  	gfp_t gfp_mask;
>  	nodemask_t *nodemask;
> @@ -1126,7 +1126,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>  
>  	gfp_mask = htlb_alloc_mask(h);
>  	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> -	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> +	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */

Please use MPOL_PREFERRED_MANY explicitly here.

> +		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
> +
> +		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
> +		page = dequeue_huge_page_nodemask(h,
> +				gfp_mask1, nid, nodemask);
> +		if (!page)
> +			page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
> +	} else {
> +		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> +	}
>  	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
>  		SetHPageRestoreReserve(page);
>  		h->resv_huge_pages--;

__GFP_DIRECT_RECLAIM handing is not needed here. dequeue_huge_page_nodemask 
only uses gfp mask to get zone and cpusets constraines. So the above
should have simply been
	if (mpol->mode == MPOL_PREFERRED_MANY) {
		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
		if (page)
			goto got_page;
		/* fallback to all nodes */
		nodemask = NULL;
	}
	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
got_page:
	if (page ...)

> @@ -1883,7 +1893,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
>  	nodemask_t *nodemask;
>  
>  	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> -	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> +	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
> +		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
> +
> +		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
> +		page = alloc_surplus_huge_page(h,
> +				gfp_mask1, nid, nodemask);
> +		if (!page)
> +			alloc_surplus_huge_page(h, gfp_mask, nid, NULL);
> +	} else {
> +		page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> +	}

And here similar
	if (mpol->mode == MPOL_PREFERRED_MANY) {
		page = alloc_surplus_huge_page(h, (gfp_mask | __GFP_NOWARN) & ~(__GFP_DIRECT_RECLAIM), nodemask);
		if (page)
			goto got_page;
		/* fallback to all nodes */
		nodemask = NULL;
	}
	page = alloc_surplus_huge_page(h, gfp_mask, nodemask);
got_page:
>  	mpol_cond_put(mpol);

You can have a dedicated gfp mask here if you prefer of course but I
calling out MPOL_PREFERRED_MANY explicitly will make the code easier to
read.

>  	return page;
Feng Tang April 15, 2021, 7:41 a.m. UTC | #3
Hi Michal,

Many thanks for reviewing the whole patchset! We will check them.

On Wed, Apr 14, 2021 at 03:25:34PM +0200, Michal Hocko wrote:
> Please use hugetlb prefix to make it explicit that this is hugetlb
> related.
> 
> On Wed 17-03-21 11:40:08, Feng Tang wrote:
> > From: Ben Widawsky <ben.widawsky@intel.com>
> > 
> > Implement the missing huge page allocation functionality while obeying
> > the preferred node semantics.
> > 
> > This uses a fallback mechanism to try multiple preferred nodes first,
> > and then all other nodes. It cannot use the helper function that was
> > introduced because huge page allocation already has its own helpers and
> > it was more LOC, and effort to try to consolidate that.
> > 
> > The weirdness is MPOL_PREFERRED_MANY can't be called yet because it is
> > part of the UAPI we haven't yet exposed. Instead of make that define
> > global, it's simply changed with the UAPI patch.
> > 
> > [ feng: add NOWARN flag, and skip the direct reclaim to speedup allocation
> >   in some case ]
> > 
> > Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
> > Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
> > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > ---
> >  mm/hugetlb.c   | 26 +++++++++++++++++++++++---
> >  mm/mempolicy.c |  3 ++-
> >  2 files changed, 25 insertions(+), 4 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 8fb42c6..9dfbfa3 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1105,7 +1105,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> >  				unsigned long address, int avoid_reserve,
> >  				long chg)
> >  {
> > -	struct page *page;
> > +	struct page *page = NULL;
> >  	struct mempolicy *mpol;
> >  	gfp_t gfp_mask;
> >  	nodemask_t *nodemask;
> > @@ -1126,7 +1126,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> >  
> >  	gfp_mask = htlb_alloc_mask(h);
> >  	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> > -	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> > +	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
> 
> Please use MPOL_PREFERRED_MANY explicitly here.
> 
> > +		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
> > +
> > +		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
> > +		page = dequeue_huge_page_nodemask(h,
> > +				gfp_mask1, nid, nodemask);
> > +		if (!page)
> > +			page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
> > +	} else {
> > +		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> > +	}
> >  	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
> >  		SetHPageRestoreReserve(page);
> >  		h->resv_huge_pages--;
> 
> __GFP_DIRECT_RECLAIM handing is not needed here. dequeue_huge_page_nodemask 
> only uses gfp mask to get zone and cpusets constraines. So the above
> should have simply been
> 	if (mpol->mode == MPOL_PREFERRED_MANY) {
> 		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> 		if (page)
> 			goto got_page;
> 		/* fallback to all nodes */
> 		nodemask = NULL;
> 	}
> 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
> got_page:
> 	if (page ...)

You are right, no need to change the gfp_mask here.


> > @@ -1883,7 +1893,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
> >  	nodemask_t *nodemask;
> >  
> >  	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> > -	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> > +	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
> > +		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
> > +
> > +		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
> > +		page = alloc_surplus_huge_page(h,
> > +				gfp_mask1, nid, nodemask);
> > +		if (!page)
> > +			alloc_surplus_huge_page(h, gfp_mask, nid, NULL);
> > +	} else {
> > +		page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> > +	}
> 
> And here similar
> 	if (mpol->mode == MPOL_PREFERRED_MANY) {
> 		page = alloc_surplus_huge_page(h, (gfp_mask | __GFP_NOWARN) & ~(__GFP_DIRECT_RECLAIM), nodemask);
> 		if (page)
> 			goto got_page;
> 		/* fallback to all nodes */
> 		nodemask = NULL;
> 	}
> 	page = alloc_surplus_huge_page(h, gfp_mask, nodemask);
> got_page:
> >  	mpol_cond_put(mpol);
> 
> You can have a dedicated gfp mask here if you prefer of course but I
> calling out MPOL_PREFERRED_MANY explicitly will make the code easier to
> read.

Will follow. The "if (mpol->mode != MPOL_BIND && nodemask) {
/* AKA MPOL_PREFERRED_MANY *a/ " and "MPOL_MAX + 1" will be replaced
in the 12/13 patch.

Thanks,
Feng

> >  	return page;
> -- 
> Michal Hocko
> SUSE Labs
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8fb42c6..9dfbfa3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1105,7 +1105,7 @@  static struct page *dequeue_huge_page_vma(struct hstate *h,
 				unsigned long address, int avoid_reserve,
 				long chg)
 {
-	struct page *page;
+	struct page *page = NULL;
 	struct mempolicy *mpol;
 	gfp_t gfp_mask;
 	nodemask_t *nodemask;
@@ -1126,7 +1126,17 @@  static struct page *dequeue_huge_page_vma(struct hstate *h,
 
 	gfp_mask = htlb_alloc_mask(h);
 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
+		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
+
+		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
+		page = dequeue_huge_page_nodemask(h,
+				gfp_mask1, nid, nodemask);
+		if (!page)
+			page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
+	} else {
+		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+	}
 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
 		SetHPageRestoreReserve(page);
 		h->resv_huge_pages--;
@@ -1883,7 +1893,17 @@  struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 	nodemask_t *nodemask;
 
 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+	if (mpol->mode != MPOL_BIND && nodemask) { /* AKA MPOL_PREFERRED_MANY */
+		gfp_t gfp_mask1 = gfp_mask | __GFP_NOWARN;
+
+		gfp_mask1 &= ~__GFP_DIRECT_RECLAIM;
+		page = alloc_surplus_huge_page(h,
+				gfp_mask1, nid, nodemask);
+		if (!page)
+			alloc_surplus_huge_page(h, gfp_mask, nid, NULL);
+	} else {
+		page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+	}
 	mpol_cond_put(mpol);
 
 	return page;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8fe76a7..40d32cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2085,7 +2085,8 @@  int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
-		if ((*mpol)->mode == MPOL_BIND)
+		if ((*mpol)->mode == MPOL_BIND ||
+		    (*mpol)->mode == MPOL_PREFERRED_MANY)
 			*nodemask = &(*mpol)->nodes;
 	}
 	return nid;