diff mbox series

[V4,3/4] mm/sparse-vmemmap: Generalise vmemmap_populate_hugepages()

Message ID 20220704112526.2492342-4-chenhuacai@loongson.cn (mailing list archive)
State Superseded
Headers show
Series mm/sparse-vmemmap: Generalise helpers and enable for LoongArch | expand

Commit Message

Huacai Chen July 4, 2022, 11:25 a.m. UTC
From: Feiyang Chen <chenfeiyang@loongson.cn>

Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can
share its implementation.

Signed-off-by: Feiyang Chen <chenfeiyang@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/arm64/mm/mmu.c      | 53 ++++++-----------------
 arch/loongarch/mm/init.c | 63 ++++++++-------------------
 arch/x86/mm/init_64.c    | 92 ++++++++++++++--------------------------
 include/linux/mm.h       |  6 +++
 mm/sparse-vmemmap.c      | 54 +++++++++++++++++++++++
 5 files changed, 124 insertions(+), 144 deletions(-)

Comments

Will Deacon July 5, 2022, 9:29 a.m. UTC | #1
On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> From: Feiyang Chen <chenfeiyang@loongson.cn>
> 
> Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can
> share its implementation.
> 
> Signed-off-by: Feiyang Chen <chenfeiyang@loongson.cn>
> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> ---
>  arch/arm64/mm/mmu.c      | 53 ++++++-----------------
>  arch/loongarch/mm/init.c | 63 ++++++++-------------------
>  arch/x86/mm/init_64.c    | 92 ++++++++++++++--------------------------
>  include/linux/mm.h       |  6 +++
>  mm/sparse-vmemmap.c      | 54 +++++++++++++++++++++++
>  5 files changed, 124 insertions(+), 144 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32873c6..b080a65c719d 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1158,49 +1158,24 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>  	return vmemmap_populate_basepages(start, end, node, altmap);
>  }
>  #else	/* !ARM64_KERNEL_USES_PMD_MAPS */
> +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
> +			       unsigned long addr, unsigned long next)
> +{
> +	pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
> +}
> +
> +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr,
> +				unsigned long next)
> +{
> +	vmemmap_verify((pte_t *)pmdp, node, addr, next);
> +	return 1;
> +}
> +
>  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>  		struct vmem_altmap *altmap)
>  {
> -	unsigned long addr = start;
> -	unsigned long next;
> -	pgd_t *pgdp;
> -	p4d_t *p4dp;
> -	pud_t *pudp;
> -	pmd_t *pmdp;
> -
>  	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
> -	do {
> -		next = pmd_addr_end(addr, end);
> -
> -		pgdp = vmemmap_pgd_populate(addr, node);
> -		if (!pgdp)
> -			return -ENOMEM;
> -
> -		p4dp = vmemmap_p4d_populate(pgdp, addr, node);
> -		if (!p4dp)
> -			return -ENOMEM;
> -
> -		pudp = vmemmap_pud_populate(p4dp, addr, node);
> -		if (!pudp)
> -			return -ENOMEM;
> -
> -		pmdp = pmd_offset(pudp, addr);
> -		if (pmd_none(READ_ONCE(*pmdp))) {
> -			void *p = NULL;
> -
> -			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> -			if (!p) {
> -				if (vmemmap_populate_basepages(addr, next, node, altmap))
> -					return -ENOMEM;
> -				continue;
> -			}
> -
> -			pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
> -		} else
> -			vmemmap_verify((pte_t *)pmdp, node, addr, next);
> -	} while (addr = next, addr != end);
> -
> -	return 0;
> +	return vmemmap_populate_hugepages(start, end, node, altmap);
>  }
>  #endif	/* !ARM64_KERNEL_USES_PMD_MAPS */


I think the arm64 change is mostly ok (thanks!), but I have a question about
the core code you're introducing:

> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index 33e2a1ceee72..6f2e40bb695d 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>  	return vmemmap_populate_range(start, end, node, altmap, NULL);
>  }
>  
> +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> +				      unsigned long addr, unsigned long next)
> +{
> +}
> +
> +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> +				       unsigned long next)
> +{
> +	return 0;
> +}
> +
> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> +					 int node, struct vmem_altmap *altmap)
> +{
> +	unsigned long addr;
> +	unsigned long next;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	for (addr = start; addr < end; addr = next) {
> +		next = pmd_addr_end(addr, end);
> +
> +		pgd = vmemmap_pgd_populate(addr, node);
> +		if (!pgd)
> +			return -ENOMEM;
> +
> +		p4d = vmemmap_p4d_populate(pgd, addr, node);
> +		if (!p4d)
> +			return -ENOMEM;
> +
> +		pud = vmemmap_pud_populate(p4d, addr, node);
> +		if (!pud)
> +			return -ENOMEM;
> +
> +		pmd = pmd_offset(pud, addr);
> +		if (pmd_none(READ_ONCE(*pmd))) {
> +			void *p;
> +
> +			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> +			if (p) {
> +				vmemmap_set_pmd(pmd, p, node, addr, next);
> +				continue;
> +			} else if (altmap)
> +				return -ENOMEM; /* no fallback */

Why do you return -ENOMEM if 'altmap' here? That seems to be different to
what we currently have on arm64 and it's not clear to me why we're happy
with an altmap for the pmd case, but not for the pte case.

Will
Huacai Chen July 5, 2022, 1:07 p.m. UTC | #2
Hi, Will,

On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
>
> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > From: Feiyang Chen <chenfeiyang@loongson.cn>
> >
> > Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can
> > share its implementation.
> >
> > Signed-off-by: Feiyang Chen <chenfeiyang@loongson.cn>
> > Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
> > ---
> >  arch/arm64/mm/mmu.c      | 53 ++++++-----------------
> >  arch/loongarch/mm/init.c | 63 ++++++++-------------------
> >  arch/x86/mm/init_64.c    | 92 ++++++++++++++--------------------------
> >  include/linux/mm.h       |  6 +++
> >  mm/sparse-vmemmap.c      | 54 +++++++++++++++++++++++
> >  5 files changed, 124 insertions(+), 144 deletions(-)
> >
> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> > index 626ec32873c6..b080a65c719d 100644
> > --- a/arch/arm64/mm/mmu.c
> > +++ b/arch/arm64/mm/mmu.c
> > @@ -1158,49 +1158,24 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> >       return vmemmap_populate_basepages(start, end, node, altmap);
> >  }
> >  #else        /* !ARM64_KERNEL_USES_PMD_MAPS */
> > +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
> > +                            unsigned long addr, unsigned long next)
> > +{
> > +     pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
> > +}
> > +
> > +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr,
> > +                             unsigned long next)
> > +{
> > +     vmemmap_verify((pte_t *)pmdp, node, addr, next);
> > +     return 1;
> > +}
> > +
> >  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> >               struct vmem_altmap *altmap)
> >  {
> > -     unsigned long addr = start;
> > -     unsigned long next;
> > -     pgd_t *pgdp;
> > -     p4d_t *p4dp;
> > -     pud_t *pudp;
> > -     pmd_t *pmdp;
> > -
> >       WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
> > -     do {
> > -             next = pmd_addr_end(addr, end);
> > -
> > -             pgdp = vmemmap_pgd_populate(addr, node);
> > -             if (!pgdp)
> > -                     return -ENOMEM;
> > -
> > -             p4dp = vmemmap_p4d_populate(pgdp, addr, node);
> > -             if (!p4dp)
> > -                     return -ENOMEM;
> > -
> > -             pudp = vmemmap_pud_populate(p4dp, addr, node);
> > -             if (!pudp)
> > -                     return -ENOMEM;
> > -
> > -             pmdp = pmd_offset(pudp, addr);
> > -             if (pmd_none(READ_ONCE(*pmdp))) {
> > -                     void *p = NULL;
> > -
> > -                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > -                     if (!p) {
> > -                             if (vmemmap_populate_basepages(addr, next, node, altmap))
> > -                                     return -ENOMEM;
> > -                             continue;
> > -                     }
> > -
> > -                     pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
> > -             } else
> > -                     vmemmap_verify((pte_t *)pmdp, node, addr, next);
> > -     } while (addr = next, addr != end);
> > -
> > -     return 0;
> > +     return vmemmap_populate_hugepages(start, end, node, altmap);
> >  }
> >  #endif       /* !ARM64_KERNEL_USES_PMD_MAPS */
>
>
> I think the arm64 change is mostly ok (thanks!), but I have a question about
> the core code you're introducing:
>
> > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > index 33e2a1ceee72..6f2e40bb695d 100644
> > --- a/mm/sparse-vmemmap.c
> > +++ b/mm/sparse-vmemmap.c
> > @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> >       return vmemmap_populate_range(start, end, node, altmap, NULL);
> >  }
> >
> > +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> > +                                   unsigned long addr, unsigned long next)
> > +{
> > +}
> > +
> > +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> > +                                    unsigned long next)
> > +{
> > +     return 0;
> > +}
> > +
> > +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > +                                      int node, struct vmem_altmap *altmap)
> > +{
> > +     unsigned long addr;
> > +     unsigned long next;
> > +     pgd_t *pgd;
> > +     p4d_t *p4d;
> > +     pud_t *pud;
> > +     pmd_t *pmd;
> > +
> > +     for (addr = start; addr < end; addr = next) {
> > +             next = pmd_addr_end(addr, end);
> > +
> > +             pgd = vmemmap_pgd_populate(addr, node);
> > +             if (!pgd)
> > +                     return -ENOMEM;
> > +
> > +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > +             if (!p4d)
> > +                     return -ENOMEM;
> > +
> > +             pud = vmemmap_pud_populate(p4d, addr, node);
> > +             if (!pud)
> > +                     return -ENOMEM;
> > +
> > +             pmd = pmd_offset(pud, addr);
> > +             if (pmd_none(READ_ONCE(*pmd))) {
> > +                     void *p;
> > +
> > +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > +                     if (p) {
> > +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > +                             continue;
> > +                     } else if (altmap)
> > +                             return -ENOMEM; /* no fallback */
>
> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> what we currently have on arm64 and it's not clear to me why we're happy
> with an altmap for the pmd case, but not for the pte case.
The generic version is the same as X86. It seems that ARM64 always
fallback whether there is an altmap, but X86 only fallback in the no
altmap case. I don't know the reason of X86, can Dan Williams give
some explaination?

Huacai
>
> Will
Will Deacon July 6, 2022, 4:17 p.m. UTC | #3
On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > > index 33e2a1ceee72..6f2e40bb695d 100644
> > > --- a/mm/sparse-vmemmap.c
> > > +++ b/mm/sparse-vmemmap.c
> > > @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> > >       return vmemmap_populate_range(start, end, node, altmap, NULL);
> > >  }
> > >
> > > +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> > > +                                   unsigned long addr, unsigned long next)
> > > +{
> > > +}
> > > +
> > > +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> > > +                                    unsigned long next)
> > > +{
> > > +     return 0;
> > > +}
> > > +
> > > +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > +                                      int node, struct vmem_altmap *altmap)
> > > +{
> > > +     unsigned long addr;
> > > +     unsigned long next;
> > > +     pgd_t *pgd;
> > > +     p4d_t *p4d;
> > > +     pud_t *pud;
> > > +     pmd_t *pmd;
> > > +
> > > +     for (addr = start; addr < end; addr = next) {
> > > +             next = pmd_addr_end(addr, end);
> > > +
> > > +             pgd = vmemmap_pgd_populate(addr, node);
> > > +             if (!pgd)
> > > +                     return -ENOMEM;
> > > +
> > > +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > +             if (!p4d)
> > > +                     return -ENOMEM;
> > > +
> > > +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > +             if (!pud)
> > > +                     return -ENOMEM;
> > > +
> > > +             pmd = pmd_offset(pud, addr);
> > > +             if (pmd_none(READ_ONCE(*pmd))) {
> > > +                     void *p;
> > > +
> > > +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > +                     if (p) {
> > > +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > +                             continue;
> > > +                     } else if (altmap)
> > > +                             return -ENOMEM; /* no fallback */
> >
> > Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > what we currently have on arm64 and it's not clear to me why we're happy
> > with an altmap for the pmd case, but not for the pte case.
> The generic version is the same as X86. It seems that ARM64 always
> fallback whether there is an altmap, but X86 only fallback in the no
> altmap case. I don't know the reason of X86, can Dan Williams give
> some explaination?

Right, I think we need to understand the new behaviour here before we adopt
it on arm64.

Will
Huacai Chen July 8, 2022, 9:47 a.m. UTC | #4
+Dan Williams
+Sudarshan Rajagopalan

On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
>
> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > > On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > > > index 33e2a1ceee72..6f2e40bb695d 100644
> > > > --- a/mm/sparse-vmemmap.c
> > > > +++ b/mm/sparse-vmemmap.c
> > > > @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> > > >       return vmemmap_populate_range(start, end, node, altmap, NULL);
> > > >  }
> > > >
> > > > +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> > > > +                                   unsigned long addr, unsigned long next)
> > > > +{
> > > > +}
> > > > +
> > > > +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> > > > +                                    unsigned long next)
> > > > +{
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > > +                                      int node, struct vmem_altmap *altmap)
> > > > +{
> > > > +     unsigned long addr;
> > > > +     unsigned long next;
> > > > +     pgd_t *pgd;
> > > > +     p4d_t *p4d;
> > > > +     pud_t *pud;
> > > > +     pmd_t *pmd;
> > > > +
> > > > +     for (addr = start; addr < end; addr = next) {
> > > > +             next = pmd_addr_end(addr, end);
> > > > +
> > > > +             pgd = vmemmap_pgd_populate(addr, node);
> > > > +             if (!pgd)
> > > > +                     return -ENOMEM;
> > > > +
> > > > +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > > +             if (!p4d)
> > > > +                     return -ENOMEM;
> > > > +
> > > > +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > > +             if (!pud)
> > > > +                     return -ENOMEM;
> > > > +
> > > > +             pmd = pmd_offset(pud, addr);
> > > > +             if (pmd_none(READ_ONCE(*pmd))) {
> > > > +                     void *p;
> > > > +
> > > > +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > +                     if (p) {
> > > > +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > > +                             continue;
> > > > +                     } else if (altmap)
> > > > +                             return -ENOMEM; /* no fallback */
> > >
> > > Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > > what we currently have on arm64 and it's not clear to me why we're happy
> > > with an altmap for the pmd case, but not for the pte case.
> > The generic version is the same as X86. It seems that ARM64 always
> > fallback whether there is an altmap, but X86 only fallback in the no
> > altmap case. I don't know the reason of X86, can Dan Williams give
> > some explaination?
>
> Right, I think we need to understand the new behaviour here before we adopt
> it on arm64.
Hi, Dan,
Could you please tell us the reason? Thanks.

And Sudarshan,
You are the author of adding a fallback mechanism to ARM64,  do you
know why ARM64 is different from X86 (only fallback in no altmap
case)?

Huacai

>
> Will
Huacai Chen July 14, 2022, 12:34 p.m. UTC | #5
Oh, Sudarshan Rajagopalan's Email has changed, Let's update.

Huacai

On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> +Dan Williams
> +Sudarshan Rajagopalan
>
> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> >
> > On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > > On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > > > On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > > > > index 33e2a1ceee72..6f2e40bb695d 100644
> > > > > --- a/mm/sparse-vmemmap.c
> > > > > +++ b/mm/sparse-vmemmap.c
> > > > > @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> > > > >       return vmemmap_populate_range(start, end, node, altmap, NULL);
> > > > >  }
> > > > >
> > > > > +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> > > > > +                                   unsigned long addr, unsigned long next)
> > > > > +{
> > > > > +}
> > > > > +
> > > > > +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> > > > > +                                    unsigned long next)
> > > > > +{
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > > > +                                      int node, struct vmem_altmap *altmap)
> > > > > +{
> > > > > +     unsigned long addr;
> > > > > +     unsigned long next;
> > > > > +     pgd_t *pgd;
> > > > > +     p4d_t *p4d;
> > > > > +     pud_t *pud;
> > > > > +     pmd_t *pmd;
> > > > > +
> > > > > +     for (addr = start; addr < end; addr = next) {
> > > > > +             next = pmd_addr_end(addr, end);
> > > > > +
> > > > > +             pgd = vmemmap_pgd_populate(addr, node);
> > > > > +             if (!pgd)
> > > > > +                     return -ENOMEM;
> > > > > +
> > > > > +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > > > +             if (!p4d)
> > > > > +                     return -ENOMEM;
> > > > > +
> > > > > +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > > > +             if (!pud)
> > > > > +                     return -ENOMEM;
> > > > > +
> > > > > +             pmd = pmd_offset(pud, addr);
> > > > > +             if (pmd_none(READ_ONCE(*pmd))) {
> > > > > +                     void *p;
> > > > > +
> > > > > +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > > +                     if (p) {
> > > > > +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > > > +                             continue;
> > > > > +                     } else if (altmap)
> > > > > +                             return -ENOMEM; /* no fallback */
> > > >
> > > > Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > > > what we currently have on arm64 and it's not clear to me why we're happy
> > > > with an altmap for the pmd case, but not for the pte case.
> > > The generic version is the same as X86. It seems that ARM64 always
> > > fallback whether there is an altmap, but X86 only fallback in the no
> > > altmap case. I don't know the reason of X86, can Dan Williams give
> > > some explaination?
> >
> > Right, I think we need to understand the new behaviour here before we adopt
> > it on arm64.
> Hi, Dan,
> Could you please tell us the reason? Thanks.
>
> And Sudarshan,
> You are the author of adding a fallback mechanism to ARM64,  do you
> know why ARM64 is different from X86 (only fallback in no altmap
> case)?
>
> Huacai
>
> >
> > Will
David Hildenbrand July 20, 2022, 9:34 a.m. UTC | #6
On 14.07.22 14:34, Huacai Chen wrote:
> Oh, Sudarshan Rajagopalan's Email has changed, Let's update.
> 
> Huacai
> 
> On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>>
>> +Dan Williams
>> +Sudarshan Rajagopalan
>>
>> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
>>>
>>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
>>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
>>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
>>>>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>>>>>> index 33e2a1ceee72..6f2e40bb695d 100644
>>>>>> --- a/mm/sparse-vmemmap.c
>>>>>> +++ b/mm/sparse-vmemmap.c
>>>>>> @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>>>>>       return vmemmap_populate_range(start, end, node, altmap, NULL);
>>>>>>  }
>>>>>>
>>>>>> +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
>>>>>> +                                   unsigned long addr, unsigned long next)
>>>>>> +{
>>>>>> +}
>>>>>> +
>>>>>> +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
>>>>>> +                                    unsigned long next)
>>>>>> +{
>>>>>> +     return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
>>>>>> +                                      int node, struct vmem_altmap *altmap)
>>>>>> +{
>>>>>> +     unsigned long addr;
>>>>>> +     unsigned long next;
>>>>>> +     pgd_t *pgd;
>>>>>> +     p4d_t *p4d;
>>>>>> +     pud_t *pud;
>>>>>> +     pmd_t *pmd;
>>>>>> +
>>>>>> +     for (addr = start; addr < end; addr = next) {
>>>>>> +             next = pmd_addr_end(addr, end);
>>>>>> +
>>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
>>>>>> +             if (!pgd)
>>>>>> +                     return -ENOMEM;
>>>>>> +
>>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
>>>>>> +             if (!p4d)
>>>>>> +                     return -ENOMEM;
>>>>>> +
>>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
>>>>>> +             if (!pud)
>>>>>> +                     return -ENOMEM;
>>>>>> +
>>>>>> +             pmd = pmd_offset(pud, addr);
>>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
>>>>>> +                     void *p;
>>>>>> +
>>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
>>>>>> +                     if (p) {
>>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
>>>>>> +                             continue;
>>>>>> +                     } else if (altmap)
>>>>>> +                             return -ENOMEM; /* no fallback */
>>>>>
>>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
>>>>> what we currently have on arm64 and it's not clear to me why we're happy
>>>>> with an altmap for the pmd case, but not for the pte case.
>>>> The generic version is the same as X86. It seems that ARM64 always
>>>> fallback whether there is an altmap, but X86 only fallback in the no
>>>> altmap case. I don't know the reason of X86, can Dan Williams give
>>>> some explaination?
>>>
>>> Right, I think we need to understand the new behaviour here before we adopt
>>> it on arm64.
>> Hi, Dan,
>> Could you please tell us the reason? Thanks.
>>
>> And Sudarshan,
>> You are the author of adding a fallback mechanism to ARM64,  do you
>> know why ARM64 is different from X86 (only fallback in no altmap
>> case)?

I think that's a purely theoretical issue: I assume that in any case we
care about, the altmap should be reasonably sized and aligned such that
this will always succeed.

To me it even sounds like the best idea to *consistently* fail if there
is no more space in the altmap, even if we'd have to fallback to PTE
(again, highly unlikely that this is relevant in practice). Could
indicate an altmap-size configuration issue.
Huacai Chen July 21, 2022, 2:08 a.m. UTC | #7
Hi, Will,

On Wed, Jul 20, 2022 at 5:34 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 14.07.22 14:34, Huacai Chen wrote:
> > Oh, Sudarshan Rajagopalan's Email has changed, Let's update.
> >
> > Huacai
> >
> > On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> >>
> >> +Dan Williams
> >> +Sudarshan Rajagopalan
> >>
> >> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> >>>
> >>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> >>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> >>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> >>>>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> >>>>>> index 33e2a1ceee72..6f2e40bb695d 100644
> >>>>>> --- a/mm/sparse-vmemmap.c
> >>>>>> +++ b/mm/sparse-vmemmap.c
> >>>>>> @@ -686,6 +686,60 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> >>>>>>       return vmemmap_populate_range(start, end, node, altmap, NULL);
> >>>>>>  }
> >>>>>>
> >>>>>> +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
> >>>>>> +                                   unsigned long addr, unsigned long next)
> >>>>>> +{
> >>>>>> +}
> >>>>>> +
> >>>>>> +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
> >>>>>> +                                    unsigned long next)
> >>>>>> +{
> >>>>>> +     return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> >>>>>> +                                      int node, struct vmem_altmap *altmap)
> >>>>>> +{
> >>>>>> +     unsigned long addr;
> >>>>>> +     unsigned long next;
> >>>>>> +     pgd_t *pgd;
> >>>>>> +     p4d_t *p4d;
> >>>>>> +     pud_t *pud;
> >>>>>> +     pmd_t *pmd;
> >>>>>> +
> >>>>>> +     for (addr = start; addr < end; addr = next) {
> >>>>>> +             next = pmd_addr_end(addr, end);
> >>>>>> +
> >>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
> >>>>>> +             if (!pgd)
> >>>>>> +                     return -ENOMEM;
> >>>>>> +
> >>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> >>>>>> +             if (!p4d)
> >>>>>> +                     return -ENOMEM;
> >>>>>> +
> >>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
> >>>>>> +             if (!pud)
> >>>>>> +                     return -ENOMEM;
> >>>>>> +
> >>>>>> +             pmd = pmd_offset(pud, addr);
> >>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
> >>>>>> +                     void *p;
> >>>>>> +
> >>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> >>>>>> +                     if (p) {
> >>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> >>>>>> +                             continue;
> >>>>>> +                     } else if (altmap)
> >>>>>> +                             return -ENOMEM; /* no fallback */
> >>>>>
> >>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> >>>>> what we currently have on arm64 and it's not clear to me why we're happy
> >>>>> with an altmap for the pmd case, but not for the pte case.
> >>>> The generic version is the same as X86. It seems that ARM64 always
> >>>> fallback whether there is an altmap, but X86 only fallback in the no
> >>>> altmap case. I don't know the reason of X86, can Dan Williams give
> >>>> some explaination?
> >>>
> >>> Right, I think we need to understand the new behaviour here before we adopt
> >>> it on arm64.
> >> Hi, Dan,
> >> Could you please tell us the reason? Thanks.
> >>
> >> And Sudarshan,
> >> You are the author of adding a fallback mechanism to ARM64,  do you
> >> know why ARM64 is different from X86 (only fallback in no altmap
> >> case)?
>
> I think that's a purely theoretical issue: I assume that in any case we
> care about, the altmap should be reasonably sized and aligned such that
> this will always succeed.
>
> To me it even sounds like the best idea to *consistently* fail if there
> is no more space in the altmap, even if we'd have to fallback to PTE
> (again, highly unlikely that this is relevant in practice). Could
> indicate an altmap-size configuration issue.

Does David's explanation make things clear? Moreover, I think Dan's
dedicated comments "no fallback" implies that his design is carefully
considered. So I think the generic version using the X86 logic is just
OK.

Huacai
>
> --
> Thanks,
>
> David / dhildenb
>
>
Will Deacon July 21, 2022, 9:55 a.m. UTC | #8
On Thu, Jul 21, 2022 at 10:08:10AM +0800, Huacai Chen wrote:
> On Wed, Jul 20, 2022 at 5:34 PM David Hildenbrand <david@redhat.com> wrote:
> > On 14.07.22 14:34, Huacai Chen wrote:
> > > On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > >> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> > >>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > >>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > >>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > >>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > >>>>>> +                                      int node, struct vmem_altmap *altmap)
> > >>>>>> +{
> > >>>>>> +     unsigned long addr;
> > >>>>>> +     unsigned long next;
> > >>>>>> +     pgd_t *pgd;
> > >>>>>> +     p4d_t *p4d;
> > >>>>>> +     pud_t *pud;
> > >>>>>> +     pmd_t *pmd;
> > >>>>>> +
> > >>>>>> +     for (addr = start; addr < end; addr = next) {
> > >>>>>> +             next = pmd_addr_end(addr, end);
> > >>>>>> +
> > >>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
> > >>>>>> +             if (!pgd)
> > >>>>>> +                     return -ENOMEM;
> > >>>>>> +
> > >>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > >>>>>> +             if (!p4d)
> > >>>>>> +                     return -ENOMEM;
> > >>>>>> +
> > >>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
> > >>>>>> +             if (!pud)
> > >>>>>> +                     return -ENOMEM;
> > >>>>>> +
> > >>>>>> +             pmd = pmd_offset(pud, addr);
> > >>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
> > >>>>>> +                     void *p;
> > >>>>>> +
> > >>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > >>>>>> +                     if (p) {
> > >>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > >>>>>> +                             continue;
> > >>>>>> +                     } else if (altmap)
> > >>>>>> +                             return -ENOMEM; /* no fallback */
> > >>>>>
> > >>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > >>>>> what we currently have on arm64 and it's not clear to me why we're happy
> > >>>>> with an altmap for the pmd case, but not for the pte case.
> > >>>> The generic version is the same as X86. It seems that ARM64 always
> > >>>> fallback whether there is an altmap, but X86 only fallback in the no
> > >>>> altmap case. I don't know the reason of X86, can Dan Williams give
> > >>>> some explaination?
> > >>>
> > >>> Right, I think we need to understand the new behaviour here before we adopt
> > >>> it on arm64.
> > >> Hi, Dan,
> > >> Could you please tell us the reason? Thanks.
> > >>
> > >> And Sudarshan,
> > >> You are the author of adding a fallback mechanism to ARM64,  do you
> > >> know why ARM64 is different from X86 (only fallback in no altmap
> > >> case)?
> >
> > I think that's a purely theoretical issue: I assume that in any case we
> > care about, the altmap should be reasonably sized and aligned such that
> > this will always succeed.
> >
> > To me it even sounds like the best idea to *consistently* fail if there
> > is no more space in the altmap, even if we'd have to fallback to PTE
> > (again, highly unlikely that this is relevant in practice). Could
> > indicate an altmap-size configuration issue.
> 
> Does David's explanation make things clear? Moreover, I think Dan's
> dedicated comments "no fallback" implies that his design is carefully
> considered. So I think the generic version using the X86 logic is just
> OK.

I think the comment isn't worth the metaphorical paper that it's written
on! If you can bulk it up a bit based on David's reasoning, then that would
help. But yes, I'm happy with the code now, thanks both.

Will
Huacai Chen July 21, 2022, 1:01 p.m. UTC | #9
Hi, Will,

On Thu, Jul 21, 2022 at 5:55 PM Will Deacon <will@kernel.org> wrote:
>
> On Thu, Jul 21, 2022 at 10:08:10AM +0800, Huacai Chen wrote:
> > On Wed, Jul 20, 2022 at 5:34 PM David Hildenbrand <david@redhat.com> wrote:
> > > On 14.07.22 14:34, Huacai Chen wrote:
> > > > On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > >> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> > > >>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > > >>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > > >>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > >>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > >>>>>> +                                      int node, struct vmem_altmap *altmap)
> > > >>>>>> +{
> > > >>>>>> +     unsigned long addr;
> > > >>>>>> +     unsigned long next;
> > > >>>>>> +     pgd_t *pgd;
> > > >>>>>> +     p4d_t *p4d;
> > > >>>>>> +     pud_t *pud;
> > > >>>>>> +     pmd_t *pmd;
> > > >>>>>> +
> > > >>>>>> +     for (addr = start; addr < end; addr = next) {
> > > >>>>>> +             next = pmd_addr_end(addr, end);
> > > >>>>>> +
> > > >>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
> > > >>>>>> +             if (!pgd)
> > > >>>>>> +                     return -ENOMEM;
> > > >>>>>> +
> > > >>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > >>>>>> +             if (!p4d)
> > > >>>>>> +                     return -ENOMEM;
> > > >>>>>> +
> > > >>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > >>>>>> +             if (!pud)
> > > >>>>>> +                     return -ENOMEM;
> > > >>>>>> +
> > > >>>>>> +             pmd = pmd_offset(pud, addr);
> > > >>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
> > > >>>>>> +                     void *p;
> > > >>>>>> +
> > > >>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > >>>>>> +                     if (p) {
> > > >>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > >>>>>> +                             continue;
> > > >>>>>> +                     } else if (altmap)
> > > >>>>>> +                             return -ENOMEM; /* no fallback */
> > > >>>>>
> > > >>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > > >>>>> what we currently have on arm64 and it's not clear to me why we're happy
> > > >>>>> with an altmap for the pmd case, but not for the pte case.
> > > >>>> The generic version is the same as X86. It seems that ARM64 always
> > > >>>> fallback whether there is an altmap, but X86 only fallback in the no
> > > >>>> altmap case. I don't know the reason of X86, can Dan Williams give
> > > >>>> some explaination?
> > > >>>
> > > >>> Right, I think we need to understand the new behaviour here before we adopt
> > > >>> it on arm64.
> > > >> Hi, Dan,
> > > >> Could you please tell us the reason? Thanks.
> > > >>
> > > >> And Sudarshan,
> > > >> You are the author of adding a fallback mechanism to ARM64,  do you
> > > >> know why ARM64 is different from X86 (only fallback in no altmap
> > > >> case)?
> > >
> > > I think that's a purely theoretical issue: I assume that in any case we
> > > care about, the altmap should be reasonably sized and aligned such that
> > > this will always succeed.
> > >
> > > To me it even sounds like the best idea to *consistently* fail if there
> > > is no more space in the altmap, even if we'd have to fallback to PTE
> > > (again, highly unlikely that this is relevant in practice). Could
> > > indicate an altmap-size configuration issue.
> >
> > Does David's explanation make things clear? Moreover, I think Dan's
> > dedicated comments "no fallback" implies that his design is carefully
> > considered. So I think the generic version using the X86 logic is just
> > OK.
>
> I think the comment isn't worth the metaphorical paper that it's written
> on! If you can bulk it up a bit based on David's reasoning, then that would
> help. But yes, I'm happy with the code now, thanks both.
OK, I will add a detailed comment here.

Huacai
>
> Will
Dan Williams July 21, 2022, 9:54 p.m. UTC | #10
Huacai Chen wrote:
> Hi, Will,
> 
> On Thu, Jul 21, 2022 at 5:55 PM Will Deacon <will@kernel.org> wrote:
> >
> > On Thu, Jul 21, 2022 at 10:08:10AM +0800, Huacai Chen wrote:
> > > On Wed, Jul 20, 2022 at 5:34 PM David Hildenbrand <david@redhat.com> wrote:
> > > > On 14.07.22 14:34, Huacai Chen wrote:
> > > > > On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > > >> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> > > > >>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > > > >>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > > > >>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > > >>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > > >>>>>> +                                      int node, struct vmem_altmap *altmap)
> > > > >>>>>> +{
> > > > >>>>>> +     unsigned long addr;
> > > > >>>>>> +     unsigned long next;
> > > > >>>>>> +     pgd_t *pgd;
> > > > >>>>>> +     p4d_t *p4d;
> > > > >>>>>> +     pud_t *pud;
> > > > >>>>>> +     pmd_t *pmd;
> > > > >>>>>> +
> > > > >>>>>> +     for (addr = start; addr < end; addr = next) {
> > > > >>>>>> +             next = pmd_addr_end(addr, end);
> > > > >>>>>> +
> > > > >>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
> > > > >>>>>> +             if (!pgd)
> > > > >>>>>> +                     return -ENOMEM;
> > > > >>>>>> +
> > > > >>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > > >>>>>> +             if (!p4d)
> > > > >>>>>> +                     return -ENOMEM;
> > > > >>>>>> +
> > > > >>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > > >>>>>> +             if (!pud)
> > > > >>>>>> +                     return -ENOMEM;
> > > > >>>>>> +
> > > > >>>>>> +             pmd = pmd_offset(pud, addr);
> > > > >>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
> > > > >>>>>> +                     void *p;
> > > > >>>>>> +
> > > > >>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > >>>>>> +                     if (p) {
> > > > >>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > > >>>>>> +                             continue;
> > > > >>>>>> +                     } else if (altmap)
> > > > >>>>>> +                             return -ENOMEM; /* no fallback */
> > > > >>>>>
> > > > >>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > > > >>>>> what we currently have on arm64 and it's not clear to me why we're happy
> > > > >>>>> with an altmap for the pmd case, but not for the pte case.
> > > > >>>> The generic version is the same as X86. It seems that ARM64 always
> > > > >>>> fallback whether there is an altmap, but X86 only fallback in the no
> > > > >>>> altmap case. I don't know the reason of X86, can Dan Williams give
> > > > >>>> some explaination?
> > > > >>>
> > > > >>> Right, I think we need to understand the new behaviour here before we adopt
> > > > >>> it on arm64.
> > > > >> Hi, Dan,
> > > > >> Could you please tell us the reason? Thanks.
> > > > >>
> > > > >> And Sudarshan,
> > > > >> You are the author of adding a fallback mechanism to ARM64,  do you
> > > > >> know why ARM64 is different from X86 (only fallback in no altmap
> > > > >> case)?
> > > >
> > > > I think that's a purely theoretical issue: I assume that in any case we
> > > > care about, the altmap should be reasonably sized and aligned such that
> > > > this will always succeed.
> > > >
> > > > To me it even sounds like the best idea to *consistently* fail if there
> > > > is no more space in the altmap, even if we'd have to fallback to PTE
> > > > (again, highly unlikely that this is relevant in practice). Could
> > > > indicate an altmap-size configuration issue.
> > >
> > > Does David's explanation make things clear? Moreover, I think Dan's
> > > dedicated comments "no fallback" implies that his design is carefully
> > > considered. So I think the generic version using the X86 logic is just
> > > OK.
> >
> > I think the comment isn't worth the metaphorical paper that it's written
> > on! If you can bulk it up a bit based on David's reasoning, then that would
> > help. But yes, I'm happy with the code now, thanks both.
> OK, I will add a detailed comment here.

Apologies for coming late to the party here, original ping came while I
was on vacation and I only just now noticed the direct questions. All
resolved now or is a question still pending?
Huacai Chen July 22, 2022, 2:31 a.m. UTC | #11
Hi, Dan,

On Fri, Jul 22, 2022 at 5:54 AM Dan Williams <dan.j.williams@intel.com> wrote:
>
> Huacai Chen wrote:
> > Hi, Will,
> >
> > On Thu, Jul 21, 2022 at 5:55 PM Will Deacon <will@kernel.org> wrote:
> > >
> > > On Thu, Jul 21, 2022 at 10:08:10AM +0800, Huacai Chen wrote:
> > > > On Wed, Jul 20, 2022 at 5:34 PM David Hildenbrand <david@redhat.com> wrote:
> > > > > On 14.07.22 14:34, Huacai Chen wrote:
> > > > > > On Fri, Jul 8, 2022 at 5:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> > > > > >> On Thu, Jul 7, 2022 at 12:17 AM Will Deacon <will@kernel.org> wrote:
> > > > > >>> On Tue, Jul 05, 2022 at 09:07:59PM +0800, Huacai Chen wrote:
> > > > > >>>> On Tue, Jul 5, 2022 at 5:29 PM Will Deacon <will@kernel.org> wrote:
> > > > > >>>>> On Mon, Jul 04, 2022 at 07:25:25PM +0800, Huacai Chen wrote:
> > > > > >>>>>> +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
> > > > > >>>>>> +                                      int node, struct vmem_altmap *altmap)
> > > > > >>>>>> +{
> > > > > >>>>>> +     unsigned long addr;
> > > > > >>>>>> +     unsigned long next;
> > > > > >>>>>> +     pgd_t *pgd;
> > > > > >>>>>> +     p4d_t *p4d;
> > > > > >>>>>> +     pud_t *pud;
> > > > > >>>>>> +     pmd_t *pmd;
> > > > > >>>>>> +
> > > > > >>>>>> +     for (addr = start; addr < end; addr = next) {
> > > > > >>>>>> +             next = pmd_addr_end(addr, end);
> > > > > >>>>>> +
> > > > > >>>>>> +             pgd = vmemmap_pgd_populate(addr, node);
> > > > > >>>>>> +             if (!pgd)
> > > > > >>>>>> +                     return -ENOMEM;
> > > > > >>>>>> +
> > > > > >>>>>> +             p4d = vmemmap_p4d_populate(pgd, addr, node);
> > > > > >>>>>> +             if (!p4d)
> > > > > >>>>>> +                     return -ENOMEM;
> > > > > >>>>>> +
> > > > > >>>>>> +             pud = vmemmap_pud_populate(p4d, addr, node);
> > > > > >>>>>> +             if (!pud)
> > > > > >>>>>> +                     return -ENOMEM;
> > > > > >>>>>> +
> > > > > >>>>>> +             pmd = pmd_offset(pud, addr);
> > > > > >>>>>> +             if (pmd_none(READ_ONCE(*pmd))) {
> > > > > >>>>>> +                     void *p;
> > > > > >>>>>> +
> > > > > >>>>>> +                     p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > > >>>>>> +                     if (p) {
> > > > > >>>>>> +                             vmemmap_set_pmd(pmd, p, node, addr, next);
> > > > > >>>>>> +                             continue;
> > > > > >>>>>> +                     } else if (altmap)
> > > > > >>>>>> +                             return -ENOMEM; /* no fallback */
> > > > > >>>>>
> > > > > >>>>> Why do you return -ENOMEM if 'altmap' here? That seems to be different to
> > > > > >>>>> what we currently have on arm64 and it's not clear to me why we're happy
> > > > > >>>>> with an altmap for the pmd case, but not for the pte case.
> > > > > >>>> The generic version is the same as X86. It seems that ARM64 always
> > > > > >>>> fallback whether there is an altmap, but X86 only fallback in the no
> > > > > >>>> altmap case. I don't know the reason of X86, can Dan Williams give
> > > > > >>>> some explaination?
> > > > > >>>
> > > > > >>> Right, I think we need to understand the new behaviour here before we adopt
> > > > > >>> it on arm64.
> > > > > >> Hi, Dan,
> > > > > >> Could you please tell us the reason? Thanks.
> > > > > >>
> > > > > >> And Sudarshan,
> > > > > >> You are the author of adding a fallback mechanism to ARM64,  do you
> > > > > >> know why ARM64 is different from X86 (only fallback in no altmap
> > > > > >> case)?
> > > > >
> > > > > I think that's a purely theoretical issue: I assume that in any case we
> > > > > care about, the altmap should be reasonably sized and aligned such that
> > > > > this will always succeed.
> > > > >
> > > > > To me it even sounds like the best idea to *consistently* fail if there
> > > > > is no more space in the altmap, even if we'd have to fallback to PTE
> > > > > (again, highly unlikely that this is relevant in practice). Could
> > > > > indicate an altmap-size configuration issue.
> > > >
> > > > Does David's explanation make things clear? Moreover, I think Dan's
> > > > dedicated comments "no fallback" implies that his design is carefully
> > > > considered. So I think the generic version using the X86 logic is just
> > > > OK.
> > >
> > > I think the comment isn't worth the metaphorical paper that it's written
> > > on! If you can bulk it up a bit based on David's reasoning, then that would
> > > help. But yes, I'm happy with the code now, thanks both.
> > OK, I will add a detailed comment here.
>
> Apologies for coming late to the party here, original ping came while I
> was on vacation and I only just now noticed the direct questions. All
> resolved now or is a question still pending?
I updated the patch and added a detailed comment based on David's
explanation [1]. If that description is correct, I think there are no
more questions. Thank you.
[1] https://lore.kernel.org/loongarch/20220721130419.1904711-4-chenhuacai@loongson.cn/T/#u

Huacai
>
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32873c6..b080a65c719d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1158,49 +1158,24 @@  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	return vmemmap_populate_basepages(start, end, node, altmap);
 }
 #else	/* !ARM64_KERNEL_USES_PMD_MAPS */
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+			       unsigned long addr, unsigned long next)
+{
+	pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr,
+				unsigned long next)
+{
+	vmemmap_verify((pte_t *)pmdp, node, addr, next);
+	return 1;
+}
+
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
-	unsigned long addr = start;
-	unsigned long next;
-	pgd_t *pgdp;
-	p4d_t *p4dp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
-	do {
-		next = pmd_addr_end(addr, end);
-
-		pgdp = vmemmap_pgd_populate(addr, node);
-		if (!pgdp)
-			return -ENOMEM;
-
-		p4dp = vmemmap_p4d_populate(pgdp, addr, node);
-		if (!p4dp)
-			return -ENOMEM;
-
-		pudp = vmemmap_pud_populate(p4dp, addr, node);
-		if (!pudp)
-			return -ENOMEM;
-
-		pmdp = pmd_offset(pudp, addr);
-		if (pmd_none(READ_ONCE(*pmdp))) {
-			void *p = NULL;
-
-			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
-			if (!p) {
-				if (vmemmap_populate_basepages(addr, next, node, altmap))
-					return -ENOMEM;
-				continue;
-			}
-
-			pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
-		} else
-			vmemmap_verify((pte_t *)pmdp, node, addr, next);
-	} while (addr = next, addr != end);
-
-	return 0;
+	return vmemmap_populate_hugepages(start, end, node, altmap);
 }
 #endif	/* !ARM64_KERNEL_USES_PMD_MAPS */
 
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 35128229fe46..3190b3cd52d1 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -158,52 +158,25 @@  void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
-					 int node, struct vmem_altmap *altmap)
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+			       unsigned long addr, unsigned long next)
 {
-	unsigned long addr = start;
-	unsigned long next;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	for (addr = start; addr < end; addr = next) {
-		next = pmd_addr_end(addr, end);
-
-		pgd = vmemmap_pgd_populate(addr, node);
-		if (!pgd)
-			return -ENOMEM;
-		p4d = vmemmap_p4d_populate(pgd, addr, node);
-		if (!p4d)
-			return -ENOMEM;
-		pud = vmemmap_pud_populate(p4d, addr, node);
-		if (!pud)
-			return -ENOMEM;
-
-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
-			void *p = NULL;
-
-			p = vmemmap_alloc_block_buf(PMD_SIZE, node, NULL);
-			if (p) {
-				pmd_t entry;
-
-				entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL);
-				pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL;
-				set_pmd_at(&init_mm, addr, pmd, entry);
-
-				continue;
-			}
-		} else if (pmd_val(*pmd) & _PAGE_HUGE) {
-			vmemmap_verify((pte_t *)pmd, node, addr, next);
-			continue;
-		}
-		if (vmemmap_populate_basepages(addr, next, node, NULL))
-			return -ENOMEM;
-	}
-
-	return 0;
+	pmd_t entry;
+
+	entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL);
+	pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL;
+	set_pmd_at(&init_mm, addr, pmd, entry);
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
+				unsigned long next)
+{
+	int huge = pmd_val(*pmd) & _PAGE_HUGE;
+
+	if (huge)
+		vmemmap_verify((pte_t *)pmd, node, addr, next);
+
+	return huge;
 }
 
 #if CONFIG_PGTABLE_LEVELS == 2
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 39c5246964a9..4911093ee2f3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1532,72 +1532,44 @@  static long __meminitdata addr_start, addr_end;
 static void __meminitdata *p_start, *p_end;
 static int __meminitdata node_start;
 
-static int __meminit vmemmap_populate_hugepages(unsigned long start,
-		unsigned long end, int node, struct vmem_altmap *altmap)
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+			       unsigned long addr, unsigned long next)
 {
-	unsigned long addr;
-	unsigned long next;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	for (addr = start; addr < end; addr = next) {
-		next = pmd_addr_end(addr, end);
-
-		pgd = vmemmap_pgd_populate(addr, node);
-		if (!pgd)
-			return -ENOMEM;
-
-		p4d = vmemmap_p4d_populate(pgd, addr, node);
-		if (!p4d)
-			return -ENOMEM;
-
-		pud = vmemmap_pud_populate(p4d, addr, node);
-		if (!pud)
-			return -ENOMEM;
-
-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
-			void *p;
-
-			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
-			if (p) {
-				pte_t entry;
-
-				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
-						PAGE_KERNEL_LARGE);
-				set_pmd(pmd, __pmd(pte_val(entry)));
+	pte_t entry;
+
+	entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+			PAGE_KERNEL_LARGE);
+	set_pmd(pmd, __pmd(pte_val(entry)));
+
+	/* check to see if we have contiguous blocks */
+	if (p_end != p || node_start != node) {
+		if (p_start)
+			pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+				addr_start, addr_end-1, p_start, p_end-1, node_start);
+		addr_start = addr;
+		node_start = node;
+		p_start = p;
+	}
 
-				/* check to see if we have contiguous blocks */
-				if (p_end != p || node_start != node) {
-					if (p_start)
-						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
-						       addr_start, addr_end-1, p_start, p_end-1, node_start);
-					addr_start = addr;
-					node_start = node;
-					p_start = p;
-				}
+	addr_end = addr + PMD_SIZE;
+	p_end = p + PMD_SIZE;
 
-				addr_end = addr + PMD_SIZE;
-				p_end = p + PMD_SIZE;
+	if (!IS_ALIGNED(addr, PMD_SIZE) ||
+		!IS_ALIGNED(next, PMD_SIZE))
+		vmemmap_use_new_sub_pmd(addr, next);
+}
 
-				if (!IS_ALIGNED(addr, PMD_SIZE) ||
-				    !IS_ALIGNED(next, PMD_SIZE))
-					vmemmap_use_new_sub_pmd(addr, next);
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
+				unsigned long next)
+{
+	int large = pmd_large(*pmd);
 
-				continue;
-			} else if (altmap)
-				return -ENOMEM; /* no fallback */
-		} else if (pmd_large(*pmd)) {
-			vmemmap_verify((pte_t *)pmd, node, addr, next);
-			vmemmap_use_sub_pmd(addr, next);
-			continue;
-		}
-		if (vmemmap_populate_basepages(addr, next, node, NULL))
-			return -ENOMEM;
+	if (pmd_large(*pmd)) {
+		vmemmap_verify((pte_t *)pmd, node, addr, next);
+		vmemmap_use_sub_pmd(addr, next);
 	}
-	return 0;
+
+	return large;
 }
 
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6ed6bc0a65f..894d59e1ffd6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3216,8 +3216,14 @@  struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
 			      struct vmem_altmap *altmap);
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
+void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+		     unsigned long addr, unsigned long next);
+int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
+		      unsigned long next);
 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
 			       int node, struct vmem_altmap *altmap);
+int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+			       int node, struct vmem_altmap *altmap);
 int vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap);
 void vmemmap_populate_print_last(void);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 33e2a1ceee72..6f2e40bb695d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -686,6 +686,60 @@  int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 	return vmemmap_populate_range(start, end, node, altmap, NULL);
 }
 
+void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+				      unsigned long addr, unsigned long next)
+{
+}
+
+int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr,
+				       unsigned long next)
+{
+	return 0;
+}
+
+int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+					 int node, struct vmem_altmap *altmap)
+{
+	unsigned long addr;
+	unsigned long next;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+
+		pgd = vmemmap_pgd_populate(addr, node);
+		if (!pgd)
+			return -ENOMEM;
+
+		p4d = vmemmap_p4d_populate(pgd, addr, node);
+		if (!p4d)
+			return -ENOMEM;
+
+		pud = vmemmap_pud_populate(p4d, addr, node);
+		if (!pud)
+			return -ENOMEM;
+
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(READ_ONCE(*pmd))) {
+			void *p;
+
+			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+			if (p) {
+				vmemmap_set_pmd(pmd, p, node, addr, next);
+				continue;
+			} else if (altmap)
+				return -ENOMEM; /* no fallback */
+		} else if (vmemmap_check_pmd(pmd, node, addr, next))
+			continue;
+		if (vmemmap_populate_basepages(addr, next, node, altmap))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 /*
  * For compound pages bigger than section size (e.g. x86 1G compound
  * pages with 2M subsection size) fill the rest of sections as tail