diff mbox series

[v7,2/3] riscv: mm: support Svnapot in hugetlb page

Message ID 20221119112242.3593646-3-panqinglin2020@iscas.ac.cn (mailing list archive)
State Superseded
Headers show
Series riscv, mm: detect svnapot cpu support at runtime | expand

Checks

Context Check Description
conchuod/patch_count success Link
conchuod/cover_letter success Series has a cover letter
conchuod/tree_selection success Guessed tree name to be for-next
conchuod/fixes_present success Fixes tag not required for -next series
conchuod/verify_signedoff success Signed-off-by tag matches author and committer
conchuod/kdoc success Errors and warnings before: 0 this patch: 0
conchuod/module_param success Was 0 now: 0
conchuod/build_rv32_defconfig fail Build failed
conchuod/build_warn_rv64 success Errors and warnings before: 0 this patch: 0
conchuod/dtb_warn_rv64 success Errors and warnings before: 0 this patch: 0
conchuod/header_inline success No static functions without inline keyword in header files
conchuod/checkpatch warning CHECK: Unbalanced braces around else statement WARNING: else is not generally useful after a break or return
conchuod/source_inline success Was 0 now: 0
conchuod/build_rv64_nommu_k210_defconfig success Build OK
conchuod/verify_fixes success No Fixes tag
conchuod/build_rv64_nommu_virt_defconfig success Build OK

Commit Message

Qinglin Pan Nov. 19, 2022, 11:22 a.m. UTC
From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Svnapot can be used to support 64KB hugetlb page, so it can become a new
option when using hugetlbfs. Add a basic implementation of hugetlb page,
and support 64KB as a size in it by using Svnapot.

For test, boot kernel with command line contains "default_hugepagesz=64K
hugepagesz=64K hugepages=20" and run a simple test like this:

tools/testing/selftests/vm/map_hugetlb 1 16

And it should be passed.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Comments

Conor Dooley Nov. 19, 2022, 6:28 p.m. UTC | #1
Hey Qinglin,
Couple minor comments here.

On Sat, Nov 19, 2022 at 07:22:41PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Svnapot can be used to support 64KB hugetlb page, so it can become a new
> option when using hugetlbfs. Add a basic implementation of hugetlb page,
> and support 64KB as a size in it by using Svnapot.
> 
> For test, boot kernel with command line contains "default_hugepagesz=64K
> hugepagesz=64K hugepages=20" and run a simple test like this:
> 
> tools/testing/selftests/vm/map_hugetlb 1 16
> 
> And it should be passed.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 31f9a5a160a0..c0307b942ed2 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -43,7 +43,7 @@ config RISCV
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>  	select ARCH_WANT_FRAME_POINTERS
> -	select ARCH_WANT_GENERAL_HUGETLB
> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>  	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>  	select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
>  	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
> index a5c2ca1d1cd8..854f5db2f45d 100644
> --- a/arch/riscv/include/asm/hugetlb.h
> +++ b/arch/riscv/include/asm/hugetlb.h
> @@ -2,7 +2,39 @@
>  #ifndef _ASM_RISCV_HUGETLB_H
>  #define _ASM_RISCV_HUGETLB_H
>  
> -#include <asm-generic/hugetlb.h>
>  #include <asm/page.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> +		    pte_t *ptep, unsigned long sz);
> +
> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr, pte_t *ptep, pte_t pte);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr, pte_t *ptep,
> +			       pte_t pte, int dirty);
> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
> +#define arch_make_huge_pte arch_make_huge_pte
> +
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
> +#include <asm-generic/hugetlb.h>
> +
>  #endif /* _ASM_RISCV_HUGETLB_H */
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 932dadfdca54..130a91b30588 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -2,6 +2,273 @@
>  #include <linux/hugetlb.h>
>  #include <linux/err.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t *huge_pte_alloc(struct mm_struct *mm,
> +		      struct vm_area_struct *vma,
> +		      unsigned long addr,
> +		      unsigned long sz)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte = NULL;
> +	unsigned long order;
> +
> +	pgd = pgd_offset(mm, addr);
> +	p4d = p4d_alloc(mm, pgd, addr);
> +	if (!p4d)
> +		return NULL;
> +
> +	pud = pud_alloc(mm, p4d, addr);
> +	if (!pud)
> +		return NULL;

Please put a line of whitespace after if statements, for loops etc, just
makes the code a little nicer to read...

> +	if (sz == PUD_SIZE) {
> +		pte = (pte_t *)pud;
> +		goto out;
> +	}
> +
> +	if (sz == PMD_SIZE) {
> +		if (want_pmd_share(vma, addr) && pud_none(*pud))
> +			pte = huge_pmd_share(mm, vma, addr, pud);
> +		else
> +			pte = (pte_t *)pmd_alloc(mm, pud, addr);
> +		goto out;
> +	}

..so here too..

> +	pmd = pmd_alloc(mm, pud, addr);
> +	if (!pmd)
> +		return NULL;
> +
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
> +		if (napot_cont_size(order) == sz) {
> +			pte = pte_alloc_map(mm, pmd, (addr & ~napot_cont_mask(order)));
> +			break;
> +		}
> +	}

...and here.

> +out:
> +	WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte));
> +	return pte;
> +}
> +
> +pte_t *huge_pte_offset(struct mm_struct *mm,
> +		       unsigned long addr,
> +		       unsigned long sz)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte = NULL;
> +	unsigned long order;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (!pgd_present(*pgd))
> +		return NULL;
> +	p4d = p4d_offset(pgd, addr);
> +	if (!p4d_present(*p4d))
> +		return NULL;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (sz == PUD_SIZE)
> +		/* must be pud huge, non-present or none */
> +		return (pte_t *)pud;
> +	if (!pud_present(*pud))
> +		return NULL;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (sz == PMD_SIZE)
> +		/* must be pmd huge, non-present or none */
> +		return (pte_t *)pmd;
> +	if (!pmd_present(*pmd))
> +		return NULL;
> +
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
> +		if (napot_cont_size(order) == sz) {
> +			pte = pte_offset_kernel(pmd, (addr & ~napot_cont_mask(order)));
> +			break;
> +		}
> +	}
> +	return pte;
> +}
> +
> +static pte_t get_clear_contig(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep,
> +			      unsigned long pte_num)
> +{
> +	pte_t orig_pte = ptep_get(ptep);
> +	unsigned long i;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> +		if (pte_dirty(pte))
> +			orig_pte = pte_mkdirty(orig_pte);
> +
> +		if (pte_young(pte))
> +			orig_pte = pte_mkyoung(orig_pte);
> +	}
> +	return orig_pte;
> +}
> +
> +static pte_t get_clear_contig_flush(struct mm_struct *mm,
> +				    unsigned long addr,
> +				    pte_t *ptep,
> +				    unsigned long pte_num)
> +{
> +	pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
> +	bool valid = !pte_none(orig_pte);
> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +
> +	if (valid)
> +		flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
> +
> +	return orig_pte;
> +}
> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
> +{
> +	unsigned long order;
> +
> +	for_each_napot_order(order) {
> +		if (shift == napot_cont_shift(order)) {
> +			entry = pte_mknapot(entry, order);
> +			break;
> +		}
> +	}
> +	if (order == NAPOT_ORDER_MAX)
> +		entry = pte_mkhuge(entry);
> +
> +	return entry;
> +}
> +
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr,
> +		     pte_t *ptep,
> +		     pte_t pte)
> +{
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte)) {
> +		set_pte_at(mm, addr, ptep, pte);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
> +		set_pte_at(mm, addr, ptep, pte);
> +}
> +
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       pte_t *ptep,
> +			       pte_t pte,
> +			       int dirty)
> +{
> +	pte_t orig_pte;
> +	int i;
> +	int pte_num;
> +	struct mm_struct *mm = vma->vm_mm;
> +
> +	if (!pte_napot(pte))
> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	ptep = huge_pte_offset(mm, addr,
> +			       napot_cont_size(napot_cont_order(pte)));
> +	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> +
> +	if (pte_dirty(orig_pte))
> +		pte = pte_mkdirty(pte);
> +
> +	if (pte_young(orig_pte))
> +		pte = pte_mkyoung(pte);
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		set_pte_at(mm, addr, ptep, pte);
> +
> +	return true;
> +}
> +
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t orig_pte = ptep_get(ptep);
> +
> +	if (!pte_napot(orig_pte))
> +		return ptep_get_and_clear(mm, addr, ptep);
> +
> +	pte_num = napot_pte_num(napot_cont_order(orig_pte));
> +
> +	return get_clear_contig(mm, addr, ptep, pte_num);
> +}
> +
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep)
> +{
> +	int i;
> +	int pte_num;
> +	pte_t pte = ptep_get(ptep);
> +
> +	if (!pte_napot(pte)) {
> +		ptep_set_wrprotect(mm, addr, ptep);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	ptep = huge_pte_offset(mm, addr, napot_cont_size(4UL));
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_wrprotect(mm, addr, ptep);
> +}
> +
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr,
> +			    pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t pte = ptep_get(ptep);
> +
> +	if (!pte_napot(pte))
> +		return ptep_clear_flush(vma, addr, ptep);
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +
> +	return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
> +}
> +
> +void huge_pte_clear(struct mm_struct *mm,
> +		    unsigned long addr,
> +		    pte_t *ptep,
> +		    unsigned long sz)
> +{
> +	int i, pte_num = 1;
> +	pte_t pte = READ_ONCE(*ptep);
> +
> +	if (pte_napot(pte))
> +		pte_num = napot_pte_num(napot_cont_order(pte));
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +}
> +
> +static __init int napot_hugetlbpages_init(void)
> +{
> +	if (has_svnapot()) {
> +		unsigned long order;
> +
> +		for_each_napot_order(order)
> +			hugetlb_add_hstate(order);
> +	}
> +	return 0;
> +}
> +arch_initcall(napot_hugetlbpages_init);
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
>  int pud_huge(pud_t pud)
>  {
>  	return pud_leaf(pud);
> @@ -18,8 +285,17 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>  		return true;
>  	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>  		return true;
> -	else
> +	else if (!has_svnapot())
> +		return false;
> +	else {

Here, could you please add {} around the other branches of this if
statement please?

Thanks,
Conor.


> +		unsigned long order;
> +
> +		for_each_napot_order(order) {
> +			if (size == napot_cont_size(order))
> +				return true;
> +		}
>  		return false;
> +	}
>  }
>  
>  #ifdef CONFIG_CONTIG_ALLOC
> -- 
> 2.37.4
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Qinglin Pan Nov. 20, 2022, 2:48 a.m. UTC | #2
Hi Conor,

Thanks for your comments. Will fix them in next version :)

Thanks,
Qinglin

On 11/20/22 2:28 AM, Conor Dooley wrote:
> Hey Qinglin,
> Couple minor comments here.
> 
> On Sat, Nov 19, 2022 at 07:22:41PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Svnapot can be used to support 64KB hugetlb page, so it can become a new
>> option when using hugetlbfs. Add a basic implementation of hugetlb page,
>> and support 64KB as a size in it by using Svnapot.
>>
>> For test, boot kernel with command line contains "default_hugepagesz=64K
>> hugepagesz=64K hugepages=20" and run a simple test like this:
>>
>> tools/testing/selftests/vm/map_hugetlb 1 16
>>
>> And it should be passed.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 31f9a5a160a0..c0307b942ed2 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -43,7 +43,7 @@ config RISCV
>>   	select ARCH_USE_QUEUED_RWLOCKS
>>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>   	select ARCH_WANT_FRAME_POINTERS
>> -	select ARCH_WANT_GENERAL_HUGETLB
>> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>>   	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>>   	select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>   	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
>> index a5c2ca1d1cd8..854f5db2f45d 100644
>> --- a/arch/riscv/include/asm/hugetlb.h
>> +++ b/arch/riscv/include/asm/hugetlb.h
>> @@ -2,7 +2,39 @@
>>   #ifndef _ASM_RISCV_HUGETLB_H
>>   #define _ASM_RISCV_HUGETLB_H
>>   
>> -#include <asm-generic/hugetlb.h>
>>   #include <asm/page.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
>> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>> +		    pte_t *ptep, unsigned long sz);
>> +
>> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr, pte_t *ptep, pte_t pte);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr, pte_t *ptep,
>> +			       pte_t pte, int dirty);
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
>> +#define arch_make_huge_pte arch_make_huge_pte
>> +
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>> +#include <asm-generic/hugetlb.h>
>> +
>>   #endif /* _ASM_RISCV_HUGETLB_H */
>> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
>> index 932dadfdca54..130a91b30588 100644
>> --- a/arch/riscv/mm/hugetlbpage.c
>> +++ b/arch/riscv/mm/hugetlbpage.c
>> @@ -2,6 +2,273 @@
>>   #include <linux/hugetlb.h>
>>   #include <linux/err.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t *huge_pte_alloc(struct mm_struct *mm,
>> +		      struct vm_area_struct *vma,
>> +		      unsigned long addr,
>> +		      unsigned long sz)
>> +{
>> +	pgd_t *pgd;
>> +	p4d_t *p4d;
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +	pte_t *pte = NULL;
>> +	unsigned long order;
>> +
>> +	pgd = pgd_offset(mm, addr);
>> +	p4d = p4d_alloc(mm, pgd, addr);
>> +	if (!p4d)
>> +		return NULL;
>> +
>> +	pud = pud_alloc(mm, p4d, addr);
>> +	if (!pud)
>> +		return NULL;
> 
> Please put a line of whitespace after if statements, for loops etc, just
> makes the code a little nicer to read...
> 
>> +	if (sz == PUD_SIZE) {
>> +		pte = (pte_t *)pud;
>> +		goto out;
>> +	}
>> +
>> +	if (sz == PMD_SIZE) {
>> +		if (want_pmd_share(vma, addr) && pud_none(*pud))
>> +			pte = huge_pmd_share(mm, vma, addr, pud);
>> +		else
>> +			pte = (pte_t *)pmd_alloc(mm, pud, addr);
>> +		goto out;
>> +	}
> 
> ..so here too..
> 
>> +	pmd = pmd_alloc(mm, pud, addr);
>> +	if (!pmd)
>> +		return NULL;
>> +
>> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
>> +		if (napot_cont_size(order) == sz) {
>> +			pte = pte_alloc_map(mm, pmd, (addr & ~napot_cont_mask(order)));
>> +			break;
>> +		}
>> +	}
> 
> ...and here.
> 
>> +out:
>> +	WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte));
>> +	return pte;
>> +}
>> +
>> +pte_t *huge_pte_offset(struct mm_struct *mm,
>> +		       unsigned long addr,
>> +		       unsigned long sz)
>> +{
>> +	pgd_t *pgd;
>> +	p4d_t *p4d;
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +	pte_t *pte = NULL;
>> +	unsigned long order;
>> +
>> +	pgd = pgd_offset(mm, addr);
>> +	if (!pgd_present(*pgd))
>> +		return NULL;
>> +	p4d = p4d_offset(pgd, addr);
>> +	if (!p4d_present(*p4d))
>> +		return NULL;
>> +
>> +	pud = pud_offset(p4d, addr);
>> +	if (sz == PUD_SIZE)
>> +		/* must be pud huge, non-present or none */
>> +		return (pte_t *)pud;
>> +	if (!pud_present(*pud))
>> +		return NULL;
>> +
>> +	pmd = pmd_offset(pud, addr);
>> +	if (sz == PMD_SIZE)
>> +		/* must be pmd huge, non-present or none */
>> +		return (pte_t *)pmd;
>> +	if (!pmd_present(*pmd))
>> +		return NULL;
>> +
>> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
>> +		if (napot_cont_size(order) == sz) {
>> +			pte = pte_offset_kernel(pmd, (addr & ~napot_cont_mask(order)));
>> +			break;
>> +		}
>> +	}
>> +	return pte;
>> +}
>> +
>> +static pte_t get_clear_contig(struct mm_struct *mm,
>> +			      unsigned long addr,
>> +			      pte_t *ptep,
>> +			      unsigned long pte_num)
>> +{
>> +	pte_t orig_pte = ptep_get(ptep);
>> +	unsigned long i;
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
>> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
>> +
>> +		if (pte_dirty(pte))
>> +			orig_pte = pte_mkdirty(orig_pte);
>> +
>> +		if (pte_young(pte))
>> +			orig_pte = pte_mkyoung(orig_pte);
>> +	}
>> +	return orig_pte;
>> +}
>> +
>> +static pte_t get_clear_contig_flush(struct mm_struct *mm,
>> +				    unsigned long addr,
>> +				    pte_t *ptep,
>> +				    unsigned long pte_num)
>> +{
>> +	pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
>> +	bool valid = !pte_none(orig_pte);
>> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +
>> +	if (valid)
>> +		flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
>> +
>> +	return orig_pte;
>> +}
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
>> +{
>> +	unsigned long order;
>> +
>> +	for_each_napot_order(order) {
>> +		if (shift == napot_cont_shift(order)) {
>> +			entry = pte_mknapot(entry, order);
>> +			break;
>> +		}
>> +	}
>> +	if (order == NAPOT_ORDER_MAX)
>> +		entry = pte_mkhuge(entry);
>> +
>> +	return entry;
>> +}
>> +
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr,
>> +		     pte_t *ptep,
>> +		     pte_t pte)
>> +{
>> +	int i;
>> +	int pte_num;
>> +
>> +	if (!pte_napot(pte)) {
>> +		set_pte_at(mm, addr, ptep, pte);
>> +		return;
>> +	}
>> +
>> +	pte_num = napot_pte_num(napot_cont_order(pte));
>> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
>> +		set_pte_at(mm, addr, ptep, pte);
>> +}
>> +
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr,
>> +			       pte_t *ptep,
>> +			       pte_t pte,
>> +			       int dirty)
>> +{
>> +	pte_t orig_pte;
>> +	int i;
>> +	int pte_num;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +	pte_num = napot_pte_num(napot_cont_order(pte));
>> +	ptep = huge_pte_offset(mm, addr,
>> +			       napot_cont_size(napot_cont_order(pte)));
>> +	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>> +
>> +	if (pte_dirty(orig_pte))
>> +		pte = pte_mkdirty(pte);
>> +
>> +	if (pte_young(orig_pte))
>> +		pte = pte_mkyoung(pte);
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		set_pte_at(mm, addr, ptep, pte);
>> +
>> +	return true;
>> +}
>> +
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr,
>> +			      pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t orig_pte = ptep_get(ptep);
>> +
>> +	if (!pte_napot(orig_pte))
>> +		return ptep_get_and_clear(mm, addr, ptep);
>> +
>> +	pte_num = napot_pte_num(napot_cont_order(orig_pte));
>> +
>> +	return get_clear_contig(mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr,
>> +			     pte_t *ptep)
>> +{
>> +	int i;
>> +	int pte_num;
>> +	pte_t pte = ptep_get(ptep);
>> +
>> +	if (!pte_napot(pte)) {
>> +		ptep_set_wrprotect(mm, addr, ptep);
>> +		return;
>> +	}
>> +
>> +	pte_num = napot_pte_num(napot_cont_order(pte));
>> +	ptep = huge_pte_offset(mm, addr, napot_cont_size(4UL));
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		ptep_set_wrprotect(mm, addr, ptep);
>> +}
>> +
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr,
>> +			    pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t pte = ptep_get(ptep);
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_clear_flush(vma, addr, ptep);
>> +
>> +	pte_num = napot_pte_num(napot_cont_order(pte));
>> +
>> +	return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_pte_clear(struct mm_struct *mm,
>> +		    unsigned long addr,
>> +		    pte_t *ptep,
>> +		    unsigned long sz)
>> +{
>> +	int i, pte_num = 1;
>> +	pte_t pte = READ_ONCE(*ptep);
>> +
>> +	if (pte_napot(pte))
>> +		pte_num = napot_pte_num(napot_cont_order(pte));
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		pte_clear(mm, addr, ptep);
>> +}
>> +
>> +static __init int napot_hugetlbpages_init(void)
>> +{
>> +	if (has_svnapot()) {
>> +		unsigned long order;
>> +
>> +		for_each_napot_order(order)
>> +			hugetlb_add_hstate(order);
>> +	}
>> +	return 0;
>> +}
>> +arch_initcall(napot_hugetlbpages_init);
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>>   int pud_huge(pud_t pud)
>>   {
>>   	return pud_leaf(pud);
>> @@ -18,8 +285,17 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>>   		return true;
>>   	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>>   		return true;
>> -	else
>> +	else if (!has_svnapot())
>> +		return false;
>> +	else {
> 
> Here, could you please add {} around the other branches of this if
> statement please?
> 
> Thanks,
> Conor.
> 
> 
>> +		unsigned long order;
>> +
>> +		for_each_napot_order(order) {
>> +			if (size == napot_cont_size(order))
>> +				return true;
>> +		}
>>   		return false;
>> +	}
>>   }
>>   
>>   #ifdef CONFIG_CONTIG_ALLOC
>> -- 
>> 2.37.4
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
Alexandre Ghiti Nov. 25, 2022, 2:23 p.m. UTC | #3
On 11/19/22 12:22, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>
> Svnapot can be used to support 64KB hugetlb page, so it can become a new
> option when using hugetlbfs. Add a basic implementation of hugetlb page,
> and support 64KB as a size in it by using Svnapot.
>
> For test, boot kernel with command line contains "default_hugepagesz=64K
> hugepagesz=64K hugepages=20" and run a simple test like this:
>
> tools/testing/selftests/vm/map_hugetlb 1 16
>
> And it should be passed.
>
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 31f9a5a160a0..c0307b942ed2 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -43,7 +43,7 @@ config RISCV
>   	select ARCH_USE_QUEUED_RWLOCKS
>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>   	select ARCH_WANT_FRAME_POINTERS
> -	select ARCH_WANT_GENERAL_HUGETLB
> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>   	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>   	select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
>   	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
> index a5c2ca1d1cd8..854f5db2f45d 100644
> --- a/arch/riscv/include/asm/hugetlb.h
> +++ b/arch/riscv/include/asm/hugetlb.h
> @@ -2,7 +2,39 @@
>   #ifndef _ASM_RISCV_HUGETLB_H
>   #define _ASM_RISCV_HUGETLB_H
>   
> -#include <asm-generic/hugetlb.h>
>   #include <asm/page.h>
>   
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> +		    pte_t *ptep, unsigned long sz);
> +
> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr, pte_t *ptep, pte_t pte);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr, pte_t *ptep);
> +
> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr, pte_t *ptep,
> +			       pte_t pte, int dirty);
> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
> +#define arch_make_huge_pte arch_make_huge_pte
> +
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
> +#include <asm-generic/hugetlb.h>
> +
>   #endif /* _ASM_RISCV_HUGETLB_H */
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 932dadfdca54..130a91b30588 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -2,6 +2,273 @@
>   #include <linux/hugetlb.h>
>   #include <linux/err.h>
>   
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t *huge_pte_alloc(struct mm_struct *mm,
> +		      struct vm_area_struct *vma,
> +		      unsigned long addr,
> +		      unsigned long sz)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte = NULL;
> +	unsigned long order;
> +
> +	pgd = pgd_offset(mm, addr);
> +	p4d = p4d_alloc(mm, pgd, addr);
> +	if (!p4d)
> +		return NULL;
> +
> +	pud = pud_alloc(mm, p4d, addr);
> +	if (!pud)
> +		return NULL;
> +	if (sz == PUD_SIZE) {
> +		pte = (pte_t *)pud;
> +		goto out;
> +	}
> +
> +	if (sz == PMD_SIZE) {
> +		if (want_pmd_share(vma, addr) && pud_none(*pud))
> +			pte = huge_pmd_share(mm, vma, addr, pud);
> +		else
> +			pte = (pte_t *)pmd_alloc(mm, pud, addr);
> +		goto out;
> +	}
> +	pmd = pmd_alloc(mm, pud, addr);
> +	if (!pmd)
> +		return NULL;
> +
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
> +		if (napot_cont_size(order) == sz) {
> +			pte = pte_alloc_map(mm, pmd, (addr & ~napot_cont_mask(order)));
> +			break;
> +		}
> +	}
> +out:
> +	WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte));
> +	return pte;
> +}
> +
> +pte_t *huge_pte_offset(struct mm_struct *mm,
> +		       unsigned long addr,
> +		       unsigned long sz)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte = NULL;
> +	unsigned long order;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (!pgd_present(*pgd))
> +		return NULL;
> +	p4d = p4d_offset(pgd, addr);
> +	if (!p4d_present(*p4d))
> +		return NULL;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (sz == PUD_SIZE)
> +		/* must be pud huge, non-present or none */
> +		return (pte_t *)pud;
> +	if (!pud_present(*pud))
> +		return NULL;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (sz == PMD_SIZE)
> +		/* must be pmd huge, non-present or none */
> +		return (pte_t *)pmd;
> +	if (!pmd_present(*pmd))
> +		return NULL;
> +
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
> +		if (napot_cont_size(order) == sz) {
> +			pte = pte_offset_kernel(pmd, (addr & ~napot_cont_mask(order)));
> +			break;
> +		}
> +	}
> +	return pte;
> +}


It's really too bad we can't use generic code  for that + an arch 
specific callback here, have you tried to do so?


> +
> +static pte_t get_clear_contig(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep,
> +			      unsigned long pte_num)
> +{
> +	pte_t orig_pte = ptep_get(ptep);
> +	unsigned long i;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> +		if (pte_dirty(pte))
> +			orig_pte = pte_mkdirty(orig_pte);
> +
> +		if (pte_young(pte))
> +			orig_pte = pte_mkyoung(orig_pte);
> +	}
> +	return orig_pte;
> +}


Same here: can't we share code with arm64?


> +
> +static pte_t get_clear_contig_flush(struct mm_struct *mm,
> +				    unsigned long addr,
> +				    pte_t *ptep,
> +				    unsigned long pte_num)
> +{
> +	pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
> +	bool valid = !pte_none(orig_pte);
> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +
> +	if (valid)
> +		flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
> +
> +	return orig_pte;
> +}


Here again: arm64 does something very similar .


> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
> +{
> +	unsigned long order;
> +
> +	for_each_napot_order(order) {
> +		if (shift == napot_cont_shift(order)) {
> +			entry = pte_mknapot(entry, order);
> +			break;
> +		}
> +	}
> +	if (order == NAPOT_ORDER_MAX)
> +		entry = pte_mkhuge(entry);
> +
> +	return entry;
> +}


Isn't it possible to do the following instead?

if (shift >= napot_cont_shift(NAPOT_ORDER_MAX))

     entry = pte_mkhuge(entry);

else

     entry = pte_mknapot(entry, order);


> +
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr,
> +		     pte_t *ptep,
> +		     pte_t pte)
> +{
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte)) {
> +		set_pte_at(mm, addr, ptep, pte);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
> +		set_pte_at(mm, addr, ptep, pte);
> +}
> +
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       pte_t *ptep,
> +			       pte_t pte,
> +			       int dirty)
> +{
> +	pte_t orig_pte;
> +	int i;
> +	int pte_num;
> +	struct mm_struct *mm = vma->vm_mm;
> +
> +	if (!pte_napot(pte))
> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	ptep = huge_pte_offset(mm, addr,
> +			       napot_cont_size(napot_cont_order(pte)));
> +	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> +
> +	if (pte_dirty(orig_pte))
> +		pte = pte_mkdirty(pte);
> +
> +	if (pte_young(orig_pte))
> +		pte = pte_mkyoung(pte);
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		set_pte_at(mm, addr, ptep, pte);
> +
> +	return true;
> +}
> +
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t orig_pte = ptep_get(ptep);
> +
> +	if (!pte_napot(orig_pte))
> +		return ptep_get_and_clear(mm, addr, ptep);
> +
> +	pte_num = napot_pte_num(napot_cont_order(orig_pte));
> +
> +	return get_clear_contig(mm, addr, ptep, pte_num);
> +}
> +
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep)
> +{
> +	int i;
> +	int pte_num;
> +	pte_t pte = ptep_get(ptep);
> +
> +	if (!pte_napot(pte)) {
> +		ptep_set_wrprotect(mm, addr, ptep);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +	ptep = huge_pte_offset(mm, addr, napot_cont_size(4UL));
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_wrprotect(mm, addr, ptep);
> +}
> +
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr,
> +			    pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t pte = ptep_get(ptep);
> +
> +	if (!pte_napot(pte))
> +		return ptep_clear_flush(vma, addr, ptep);
> +
> +	pte_num = napot_pte_num(napot_cont_order(pte));
> +
> +	return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
> +}
> +
> +void huge_pte_clear(struct mm_struct *mm,
> +		    unsigned long addr,
> +		    pte_t *ptep,
> +		    unsigned long sz)
> +{
> +	int i, pte_num = 1;
> +	pte_t pte = READ_ONCE(*ptep);
> +
> +	if (pte_napot(pte))
> +		pte_num = napot_pte_num(napot_cont_order(pte));
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +}
> +
> +static __init int napot_hugetlbpages_init(void)
> +{
> +	if (has_svnapot()) {
> +		unsigned long order;
> +
> +		for_each_napot_order(order)
> +			hugetlb_add_hstate(order);
> +	}
> +	return 0;
> +}
> +arch_initcall(napot_hugetlbpages_init);
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
>   int pud_huge(pud_t pud)
>   {
>   	return pud_leaf(pud);
> @@ -18,8 +285,17 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>   		return true;
>   	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>   		return true;
> -	else
> +	else if (!has_svnapot())
> +		return false;
> +	else {
> +		unsigned long order;
> +
> +		for_each_napot_order(order) {
> +			if (size == napot_cont_size(order))
> +				return true;
> +		}
>   		return false;
> +	}
>   }
>   
>   #ifdef CONFIG_CONTIG_ALLOC


IMO for most of the functions above, we don't need a duplicate of 
arm64/generic functions, otherwise we should really argue why we need 
them. I'm sorry my comment intervenes this late, but I really think it's 
worth giving it a try merging with existing functions, it is quite hard 
to review this way.
Qinglin Pan Nov. 26, 2022, 3:04 a.m. UTC | #4
Hi Alex,

On 11/25/22 10:23 PM, Alexandre Ghiti wrote:
> On 11/19/22 12:22, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Svnapot can be used to support 64KB hugetlb page, so it can become a new
>> option when using hugetlbfs. Add a basic implementation of hugetlb page,
>> and support 64KB as a size in it by using Svnapot.
>>
>> For test, boot kernel with command line contains "default_hugepagesz=64K
>> hugepagesz=64K hugepages=20" and run a simple test like this:
>>
>> tools/testing/selftests/vm/map_hugetlb 1 16
>>
>> And it should be passed.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 31f9a5a160a0..c0307b942ed2 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -43,7 +43,7 @@ config RISCV
>>       select ARCH_USE_QUEUED_RWLOCKS
>>       select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>       select ARCH_WANT_FRAME_POINTERS
>> -    select ARCH_WANT_GENERAL_HUGETLB
>> +    select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>>       select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>>       select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>       select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>> diff --git a/arch/riscv/include/asm/hugetlb.h 
>> b/arch/riscv/include/asm/hugetlb.h
>> index a5c2ca1d1cd8..854f5db2f45d 100644
>> --- a/arch/riscv/include/asm/hugetlb.h
>> +++ b/arch/riscv/include/asm/hugetlb.h
>> @@ -2,7 +2,39 @@
>>   #ifndef _ASM_RISCV_HUGETLB_H
>>   #define _ASM_RISCV_HUGETLB_H
>> -#include <asm-generic/hugetlb.h>
>>   #include <asm/page.h>
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
>> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>> +            pte_t *ptep, unsigned long sz);
>> +
>> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +             unsigned long addr, pte_t *ptep, pte_t pte);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +                  unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +                unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +                 unsigned long addr, pte_t *ptep);
>> +
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +                   unsigned long addr, pte_t *ptep,
>> +                   pte_t pte, int dirty);
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t 
>> flags);
>> +#define arch_make_huge_pte arch_make_huge_pte
>> +
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>> +#include <asm-generic/hugetlb.h>
>> +
>>   #endif /* _ASM_RISCV_HUGETLB_H */
>> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
>> index 932dadfdca54..130a91b30588 100644
>> --- a/arch/riscv/mm/hugetlbpage.c
>> +++ b/arch/riscv/mm/hugetlbpage.c
>> @@ -2,6 +2,273 @@
>>   #include <linux/hugetlb.h>
>>   #include <linux/err.h>
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t *huge_pte_alloc(struct mm_struct *mm,
>> +              struct vm_area_struct *vma,
>> +              unsigned long addr,
>> +              unsigned long sz)
>> +{
>> +    pgd_t *pgd;
>> +    p4d_t *p4d;
>> +    pud_t *pud;
>> +    pmd_t *pmd;
>> +    pte_t *pte = NULL;
>> +    unsigned long order;
>> +
>> +    pgd = pgd_offset(mm, addr);
>> +    p4d = p4d_alloc(mm, pgd, addr);
>> +    if (!p4d)
>> +        return NULL;
>> +
>> +    pud = pud_alloc(mm, p4d, addr);
>> +    if (!pud)
>> +        return NULL;
>> +    if (sz == PUD_SIZE) {
>> +        pte = (pte_t *)pud;
>> +        goto out;
>> +    }
>> +
>> +    if (sz == PMD_SIZE) {
>> +        if (want_pmd_share(vma, addr) && pud_none(*pud))
>> +            pte = huge_pmd_share(mm, vma, addr, pud);
>> +        else
>> +            pte = (pte_t *)pmd_alloc(mm, pud, addr);
>> +        goto out;
>> +    }
>> +    pmd = pmd_alloc(mm, pud, addr);
>> +    if (!pmd)
>> +        return NULL;
>> +
>> +    for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; 
>> order++) {
>> +        if (napot_cont_size(order) == sz) {
>> +            pte = pte_alloc_map(mm, pmd, (addr & 
>> ~napot_cont_mask(order)));
>> +            break;
>> +        }
>> +    }
>> +out:
>> +    WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte));
>> +    return pte;
>> +}
>> +
>> +pte_t *huge_pte_offset(struct mm_struct *mm,
>> +               unsigned long addr,
>> +               unsigned long sz)
>> +{
>> +    pgd_t *pgd;
>> +    p4d_t *p4d;
>> +    pud_t *pud;
>> +    pmd_t *pmd;
>> +    pte_t *pte = NULL;
>> +    unsigned long order;
>> +
>> +    pgd = pgd_offset(mm, addr);
>> +    if (!pgd_present(*pgd))
>> +        return NULL;
>> +    p4d = p4d_offset(pgd, addr);
>> +    if (!p4d_present(*p4d))
>> +        return NULL;
>> +
>> +    pud = pud_offset(p4d, addr);
>> +    if (sz == PUD_SIZE)
>> +        /* must be pud huge, non-present or none */
>> +        return (pte_t *)pud;
>> +    if (!pud_present(*pud))
>> +        return NULL;
>> +
>> +    pmd = pmd_offset(pud, addr);
>> +    if (sz == PMD_SIZE)
>> +        /* must be pmd huge, non-present or none */
>> +        return (pte_t *)pmd;
>> +    if (!pmd_present(*pmd))
>> +        return NULL;
>> +
>> +    for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; 
>> order++) {
>> +        if (napot_cont_size(order) == sz) {
>> +            pte = pte_offset_kernel(pmd, (addr & 
>> ~napot_cont_mask(order)));
>> +            break;
>> +        }
>> +    }
>> +    return pte;
>> +}
> 
> 
> It's really too bad we can't use generic code  for that + an arch 
> specific callback here, have you tried to do so?
> 
> 
>> +
>> +static pte_t get_clear_contig(struct mm_struct *mm,
>> +                  unsigned long addr,
>> +                  pte_t *ptep,
>> +                  unsigned long pte_num)
>> +{
>> +    pte_t orig_pte = ptep_get(ptep);
>> +    unsigned long i;
>> +
>> +    for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
>> +        pte_t pte = ptep_get_and_clear(mm, addr, ptep);
>> +
>> +        if (pte_dirty(pte))
>> +            orig_pte = pte_mkdirty(orig_pte);
>> +
>> +        if (pte_young(pte))
>> +            orig_pte = pte_mkyoung(orig_pte);
>> +    }
>> +    return orig_pte;
>> +}
> 
> 
> Same here: can't we share code with arm64?
> 
> 
>> +
>> +static pte_t get_clear_contig_flush(struct mm_struct *mm,
>> +                    unsigned long addr,
>> +                    pte_t *ptep,
>> +                    unsigned long pte_num)
>> +{
>> +    pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
>> +    bool valid = !pte_none(orig_pte);
>> +    struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +
>> +    if (valid)
>> +        flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
>> +
>> +    return orig_pte;
>> +}
> 
> 
> Here again: arm64 does something very similar .
> 
> 
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t 
>> flags)
>> +{
>> +    unsigned long order;
>> +
>> +    for_each_napot_order(order) {
>> +        if (shift == napot_cont_shift(order)) {
>> +            entry = pte_mknapot(entry, order);
>> +            break;
>> +        }
>> +    }
>> +    if (order == NAPOT_ORDER_MAX)
>> +        entry = pte_mkhuge(entry);
>> +
>> +    return entry;
>> +}
> 
> 
> Isn't it possible to do the following instead?
> 
> if (shift >= napot_cont_shift(NAPOT_ORDER_MAX))
> 
>      entry = pte_mkhuge(entry);
> 
> else
> 
>      entry = pte_mknapot(entry, order);
> 
> 
>> +
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +             unsigned long addr,
>> +             pte_t *ptep,
>> +             pte_t pte)
>> +{
>> +    int i;
>> +    int pte_num;
>> +
>> +    if (!pte_napot(pte)) {
>> +        set_pte_at(mm, addr, ptep, pte);
>> +        return;
>> +    }
>> +
>> +    pte_num = napot_pte_num(napot_cont_order(pte));
>> +    for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
>> +        set_pte_at(mm, addr, ptep, pte);
>> +}
>> +
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +                   unsigned long addr,
>> +                   pte_t *ptep,
>> +                   pte_t pte,
>> +                   int dirty)
>> +{
>> +    pte_t orig_pte;
>> +    int i;
>> +    int pte_num;
>> +    struct mm_struct *mm = vma->vm_mm;
>> +
>> +    if (!pte_napot(pte))
>> +        return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +    pte_num = napot_pte_num(napot_cont_order(pte));
>> +    ptep = huge_pte_offset(mm, addr,
>> +                   napot_cont_size(napot_cont_order(pte)));
>> +    orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>> +
>> +    if (pte_dirty(orig_pte))
>> +        pte = pte_mkdirty(pte);
>> +
>> +    if (pte_young(orig_pte))
>> +        pte = pte_mkyoung(pte);
>> +
>> +    for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +        set_pte_at(mm, addr, ptep, pte);
>> +
>> +    return true;
>> +}
>> +
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +                  unsigned long addr,
>> +                  pte_t *ptep)
>> +{
>> +    int pte_num;
>> +    pte_t orig_pte = ptep_get(ptep);
>> +
>> +    if (!pte_napot(orig_pte))
>> +        return ptep_get_and_clear(mm, addr, ptep);
>> +
>> +    pte_num = napot_pte_num(napot_cont_order(orig_pte));
>> +
>> +    return get_clear_contig(mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +                 unsigned long addr,
>> +                 pte_t *ptep)
>> +{
>> +    int i;
>> +    int pte_num;
>> +    pte_t pte = ptep_get(ptep);
>> +
>> +    if (!pte_napot(pte)) {
>> +        ptep_set_wrprotect(mm, addr, ptep);
>> +        return;
>> +    }
>> +
>> +    pte_num = napot_pte_num(napot_cont_order(pte));
>> +    ptep = huge_pte_offset(mm, addr, napot_cont_size(4UL));
>> +
>> +    for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +        ptep_set_wrprotect(mm, addr, ptep);
>> +}
>> +
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +                unsigned long addr,
>> +                pte_t *ptep)
>> +{
>> +    int pte_num;
>> +    pte_t pte = ptep_get(ptep);
>> +
>> +    if (!pte_napot(pte))
>> +        return ptep_clear_flush(vma, addr, ptep);
>> +
>> +    pte_num = napot_pte_num(napot_cont_order(pte));
>> +
>> +    return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_pte_clear(struct mm_struct *mm,
>> +            unsigned long addr,
>> +            pte_t *ptep,
>> +            unsigned long sz)
>> +{
>> +    int i, pte_num = 1;
>> +    pte_t pte = READ_ONCE(*ptep);
>> +
>> +    if (pte_napot(pte))
>> +        pte_num = napot_pte_num(napot_cont_order(pte));
>> +
>> +    for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +        pte_clear(mm, addr, ptep);
>> +}
>> +
>> +static __init int napot_hugetlbpages_init(void)
>> +{
>> +    if (has_svnapot()) {
>> +        unsigned long order;
>> +
>> +        for_each_napot_order(order)
>> +            hugetlb_add_hstate(order);
>> +    }
>> +    return 0;
>> +}
>> +arch_initcall(napot_hugetlbpages_init);
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>>   int pud_huge(pud_t pud)
>>   {
>>       return pud_leaf(pud);
>> @@ -18,8 +285,17 @@ bool __init arch_hugetlb_valid_size(unsigned long 
>> size)
>>           return true;
>>       else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>>           return true;
>> -    else
>> +    else if (!has_svnapot())
>> +        return false;
>> +    else {
>> +        unsigned long order;
>> +
>> +        for_each_napot_order(order) {
>> +            if (size == napot_cont_size(order))
>> +                return true;
>> +        }
>>           return false;
>> +    }
>>   }
>>   #ifdef CONFIG_CONTIG_ALLOC
> 
> 
> IMO for most of the functions above, we don't need a duplicate of 
> arm64/generic functions, otherwise we should really argue why we need 
> them. I'm sorry my comment intervenes this late, but I really think it's 
> worth giving it a try merging with existing functions, it is quite hard 
> to review this way.

Many functions above are really developed with reference to that of 
arm64. I agree with you and I will give it a try to add more arch 
callbacks in a separate patchset in the future. I think we can merge
these functions now. And we can refactor them and that of arm64 after
we have added new arch callbacks for them in a new patchset. What do
you think of this plan?

Thanks,
Qinglin
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 31f9a5a160a0..c0307b942ed2 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -43,7 +43,7 @@  config RISCV
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
-	select ARCH_WANT_GENERAL_HUGETLB
+	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
 	select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index a5c2ca1d1cd8..854f5db2f45d 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -2,7 +2,39 @@ 
 #ifndef _ASM_RISCV_HUGETLB_H
 #define _ASM_RISCV_HUGETLB_H
 
-#include <asm-generic/hugetlb.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+#define __HAVE_ARCH_HUGE_PTE_CLEAR
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, unsigned long sz);
+
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm,
+		     unsigned long addr, pte_t *ptep, pte_t pte);
+
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+			    unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr, pte_t *ptep,
+			       pte_t pte, int dirty);
+
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
+#define arch_make_huge_pte arch_make_huge_pte
+
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+
+#include <asm-generic/hugetlb.h>
+
 #endif /* _ASM_RISCV_HUGETLB_H */
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 932dadfdca54..130a91b30588 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -2,6 +2,273 @@ 
 #include <linux/hugetlb.h>
 #include <linux/err.h>
 
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+		      struct vm_area_struct *vma,
+		      unsigned long addr,
+		      unsigned long sz)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+	unsigned long order;
+
+	pgd = pgd_offset(mm, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return NULL;
+
+	pud = pud_alloc(mm, p4d, addr);
+	if (!pud)
+		return NULL;
+	if (sz == PUD_SIZE) {
+		pte = (pte_t *)pud;
+		goto out;
+	}
+
+	if (sz == PMD_SIZE) {
+		if (want_pmd_share(vma, addr) && pud_none(*pud))
+			pte = huge_pmd_share(mm, vma, addr, pud);
+		else
+			pte = (pte_t *)pmd_alloc(mm, pud, addr);
+		goto out;
+	}
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return NULL;
+
+	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
+		if (napot_cont_size(order) == sz) {
+			pte = pte_alloc_map(mm, pmd, (addr & ~napot_cont_mask(order)));
+			break;
+		}
+	}
+out:
+	WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte));
+	return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr,
+		       unsigned long sz)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+	unsigned long order;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return NULL;
+
+	pud = pud_offset(p4d, addr);
+	if (sz == PUD_SIZE)
+		/* must be pud huge, non-present or none */
+		return (pte_t *)pud;
+	if (!pud_present(*pud))
+		return NULL;
+
+	pmd = pmd_offset(pud, addr);
+	if (sz == PMD_SIZE)
+		/* must be pmd huge, non-present or none */
+		return (pte_t *)pmd;
+	if (!pmd_present(*pmd))
+		return NULL;
+
+	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) {
+		if (napot_cont_size(order) == sz) {
+			pte = pte_offset_kernel(pmd, (addr & ~napot_cont_mask(order)));
+			break;
+		}
+	}
+	return pte;
+}
+
+static pte_t get_clear_contig(struct mm_struct *mm,
+			      unsigned long addr,
+			      pte_t *ptep,
+			      unsigned long pte_num)
+{
+	pte_t orig_pte = ptep_get(ptep);
+	unsigned long i;
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
+		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+	return orig_pte;
+}
+
+static pte_t get_clear_contig_flush(struct mm_struct *mm,
+				    unsigned long addr,
+				    pte_t *ptep,
+				    unsigned long pte_num)
+{
+	pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
+	bool valid = !pte_none(orig_pte);
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+
+	if (valid)
+		flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
+
+	return orig_pte;
+}
+
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
+{
+	unsigned long order;
+
+	for_each_napot_order(order) {
+		if (shift == napot_cont_shift(order)) {
+			entry = pte_mknapot(entry, order);
+			break;
+		}
+	}
+	if (order == NAPOT_ORDER_MAX)
+		entry = pte_mkhuge(entry);
+
+	return entry;
+}
+
+void set_huge_pte_at(struct mm_struct *mm,
+		     unsigned long addr,
+		     pte_t *ptep,
+		     pte_t pte)
+{
+	int i;
+	int pte_num;
+
+	if (!pte_napot(pte)) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+
+	pte_num = napot_pte_num(napot_cont_order(pte));
+	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
+		set_pte_at(mm, addr, ptep, pte);
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       pte_t *ptep,
+			       pte_t pte,
+			       int dirty)
+{
+	pte_t orig_pte;
+	int i;
+	int pte_num;
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (!pte_napot(pte))
+		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+	pte_num = napot_pte_num(napot_cont_order(pte));
+	ptep = huge_pte_offset(mm, addr,
+			       napot_cont_size(napot_cont_order(pte)));
+	orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
+
+	if (pte_dirty(orig_pte))
+		pte = pte_mkdirty(pte);
+
+	if (pte_young(orig_pte))
+		pte = pte_mkyoung(pte);
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		set_pte_at(mm, addr, ptep, pte);
+
+	return true;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr,
+			      pte_t *ptep)
+{
+	int pte_num;
+	pte_t orig_pte = ptep_get(ptep);
+
+	if (!pte_napot(orig_pte))
+		return ptep_get_and_clear(mm, addr, ptep);
+
+	pte_num = napot_pte_num(napot_cont_order(orig_pte));
+
+	return get_clear_contig(mm, addr, ptep, pte_num);
+}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr,
+			     pte_t *ptep)
+{
+	int i;
+	int pte_num;
+	pte_t pte = ptep_get(ptep);
+
+	if (!pte_napot(pte)) {
+		ptep_set_wrprotect(mm, addr, ptep);
+		return;
+	}
+
+	pte_num = napot_pte_num(napot_cont_order(pte));
+	ptep = huge_pte_offset(mm, addr, napot_cont_size(4UL));
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		ptep_set_wrprotect(mm, addr, ptep);
+}
+
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+			    unsigned long addr,
+			    pte_t *ptep)
+{
+	int pte_num;
+	pte_t pte = ptep_get(ptep);
+
+	if (!pte_napot(pte))
+		return ptep_clear_flush(vma, addr, ptep);
+
+	pte_num = napot_pte_num(napot_cont_order(pte));
+
+	return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
+}
+
+void huge_pte_clear(struct mm_struct *mm,
+		    unsigned long addr,
+		    pte_t *ptep,
+		    unsigned long sz)
+{
+	int i, pte_num = 1;
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (pte_napot(pte))
+		pte_num = napot_pte_num(napot_cont_order(pte));
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		pte_clear(mm, addr, ptep);
+}
+
+static __init int napot_hugetlbpages_init(void)
+{
+	if (has_svnapot()) {
+		unsigned long order;
+
+		for_each_napot_order(order)
+			hugetlb_add_hstate(order);
+	}
+	return 0;
+}
+arch_initcall(napot_hugetlbpages_init);
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+
 int pud_huge(pud_t pud)
 {
 	return pud_leaf(pud);
@@ -18,8 +285,17 @@  bool __init arch_hugetlb_valid_size(unsigned long size)
 		return true;
 	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
 		return true;
-	else
+	else if (!has_svnapot())
+		return false;
+	else {
+		unsigned long order;
+
+		for_each_napot_order(order) {
+			if (size == napot_cont_size(order))
+				return true;
+		}
 		return false;
+	}
 }
 
 #ifdef CONFIG_CONTIG_ALLOC