diff mbox series

[v9,1/3] riscv: mm: modify pte format for Svnapot

Message ID 20221204141137.691790-2-panqinglin2020@iscas.ac.cn (mailing list archive)
State Superseded
Delegated to: Palmer Dabbelt
Headers show
Series riscv, mm: detect svnapot cpu support at runtime | expand

Checks

Context Check Description
conchuod/patch_count success Link
conchuod/cover_letter success Series has a cover letter
conchuod/tree_selection success Guessed tree name to be for-next
conchuod/fixes_present success Fixes tag not required for -next series
conchuod/verify_signedoff success Signed-off-by tag matches author and committer
conchuod/kdoc success Errors and warnings before: 0 this patch: 0
conchuod/module_param success Was 0 now: 0
conchuod/alphanumeric_selects fail Out of order selects before the patch: 57 and now 59
conchuod/build_rv32_defconfig success Build OK
conchuod/build_warn_rv64 success Errors and warnings before: 0 this patch: 0
conchuod/dtb_warn_rv64 success Errors and warnings before: 0 this patch: 0
conchuod/header_inline success No static functions without inline keyword in header files
conchuod/checkpatch success total: 0 errors, 0 warnings, 0 checks, 214 lines checked
conchuod/source_inline success Was 0 now: 0
conchuod/build_rv64_nommu_k210_defconfig success Build OK
conchuod/verify_fixes success No Fixes tag
conchuod/build_rv64_nommu_virt_defconfig success Build OK

Commit Message

Qinglin Pan Dec. 4, 2022, 2:11 p.m. UTC
From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Add one alternative to enable/disable svnapot support, enable this static
key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
option is set. It will influence the behavior of has_svnapot. All code
dependent on svnapot should make sure that has_svnapot return true firstly.

Modify PTE definition for Svnapot, and creates some functions in pgtable.h
to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
64KB napot size is supported in spec, so some macros has only 64KB version.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Comments

Conor Dooley Dec. 6, 2022, 7:56 p.m. UTC | #1
Palmer,

On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> +config RISCV_ISA_SVNAPOT
> +	bool "SVNAPOT extension support"
> +	depends on 64BIT && MMU
> +	select RISCV_ALTERNATIVE
> +	default y
> +	help
> +	  Allow kernel to detect SVNAPOT ISA-extension dynamically in boot time
> +	  and enable its usage.
> +
> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
> +	  of contiguous virtual-to-physical translations, with a naturally
> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
> +	  size.

RMK's perl script that I yoinked complains about this patch,
specifically that it expects the order:
depends
default
select

The patch I sent out for the select order the other day also switches
everything to this ordering:
https://lore.kernel.org/all/20221202131034.66674-1-conor@kernel.org/

Is that sane? Certainly for the really long lists that order works well,
but idk about the short ones. Unless you feel strongly otherwise, maybe
we should enforce that order for uniformity?

Thanks,
Conor.
Conor Dooley Dec. 7, 2022, 6:46 p.m. UTC | #2
Hey!

Couple small remarks and questions for you.

On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Add one alternative to enable/disable svnapot support, enable this static
> key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
> option is set. It will influence the behavior of has_svnapot. All code
> dependent on svnapot should make sure that has_svnapot return true firstly.
> 
> Modify PTE definition for Svnapot, and creates some functions in pgtable.h
> to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
> 64KB napot size is supported in spec, so some macros has only 64KB version.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index ef8d66de5f38..1d8477c0af7c 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -395,6 +395,20 @@ config RISCV_ISA_C
>  
>  	  If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_SVNAPOT
> +	bool "SVNAPOT extension support"
> +	depends on 64BIT && MMU
> +	select RISCV_ALTERNATIVE
> +	default y
> +	help
> +	  Allow kernel to detect SVNAPOT ISA-extension dynamically in boot time
> +	  and enable its usage.
> +
> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
> +	  of contiguous virtual-to-physical translations, with a naturally
> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
> +	  size.
> +
>  config RISCV_ISA_SVPBMT
>  	bool "SVPBMT extension support"
>  	depends on 64BIT && MMU
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index 4180312d2a70..beadb1126ed9 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -22,9 +22,10 @@
>  #define	ERRATA_THEAD_NUMBER 3
>  #endif
>  
> -#define	CPUFEATURE_SVPBMT 0
> -#define	CPUFEATURE_ZICBOM 1
> -#define	CPUFEATURE_NUMBER 2
> +#define	CPUFEATURE_SVPBMT	0
> +#define	CPUFEATURE_ZICBOM	1
> +#define	CPUFEATURE_SVNAPOT	2
> +#define	CPUFEATURE_NUMBER	3
>  
>  #ifdef __ASSEMBLY__
>  
> @@ -156,6 +157,14 @@ asm volatile(ALTERNATIVE(						\
>  	: "=r" (__ovl) :						\
>  	: "memory")
>  
> +#define ALT_SVNAPOT(_val)						\
> +asm(ALTERNATIVE(							\
> +	"li %0, 0",							\
> +	"li %0, 1",							\
> +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
> +	: "=r" (_val) :							\
> +	: "memory")
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index b22525290073..4cbc1f45ab26 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -54,6 +54,7 @@ extern unsigned long elf_hwcap;
>   */
>  enum riscv_isa_ext_id {
>  	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
> +	RISCV_ISA_EXT_SVNAPOT,
>  	RISCV_ISA_EXT_SVPBMT,
>  	RISCV_ISA_EXT_ZICBOM,
>  	RISCV_ISA_EXT_ZIHINTPAUSE,
> @@ -87,7 +88,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>  {
>  	switch (num) {
>  	case RISCV_ISA_EXT_f:
> -		return RISCV_ISA_EXT_KEY_FPU;
>  	case RISCV_ISA_EXT_d:
>  		return RISCV_ISA_EXT_KEY_FPU;
>  	case RISCV_ISA_EXT_ZIHINTPAUSE:
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index ac70b0fd9a9a..349fad5e35de 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -16,11 +16,6 @@
>  #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
>  #define PAGE_MASK	(~(PAGE_SIZE - 1))
>  
> -#ifdef CONFIG_64BIT
> -#define HUGE_MAX_HSTATE		2
> -#else
> -#define HUGE_MAX_HSTATE		1
> -#endif
>  #define HPAGE_SHIFT		PMD_SHIFT
>  #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
>  #define HPAGE_MASK              (~(HPAGE_SIZE - 1))
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index dc42375c2357..9611833907ec 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -74,6 +74,40 @@ typedef struct {
>   */
>  #define _PAGE_PFN_MASK  GENMASK(53, 10)
>  
> +/*
> + * [63] Svnapot definitions:
> + * 0 Svnapot disabled
> + * 1 Svnapot enabled
> + */
> +#define _PAGE_NAPOT_SHIFT	63
> +#define _PAGE_NAPOT		BIT(_PAGE_NAPOT_SHIFT)
> +/*
> + * Only 64KB (order 4) napot ptes supported.
> + */
> +#define NAPOT_CONT_ORDER_BASE 4
> +enum napot_cont_order {
> +	NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE,
> +	NAPOT_ORDER_MAX,
> +};
> +
> +#define for_each_napot_order(order)						\
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++)
> +#define for_each_napot_order_rev(order)						\

Would it be terrible to do s/rev/reverse to make things more obvious?

> +	for (order = NAPOT_ORDER_MAX - 1;					\
> +	     order >= NAPOT_CONT_ORDER_BASE; order--)
> +#define napot_cont_order(val)	(__builtin_ctzl((val.pte >> _PAGE_PFN_SHIFT) << 1))
> +
> +#define napot_cont_shift(order)	((order) + PAGE_SHIFT)
> +#define napot_cont_size(order)	BIT(napot_cont_shift(order))
> +#define napot_cont_mask(order)	(~(napot_cont_size(order) - 1UL))
> +#define napot_pte_num(order)	BIT(order)
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +#define HUGE_MAX_HSTATE		(2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
> +#else
> +#define HUGE_MAX_HSTATE		2
> +#endif

This is coming from a place of *complete* ignorance:
If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
support for it on a platform is it okay to change the value of
HUGE_MAX_HSTATE? Apologies if I've missed something obvious.

> +
>  /*
>   * [62:61] Svpbmt Memory Type definitions:
>   *
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index c61ae83aadee..99957b1270f2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -6,10 +6,12 @@
>  #ifndef _ASM_RISCV_PGTABLE_H
>  #define _ASM_RISCV_PGTABLE_H
>  
> +#include <linux/jump_label.h>
>  #include <linux/mmzone.h>
>  #include <linux/sizes.h>
>  
>  #include <asm/pgtable-bits.h>
> +#include <asm/hwcap.h>
>  
>  #ifndef CONFIG_MMU
>  #define KERNEL_LINK_ADDR	PAGE_OFFSET
> @@ -264,10 +266,49 @@ static inline pte_t pud_pte(pud_t pud)
>  	return __pte(pud_val(pud));
>  }
>  
> +static __always_inline bool has_svnapot(void)
> +{
> +	unsigned int _val;
> +
> +	ALT_SVNAPOT(_val);
> +
> +	return _val;
> +}
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +
> +static inline unsigned long pte_napot(pte_t pte)
> +{
> +	return pte_val(pte) & _PAGE_NAPOT;
> +}
> +
> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> +{
> +	int pos = order - 1 + _PAGE_PFN_SHIFT;
> +	unsigned long napot_bit = BIT(pos);
> +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> +
> +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> +}
> +
> +#else
> +
> +static inline unsigned long pte_napot(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
> +
>  /* Yields the page frame number (PFN) of a page table entry */
>  static inline unsigned long pte_pfn(pte_t pte)
>  {
> -	return __page_val_to_pfn(pte_val(pte));
> +	unsigned long res  = __page_val_to_pfn(pte_val(pte));

nit: extra space before the =

> +
> +	if (has_svnapot() && pte_napot(pte))
> +		res = res & (res - 1UL);
> +
> +	return res;
>  }
>  
>  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> index bf9dd6764bad..88495f5fcafd 100644
> --- a/arch/riscv/kernel/cpu.c
> +++ b/arch/riscv/kernel/cpu.c
> @@ -165,6 +165,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>  	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
>  	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
>  	__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
> +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
>  	__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index 694267d1fe81..eeed66c3d497 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -205,6 +205,7 @@ void __init riscv_fill_hwcap(void)
>  				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>  				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>  				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
> +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
>  			}
>  #undef SET_ISA_EXT_MAP
>  		}
> @@ -252,6 +253,17 @@ void __init riscv_fill_hwcap(void)
>  }
>  
>  #ifdef CONFIG_RISCV_ALTERNATIVE
> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
> +{
> +	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVNAPOT))
> +		return false;
> +
> +	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
> +		return false;
> +
> +	return riscv_isa_extension_available(NULL, SVNAPOT);
> +}
> +
>  static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage)
>  {
>  	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
> @@ -289,6 +301,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>  {
>  	u32 cpu_req_feature = 0;
>  
> +	if (cpufeature_probe_svnapot(stage))
> +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
> +
>  	if (cpufeature_probe_svpbmt(stage))
>  		cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);
>  

There's a bunch of stuff in this patch that may go away, depending on
sequencing just FYI. See [1] for more. I wouldn't rebase on top of that,
but just so that you're aware.

1 - https://patchwork.kernel.org/project/linux-riscv/cover/20221204174632.3677-1-jszhang@kernel.org/
Qinglin Pan Dec. 8, 2022, 4:51 a.m. UTC | #3
Hey!

On 2022/12/8 02:46, Conor Dooley wrote:
> Hey!
> 
> Couple small remarks and questions for you.
> 
> On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Add one alternative to enable/disable svnapot support, enable this static
>> key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
>> option is set. It will influence the behavior of has_svnapot. All code
>> dependent on svnapot should make sure that has_svnapot return true firstly.
>>
>> Modify PTE definition for Svnapot, and creates some functions in pgtable.h
>> to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
>> 64KB napot size is supported in spec, so some macros has only 64KB version.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index ef8d66de5f38..1d8477c0af7c 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -395,6 +395,20 @@ config RISCV_ISA_C
>>   
>>   	  If you don't know what to do here, say Y.
>>   
>> +config RISCV_ISA_SVNAPOT
>> +	bool "SVNAPOT extension support"
>> +	depends on 64BIT && MMU
>> +	select RISCV_ALTERNATIVE
>> +	default y
>> +	help
>> +	  Allow kernel to detect SVNAPOT ISA-extension dynamically in boot time
>> +	  and enable its usage.
>> +
>> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
>> +	  of contiguous virtual-to-physical translations, with a naturally
>> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
>> +	  size.
>> +
>>   config RISCV_ISA_SVPBMT
>>   	bool "SVPBMT extension support"
>>   	depends on 64BIT && MMU
>> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
>> index 4180312d2a70..beadb1126ed9 100644
>> --- a/arch/riscv/include/asm/errata_list.h
>> +++ b/arch/riscv/include/asm/errata_list.h
>> @@ -22,9 +22,10 @@
>>   #define	ERRATA_THEAD_NUMBER 3
>>   #endif
>>   
>> -#define	CPUFEATURE_SVPBMT 0
>> -#define	CPUFEATURE_ZICBOM 1
>> -#define	CPUFEATURE_NUMBER 2
>> +#define	CPUFEATURE_SVPBMT	0
>> +#define	CPUFEATURE_ZICBOM	1
>> +#define	CPUFEATURE_SVNAPOT	2
>> +#define	CPUFEATURE_NUMBER	3
>>   
>>   #ifdef __ASSEMBLY__
>>   
>> @@ -156,6 +157,14 @@ asm volatile(ALTERNATIVE(						\
>>   	: "=r" (__ovl) :						\
>>   	: "memory")
>>   
>> +#define ALT_SVNAPOT(_val)						\
>> +asm(ALTERNATIVE(							\
>> +	"li %0, 0",							\
>> +	"li %0, 1",							\
>> +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
>> +	: "=r" (_val) :							\
>> +	: "memory")
>> +
>>   #endif /* __ASSEMBLY__ */
>>   
>>   #endif
>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>> index b22525290073..4cbc1f45ab26 100644
>> --- a/arch/riscv/include/asm/hwcap.h
>> +++ b/arch/riscv/include/asm/hwcap.h
>> @@ -54,6 +54,7 @@ extern unsigned long elf_hwcap;
>>    */
>>   enum riscv_isa_ext_id {
>>   	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
>> +	RISCV_ISA_EXT_SVNAPOT,
>>   	RISCV_ISA_EXT_SVPBMT,
>>   	RISCV_ISA_EXT_ZICBOM,
>>   	RISCV_ISA_EXT_ZIHINTPAUSE,
>> @@ -87,7 +88,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>>   {
>>   	switch (num) {
>>   	case RISCV_ISA_EXT_f:
>> -		return RISCV_ISA_EXT_KEY_FPU;
>>   	case RISCV_ISA_EXT_d:
>>   		return RISCV_ISA_EXT_KEY_FPU;
>>   	case RISCV_ISA_EXT_ZIHINTPAUSE:
>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>> index ac70b0fd9a9a..349fad5e35de 100644
>> --- a/arch/riscv/include/asm/page.h
>> +++ b/arch/riscv/include/asm/page.h
>> @@ -16,11 +16,6 @@
>>   #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
>>   #define PAGE_MASK	(~(PAGE_SIZE - 1))
>>   
>> -#ifdef CONFIG_64BIT
>> -#define HUGE_MAX_HSTATE		2
>> -#else
>> -#define HUGE_MAX_HSTATE		1
>> -#endif
>>   #define HPAGE_SHIFT		PMD_SHIFT
>>   #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
>>   #define HPAGE_MASK              (~(HPAGE_SIZE - 1))
>> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
>> index dc42375c2357..9611833907ec 100644
>> --- a/arch/riscv/include/asm/pgtable-64.h
>> +++ b/arch/riscv/include/asm/pgtable-64.h
>> @@ -74,6 +74,40 @@ typedef struct {
>>    */
>>   #define _PAGE_PFN_MASK  GENMASK(53, 10)
>>   
>> +/*
>> + * [63] Svnapot definitions:
>> + * 0 Svnapot disabled
>> + * 1 Svnapot enabled
>> + */
>> +#define _PAGE_NAPOT_SHIFT	63
>> +#define _PAGE_NAPOT		BIT(_PAGE_NAPOT_SHIFT)
>> +/*
>> + * Only 64KB (order 4) napot ptes supported.
>> + */
>> +#define NAPOT_CONT_ORDER_BASE 4
>> +enum napot_cont_order {
>> +	NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE,
>> +	NAPOT_ORDER_MAX,
>> +};
>> +
>> +#define for_each_napot_order(order)						\
>> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++)
>> +#define for_each_napot_order_rev(order)						\
> 
> Would it be terrible to do s/rev/reverse to make things more obvious?

Maybe we should just keep it in the same style as existing macros like
for_each_mem_range/for_each_mem_range_rev :)

> 
>> +	for (order = NAPOT_ORDER_MAX - 1;					\
>> +	     order >= NAPOT_CONT_ORDER_BASE; order--)
>> +#define napot_cont_order(val)	(__builtin_ctzl((val.pte >> _PAGE_PFN_SHIFT) << 1))
>> +
>> +#define napot_cont_shift(order)	((order) + PAGE_SHIFT)
>> +#define napot_cont_size(order)	BIT(napot_cont_shift(order))
>> +#define napot_cont_mask(order)	(~(napot_cont_size(order) - 1UL))
>> +#define napot_pte_num(order)	BIT(order)
>> +
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +#define HUGE_MAX_HSTATE		(2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
>> +#else
>> +#define HUGE_MAX_HSTATE		2
>> +#endif
> 
> This is coming from a place of *complete* ignorance:
> If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
> support for it on a platform is it okay to change the value of
> HUGE_MAX_HSTATE? Apologies if I've missed something obvious.

:(
You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
will always be 3 whether has_svnapot() is true or false. I will try to
fix this.

> 
>> +
>>   /*
>>    * [62:61] Svpbmt Memory Type definitions:
>>    *
>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>> index c61ae83aadee..99957b1270f2 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -6,10 +6,12 @@
>>   #ifndef _ASM_RISCV_PGTABLE_H
>>   #define _ASM_RISCV_PGTABLE_H
>>   
>> +#include <linux/jump_label.h>
>>   #include <linux/mmzone.h>
>>   #include <linux/sizes.h>
>>   
>>   #include <asm/pgtable-bits.h>
>> +#include <asm/hwcap.h>
>>   
>>   #ifndef CONFIG_MMU
>>   #define KERNEL_LINK_ADDR	PAGE_OFFSET
>> @@ -264,10 +266,49 @@ static inline pte_t pud_pte(pud_t pud)
>>   	return __pte(pud_val(pud));
>>   }
>>   
>> +static __always_inline bool has_svnapot(void)
>> +{
>> +	unsigned int _val;
>> +
>> +	ALT_SVNAPOT(_val);
>> +
>> +	return _val;
>> +}
>> +
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +
>> +static inline unsigned long pte_napot(pte_t pte)
>> +{
>> +	return pte_val(pte) & _PAGE_NAPOT;
>> +}
>> +
>> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>> +{
>> +	int pos = order - 1 + _PAGE_PFN_SHIFT;
>> +	unsigned long napot_bit = BIT(pos);
>> +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
>> +
>> +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
>> +}
>> +
>> +#else
>> +
>> +static inline unsigned long pte_napot(pte_t pte)
>> +{
>> +	return 0;
>> +}
>> +
>> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
>> +
>>   /* Yields the page frame number (PFN) of a page table entry */
>>   static inline unsigned long pte_pfn(pte_t pte)
>>   {
>> -	return __page_val_to_pfn(pte_val(pte));
>> +	unsigned long res  = __page_val_to_pfn(pte_val(pte));
> 
> nit: extra space before the =

Nice catch! Will remove this extra space :)

> 
>> +
>> +	if (has_svnapot() && pte_napot(pte))
>> +		res = res & (res - 1UL);
>> +
>> +	return res;
>>   }
>>   
>>   #define pte_page(x)     pfn_to_page(pte_pfn(x))
>> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
>> index bf9dd6764bad..88495f5fcafd 100644
>> --- a/arch/riscv/kernel/cpu.c
>> +++ b/arch/riscv/kernel/cpu.c
>> @@ -165,6 +165,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>>   	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
>>   	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
>>   	__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
>> +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
>>   	__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
>>   	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>>   	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
>> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
>> index 694267d1fe81..eeed66c3d497 100644
>> --- a/arch/riscv/kernel/cpufeature.c
>> +++ b/arch/riscv/kernel/cpufeature.c
>> @@ -205,6 +205,7 @@ void __init riscv_fill_hwcap(void)
>>   				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>>   				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>>   				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
>> +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
>>   			}
>>   #undef SET_ISA_EXT_MAP
>>   		}
>> @@ -252,6 +253,17 @@ void __init riscv_fill_hwcap(void)
>>   }
>>   
>>   #ifdef CONFIG_RISCV_ALTERNATIVE
>> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
>> +{
>> +	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVNAPOT))
>> +		return false;
>> +
>> +	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
>> +		return false;
>> +
>> +	return riscv_isa_extension_available(NULL, SVNAPOT);
>> +}
>> +
>>   static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage)
>>   {
>>   	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
>> @@ -289,6 +301,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>>   {
>>   	u32 cpu_req_feature = 0;
>>   
>> +	if (cpufeature_probe_svnapot(stage))
>> +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
>> +
>>   	if (cpufeature_probe_svpbmt(stage))
>>   		cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);
>>   
> 
> There's a bunch of stuff in this patch that may go away, depending on
> sequencing just FYI. See [1] for more. I wouldn't rebase on top of that,
> but just so that you're aware.

Thanks for the information! If this patchset will be merged firstly, 
Jisheng can easily replace implementation of has_svnapot with his 
interface. Otherwise I will do this replacement :)

Thanks,
Qinglin

> 
> 1 - https://patchwork.kernel.org/project/linux-riscv/cover/20221204174632.3677-1-jszhang@kernel.org/
Conor Dooley Dec. 8, 2022, 4:59 a.m. UTC | #4
On 08/12/2022 04:51, Qinglin Pan wrote:
> On 2022/12/8 02:46, Conor Dooley wrote:
>> On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn wrote:
>>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

>>> +#define for_each_napot_order(order)                                         \
>>> +    for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++)
>>> +#define for_each_napot_order_rev(order)                                             \
>>
>> Would it be terrible to do s/rev/reverse to make things more obvious?
> 
> Maybe we should just keep it in the same style as existing macros like
> for_each_mem_range/for_each_mem_range_rev :)

Yah, makes sense. Sorry for the noise then!
Andrew Jones Dec. 8, 2022, 2:55 p.m. UTC | #5
On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Add one alternative to enable/disable svnapot support, enable this static
> key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
> option is set. It will influence the behavior of has_svnapot. All code
> dependent on svnapot should make sure that has_svnapot return true firstly.
> 
> Modify PTE definition for Svnapot, and creates some functions in pgtable.h
> to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
> 64KB napot size is supported in spec, so some macros has only 64KB version.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index ef8d66de5f38..1d8477c0af7c 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -395,6 +395,20 @@ config RISCV_ISA_C
>  
>  	  If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_SVNAPOT
> +	bool "SVNAPOT extension support"
> +	depends on 64BIT && MMU
> +	select RISCV_ALTERNATIVE
> +	default y
> +	help
> +	  Allow kernel to detect SVNAPOT ISA-extension dynamically in boot time
> +	  and enable its usage.
> +
> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
> +	  of contiguous virtual-to-physical translations, with a naturally
> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
> +	  size.

Maybe we want to usual, "If you don't know what to do here, say Y."

> +
>  config RISCV_ISA_SVPBMT
>  	bool "SVPBMT extension support"
>  	depends on 64BIT && MMU
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index 4180312d2a70..beadb1126ed9 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -22,9 +22,10 @@
>  #define	ERRATA_THEAD_NUMBER 3
>  #endif
>  
> -#define	CPUFEATURE_SVPBMT 0
> -#define	CPUFEATURE_ZICBOM 1
> -#define	CPUFEATURE_NUMBER 2
> +#define	CPUFEATURE_SVPBMT	0
> +#define	CPUFEATURE_ZICBOM	1
> +#define	CPUFEATURE_SVNAPOT	2
> +#define	CPUFEATURE_NUMBER	3
>  
>  #ifdef __ASSEMBLY__
>  
> @@ -156,6 +157,14 @@ asm volatile(ALTERNATIVE(						\
>  	: "=r" (__ovl) :						\
>  	: "memory")
>  
> +#define ALT_SVNAPOT(_val)						\
> +asm(ALTERNATIVE(							\
> +	"li %0, 0",							\
> +	"li %0, 1",							\
> +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
> +	: "=r" (_val) :							\
> +	: "memory")
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index b22525290073..4cbc1f45ab26 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -54,6 +54,7 @@ extern unsigned long elf_hwcap;
>   */
>  enum riscv_isa_ext_id {
>  	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
> +	RISCV_ISA_EXT_SVNAPOT,
>  	RISCV_ISA_EXT_SVPBMT,
>  	RISCV_ISA_EXT_ZICBOM,
>  	RISCV_ISA_EXT_ZIHINTPAUSE,
> @@ -87,7 +88,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>  {
>  	switch (num) {
>  	case RISCV_ISA_EXT_f:
> -		return RISCV_ISA_EXT_KEY_FPU;
>  	case RISCV_ISA_EXT_d:
>  		return RISCV_ISA_EXT_KEY_FPU;
>  	case RISCV_ISA_EXT_ZIHINTPAUSE:
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index ac70b0fd9a9a..349fad5e35de 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -16,11 +16,6 @@
>  #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
>  #define PAGE_MASK	(~(PAGE_SIZE - 1))
>  
> -#ifdef CONFIG_64BIT
> -#define HUGE_MAX_HSTATE		2
> -#else
> -#define HUGE_MAX_HSTATE		1
> -#endif
>  #define HPAGE_SHIFT		PMD_SHIFT
>  #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
>  #define HPAGE_MASK              (~(HPAGE_SIZE - 1))
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index dc42375c2357..9611833907ec 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -74,6 +74,40 @@ typedef struct {
>   */
>  #define _PAGE_PFN_MASK  GENMASK(53, 10)
>  
> +/*
> + * [63] Svnapot definitions:
> + * 0 Svnapot disabled
> + * 1 Svnapot enabled
> + */
> +#define _PAGE_NAPOT_SHIFT	63
> +#define _PAGE_NAPOT		BIT(_PAGE_NAPOT_SHIFT)
> +/*
> + * Only 64KB (order 4) napot ptes supported.
> + */
> +#define NAPOT_CONT_ORDER_BASE 4
> +enum napot_cont_order {
> +	NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE,
> +	NAPOT_ORDER_MAX,
> +};
> +
> +#define for_each_napot_order(order)						\
> +	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++)
> +#define for_each_napot_order_rev(order)						\
> +	for (order = NAPOT_ORDER_MAX - 1;					\
> +	     order >= NAPOT_CONT_ORDER_BASE; order--)
> +#define napot_cont_order(val)	(__builtin_ctzl((val.pte >> _PAGE_PFN_SHIFT) << 1))
> +
> +#define napot_cont_shift(order)	((order) + PAGE_SHIFT)
> +#define napot_cont_size(order)	BIT(napot_cont_shift(order))
> +#define napot_cont_mask(order)	(~(napot_cont_size(order) - 1UL))
> +#define napot_pte_num(order)	BIT(order)
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +#define HUGE_MAX_HSTATE		(2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
> +#else
> +#define HUGE_MAX_HSTATE		2
> +#endif
> +
>  /*
>   * [62:61] Svpbmt Memory Type definitions:
>   *
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index c61ae83aadee..99957b1270f2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -6,10 +6,12 @@
>  #ifndef _ASM_RISCV_PGTABLE_H
>  #define _ASM_RISCV_PGTABLE_H
>  
> +#include <linux/jump_label.h>
>  #include <linux/mmzone.h>
>  #include <linux/sizes.h>
>  
>  #include <asm/pgtable-bits.h>
> +#include <asm/hwcap.h>
>  
>  #ifndef CONFIG_MMU
>  #define KERNEL_LINK_ADDR	PAGE_OFFSET
> @@ -264,10 +266,49 @@ static inline pte_t pud_pte(pud_t pud)
>  	return __pte(pud_val(pud));
>  }
>  
> +static __always_inline bool has_svnapot(void)
> +{
> +	unsigned int _val;
> +
> +	ALT_SVNAPOT(_val);

I was actually hoping to see this based on Jisheng's series and then use
riscv_has_extension_likely() or riscv_has_extension_unlikely(). I guess
we can proceed independently and then sort it out later though as you
said in a reply to Conor.

> +
> +	return _val;
> +}
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +
> +static inline unsigned long pte_napot(pte_t pte)
> +{
> +	return pte_val(pte) & _PAGE_NAPOT;
> +}
> +
> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> +{
> +	int pos = order - 1 + _PAGE_PFN_SHIFT;
> +	unsigned long napot_bit = BIT(pos);
> +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> +
> +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> +}
> +
> +#else
> +
> +static inline unsigned long pte_napot(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
> +
>  /* Yields the page frame number (PFN) of a page table entry */
>  static inline unsigned long pte_pfn(pte_t pte)
>  {
> -	return __page_val_to_pfn(pte_val(pte));
> +	unsigned long res  = __page_val_to_pfn(pte_val(pte));
> +
> +	if (has_svnapot() && pte_napot(pte))
> +		res = res & (res - 1UL);
> +
> +	return res;
>  }
>  
>  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> index bf9dd6764bad..88495f5fcafd 100644
> --- a/arch/riscv/kernel/cpu.c
> +++ b/arch/riscv/kernel/cpu.c
> @@ -165,6 +165,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>  	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
>  	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
>  	__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
> +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
>  	__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index 694267d1fe81..eeed66c3d497 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -205,6 +205,7 @@ void __init riscv_fill_hwcap(void)
>  				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>  				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>  				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
> +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
>  			}
>  #undef SET_ISA_EXT_MAP
>  		}
> @@ -252,6 +253,17 @@ void __init riscv_fill_hwcap(void)
>  }
>  
>  #ifdef CONFIG_RISCV_ALTERNATIVE
> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
> +{
> +	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVNAPOT))
> +		return false;
> +
> +	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
> +		return false;
> +
> +	return riscv_isa_extension_available(NULL, SVNAPOT);
> +}
> +
>  static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage)
>  {
>  	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
> @@ -289,6 +301,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>  {
>  	u32 cpu_req_feature = 0;
>  
> +	if (cpufeature_probe_svnapot(stage))
> +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
> +
>  	if (cpufeature_probe_svpbmt(stage))
>  		cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);
>  
> -- 
> 2.37.4
>

Other than the issue with HUGE_MAX_HSTATE that Conor pointed out this
looks good to me.

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>

Thanks,
drew
Qinglin Pan Dec. 8, 2022, 5:34 p.m. UTC | #6
Hi all,

On 2022/12/8 12:51, Qinglin Pan wrote:
> Hey!
> 
> On 2022/12/8 02:46, Conor Dooley wrote:
>> Hey!
>>
>> Couple small remarks and questions for you.
>>
>> On Sun, Dec 04, 2022 at 10:11:35PM +0800, panqinglin2020@iscas.ac.cn 
>> wrote:
>>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>>
>>> Add one alternative to enable/disable svnapot support, enable this 
>>> static
>>> key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT 
>>> compile
>>> option is set. It will influence the behavior of has_svnapot. All code
>>> dependent on svnapot should make sure that has_svnapot return true 
>>> firstly.
>>>
>>> Modify PTE definition for Svnapot, and creates some functions in 
>>> pgtable.h
>>> to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
>>> 64KB napot size is supported in spec, so some macros has only 64KB 
>>> version.
>>>
>>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index ef8d66de5f38..1d8477c0af7c 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -395,6 +395,20 @@ config RISCV_ISA_C
>>>         If you don't know what to do here, say Y.
>>> +config RISCV_ISA_SVNAPOT
>>> +    bool "SVNAPOT extension support"
>>> +    depends on 64BIT && MMU
>>> +    select RISCV_ALTERNATIVE
>>> +    default y
>>> +    help
>>> +      Allow kernel to detect SVNAPOT ISA-extension dynamically in 
>>> boot time
>>> +      and enable its usage.
>>> +
>>> +      SVNAPOT extension helps to mark contiguous PTEs as a range
>>> +      of contiguous virtual-to-physical translations, with a naturally
>>> +      aligned power-of-2 (NAPOT) granularity larger than the base 
>>> 4KB page
>>> +      size.
>>> +
>>>   config RISCV_ISA_SVPBMT
>>>       bool "SVPBMT extension support"
>>>       depends on 64BIT && MMU
>>> diff --git a/arch/riscv/include/asm/errata_list.h 
>>> b/arch/riscv/include/asm/errata_list.h
>>> index 4180312d2a70..beadb1126ed9 100644
>>> --- a/arch/riscv/include/asm/errata_list.h
>>> +++ b/arch/riscv/include/asm/errata_list.h
>>> @@ -22,9 +22,10 @@
>>>   #define    ERRATA_THEAD_NUMBER 3
>>>   #endif
>>> -#define    CPUFEATURE_SVPBMT 0
>>> -#define    CPUFEATURE_ZICBOM 1
>>> -#define    CPUFEATURE_NUMBER 2
>>> +#define    CPUFEATURE_SVPBMT    0
>>> +#define    CPUFEATURE_ZICBOM    1
>>> +#define    CPUFEATURE_SVNAPOT    2
>>> +#define    CPUFEATURE_NUMBER    3
>>>   #ifdef __ASSEMBLY__
>>> @@ -156,6 +157,14 @@ asm volatile(ALTERNATIVE(                        \
>>>       : "=r" (__ovl) :                        \
>>>       : "memory")
>>> +#define ALT_SVNAPOT(_val)                        \
>>> +asm(ALTERNATIVE(                            \
>>> +    "li %0, 0",                            \
>>> +    "li %0, 1",                            \
>>> +        0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)    \
>>> +    : "=r" (_val) :                            \
>>> +    : "memory")
>>> +
>>>   #endif /* __ASSEMBLY__ */
>>>   #endif
>>> diff --git a/arch/riscv/include/asm/hwcap.h 
>>> b/arch/riscv/include/asm/hwcap.h
>>> index b22525290073..4cbc1f45ab26 100644
>>> --- a/arch/riscv/include/asm/hwcap.h
>>> +++ b/arch/riscv/include/asm/hwcap.h
>>> @@ -54,6 +54,7 @@ extern unsigned long elf_hwcap;
>>>    */
>>>   enum riscv_isa_ext_id {
>>>       RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
>>> +    RISCV_ISA_EXT_SVNAPOT,
>>>       RISCV_ISA_EXT_SVPBMT,
>>>       RISCV_ISA_EXT_ZICBOM,
>>>       RISCV_ISA_EXT_ZIHINTPAUSE,
>>> @@ -87,7 +88,6 @@ static __always_inline int riscv_isa_ext2key(int num)
>>>   {
>>>       switch (num) {
>>>       case RISCV_ISA_EXT_f:
>>> -        return RISCV_ISA_EXT_KEY_FPU;
>>>       case RISCV_ISA_EXT_d:
>>>           return RISCV_ISA_EXT_KEY_FPU;
>>>       case RISCV_ISA_EXT_ZIHINTPAUSE:
>>> diff --git a/arch/riscv/include/asm/page.h 
>>> b/arch/riscv/include/asm/page.h
>>> index ac70b0fd9a9a..349fad5e35de 100644
>>> --- a/arch/riscv/include/asm/page.h
>>> +++ b/arch/riscv/include/asm/page.h
>>> @@ -16,11 +16,6 @@
>>>   #define PAGE_SIZE    (_AC(1, UL) << PAGE_SHIFT)
>>>   #define PAGE_MASK    (~(PAGE_SIZE - 1))
>>> -#ifdef CONFIG_64BIT
>>> -#define HUGE_MAX_HSTATE        2
>>> -#else
>>> -#define HUGE_MAX_HSTATE        1
>>> -#endif
>>>   #define HPAGE_SHIFT        PMD_SHIFT
>>>   #define HPAGE_SIZE        (_AC(1, UL) << HPAGE_SHIFT)
>>>   #define HPAGE_MASK              (~(HPAGE_SIZE - 1))
>>> diff --git a/arch/riscv/include/asm/pgtable-64.h 
>>> b/arch/riscv/include/asm/pgtable-64.h
>>> index dc42375c2357..9611833907ec 100644
>>> --- a/arch/riscv/include/asm/pgtable-64.h
>>> +++ b/arch/riscv/include/asm/pgtable-64.h
>>> @@ -74,6 +74,40 @@ typedef struct {
>>>    */
>>>   #define _PAGE_PFN_MASK  GENMASK(53, 10)
>>> +/*
>>> + * [63] Svnapot definitions:
>>> + * 0 Svnapot disabled
>>> + * 1 Svnapot enabled
>>> + */
>>> +#define _PAGE_NAPOT_SHIFT    63
>>> +#define _PAGE_NAPOT        BIT(_PAGE_NAPOT_SHIFT)
>>> +/*
>>> + * Only 64KB (order 4) napot ptes supported.
>>> + */
>>> +#define NAPOT_CONT_ORDER_BASE 4
>>> +enum napot_cont_order {
>>> +    NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE,
>>> +    NAPOT_ORDER_MAX,
>>> +};
>>> +
>>> +#define for_each_napot_order(order)                        \
>>> +    for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; 
>>> order++)
>>> +#define for_each_napot_order_rev(order)                        \
>>
>> Would it be terrible to do s/rev/reverse to make things more obvious?
> 
> Maybe we should just keep it in the same style as existing macros like
> for_each_mem_range/for_each_mem_range_rev :)
> 
>>
>>> +    for (order = NAPOT_ORDER_MAX - 1;                    \
>>> +         order >= NAPOT_CONT_ORDER_BASE; order--)
>>> +#define napot_cont_order(val)    (__builtin_ctzl((val.pte >> 
>>> _PAGE_PFN_SHIFT) << 1))
>>> +
>>> +#define napot_cont_shift(order)    ((order) + PAGE_SHIFT)
>>> +#define napot_cont_size(order)    BIT(napot_cont_shift(order))
>>> +#define napot_cont_mask(order)    (~(napot_cont_size(order) - 1UL))
>>> +#define napot_pte_num(order)    BIT(order)
>>> +
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> +#define HUGE_MAX_HSTATE        (2 + (NAPOT_ORDER_MAX - 
>>> NAPOT_CONT_ORDER_BASE))
>>> +#else
>>> +#define HUGE_MAX_HSTATE        2
>>> +#endif
>>
>> This is coming from a place of *complete* ignorance:
>> If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
>> support for it on a platform is it okay to change the value of
>> HUGE_MAX_HSTATE? Apologies if I've missed something obvious.
> 
> :(
> You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
> will always be 3 whether has_svnapot() is true or false. I will try to
> fix this.

I am really expecting that HUGE_MAX_HSTATE can change according to the
platform situation when CONFIG_RISCV_ISA_SVNAPOT is enabled. But it
seems impossible to achive this with a minor patch. Because this value
is used as some static allocated arrays' length (for example, hstates in
mm/hugetlb.c:52) in arch-independent code :(

This immuture HUGE_MAX_HSTATE value will cause no functional error but
it really makes some objects used only partially when 
CONFIG_RISCV_ISA_SVNAPOT is enabled but the platform has no svnapot
support :( I am not sure whether this patch can be accepted with this
feature? Or any other helpful idea to fix this problem?

Please let me know if I ignore any information about it.

Regards,
Qinglin.

> 
>>
>>> +
>>>   /*
>>>    * [62:61] Svpbmt Memory Type definitions:
>>>    *
>>> diff --git a/arch/riscv/include/asm/pgtable.h 
>>> b/arch/riscv/include/asm/pgtable.h
>>> index c61ae83aadee..99957b1270f2 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -6,10 +6,12 @@
>>>   #ifndef _ASM_RISCV_PGTABLE_H
>>>   #define _ASM_RISCV_PGTABLE_H
>>> +#include <linux/jump_label.h>
>>>   #include <linux/mmzone.h>
>>>   #include <linux/sizes.h>
>>>   #include <asm/pgtable-bits.h>
>>> +#include <asm/hwcap.h>
>>>   #ifndef CONFIG_MMU
>>>   #define KERNEL_LINK_ADDR    PAGE_OFFSET
>>> @@ -264,10 +266,49 @@ static inline pte_t pud_pte(pud_t pud)
>>>       return __pte(pud_val(pud));
>>>   }
>>> +static __always_inline bool has_svnapot(void)
>>> +{
>>> +    unsigned int _val;
>>> +
>>> +    ALT_SVNAPOT(_val);
>>> +
>>> +    return _val;
>>> +}
>>> +
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> +
>>> +static inline unsigned long pte_napot(pte_t pte)
>>> +{
>>> +    return pte_val(pte) & _PAGE_NAPOT;
>>> +}
>>> +
>>> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>>> +{
>>> +    int pos = order - 1 + _PAGE_PFN_SHIFT;
>>> +    unsigned long napot_bit = BIT(pos);
>>> +    unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
>>> +
>>> +    return __pte((pte_val(pte) & napot_mask) | napot_bit | 
>>> _PAGE_NAPOT);
>>> +}
>>> +
>>> +#else
>>> +
>>> +static inline unsigned long pte_napot(pte_t pte)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
>>> +
>>>   /* Yields the page frame number (PFN) of a page table entry */
>>>   static inline unsigned long pte_pfn(pte_t pte)
>>>   {
>>> -    return __page_val_to_pfn(pte_val(pte));
>>> +    unsigned long res  = __page_val_to_pfn(pte_val(pte));
>>
>> nit: extra space before the =
> 
> Nice catch! Will remove this extra space :)
> 
>>
>>> +
>>> +    if (has_svnapot() && pte_napot(pte))
>>> +        res = res & (res - 1UL);
>>> +
>>> +    return res;
>>>   }
>>>   #define pte_page(x)     pfn_to_page(pte_pfn(x))
>>> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
>>> index bf9dd6764bad..88495f5fcafd 100644
>>> --- a/arch/riscv/kernel/cpu.c
>>> +++ b/arch/riscv/kernel/cpu.c
>>> @@ -165,6 +165,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>>>       __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
>>>       __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
>>>       __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
>>> +    __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
>>>       __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
>>>       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>>>       __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
>>> diff --git a/arch/riscv/kernel/cpufeature.c 
>>> b/arch/riscv/kernel/cpufeature.c
>>> index 694267d1fe81..eeed66c3d497 100644
>>> --- a/arch/riscv/kernel/cpufeature.c
>>> +++ b/arch/riscv/kernel/cpufeature.c
>>> @@ -205,6 +205,7 @@ void __init riscv_fill_hwcap(void)
>>>                   SET_ISA_EXT_MAP("zihintpause", 
>>> RISCV_ISA_EXT_ZIHINTPAUSE);
>>>                   SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>>>                   SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
>>> +                SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
>>>               }
>>>   #undef SET_ISA_EXT_MAP
>>>           }
>>> @@ -252,6 +253,17 @@ void __init riscv_fill_hwcap(void)
>>>   }
>>>   #ifdef CONFIG_RISCV_ALTERNATIVE
>>> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int 
>>> stage)
>>> +{
>>> +    if (!IS_ENABLED(CONFIG_RISCV_ISA_SVNAPOT))
>>> +        return false;
>>> +
>>> +    if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
>>> +        return false;
>>> +
>>> +    return riscv_isa_extension_available(NULL, SVNAPOT);
>>> +}
>>> +
>>>   static bool __init_or_module cpufeature_probe_svpbmt(unsigned int 
>>> stage)
>>>   {
>>>       if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
>>> @@ -289,6 +301,9 @@ static u32 __init_or_module 
>>> cpufeature_probe(unsigned int stage)
>>>   {
>>>       u32 cpu_req_feature = 0;
>>> +    if (cpufeature_probe_svnapot(stage))
>>> +        cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
>>> +
>>>       if (cpufeature_probe_svpbmt(stage))
>>>           cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);
>>
>> There's a bunch of stuff in this patch that may go away, depending on
>> sequencing just FYI. See [1] for more. I wouldn't rebase on top of that,
>> but just so that you're aware.
> 
> Thanks for the information! If this patchset will be merged firstly, 
> Jisheng can easily replace implementation of has_svnapot with his 
> interface. Otherwise I will do this replacement :)
> 
> Thanks,
> Qinglin
> 
>>
>> 1 - 
>> https://patchwork.kernel.org/project/linux-riscv/cover/20221204174632.3677-1-jszhang@kernel.org/
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Andrew Jones Dec. 9, 2022, 7:42 a.m. UTC | #7
On Fri, Dec 09, 2022 at 01:34:42AM +0800, Qinglin Pan wrote:
...
> > > > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > > > +#define HUGE_MAX_HSTATE        (2 + (NAPOT_ORDER_MAX -
> > > > NAPOT_CONT_ORDER_BASE))
> > > > +#else
> > > > +#define HUGE_MAX_HSTATE        2
> > > > +#endif
> > > 
> > > This is coming from a place of *complete* ignorance:
> > > If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
> > > support for it on a platform is it okay to change the value of
> > > HUGE_MAX_HSTATE? Apologies if I've missed something obvious.
> > 
> > :(
> > You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
> > will always be 3 whether has_svnapot() is true or false. I will try to
> > fix this.
> 
> I am really expecting that HUGE_MAX_HSTATE can change according to the
> platform situation when CONFIG_RISCV_ISA_SVNAPOT is enabled. But it
> seems impossible to achive this with a minor patch. Because this value
> is used as some static allocated arrays' length (for example, hstates in
> mm/hugetlb.c:52) in arch-independent code :(
> 
> This immuture HUGE_MAX_HSTATE value will cause no functional error but
> it really makes some objects used only partially when
> CONFIG_RISCV_ISA_SVNAPOT is enabled but the platform has no svnapot
> support :( I am not sure whether this patch can be accepted with this
> feature? Or any other helpful idea to fix this problem?
> 
> Please let me know if I ignore any information about it.

I agree that the only problem appears to a waste of memory, particularly
if cgroups are enabled. I don't see any easy solution either. Maybe a
warning in the Kconfig text would be sufficient?

Thanks,
drew
Conor Dooley Dec. 9, 2022, 2:33 p.m. UTC | #8
On 9 December 2022 08:42:29 GMT+01:00, Andrew Jones <ajones@ventanamicro.com> wrote:
>On Fri, Dec 09, 2022 at 01:34:42AM +0800, Qinglin Pan wrote:
>...
>> > > > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> > > > +#define HUGE_MAX_HSTATE        (2 + (NAPOT_ORDER_MAX -
>> > > > NAPOT_CONT_ORDER_BASE))
>> > > > +#else
>> > > > +#define HUGE_MAX_HSTATE        2
>> > > > +#endif
>> > > 
>> > > This is coming from a place of *complete* ignorance:
>> > > If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
>> > > support for it on a platform is it okay to change the value of
>> > > HUGE_MAX_HSTATE? Apologies if I've missed something obvious.
>> > 
>> > :(
>> > You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
>> > will always be 3 whether has_svnapot() is true or false. I will try to
>> > fix this.
>> 
>> I am really expecting that HUGE_MAX_HSTATE can change according to the
>> platform situation when CONFIG_RISCV_ISA_SVNAPOT is enabled. But it
>> seems impossible to achive this with a minor patch. Because this value
>> is used as some static allocated arrays' length (for example, hstates in
>> mm/hugetlb.c:52) in arch-independent code :(
>> 
>> This immuture HUGE_MAX_HSTATE value will cause no functional error but
>> it really makes some objects used only partially when
>> CONFIG_RISCV_ISA_SVNAPOT is enabled but the platform has no svnapot
>> support :( I am not sure whether this patch can be accepted with this
>> feature? Or any other helpful idea to fix this problem?
>> 
>> Please let me know if I ignore any information about it.
>
>I agree that the only problem appears to a waste of memory, particularly

Caveat about ignorance still applies, how much of a waste of memory are we talking?

>if cgroups are enabled. I don't see any easy solution either. Maybe a
>warning in the Kconfig text would be sufficient?

I think that entirely depends on how wasteful it is.
If it's minor, sure?

Thanks,
Conor.
Qinglin Pan Dec. 9, 2022, 3:36 p.m. UTC | #9
Hi all,

On 2022/12/9 22:33, Conor Dooley wrote:
> 
> 
> On 9 December 2022 08:42:29 GMT+01:00, Andrew Jones <ajones@ventanamicro.com> wrote:
>> On Fri, Dec 09, 2022 at 01:34:42AM +0800, Qinglin Pan wrote:
>> ...
>>>>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>>>>> +#define HUGE_MAX_HSTATE        (2 + (NAPOT_ORDER_MAX -
>>>>>> NAPOT_CONT_ORDER_BASE))
>>>>>> +#else
>>>>>> +#define HUGE_MAX_HSTATE        2
>>>>>> +#endif
>>>>>
>>>>> This is coming from a place of *complete* ignorance:
>>>>> If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
>>>>> support for it on a platform is it okay to change the value of
>>>>> HUGE_MAX_HSTATE? Apologies if I've missed something obvious.
>>>>
>>>> :(
>>>> You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
>>>> will always be 3 whether has_svnapot() is true or false. I will try to
>>>> fix this.
>>>
>>> I am really expecting that HUGE_MAX_HSTATE can change according to the
>>> platform situation when CONFIG_RISCV_ISA_SVNAPOT is enabled. But it
>>> seems impossible to achive this with a minor patch. Because this value
>>> is used as some static allocated arrays' length (for example, hstates in
>>> mm/hugetlb.c:52) in arch-independent code :(
>>>
>>> This immuture HUGE_MAX_HSTATE value will cause no functional error but
>>> it really makes some objects used only partially when
>>> CONFIG_RISCV_ISA_SVNAPOT is enabled but the platform has no svnapot
>>> support :( I am not sure whether this patch can be accepted with this
>>> feature? Or any other helpful idea to fix this problem?
>>>
>>> Please let me know if I ignore any information about it.
>>
>> I agree that the only problem appears to a waste of memory, particularly
> 
> Caveat about ignorance still applies, how much of a waste of memory are we talking?
> 
>> if cgroups are enabled. I don't see any easy solution either. Maybe a
>> warning in the Kconfig text would be sufficient?
> 
> I think that entirely depends on how wasteful it is.
> If it's minor, sure?
> 

I am also not sure how minor this waste will be. But I believe it will
be acceptable. Because before this patchset, we also use only 1/2 of
HUGE_MAX_HSTATE value (which is 2) when CONFIG_64BIT is enabled and
CONFIG_CONTIG_ALLOC is disabled. As HUGE_MAX_HSTATE is a 'max' value,
it is normal to use it up only sometimes but not always.

I prefer to do as Andrew said. To add an notification about this waste
in KConfig text, and still use the immuture HUGE_MAX_HSTATE value.

@Andrew @Conor what do you think of this explanation and this solution?

Please let me know if I ignore anything. Thanks a lot!

Regards,
Qinglin.

> Thanks,
> Conor.
Andrew Jones Dec. 9, 2022, 5:04 p.m. UTC | #10
On Fri, Dec 09, 2022 at 11:36:53PM +0800, Qinglin Pan wrote:
> Hi all,
> 
> On 2022/12/9 22:33, Conor Dooley wrote:
> > 
> > 
> > On 9 December 2022 08:42:29 GMT+01:00, Andrew Jones <ajones@ventanamicro.com> wrote:
> > > On Fri, Dec 09, 2022 at 01:34:42AM +0800, Qinglin Pan wrote:
> > > ...
> > > > > > > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > > > > > > +#define HUGE_MAX_HSTATE        (2 + (NAPOT_ORDER_MAX -
> > > > > > > NAPOT_CONT_ORDER_BASE))
> > > > > > > +#else
> > > > > > > +#define HUGE_MAX_HSTATE        2
> > > > > > > +#endif
> > > > > > 
> > > > > > This is coming from a place of *complete* ignorance:
> > > > > > If CONFIG_RISCV_ISA_SVNAPOT is enabled in the kernel but there is no
> > > > > > support for it on a platform is it okay to change the value of
> > > > > > HUGE_MAX_HSTATE? Apologies if I've missed something obvious.
> > > > > 
> > > > > :(
> > > > > You are right. If CONFIG_RISCV_ISA_SVNAPOT is enabled, HUGE_MAX_HSTATE
> > > > > will always be 3 whether has_svnapot() is true or false. I will try to
> > > > > fix this.
> > > > 
> > > > I am really expecting that HUGE_MAX_HSTATE can change according to the
> > > > platform situation when CONFIG_RISCV_ISA_SVNAPOT is enabled. But it
> > > > seems impossible to achive this with a minor patch. Because this value
> > > > is used as some static allocated arrays' length (for example, hstates in
> > > > mm/hugetlb.c:52) in arch-independent code :(
> > > > 
> > > > This immuture HUGE_MAX_HSTATE value will cause no functional error but
> > > > it really makes some objects used only partially when
> > > > CONFIG_RISCV_ISA_SVNAPOT is enabled but the platform has no svnapot
> > > > support :( I am not sure whether this patch can be accepted with this
> > > > feature? Or any other helpful idea to fix this problem?
> > > > 
> > > > Please let me know if I ignore any information about it.
> > > 
> > > I agree that the only problem appears to a waste of memory, particularly
> > 
> > Caveat about ignorance still applies, how much of a waste of memory are we talking?
> > 
> > > if cgroups are enabled. I don't see any easy solution either. Maybe a
> > > warning in the Kconfig text would be sufficient?
> > 
> > I think that entirely depends on how wasteful it is.
> > If it's minor, sure?
> > 
> 
> I am also not sure how minor this waste will be. But I believe it will
> be acceptable. Because before this patchset, we also use only 1/2 of
> HUGE_MAX_HSTATE value (which is 2) when CONFIG_64BIT is enabled and
> CONFIG_CONTIG_ALLOC is disabled. As HUGE_MAX_HSTATE is a 'max' value,
> it is normal to use it up only sometimes but not always.
> 
> I prefer to do as Andrew said. To add an notification about this waste
> in KConfig text, and still use the immuture HUGE_MAX_HSTATE value.
> 
> @Andrew @Conor what do you think of this explanation and this solution?

To get a better estimate of the waste it should be possible to write a
quick kernel module which outputs sizeof on a few structs. But, from a
quick read of the structs, I think we're probably fine. Certainly it looks
like without CGROUP_HUGETLB and with a small MAX_NUMNODES it's no big
deal. With CGROUP_HUGETLB it's definitely more, but probably OK.
MAX_NUMNODES probably won't ever be huge, so, while there are five arrays
of MAX_NUMNODES elements in hstates, it's unlikely to matter much,
especially since the types of the array elements are small.

It'd be nice to get actual numbers, but based on that analysis I think I'm
OK with the comment in Kconfig.

Thanks,
drew

> 
> Please let me know if I ignore anything. Thanks a lot!
> 
> Regards,
> Qinglin.
> 
> > Thanks,
> > Conor.
>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index ef8d66de5f38..1d8477c0af7c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -395,6 +395,20 @@  config RISCV_ISA_C
 
 	  If you don't know what to do here, say Y.
 
+config RISCV_ISA_SVNAPOT
+	bool "SVNAPOT extension support"
+	depends on 64BIT && MMU
+	select RISCV_ALTERNATIVE
+	default y
+	help
+	  Allow kernel to detect SVNAPOT ISA-extension dynamically in boot time
+	  and enable its usage.
+
+	  SVNAPOT extension helps to mark contiguous PTEs as a range
+	  of contiguous virtual-to-physical translations, with a naturally
+	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
+	  size.
+
 config RISCV_ISA_SVPBMT
 	bool "SVPBMT extension support"
 	depends on 64BIT && MMU
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index 4180312d2a70..beadb1126ed9 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -22,9 +22,10 @@ 
 #define	ERRATA_THEAD_NUMBER 3
 #endif
 
-#define	CPUFEATURE_SVPBMT 0
-#define	CPUFEATURE_ZICBOM 1
-#define	CPUFEATURE_NUMBER 2
+#define	CPUFEATURE_SVPBMT	0
+#define	CPUFEATURE_ZICBOM	1
+#define	CPUFEATURE_SVNAPOT	2
+#define	CPUFEATURE_NUMBER	3
 
 #ifdef __ASSEMBLY__
 
@@ -156,6 +157,14 @@  asm volatile(ALTERNATIVE(						\
 	: "=r" (__ovl) :						\
 	: "memory")
 
+#define ALT_SVNAPOT(_val)						\
+asm(ALTERNATIVE(							\
+	"li %0, 0",							\
+	"li %0, 1",							\
+		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
+	: "=r" (_val) :							\
+	: "memory")
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index b22525290073..4cbc1f45ab26 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -54,6 +54,7 @@  extern unsigned long elf_hwcap;
  */
 enum riscv_isa_ext_id {
 	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
+	RISCV_ISA_EXT_SVNAPOT,
 	RISCV_ISA_EXT_SVPBMT,
 	RISCV_ISA_EXT_ZICBOM,
 	RISCV_ISA_EXT_ZIHINTPAUSE,
@@ -87,7 +88,6 @@  static __always_inline int riscv_isa_ext2key(int num)
 {
 	switch (num) {
 	case RISCV_ISA_EXT_f:
-		return RISCV_ISA_EXT_KEY_FPU;
 	case RISCV_ISA_EXT_d:
 		return RISCV_ISA_EXT_KEY_FPU;
 	case RISCV_ISA_EXT_ZIHINTPAUSE:
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index ac70b0fd9a9a..349fad5e35de 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -16,11 +16,6 @@ 
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE - 1))
 
-#ifdef CONFIG_64BIT
-#define HUGE_MAX_HSTATE		2
-#else
-#define HUGE_MAX_HSTATE		1
-#endif
 #define HPAGE_SHIFT		PMD_SHIFT
 #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
 #define HPAGE_MASK              (~(HPAGE_SIZE - 1))
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index dc42375c2357..9611833907ec 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -74,6 +74,40 @@  typedef struct {
  */
 #define _PAGE_PFN_MASK  GENMASK(53, 10)
 
+/*
+ * [63] Svnapot definitions:
+ * 0 Svnapot disabled
+ * 1 Svnapot enabled
+ */
+#define _PAGE_NAPOT_SHIFT	63
+#define _PAGE_NAPOT		BIT(_PAGE_NAPOT_SHIFT)
+/*
+ * Only 64KB (order 4) napot ptes supported.
+ */
+#define NAPOT_CONT_ORDER_BASE 4
+enum napot_cont_order {
+	NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE,
+	NAPOT_ORDER_MAX,
+};
+
+#define for_each_napot_order(order)						\
+	for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++)
+#define for_each_napot_order_rev(order)						\
+	for (order = NAPOT_ORDER_MAX - 1;					\
+	     order >= NAPOT_CONT_ORDER_BASE; order--)
+#define napot_cont_order(val)	(__builtin_ctzl((val.pte >> _PAGE_PFN_SHIFT) << 1))
+
+#define napot_cont_shift(order)	((order) + PAGE_SHIFT)
+#define napot_cont_size(order)	BIT(napot_cont_shift(order))
+#define napot_cont_mask(order)	(~(napot_cont_size(order) - 1UL))
+#define napot_pte_num(order)	BIT(order)
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+#define HUGE_MAX_HSTATE		(2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
+#else
+#define HUGE_MAX_HSTATE		2
+#endif
+
 /*
  * [62:61] Svpbmt Memory Type definitions:
  *
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index c61ae83aadee..99957b1270f2 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -6,10 +6,12 @@ 
 #ifndef _ASM_RISCV_PGTABLE_H
 #define _ASM_RISCV_PGTABLE_H
 
+#include <linux/jump_label.h>
 #include <linux/mmzone.h>
 #include <linux/sizes.h>
 
 #include <asm/pgtable-bits.h>
+#include <asm/hwcap.h>
 
 #ifndef CONFIG_MMU
 #define KERNEL_LINK_ADDR	PAGE_OFFSET
@@ -264,10 +266,49 @@  static inline pte_t pud_pte(pud_t pud)
 	return __pte(pud_val(pud));
 }
 
+static __always_inline bool has_svnapot(void)
+{
+	unsigned int _val;
+
+	ALT_SVNAPOT(_val);
+
+	return _val;
+}
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+
+static inline unsigned long pte_napot(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_NAPOT;
+}
+
+static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
+{
+	int pos = order - 1 + _PAGE_PFN_SHIFT;
+	unsigned long napot_bit = BIT(pos);
+	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
+
+	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
+}
+
+#else
+
+static inline unsigned long pte_napot(pte_t pte)
+{
+	return 0;
+}
+
+#endif /* CONFIG_RISCV_ISA_SVNAPOT */
+
 /* Yields the page frame number (PFN) of a page table entry */
 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return __page_val_to_pfn(pte_val(pte));
+	unsigned long res  = __page_val_to_pfn(pte_val(pte));
+
+	if (has_svnapot() && pte_napot(pte))
+		res = res & (res - 1UL);
+
+	return res;
 }
 
 #define pte_page(x)     pfn_to_page(pte_pfn(x))
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index bf9dd6764bad..88495f5fcafd 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -165,6 +165,7 @@  static struct riscv_isa_ext_data isa_ext_arr[] = {
 	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
 	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
 	__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
+	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
 	__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
 	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
 	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 694267d1fe81..eeed66c3d497 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -205,6 +205,7 @@  void __init riscv_fill_hwcap(void)
 				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
 				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
 				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
+				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
 			}
 #undef SET_ISA_EXT_MAP
 		}
@@ -252,6 +253,17 @@  void __init riscv_fill_hwcap(void)
 }
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
+static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVNAPOT))
+		return false;
+
+	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+		return false;
+
+	return riscv_isa_extension_available(NULL, SVNAPOT);
+}
+
 static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage)
 {
 	if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
@@ -289,6 +301,9 @@  static u32 __init_or_module cpufeature_probe(unsigned int stage)
 {
 	u32 cpu_req_feature = 0;
 
+	if (cpufeature_probe_svnapot(stage))
+		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
+
 	if (cpufeature_probe_svpbmt(stage))
 		cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);