diff mbox series

[2/5] arm64: mm: introduce 52-bit userspace support

Message ID 20180829124543.25314-3-steve.capper@arm.com (mailing list archive)
State New, archived
Headers show
Series 52-bit userspace VAs | expand

Commit Message

Steve Capper Aug. 29, 2018, 12:45 p.m. UTC
On arm64 there is optional support for a 52-bit virtual address space.
To exploit this one has to be running with a 64KB page size and be
running on hardware that supports this.

For an arm64 kernel supporting a 48 bit VA with a 64KB page size,
a few changes are needed to support a 52-bit userspace:
 * TCR_EL1.T0SZ needs to be 12 instead of 16,
 * pgd_offset needs to work with a different PTRS_PER_PGD,
 * PGD_SIZE needs to be increased,
 * TASK_SIZE needs to reflect the new size.

This patch implements the above when the support for 52-bit VAs is
detected at early boot time.

On arm64 userspace addresses translation is controlled by TTBR0_EL1. As
well as userspace, TTBR0_EL1 controls:
 * The identity mapping,
 * EFI runtime code.

It is possible to run a kernel with an identity mapping that has a
larger VA size than userspace (and for this case __cpu_set_tcr_t0sz()
would set TCR_EL1.T0SZ as appropriate). However, when the conditions for
52-bit userspace are met; it is possible to keep TCR_EL1.T0SZ fixed at
12. Thus in this patch, the TCR_EL1.T0SZ size changing logic is
disabled.

A future patch enables CONFIG_ARM64_TRY_52BIT_VA (which essentially
activates this patch) as we need to also change the mmap logic to maintain
compatibility with a userspace that expects at most 48-bits of VA.

Signed-off-by: Steve Capper <steve.capper@arm.com>
---
 arch/arm64/include/asm/assembler.h   |  7 +++----
 arch/arm64/include/asm/cpucaps.h     |  3 ++-
 arch/arm64/include/asm/mmu_context.h |  3 +++
 arch/arm64/include/asm/pgalloc.h     |  4 ++++
 arch/arm64/include/asm/pgtable.h     | 16 +++++++++++++---
 arch/arm64/include/asm/processor.h   | 10 +++++-----
 arch/arm64/kernel/cpufeature.c       | 16 ++++++++++++++++
 arch/arm64/kernel/entry.S            |  6 +++++-
 arch/arm64/kernel/head.S             | 13 +++++++++++++
 arch/arm64/mm/fault.c                |  2 +-
 arch/arm64/mm/mmu.c                  |  1 +
 arch/arm64/mm/proc.S                 | 10 +++++++++-
 12 files changed, 75 insertions(+), 16 deletions(-)

Comments

Catalin Marinas Sept. 21, 2018, 5:40 p.m. UTC | #1
On Wed, Aug 29, 2018 at 01:45:40PM +0100, Steve Capper wrote:
> On arm64 there is optional support for a 52-bit virtual address space.
> To exploit this one has to be running with a 64KB page size and be
> running on hardware that supports this.
> 
> For an arm64 kernel supporting a 48 bit VA with a 64KB page size,
> a few changes are needed to support a 52-bit userspace:
>  * TCR_EL1.T0SZ needs to be 12 instead of 16,
>  * pgd_offset needs to work with a different PTRS_PER_PGD,
>  * PGD_SIZE needs to be increased,
>  * TASK_SIZE needs to reflect the new size.

I will comment on the general approach here. We indeed need TASK_SIZE
and T0SZ to be variable based on the supported feature. However, I think
PGD_SIZE can be fixed to cover 52-bit VA all the time, we just need to
ensure that we allocate a bigger pgd when this option is enabled.

When the 52-bit VA is not present, everything still works as normal as
the pgd entries for a 48-bit VA are written at the beginning of the pgd.
There is no need to change pgd_offset/index etc.

> A future patch enables CONFIG_ARM64_TRY_52BIT_VA (which essentially
> activates this patch) as we need to also change the mmap logic to maintain
> compatibility with a userspace that expects at most 48-bits of VA.

And I would just drop the "TRY" part in the above config (but mention it
in the help text that it is used only if available).

> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> index 2e05bcd944c8..7f50804a677e 100644
> --- a/arch/arm64/include/asm/pgalloc.h
> +++ b/arch/arm64/include/asm/pgalloc.h
> @@ -27,7 +27,11 @@
>  #define check_pgt_cache()		do { } while (0)
>  
>  #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
> +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> +#define PGD_SIZE	((1 << (52 - PGDIR_SHIFT)) * sizeof(pgd_t))
> +#else
>  #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
> +#endif

OK, so you've already made the PGD_SIZE static here (unless changed in
subsequent patches). 

> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 1bdeca8918a6..8449e266cd46 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -577,11 +577,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>  #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
>  
>  /* to find an entry in a page-table-directory */
> -#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
> +#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
> +#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
> +#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
>  
> -#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
> +static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
> +{
> +	pgd_t *ret;
> +
> +	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
> +		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
> +	else
> +		ret = pgd_offset_raw(mm->pgd, addr);
>  
> -#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
> +	return ret;
> +}
>  
>  /* to find an entry in a kernel page-table-directory */
>  #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)

We can decouple pgd_offset_k() from pgd_offset() and there wouldn't be a
need to check the addr < TASK_SIZE. Do we have any case where
pgd_offset() is used on a kernel address?

> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> index 46c9d9ff028c..ba63b8a8dac1 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -19,13 +19,13 @@
>  #ifndef __ASM_PROCESSOR_H
>  #define __ASM_PROCESSOR_H
>  
> -#define TASK_SIZE_64		(UL(1) << VA_BITS)
> -
> -#define KERNEL_DS	UL(-1)
> -#define USER_DS		(TASK_SIZE_64 - 1)
> -
> +#define KERNEL_DS		UL(-1)
> +#define _USER_DS(vabits)	((UL(1) << (vabits)) - 1)
>  #ifndef __ASSEMBLY__
>  
> +extern u64 vabits_user;
> +#define TASK_SIZE_64		(UL(1) << vabits_user)
> +#define USER_DS			_USER_DS(vabits_user)
>  #define DEFAULT_MAP_WINDOW_64	(UL(1) << VA_BITS)

Can we keep USER_DS to TASK_SIZE_64 (static)? I don't see what we gain
by making it dynamic, all we want is that it doesn't overlap with the
KERNEL_DS (similarly we don't have a different USER_DS for 32-bit tasks
here).

> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index e238b7932096..4807fee515b6 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1006,6 +1006,14 @@ static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap,
>  
>  #endif
>  
> +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> +extern u64 vabits_user;
> +static bool has_52bit_vas(const struct arm64_cpu_capabilities *entry, int __unused)

Nitpick: just s/vas/va/ to match the config option naming.

> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 09dbea221a27..72b4b0b069b9 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -189,7 +189,11 @@ alternative_cb_end
>  	/* Save the task's original addr_limit and set USER_DS */
>  	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
>  	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
> -	mov	x20, #USER_DS
> +alternative_if	ARM64_HAS_52BIT_VA
> +	mov	x20, #_USER_DS(52)
> +alternative_else
> +	mov	x20, #_USER_DS(VA_BITS)
> +alternative_endif

Why is this needed? Can't we just use 52 all the time?

> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index b0853069702f..6b7d32990b25 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -316,6 +316,19 @@ __create_page_tables:
>  	adrp	x0, idmap_pg_dir
>  	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)
>  
> +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> +	mrs_s	x6, SYS_ID_AA64MMFR2_EL1
> +	and	x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
> +	mov	x5, #52
> +	cbnz	x6, 1f
> +#endif
> +	mov	x5, #VA_BITS
> +1:
> +	adr_l	x6, vabits_user
> +	str	x5, [x6]
> +	dmb	sy
> +	dc	ivac, x6		// Invalidate potentially stale cache line

I wonder whether we could get away with not setting vabits_user until
has_52bit_va(), especially since it's meant for user space. For idmap,
this matches the PA and while we support 52-bit PA, we've never
supported a 52-bit idmap. Even if we do, I don't think setting
vabits_user early is necessary.

> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 03646e6a2ef4..4834bd434143 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -441,7 +441,15 @@ ENTRY(__cpu_setup)
>  	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
>  			TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
>  			TCR_TBI0 | TCR_A1
> -	tcr_set_idmap_t0sz	x10, x9
> +
> +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> +	ldr_l 		x9, vabits_user
> +	sub		x9, xzr, x9
> +	add		x9, x9, #64
> +#else
> +	ldr_l		x9, idmap_t0sz
> +#endif
> +	tcr_set_t0sz	x10, x9

This is probably sufficient for 52-bit idmap, together with a statically
allocated idmap_pg_dir (which we do anyway, using a full page IIUC).
Steve Capper Sept. 27, 2018, 1:50 p.m. UTC | #2
Hi Catalin,

On Fri, Sep 21, 2018 at 06:40:31PM +0100, Catalin Marinas wrote:
> On Wed, Aug 29, 2018 at 01:45:40PM +0100, Steve Capper wrote:
> > On arm64 there is optional support for a 52-bit virtual address space.
> > To exploit this one has to be running with a 64KB page size and be
> > running on hardware that supports this.
> > 
> > For an arm64 kernel supporting a 48 bit VA with a 64KB page size,
> > a few changes are needed to support a 52-bit userspace:
> >  * TCR_EL1.T0SZ needs to be 12 instead of 16,
> >  * pgd_offset needs to work with a different PTRS_PER_PGD,
> >  * PGD_SIZE needs to be increased,
> >  * TASK_SIZE needs to reflect the new size.
> 
> I will comment on the general approach here. We indeed need TASK_SIZE
> and T0SZ to be variable based on the supported feature. However, I think
> PGD_SIZE can be fixed to cover 52-bit VA all the time, we just need to
> ensure that we allocate a bigger pgd when this option is enabled.
> 
> When the 52-bit VA is not present, everything still works as normal as
> the pgd entries for a 48-bit VA are written at the beginning of the pgd.
> There is no need to change pgd_offset/index etc.
> 

I will tighten up the wording above to indicate constant, variable
depending on 52-bit support.

> > A future patch enables CONFIG_ARM64_TRY_52BIT_VA (which essentially
> > activates this patch) as we need to also change the mmap logic to maintain
> > compatibility with a userspace that expects at most 48-bits of VA.
> 
> And I would just drop the "TRY" part in the above config (but mention it
> in the help text that it is used only if available).
> 

Sure :-).

> > diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> > index 2e05bcd944c8..7f50804a677e 100644
> > --- a/arch/arm64/include/asm/pgalloc.h
> > +++ b/arch/arm64/include/asm/pgalloc.h
> > @@ -27,7 +27,11 @@
> >  #define check_pgt_cache()		do { } while (0)
> >  
> >  #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
> > +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> > +#define PGD_SIZE	((1 << (52 - PGDIR_SHIFT)) * sizeof(pgd_t))
> > +#else
> >  #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
> > +#endif
> 
> OK, so you've already made the PGD_SIZE static here (unless changed in
> subsequent patches). 
> 

Yes PGD_SIZE should be static, the problem is PTRS_PER_PGD needs to reflect a
48-bit VA for kernels (as the kernel addresses are "negative"), so can't
be used to compute PGD_SIZE.

> > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > index 1bdeca8918a6..8449e266cd46 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -577,11 +577,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
> >  #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
> >  
> >  /* to find an entry in a page-table-directory */
> > -#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
> > +#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
> > +#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
> > +#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
> >  
> > -#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
> > +static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
> > +{
> > +	pgd_t *ret;
> > +
> > +	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
> > +		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
> > +	else
> > +		ret = pgd_offset_raw(mm->pgd, addr);
> >  
> > -#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
> > +	return ret;
> > +}
> >  
> >  /* to find an entry in a kernel page-table-directory */
> >  #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
> 
> We can decouple pgd_offset_k() from pgd_offset() and there wouldn't be a
> need to check the addr < TASK_SIZE. Do we have any case where
> pgd_offset() is used on a kernel address?
>

Unfortunately there are a few cases where pgd_offset is used instead of
pgd_offset_k, I'll see if I can fix these in a separate patch and that
would then simplify this patch.

> > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > index 46c9d9ff028c..ba63b8a8dac1 100644
> > --- a/arch/arm64/include/asm/processor.h
> > +++ b/arch/arm64/include/asm/processor.h
> > @@ -19,13 +19,13 @@
> >  #ifndef __ASM_PROCESSOR_H
> >  #define __ASM_PROCESSOR_H
> >  
> > -#define TASK_SIZE_64		(UL(1) << VA_BITS)
> > -
> > -#define KERNEL_DS	UL(-1)
> > -#define USER_DS		(TASK_SIZE_64 - 1)
> > -
> > +#define KERNEL_DS		UL(-1)
> > +#define _USER_DS(vabits)	((UL(1) << (vabits)) - 1)
> >  #ifndef __ASSEMBLY__
> >  
> > +extern u64 vabits_user;
> > +#define TASK_SIZE_64		(UL(1) << vabits_user)
> > +#define USER_DS			_USER_DS(vabits_user)
> >  #define DEFAULT_MAP_WINDOW_64	(UL(1) << VA_BITS)
> 
> Can we keep USER_DS to TASK_SIZE_64 (static)? I don't see what we gain
> by making it dynamic, all we want is that it doesn't overlap with the
> KERNEL_DS (similarly we don't have a different USER_DS for 32-bit tasks
> here).

Sure thing, I was probably being overly paranoid before.

> 
> > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> > index e238b7932096..4807fee515b6 100644
> > --- a/arch/arm64/kernel/cpufeature.c
> > +++ b/arch/arm64/kernel/cpufeature.c
> > @@ -1006,6 +1006,14 @@ static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap,
> >  
> >  #endif
> >  
> > +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> > +extern u64 vabits_user;
> > +static bool has_52bit_vas(const struct arm64_cpu_capabilities *entry, int __unused)
> 
> Nitpick: just s/vas/va/ to match the config option naming.
> 

Thanks :-)

> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 09dbea221a27..72b4b0b069b9 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -189,7 +189,11 @@ alternative_cb_end
> >  	/* Save the task's original addr_limit and set USER_DS */
> >  	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
> >  	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
> > -	mov	x20, #USER_DS
> > +alternative_if	ARM64_HAS_52BIT_VA
> > +	mov	x20, #_USER_DS(52)
> > +alternative_else
> > +	mov	x20, #_USER_DS(VA_BITS)
> > +alternative_endif
> 
> Why is this needed? Can't we just use 52 all the time?
>

This was me trying to keep the limit as low as possible, I can relax
this to a 52-bit constant and this will still be disjoint from the
kernel space.

> > diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> > index b0853069702f..6b7d32990b25 100644
> > --- a/arch/arm64/kernel/head.S
> > +++ b/arch/arm64/kernel/head.S
> > @@ -316,6 +316,19 @@ __create_page_tables:
> >  	adrp	x0, idmap_pg_dir
> >  	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)
> >  
> > +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> > +	mrs_s	x6, SYS_ID_AA64MMFR2_EL1
> > +	and	x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
> > +	mov	x5, #52
> > +	cbnz	x6, 1f
> > +#endif
> > +	mov	x5, #VA_BITS
> > +1:
> > +	adr_l	x6, vabits_user
> > +	str	x5, [x6]
> > +	dmb	sy
> > +	dc	ivac, x6		// Invalidate potentially stale cache line
> 
> I wonder whether we could get away with not setting vabits_user until
> has_52bit_va(), especially since it's meant for user space. For idmap,
> this matches the PA and while we support 52-bit PA, we've never
> supported a 52-bit idmap. Even if we do, I don't think setting
> vabits_user early is necessary.

I think it makes sense to have 52-bits detected earlier as this will
allow T0SZ to be set correctly earlier. I'll see if I can simplify
things.

> 
> > diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> > index 03646e6a2ef4..4834bd434143 100644
> > --- a/arch/arm64/mm/proc.S
> > +++ b/arch/arm64/mm/proc.S
> > @@ -441,7 +441,15 @@ ENTRY(__cpu_setup)
> >  	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
> >  			TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
> >  			TCR_TBI0 | TCR_A1
> > -	tcr_set_idmap_t0sz	x10, x9
> > +
> > +#ifdef CONFIG_ARM64_TRY_52BIT_VA
> > +	ldr_l 		x9, vabits_user
> > +	sub		x9, xzr, x9
> > +	add		x9, x9, #64
> > +#else
> > +	ldr_l		x9, idmap_t0sz
> > +#endif
> > +	tcr_set_t0sz	x10, x9
> 
> This is probably sufficient for 52-bit idmap, together with a statically
> allocated idmap_pg_dir (which we do anyway, using a full page IIUC).

That was my thinking too. Also with the EFI map, that is allocated by
the kernel (thus no "straggler" entries mapped high) so that also has T0SZ==12.

Cheers,
Steve Capper Sept. 27, 2018, 2:48 p.m. UTC | #3
On Thu, Sep 27, 2018 at 02:50:32PM +0100, Steve Capper wrote:
> Hi Catalin,
> 
> On Fri, Sep 21, 2018 at 06:40:31PM +0100, Catalin Marinas wrote:
[...] 
> > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > index 1bdeca8918a6..8449e266cd46 100644
> > > --- a/arch/arm64/include/asm/pgtable.h
> > > +++ b/arch/arm64/include/asm/pgtable.h
> > > @@ -577,11 +577,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
> > >  #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
> > >  
> > >  /* to find an entry in a page-table-directory */
> > > -#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
> > > +#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
> > > +#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
> > > +#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
> > >  
> > > -#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
> > > +static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
> > > +{
> > > +	pgd_t *ret;
> > > +
> > > +	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
> > > +		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
> > > +	else
> > > +		ret = pgd_offset_raw(mm->pgd, addr);
> > >  
> > > -#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
> > > +	return ret;
> > > +}
> > >  
> > >  /* to find an entry in a kernel page-table-directory */
> > >  #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
> > 
> > We can decouple pgd_offset_k() from pgd_offset() and there wouldn't be a
> > need to check the addr < TASK_SIZE. Do we have any case where
> > pgd_offset() is used on a kernel address?
> >
> 
> Unfortunately there are a few cases where pgd_offset is used instead of
> pgd_offset_k, I'll see if I can fix these in a separate patch and that
> would then simplify this patch.
> 

So it turns out that __change_memory_common, calls apply_to_page_range
which then calls pgd_offset...

Is it worth changing __change_memory_common, or would it be better to
introduce a check in pgd_offset (can also check the mm parameter)?

Cheers,
Catalin Marinas Oct. 1, 2018, 10:28 a.m. UTC | #4
On Thu, Sep 27, 2018 at 02:48:43PM +0000, Steve Capper wrote:
> On Thu, Sep 27, 2018 at 02:50:32PM +0100, Steve Capper wrote:
> > On Fri, Sep 21, 2018 at 06:40:31PM +0100, Catalin Marinas wrote:
> > > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > > index 1bdeca8918a6..8449e266cd46 100644
> > > > --- a/arch/arm64/include/asm/pgtable.h
> > > > +++ b/arch/arm64/include/asm/pgtable.h
> > > > @@ -577,11 +577,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
> > > >  #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
> > > >  
> > > >  /* to find an entry in a page-table-directory */
> > > > -#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
> > > > +#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
> > > > +#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
> > > > +#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
> > > >  
> > > > -#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
> > > > +static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
> > > > +{
> > > > +	pgd_t *ret;
> > > > +
> > > > +	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
> > > > +		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
> > > > +	else
> > > > +		ret = pgd_offset_raw(mm->pgd, addr);
> > > >  
> > > > -#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
> > > > +	return ret;
> > > > +}
> > > >  
> > > >  /* to find an entry in a kernel page-table-directory */
> > > >  #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
> > > 
> > > We can decouple pgd_offset_k() from pgd_offset() and there wouldn't be a
> > > need to check the addr < TASK_SIZE. Do we have any case where
> > > pgd_offset() is used on a kernel address?
> > 
> > Unfortunately there are a few cases where pgd_offset is used instead of
> > pgd_offset_k, I'll see if I can fix these in a separate patch and that
> > would then simplify this patch.
> 
> So it turns out that __change_memory_common, calls apply_to_page_range
> which then calls pgd_offset...

Ah, James Morse had a plan to change apply_to_page_range() for other
reasons but I'm not sure whether that would have helped.

> Is it worth changing __change_memory_common, or would it be better to
> introduce a check in pgd_offset (can also check the mm parameter)?

The problem here doesn't seem to be __change_memory_common() but rather
apply_to_page_range(). Since there are other users of this API (e.g.
alloc_vm_area()), I think we should just change pgd_offset(). Would a
check on the (mm == &init_mm) be sufficient? Otherwise, we could do a
test on bit 55 of the address.
Steve Capper Oct. 1, 2018, 10:50 a.m. UTC | #5
On Mon, Oct 01, 2018 at 11:28:17AM +0100, Catalin Marinas wrote:
> On Thu, Sep 27, 2018 at 02:48:43PM +0000, Steve Capper wrote:
> > On Thu, Sep 27, 2018 at 02:50:32PM +0100, Steve Capper wrote:
> > > On Fri, Sep 21, 2018 at 06:40:31PM +0100, Catalin Marinas wrote:
> > > > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > > > index 1bdeca8918a6..8449e266cd46 100644
> > > > > --- a/arch/arm64/include/asm/pgtable.h
> > > > > +++ b/arch/arm64/include/asm/pgtable.h
> > > > > @@ -577,11 +577,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
> > > > >  #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
> > > > >  
> > > > >  /* to find an entry in a page-table-directory */
> > > > > -#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
> > > > > +#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
> > > > > +#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
> > > > > +#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
> > > > >  
> > > > > -#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
> > > > > +static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
> > > > > +{
> > > > > +	pgd_t *ret;
> > > > > +
> > > > > +	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
> > > > > +		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
> > > > > +	else
> > > > > +		ret = pgd_offset_raw(mm->pgd, addr);
> > > > >  
> > > > > -#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
> > > > > +	return ret;
> > > > > +}
> > > > >  
> > > > >  /* to find an entry in a kernel page-table-directory */
> > > > >  #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
> > > > 
> > > > We can decouple pgd_offset_k() from pgd_offset() and there wouldn't be a
> > > > need to check the addr < TASK_SIZE. Do we have any case where
> > > > pgd_offset() is used on a kernel address?
> > > 
> > > Unfortunately there are a few cases where pgd_offset is used instead of
> > > pgd_offset_k, I'll see if I can fix these in a separate patch and that
> > > would then simplify this patch.
> > 
> > So it turns out that __change_memory_common, calls apply_to_page_range
> > which then calls pgd_offset...
> 
> Ah, James Morse had a plan to change apply_to_page_range() for other
> reasons but I'm not sure whether that would have helped.
> 
> > Is it worth changing __change_memory_common, or would it be better to
> > introduce a check in pgd_offset (can also check the mm parameter)?
> 
> The problem here doesn't seem to be __change_memory_common() but rather
> apply_to_page_range(). Since there are other users of this API (e.g.
> alloc_vm_area()), I think we should just change pgd_offset(). Would a
> check on the (mm == &init_mm) be sufficient? Otherwise, we could do a
> test on bit 55 of the address.
>

Ahh okay, yes I can change pgd_offset to check for init_mm.

Cheers,
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 0bcc98dbba56..8c8ed20beca9 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -343,11 +343,10 @@  alternative_endif
 	.endm
 
 /*
- * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ * tcr_set_t0sz - update TCR.T0SZ so that we can load the ID map
  */
-	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
-	ldr_l	\tmpreg, idmap_t0sz
-	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+	.macro	tcr_set_t0sz, valreg, t0sz
+	bfi	\valreg, \t0sz, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
 	.endm
 
 /*
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ae1f70450fb2..0cf286307b79 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -51,7 +51,8 @@ 
 #define ARM64_SSBD				30
 #define ARM64_MISMATCHED_CACHE_TYPE		31
 #define ARM64_HAS_STAGE2_FWB			32
+#define ARM64_HAS_52BIT_VA			33
 
-#define ARM64_NCAPS				33
+#define ARM64_NCAPS				34
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 39ec0b8a689e..70598750a21e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -72,6 +72,9 @@  extern u64 idmap_ptrs_per_pgd;
 
 static inline bool __cpu_uses_extended_idmap(void)
 {
+	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA))
+		return false;
+
 	return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS));
 }
 
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 2e05bcd944c8..7f50804a677e 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -27,7 +27,11 @@ 
 #define check_pgt_cache()		do { } while (0)
 
 #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
+#ifdef CONFIG_ARM64_TRY_52BIT_VA
+#define PGD_SIZE	((1 << (52 - PGDIR_SHIFT)) * sizeof(pgd_t))
+#else
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
+#endif
 
 #if CONFIG_PGTABLE_LEVELS > 2
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1bdeca8918a6..8449e266cd46 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -577,11 +577,21 @@  static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
 
 /* to find an entry in a page-table-directory */
-#define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd_index(addr, ptrs)		(((addr) >> PGDIR_SHIFT) & ((ptrs) - 1))
+#define _pgd_offset_raw(pgd, addr, ptrs) ((pgd) + pgd_index(addr, ptrs))
+#define pgd_offset_raw(pgd, addr)	(_pgd_offset_raw(pgd, addr, PTRS_PER_PGD))
 
-#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
+static inline pgd_t *pgd_offset(const struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *ret;
+
+	if (IS_ENABLED(CONFIG_ARM64_TRY_52BIT_VA) && (addr < TASK_SIZE))
+		ret = _pgd_offset_raw(mm->pgd, addr, 1ULL << (vabits_user - PGDIR_SHIFT));
+	else
+		ret = pgd_offset_raw(mm->pgd, addr);
 
-#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
+	return ret;
+}
 
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 46c9d9ff028c..ba63b8a8dac1 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -19,13 +19,13 @@ 
 #ifndef __ASM_PROCESSOR_H
 #define __ASM_PROCESSOR_H
 
-#define TASK_SIZE_64		(UL(1) << VA_BITS)
-
-#define KERNEL_DS	UL(-1)
-#define USER_DS		(TASK_SIZE_64 - 1)
-
+#define KERNEL_DS		UL(-1)
+#define _USER_DS(vabits)	((UL(1) << (vabits)) - 1)
 #ifndef __ASSEMBLY__
 
+extern u64 vabits_user;
+#define TASK_SIZE_64		(UL(1) << vabits_user)
+#define USER_DS			_USER_DS(vabits_user)
 #define DEFAULT_MAP_WINDOW_64	(UL(1) << VA_BITS)
 
 /*
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e238b7932096..4807fee515b6 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1006,6 +1006,14 @@  static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap,
 
 #endif
 
+#ifdef CONFIG_ARM64_TRY_52BIT_VA
+extern u64 vabits_user;
+static bool has_52bit_vas(const struct arm64_cpu_capabilities *entry, int __unused)
+{
+	return vabits_user == 52;
+}
+#endif
+
 #ifdef CONFIG_ARM64_VHE
 static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused)
 {
@@ -1222,6 +1230,14 @@  static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_hw_dbm,
 	},
 #endif
+#ifdef CONFIG_ARM64_TRY_52BIT_VA
+	{
+		.desc = "52-bit Virtual Addresses",
+		.capability = ARM64_HAS_52BIT_VA,
+		.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+		.matches = has_52bit_vas,
+	},
+#endif /* CONFIG_ARM64_TRY_52BIT_VA */
 	{},
 };
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 09dbea221a27..72b4b0b069b9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -189,7 +189,11 @@  alternative_cb_end
 	/* Save the task's original addr_limit and set USER_DS */
 	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
-	mov	x20, #USER_DS
+alternative_if	ARM64_HAS_52BIT_VA
+	mov	x20, #_USER_DS(52)
+alternative_else
+	mov	x20, #_USER_DS(VA_BITS)
+alternative_endif
 	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
 	.endif /* \el == 0 */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b0853069702f..6b7d32990b25 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -316,6 +316,19 @@  __create_page_tables:
 	adrp	x0, idmap_pg_dir
 	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)
 
+#ifdef CONFIG_ARM64_TRY_52BIT_VA
+	mrs_s	x6, SYS_ID_AA64MMFR2_EL1
+	and	x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
+	mov	x5, #52
+	cbnz	x6, 1f
+#endif
+	mov	x5, #VA_BITS
+1:
+	adr_l	x6, vabits_user
+	str	x5, [x6]
+	dmb	sy
+	dc	ivac, x6		// Invalidate potentially stale cache line
+
 	/*
 	 * VA_BITS may be too small to allow for an ID mapping to be created
 	 * that covers system RAM if that is located sufficiently high in the
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 50b30ff30de4..4a9b86152b9a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -153,7 +153,7 @@  void show_pte(unsigned long addr)
 
 	pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgdp = %p\n",
 		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
-		 VA_BITS, mm->pgd);
+		 (addr >= TASK_SIZE) ? VA_BITS : (int) vabits_user, mm->pgd);
 	pgdp = pgd_offset(mm, addr);
 	pgd = READ_ONCE(*pgdp);
 	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 65f86271f02b..f5e37705b399 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -52,6 +52,7 @@ 
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
+u64 vabits_user __ro_after_init;
 
 u64 kimage_voffset __ro_after_init;
 EXPORT_SYMBOL(kimage_voffset);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 03646e6a2ef4..4834bd434143 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -441,7 +441,15 @@  ENTRY(__cpu_setup)
 	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
 			TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
 			TCR_TBI0 | TCR_A1
-	tcr_set_idmap_t0sz	x10, x9
+
+#ifdef CONFIG_ARM64_TRY_52BIT_VA
+	ldr_l 		x9, vabits_user
+	sub		x9, xzr, x9
+	add		x9, x9, #64
+#else
+	ldr_l		x9, idmap_t0sz
+#endif
+	tcr_set_t0sz	x10, x9
 
 	/*
 	 * Set the IPS bits in TCR_EL1.