diff mbox series

[v11,10/40] arm64/sme: Basic enumeration support

Message ID 20220207152109.197566-11-broonie@kernel.org (mailing list archive)
State New
Headers show
Series arm64/sme: Initial support for the Scalable Matrix Extension | expand

Commit Message

Mark Brown Feb. 7, 2022, 3:20 p.m. UTC
This patch introduces basic cpufeature support for discovering the presence
of the Scalable Matrix Extension.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/arm64/elf_hwcaps.rst  | 33 +++++++++++++++++
 arch/arm64/include/asm/cpu.h        |  1 +
 arch/arm64/include/asm/cpufeature.h | 12 ++++++
 arch/arm64/include/asm/fpsimd.h     |  2 +
 arch/arm64/include/asm/hwcap.h      |  8 ++++
 arch/arm64/include/uapi/asm/hwcap.h |  8 ++++
 arch/arm64/kernel/cpufeature.c      | 57 +++++++++++++++++++++++++++++
 arch/arm64/kernel/cpuinfo.c         |  9 +++++
 arch/arm64/kernel/fpsimd.c          | 30 +++++++++++++++
 arch/arm64/tools/cpucaps            |  2 +
 10 files changed, 162 insertions(+)

Comments

Catalin Marinas Feb. 21, 2022, 2:32 p.m. UTC | #1
On Mon, Feb 07, 2022 at 03:20:39PM +0000, Mark Brown wrote:
> diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
> index b72ff17d600a..5626cf208000 100644
> --- a/Documentation/arm64/elf_hwcaps.rst
> +++ b/Documentation/arm64/elf_hwcaps.rst
> @@ -259,6 +259,39 @@ HWCAP2_RPRES
>  
>      Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
>  
> +HWCAP2_SME
> +
> +    Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
> +    by Documentation/arm64/sme.rst.
> +
> +HWCAP2_SME_I16I64
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
> +
> +HWCAP2_SME_F64F64
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
> +
> +HWCAP2_SME_I8I32
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
> +
> +HWCAP2_SME_F16F32
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
> +
> +HWCAP2_SME_B16F32
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
> +
> +HWCAP2_SME_F32F32
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
> +
> +HWCAP2_SME_FA64
> +
> +    Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.

More of a question for the libc people: should we drop the fine-grained
HWCAP corresponding to the new ID_AA64SMFR0_EL1 register (only keep
HWCAP2_SME) and get the user space to use the MRS emulation? Would any
ifunc resolver be affected?

> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 64a748c2b351..2634e32bbfb9 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -251,6 +251,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
>  };
>  
>  static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
> +	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0),

Shouldn't this field also be visible (if SME is enabled)?

> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 5280e098cfb5..576490be3c2b 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -987,6 +987,32 @@ void fpsimd_release_task(struct task_struct *dead_task)
>  
>  #endif /* CONFIG_ARM64_SVE */
>  
> +#ifdef CONFIG_ARM64_SME
> +
> +void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
> +{
> +	/* Set priority for all PEs to architecturally defined minimum */
> +	write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK,
> +		       SYS_SMPRI_EL1);
> +
> +	/* Allow SME in kernel */
> +	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1);
> +	isb();
> +}
> +
> +/*
> + * This must be called after sme_kernel_enable(), we rely on the
> + * feature table being sorted to ensure this.
> + */
> +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
> +{
> +	/* Allow use of FA64 */
> +	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
> +		       SYS_SMCR_EL1);
> +}
> +
> +#endif /* CONFIG_ARM64_SVE */

I think instead of worrying about the order, we could check the
sanitised register value in sme_kernel_enable() and set the FA64 bit.
Also to me 'fa64_kernel_enable' somehow implies that the kernel cares
about FA64 for itself but AFAICT we never run the kernel in streaming
mode.
Mark Brown Feb. 21, 2022, 3:01 p.m. UTC | #2
On Mon, Feb 21, 2022 at 02:32:38PM +0000, Catalin Marinas wrote:
> On Mon, Feb 07, 2022 at 03:20:39PM +0000, Mark Brown wrote:

> >  static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
> > +	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0),

> Shouldn't this field also be visible (if SME is enabled)?

Yes.

> > +/*
> > + * This must be called after sme_kernel_enable(), we rely on the
> > + * feature table being sorted to ensure this.
> > + */
> > +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
> > +{
> > +	/* Allow use of FA64 */
> > +	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
> > +		       SYS_SMCR_EL1);
> > +}
> > +
> > +#endif /* CONFIG_ARM64_SVE */

> I think instead of worrying about the order, we could check the
> sanitised register value in sme_kernel_enable() and set the FA64 bit.

There's going to be a ordering/clarity issue whatever way round we do it
- the FA64 feature bit is in a different feature register to the main
SME feature bitfield and it's not as abundantly clear as might be ideal 
that it will have been sanitised when we're getting callbacks for the
main SME feature, there's an awful lot of sharp edges with this code.
Having things this way round felt more idiomatic to me.

> Also to me 'fa64_kernel_enable' somehow implies that the kernel cares
> about FA64 for itself but AFAICT we never run the kernel in streaming
> mode.

We do run the kernel in streaming mode - entering the kernel through a
syscall or preemption will not change the streaming mode state, and we
need to be in streaming mode in order to save or restore the register
state for streaming mode.  In particular we need FA64 enabled for EL1 in
order to context switch FFR when in streaming mode, without it we'll
generate an exception when we execute the rdffr or wrffr.  We don't do
any real floating point work in streaming mode but we absolutely need to
run in streaming mode and only exit streaming mode when restoring a
context where it is disabled, when using floating point in the kernel or
when idling the CPU.
Szabolcs Nagy Feb. 21, 2022, 4:07 p.m. UTC | #3
The 02/21/2022 14:32, Catalin Marinas wrote:
> On Mon, Feb 07, 2022 at 03:20:39PM +0000, Mark Brown wrote:
> > diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
> > index b72ff17d600a..5626cf208000 100644
> > --- a/Documentation/arm64/elf_hwcaps.rst
> > +++ b/Documentation/arm64/elf_hwcaps.rst
> > @@ -259,6 +259,39 @@ HWCAP2_RPRES
> >  
> >      Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
> >  
> > +HWCAP2_SME
> > +
> > +    Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
> > +    by Documentation/arm64/sme.rst.
> > +
> > +HWCAP2_SME_I16I64
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
> > +
> > +HWCAP2_SME_F64F64
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
> > +
> > +HWCAP2_SME_I8I32
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
> > +
> > +HWCAP2_SME_F16F32
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
> > +
> > +HWCAP2_SME_B16F32
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
> > +
> > +HWCAP2_SME_F32F32
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
> > +
> > +HWCAP2_SME_FA64
> > +
> > +    Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
> 
> More of a question for the libc people: should we drop the fine-grained
> HWCAP corresponding to the new ID_AA64SMFR0_EL1 register (only keep
> HWCAP2_SME) and get the user space to use the MRS emulation? Would any
> ifunc resolver be affected?
> 

good question.

within glibc HWCAP2_SME is enough (to decide if we need to
deal with additional register state and the lazy ZA save
scheme) but i guess user code that actually uses sme would
need the details (including in ifunc resolvers in principle).

since we have mrs, there is no strict need for hwcaps.
if ifunc resolvers using this info are not widespread then
the mrs emulation overhead is acceptable, but i suspect
hwcaps are nicer to use.

do we have a plan after hwcap2 bits run out? :)
Catalin Marinas Feb. 21, 2022, 7:04 p.m. UTC | #4
On Mon, Feb 21, 2022 at 04:07:06PM +0000, Szabolcs Nagy wrote:
> The 02/21/2022 14:32, Catalin Marinas wrote:
> > On Mon, Feb 07, 2022 at 03:20:39PM +0000, Mark Brown wrote:
> > > diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
> > > index b72ff17d600a..5626cf208000 100644
> > > --- a/Documentation/arm64/elf_hwcaps.rst
> > > +++ b/Documentation/arm64/elf_hwcaps.rst
> > > @@ -259,6 +259,39 @@ HWCAP2_RPRES
> > >  
> > >      Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
> > >  
> > > +HWCAP2_SME
> > > +
> > > +    Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
> > > +    by Documentation/arm64/sme.rst.
> > > +
> > > +HWCAP2_SME_I16I64
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
> > > +
> > > +HWCAP2_SME_F64F64
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
> > > +
> > > +HWCAP2_SME_I8I32
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
> > > +
> > > +HWCAP2_SME_F16F32
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
> > > +
> > > +HWCAP2_SME_B16F32
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
> > > +
> > > +HWCAP2_SME_F32F32
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
> > > +
> > > +HWCAP2_SME_FA64
> > > +
> > > +    Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
> > 
> > More of a question for the libc people: should we drop the fine-grained
> > HWCAP corresponding to the new ID_AA64SMFR0_EL1 register (only keep
> > HWCAP2_SME) and get the user space to use the MRS emulation? Would any
> > ifunc resolver be affected?
> 
> good question.
> 
> within glibc HWCAP2_SME is enough (to decide if we need to
> deal with additional register state and the lazy ZA save
> scheme) but i guess user code that actually uses sme would
> need the details (including in ifunc resolvers in principle).
> 
> since we have mrs, there is no strict need for hwcaps.
> if ifunc resolvers using this info are not widespread then
> the mrs emulation overhead is acceptable, but i suspect
> hwcaps are nicer to use.

I presume the ifunc resolvers only run once, so the overhead won't be
noticed. Anyway, happy to keep the new HWCAP2 if they are useful.

> do we have a plan after hwcap2 bits run out? :)

HWCAP3 or we free up the top 32-bit in both HWCAP and HWCAP2 ranges. We
did not extend into those upper bits because of the ILP32 discussions at
the time.
Catalin Marinas Feb. 21, 2022, 7:24 p.m. UTC | #5
On Mon, Feb 21, 2022 at 03:01:03PM +0000, Mark Brown wrote:
> On Mon, Feb 21, 2022 at 02:32:38PM +0000, Catalin Marinas wrote:
> > On Mon, Feb 07, 2022 at 03:20:39PM +0000, Mark Brown wrote:
> > > +/*
> > > + * This must be called after sme_kernel_enable(), we rely on the
> > > + * feature table being sorted to ensure this.
> > > + */
> > > +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
> > > +{
> > > +	/* Allow use of FA64 */
> > > +	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
> > > +		       SYS_SMCR_EL1);
> > > +}
> > > +
> > > +#endif /* CONFIG_ARM64_SVE */
> 
> > I think instead of worrying about the order, we could check the
> > sanitised register value in sme_kernel_enable() and set the FA64 bit.
> 
> There's going to be a ordering/clarity issue whatever way round we do it
> - the FA64 feature bit is in a different feature register to the main
> SME feature bitfield and it's not as abundantly clear as might be ideal 
> that it will have been sanitised when we're getting callbacks for the
> main SME feature, there's an awful lot of sharp edges with this code.
> Having things this way round felt more idiomatic to me.

You may want to add a comment in the cpu_feature[] array that it should
be placed after SME.

> > Also to me 'fa64_kernel_enable' somehow implies that the kernel cares
> > about FA64 for itself but AFAICT we never run the kernel in streaming
> > mode.
> 
> We do run the kernel in streaming mode - entering the kernel through a
> syscall or preemption will not change the streaming mode state, and we
> need to be in streaming mode in order to save or restore the register
> state for streaming mode.  In particular we need FA64 enabled for EL1 in
> order to context switch FFR when in streaming mode, without it we'll
> generate an exception when we execute the rdffr or wrffr.  We don't do
> any real floating point work in streaming mode but we absolutely need to
> run in streaming mode and only exit streaming mode when restoring a
> context where it is disabled, when using floating point in the kernel or
> when idling the CPU.

So, IIUC, for Linux it is mandatory that FEAT_SME_FA64 is supported,
otherwise we won't be able to enable SME. Does the architecture say
this feature as optional? Which A64 instructions are not available if
FA64 is disabled? I hope it's only the SVE ones but I thought we can
still do load/store of the state even with FA64 disabled.

Anyway, if we can't even context switch without FA64 while in streaming
mode, I think we should move the check in the main SME .matches function
and enable it in sme_kernel_enable(), no need for an additional feature.

I think we should also update booting.rst to require that the FA64 is
enabled at EL2 and EL3.
Mark Brown Feb. 21, 2022, 11:10 p.m. UTC | #6
On Mon, Feb 21, 2022 at 07:24:59PM +0000, Catalin Marinas wrote:
> On Mon, Feb 21, 2022 at 03:01:03PM +0000, Mark Brown wrote:

> > There's going to be a ordering/clarity issue whatever way round we do it
> > - the FA64 feature bit is in a different feature register to the main
> > SME feature bitfield and it's not as abundantly clear as might be ideal 
> > that it will have been sanitised when we're getting callbacks for the
> > main SME feature, there's an awful lot of sharp edges with this code.
> > Having things this way round felt more idiomatic to me.

> You may want to add a comment in the cpu_feature[] array that it should
> be placed after SME.

Sure.

> > We do run the kernel in streaming mode - entering the kernel through a
> > syscall or preemption will not change the streaming mode state, and we
> > need to be in streaming mode in order to save or restore the register
> > state for streaming mode.  In particular we need FA64 enabled for EL1 in
> > order to context switch FFR when in streaming mode, without it we'll
> > generate an exception when we execute the rdffr or wrffr.  We don't do
> > any real floating point work in streaming mode but we absolutely need to
> > run in streaming mode and only exit streaming mode when restoring a
> > context where it is disabled, when using floating point in the kernel or
> > when idling the CPU.

> So, IIUC, for Linux it is mandatory that FEAT_SME_FA64 is supported,
> otherwise we won't be able to enable SME. Does the architecture say

The feature is not mandatory and we do not require it for Linux.  It is
expected that many implementations will choose to not support FA64.

The only impact it has on the kernel is that if it's present then we
need to enable it for each EL and then context switch FFR in streaming
mode, the code is there to do that conditionally already.  We'd also
have to take it into account if we were to run streaming mode algorithms
in the kernel but if we ever do so that's just an additional feature
check when choosing to run such code.

> this feature as optional? Which A64 instructions are not available if
> FA64 is disabled? I hope it's only the SVE ones but I thought we can
> still do load/store of the state even with FA64 disabled.

There's a rather large subset of mostly FPSIMD and some SVE instructions
(including those for accessing FFR which is why we don't need to context
switch it in streaming mode), you can see a full list in appendix F1 of
the SME specification.

This is actually a bit awkward for not disabling streaming mode when we
do a syscall since the disabled instructions include the FPSMID mov
vector, vector instruction which we currently use to zero the high bits
of the Z registers.  That issue goes away if the optimisations I've got
for relaxed flushing of the non-shared SVE state that we discussed in
relation to syscall-abi get merged, though it'd still be there if we add
a sysctl to force flushing.  This is a solvable problem though, even if
we have to use a less efficient sequence to flush in streaming mode.

> Anyway, if we can't even context switch without FA64 while in streaming
> mode, I think we should move the check in the main SME .matches function
> and enable it in sme_kernel_enable(), no need for an additional feature.

Given that it's optional and we need to check for it at runtime in order
to context switch it seems sensible to use the cpufeature infrastructure
for the detection.

> I think we should also update booting.rst to require that the FA64 is
> enabled at EL2 and EL3.

That's there already since d198c77b7fab13d4 ("arm64: Document boot
requirements for FEAT_SME_FA64").
Catalin Marinas Feb. 22, 2022, 12:09 p.m. UTC | #7
On Mon, Feb 21, 2022 at 11:10:34PM +0000, Mark Brown wrote:
> On Mon, Feb 21, 2022 at 07:24:59PM +0000, Catalin Marinas wrote:
> > On Mon, Feb 21, 2022 at 03:01:03PM +0000, Mark Brown wrote:
> > > We do run the kernel in streaming mode - entering the kernel through a
> > > syscall or preemption will not change the streaming mode state, and we
> > > need to be in streaming mode in order to save or restore the register
> > > state for streaming mode.  In particular we need FA64 enabled for EL1 in
> > > order to context switch FFR when in streaming mode, without it we'll
> > > generate an exception when we execute the rdffr or wrffr.  We don't do
> > > any real floating point work in streaming mode but we absolutely need to
> > > run in streaming mode and only exit streaming mode when restoring a
> > > context where it is disabled, when using floating point in the kernel or
> > > when idling the CPU.
> 
> > So, IIUC, for Linux it is mandatory that FEAT_SME_FA64 is supported,
> > otherwise we won't be able to enable SME. Does the architecture say
> 
> The feature is not mandatory and we do not require it for Linux.  It is
> expected that many implementations will choose to not support FA64.
> 
> The only impact it has on the kernel is that if it's present then we
> need to enable it for each EL and then context switch FFR in streaming
> mode, the code is there to do that conditionally already.

OK, I get it. So FFR is only present if FA64 is supported.

> This is actually a bit awkward for not disabling streaming mode when we
> do a syscall since the disabled instructions include the FPSMID mov
> vector, vector instruction which we currently use to zero the high bits
> of the Z registers.  That issue goes away if the optimisations I've got
> for relaxed flushing of the non-shared SVE state that we discussed in
> relation to syscall-abi get merged, though it'd still be there if we add
> a sysctl to force flushing.  This is a solvable problem though, even if
> we have to use a less efficient sequence to flush in streaming mode.

I guess the simplest is to just disable streaming mode on syscall. The C
library would mark the syscall wrappers as not streaming compatible, so
whoever is calling them might disable SM anyway.

So I think your original proposal in the ABI doc is fine (I just need
the libc people to confirm ;)).
diff mbox series

Patch

diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index b72ff17d600a..5626cf208000 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -259,6 +259,39 @@  HWCAP2_RPRES
 
     Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
 
+HWCAP2_SME
+
+    Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
+    by Documentation/arm64/sme.rst.
+
+HWCAP2_SME_I16I64
+
+    Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
+
+HWCAP2_SME_F64F64
+
+    Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
+
+HWCAP2_SME_I8I32
+
+    Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
+
+HWCAP2_SME_F16F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
+
+HWCAP2_SME_B16F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
+
+HWCAP2_SME_F32F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
+
+HWCAP2_SME_FA64
+
+    Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index a58e366f0b07..d08062bcb9c1 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -58,6 +58,7 @@  struct cpuinfo_arm64 {
 	u64		reg_id_aa64pfr0;
 	u64		reg_id_aa64pfr1;
 	u64		reg_id_aa64zfr0;
+	u64		reg_id_aa64smfr0;
 
 	struct cpuinfo_32bit	aarch32;
 
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 2728abd9cae4..f93b1442143f 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -728,6 +728,18 @@  static __always_inline bool system_supports_sve(void)
 		cpus_have_const_cap(ARM64_SVE);
 }
 
+static __always_inline bool system_supports_sme(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SME) &&
+		cpus_have_const_cap(ARM64_SME);
+}
+
+static __always_inline bool system_supports_fa64(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SME) &&
+		cpus_have_const_cap(ARM64_SME_FA64);
+}
+
 static __always_inline bool system_supports_cnp(void)
 {
 	return IS_ENABLED(CONFIG_ARM64_CNP) &&
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c90f7f99a768..6b7eb6f2cecd 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -74,6 +74,8 @@  extern void sve_set_vq(unsigned long vq_minus_1);
 
 struct arm64_cpu_capabilities;
 extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused);
+extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused);
+extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused);
 
 extern u64 read_zcr_features(void);
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index f68fbb207473..76d9999527c5 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -108,6 +108,14 @@ 
 #define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
 #define KERNEL_HWCAP_AFP		__khwcap2_feature(AFP)
 #define KERNEL_HWCAP_RPRES		__khwcap2_feature(RPRES)
+#define KERNEL_HWCAP_SME		__khwcap2_feature(SME)
+#define KERNEL_HWCAP_SME_I16I64		__khwcap2_feature(SME_I16I64)
+#define KERNEL_HWCAP_SME_F64F64		__khwcap2_feature(SME_F64F64)
+#define KERNEL_HWCAP_SME_I8I32		__khwcap2_feature(SME_I8I32)
+#define KERNEL_HWCAP_SME_F16F32		__khwcap2_feature(SME_F16F32)
+#define KERNEL_HWCAP_SME_B16F32		__khwcap2_feature(SME_B16F32)
+#define KERNEL_HWCAP_SME_F32F32		__khwcap2_feature(SME_F32F32)
+#define KERNEL_HWCAP_SME_FA64		__khwcap2_feature(SME_FA64)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index f03731847d9d..60de5626f8fb 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -78,5 +78,13 @@ 
 #define HWCAP2_ECV		(1 << 19)
 #define HWCAP2_AFP		(1 << 20)
 #define HWCAP2_RPRES		(1 << 21)
+#define HWCAP2_SME		(1 << 22)
+#define HWCAP2_SME_I16I64	(1 << 23)
+#define HWCAP2_SME_F64F64	(1 << 24)
+#define HWCAP2_SME_I8I32	(1 << 25)
+#define HWCAP2_SME_F16F32	(1 << 26)
+#define HWCAP2_SME_B16F32	(1 << 27)
+#define HWCAP2_SME_F32F32	(1 << 28)
+#define HWCAP2_SME_FA64		(1 << 29)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 64a748c2b351..2634e32bbfb9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -251,6 +251,7 @@  static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE),
@@ -283,6 +284,24 @@  static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0),
@@ -634,6 +653,7 @@  static const struct __ftr_reg_entry {
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
 			       &id_aa64pfr1_override),
 	ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0),
+	ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0),
 
 	/* Op1 = 0, CRn = 0, CRm = 5 */
 	ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
@@ -947,6 +967,7 @@  void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
+	init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0);
 
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
 		init_32bit_cpu_features(&info->aarch32);
@@ -2403,6 +2424,32 @@  static const struct arm64_cpu_capabilities arm64_features[] = {
 		.matches = has_cpuid_feature,
 		.min_field_value = 1,
 	},
+#ifdef CONFIG_ARM64_SME
+	{
+		.desc = "Scalable Matrix Extension",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_SME,
+		.sys_reg = SYS_ID_AA64PFR1_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64PFR1_SME_SHIFT,
+		.field_width = 4,
+		.min_field_value = ID_AA64PFR1_SME,
+		.matches = has_cpuid_feature,
+		.cpu_enable = sme_kernel_enable,
+	},
+	{
+		.desc = "FA64",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_SME_FA64,
+		.sys_reg = SYS_ID_AA64SMFR0_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64SMFR0_FA64_SHIFT,
+		.field_width = 1,
+		.min_field_value = ID_AA64SMFR0_FA64,
+		.matches = has_cpuid_feature,
+		.cpu_enable = fa64_kernel_enable,
+	},
+#endif /* CONFIG_ARM64_SME */
 	{},
 };
 
@@ -2527,6 +2574,16 @@  static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
 	HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
 	HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
+#ifdef CONFIG_ARM64_SME
+	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_FA64, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I16I64, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F64F64, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I8I32, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F16F32, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_B16F32, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F32F32, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+#endif /* CONFIG_ARM64_SME */
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 591c18a889a5..33ec182e872e 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -97,6 +97,14 @@  static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_ECV]		= "ecv",
 	[KERNEL_HWCAP_AFP]		= "afp",
 	[KERNEL_HWCAP_RPRES]		= "rpres",
+	[KERNEL_HWCAP_SME]		= "sme",
+	[KERNEL_HWCAP_SME_I16I64]	= "smei16i64",
+	[KERNEL_HWCAP_SME_F64F64]	= "smef64f64",
+	[KERNEL_HWCAP_SME_I8I32]	= "smei8i32",
+	[KERNEL_HWCAP_SME_F16F32]	= "smef16f32",
+	[KERNEL_HWCAP_SME_B16F32]	= "smeb16f32",
+	[KERNEL_HWCAP_SME_F32F32]	= "smef32f32",
+	[KERNEL_HWCAP_SME_FA64]		= "smefa64",
 };
 
 #ifdef CONFIG_COMPAT
@@ -400,6 +408,7 @@  static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
 	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
+	info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1);
 
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
 		info->reg_gmid = read_cpuid(GMID_EL1);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 5280e098cfb5..576490be3c2b 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -987,6 +987,32 @@  void fpsimd_release_task(struct task_struct *dead_task)
 
 #endif /* CONFIG_ARM64_SVE */
 
+#ifdef CONFIG_ARM64_SME
+
+void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	/* Set priority for all PEs to architecturally defined minimum */
+	write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK,
+		       SYS_SMPRI_EL1);
+
+	/* Allow SME in kernel */
+	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1);
+	isb();
+}
+
+/*
+ * This must be called after sme_kernel_enable(), we rely on the
+ * feature table being sorted to ensure this.
+ */
+void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	/* Allow use of FA64 */
+	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
+		       SYS_SMCR_EL1);
+}
+
+#endif /* CONFIG_ARM64_SVE */
+
 /*
  * Trapped SVE access
  *
@@ -1532,6 +1558,10 @@  static int __init fpsimd_init(void)
 	if (!cpu_have_named_feature(ASIMD))
 		pr_notice("Advanced SIMD is not implemented\n");
 
+
+	if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE))
+		pr_notice("SME is implemented but not SVE\n");
+
 	return sve_sysctl_init();
 }
 core_initcall(fpsimd_init);
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 9c65b1e25a96..0a602525bda6 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -41,6 +41,8 @@  KVM_PROTECTED_MODE
 MISMATCHED_CACHE_TYPE
 MTE
 MTE_ASYMM
+SME
+SME_FA64
 SPECTRE_V2
 SPECTRE_V3A
 SPECTRE_V4