diff mbox series

[v6,13/37] arm64/sme: Basic enumeration support

Message ID 20211115152835.3212149-14-broonie@kernel.org (mailing list archive)
State New
Headers show
Series arm64/sme: Initial support for the Scalable Matrix Extension | expand

Commit Message

Mark Brown Nov. 15, 2021, 3:28 p.m. UTC
This patch introduces basic cpufeature support for discovering the presence
of the Scalable Matrix Extension and reporting hwcaps for the detected
features.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/arm64/elf_hwcaps.rst  | 33 +++++++++++++++++
 arch/arm64/include/asm/cpu.h        |  1 +
 arch/arm64/include/asm/cpufeature.h | 12 +++++++
 arch/arm64/include/asm/fpsimd.h     |  2 ++
 arch/arm64/include/asm/hwcap.h      |  8 +++++
 arch/arm64/include/uapi/asm/hwcap.h |  8 +++++
 arch/arm64/kernel/cpufeature.c      | 55 +++++++++++++++++++++++++++++
 arch/arm64/kernel/cpuinfo.c         |  9 +++++
 arch/arm64/kernel/fpsimd.c          | 30 ++++++++++++++++
 arch/arm64/tools/cpucaps            |  2 ++
 10 files changed, 160 insertions(+)

Comments

Catalin Marinas Dec. 9, 2021, 6:41 p.m. UTC | #1
On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote:
> diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
> index 7b23b16f21ce..6f8ca04b6566 100644
> --- a/arch/arm64/include/uapi/asm/hwcap.h
> +++ b/arch/arm64/include/uapi/asm/hwcap.h
> @@ -76,5 +76,13 @@
>  #define HWCAP2_BTI		(1 << 17)
>  #define HWCAP2_MTE		(1 << 18)
>  #define HWCAP2_ECV		(1 << 19)
> +#define HWCAP2_SME		(1 << 20)
> +#define HWCAP2_SME_I16I64	(1 << 21)
> +#define HWCAP2_SME_F64F64	(1 << 22)
> +#define HWCAP2_SME_I8I32	(1 << 23)
> +#define HWCAP2_SME_F16F32	(1 << 24)
> +#define HWCAP2_SME_B16F32	(1 << 25)
> +#define HWCAP2_SME_F32F32	(1 << 26)
> +#define HWCAP2_SME_FA64		(1 << 27)

At this pace we'll need HWCAP3 pretty soon (since we only allocated
32-bit in each). I wonder whether we could instead not bother at all and
just provide user-space emulation for ID_AA64SMFR0_EL1.

>  #endif /* _UAPI__ASM_HWCAP_H */
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 81824c7ea74f..3cf60819c354 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -246,6 +246,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
>  };
>  
>  static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
> +	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE),
> @@ -278,6 +279,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
>  	ARM64_FTR_END,
>  };
>  
> +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
> +		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0),
> +	ARM64_FTR_END,
> +};
> +
>  static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0),
> @@ -628,6 +647,7 @@ static const struct __ftr_reg_entry {
>  	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
>  			       &id_aa64pfr1_override),
>  	ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0),
> +	ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0),
>  
>  	/* Op1 = 0, CRn = 0, CRm = 5 */
>  	ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
> @@ -939,6 +959,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
>  	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
>  	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
>  	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
> +	init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0);
>  
>  	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
>  		init_32bit_cpu_features(&info->aarch32);
> @@ -2370,6 +2391,30 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>  		.matches = has_cpuid_feature,
>  		.min_field_value = 1,
>  	},
> +#ifdef CONFIG_ARM64_SME
> +	{
> +		.desc = "Scalable Matrix Extension",
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.capability = ARM64_SME,
> +		.sys_reg = SYS_ID_AA64PFR1_EL1,
> +		.sign = FTR_UNSIGNED,
> +		.field_pos = ID_AA64PFR1_SME_SHIFT,
> +		.min_field_value = ID_AA64PFR1_SME,
> +		.matches = has_cpuid_feature,
> +		.cpu_enable = sme_kernel_enable,
> +	},
> +	{
> +		.desc = "FA64",
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.capability = ARM64_SME_FA64,
> +		.sys_reg = SYS_ID_AA64SMFR0_EL1,
> +		.sign = FTR_UNSIGNED,
> +		.field_pos = ID_AA64SMFR0_FA64_SHIFT,
> +		.min_field_value = ID_AA64SMFR0_FA64,
> +		.matches = has_feature_flag,
> +		.cpu_enable = fa64_kernel_enable,
> +	},

I'll comment here rather than the patch introducing has_feature_flag():
an alternative would be to add a .field_width option and in
feature_matches() use cpuid_feature_extract_field_width() directly. All
the arm64_ftr_bits entries already have a width, so just generalise it
for arm64_cpu_capabilities.
Mark Brown Dec. 9, 2021, 7:28 p.m. UTC | #2
On Thu, Dec 09, 2021 at 06:41:26PM +0000, Catalin Marinas wrote:
> On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote:

> > +#define HWCAP2_SME		(1 << 20)
> > +#define HWCAP2_SME_I16I64	(1 << 21)
> > +#define HWCAP2_SME_F64F64	(1 << 22)
> > +#define HWCAP2_SME_I8I32	(1 << 23)
> > +#define HWCAP2_SME_F16F32	(1 << 24)
> > +#define HWCAP2_SME_B16F32	(1 << 25)
> > +#define HWCAP2_SME_F32F32	(1 << 26)
> > +#define HWCAP2_SME_FA64		(1 << 27)

> At this pace we'll need HWCAP3 pretty soon (since we only allocated
> 32-bit in each). I wonder whether we could instead not bother at all and
> just provide user-space emulation for ID_AA64SMFR0_EL1.

I think so if people are willing to go along with just having userspace
check the ID register (IIRC access to it already does the right thing
but I need to confirm).  We'll also need to think about how we handle
any new SVE features, that's got a similar thing going on and is most of
the existing usage of HWCAP2.

> > +	{
> > +		.desc = "FA64",
> > +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> > +		.capability = ARM64_SME_FA64,
> > +		.sys_reg = SYS_ID_AA64SMFR0_EL1,
> > +		.sign = FTR_UNSIGNED,
> > +		.field_pos = ID_AA64SMFR0_FA64_SHIFT,
> > +		.min_field_value = ID_AA64SMFR0_FA64,
> > +		.matches = has_feature_flag,
> > +		.cpu_enable = fa64_kernel_enable,
> > +	},

> I'll comment here rather than the patch introducing has_feature_flag():
> an alternative would be to add a .field_width option and in
> feature_matches() use cpuid_feature_extract_field_width() directly. All
> the arm64_ftr_bits entries already have a width, so just generalise it
> for arm64_cpu_capabilities.

Sure, if people are happy with that - it's a more invasive change since
we don't currently set the widths, I wasn't clear if that was a case of
not needing it right now or a design decision.
Catalin Marinas Dec. 10, 2021, 10:41 a.m. UTC | #3
On Thu, Dec 09, 2021 at 07:28:16PM +0000, Mark Brown wrote:
> On Thu, Dec 09, 2021 at 06:41:26PM +0000, Catalin Marinas wrote:
> > On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote:
> > > +#define HWCAP2_SME		(1 << 20)
> > > +#define HWCAP2_SME_I16I64	(1 << 21)
> > > +#define HWCAP2_SME_F64F64	(1 << 22)
> > > +#define HWCAP2_SME_I8I32	(1 << 23)
> > > +#define HWCAP2_SME_F16F32	(1 << 24)
> > > +#define HWCAP2_SME_B16F32	(1 << 25)
> > > +#define HWCAP2_SME_F32F32	(1 << 26)
> > > +#define HWCAP2_SME_FA64		(1 << 27)
> 
> > At this pace we'll need HWCAP3 pretty soon (since we only allocated
> > 32-bit in each). I wonder whether we could instead not bother at all and
> > just provide user-space emulation for ID_AA64SMFR0_EL1.
> 
> I think so if people are willing to go along with just having userspace
> check the ID register (IIRC access to it already does the right thing
> but I need to confirm).  We'll also need to think about how we handle
> any new SVE features, that's got a similar thing going on and is most of
> the existing usage of HWCAP2.

It would be good to get feedback from the libc people. IIRC the ifunc
resolver relies currently on the HWCAP bits. Could this be adapted to
use the MRS instruction? We'd still keep the main HWCAP2_SME but without
the finer-grained bits.

The other option is to start going into the upper 32-bit of the
elf_hwcap. We tried to avoid this some time back when we were still
having doubts about merging ILP32.

> > > +	{
> > > +		.desc = "FA64",
> > > +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> > > +		.capability = ARM64_SME_FA64,
> > > +		.sys_reg = SYS_ID_AA64SMFR0_EL1,
> > > +		.sign = FTR_UNSIGNED,
> > > +		.field_pos = ID_AA64SMFR0_FA64_SHIFT,
> > > +		.min_field_value = ID_AA64SMFR0_FA64,
> > > +		.matches = has_feature_flag,
> > > +		.cpu_enable = fa64_kernel_enable,
> > > +	},
> 
> > I'll comment here rather than the patch introducing has_feature_flag():
> > an alternative would be to add a .field_width option and in
> > feature_matches() use cpuid_feature_extract_field_width() directly. All
> > the arm64_ftr_bits entries already have a width, so just generalise it
> > for arm64_cpu_capabilities.
> 
> Sure, if people are happy with that - it's a more invasive change since
> we don't currently set the widths, I wasn't clear if that was a case of
> not needing it right now or a design decision.

We didn't have a field_width since they were all 4 bits until SVE. If
you don't want to touch all the entries in the array, we can say that a
0 value (i.e. not explicitly initialised) means default 4 and update it
during init_cpu_features().
Mark Brown Dec. 10, 2021, 1:59 p.m. UTC | #4
On Fri, Dec 10, 2021 at 10:41:18AM +0000, Catalin Marinas wrote:
> On Thu, Dec 09, 2021 at 07:28:16PM +0000, Mark Brown wrote:

> > > > +#define HWCAP2_SME_B16F32	(1 << 25)
> > > > +#define HWCAP2_SME_F32F32	(1 << 26)
> > > > +#define HWCAP2_SME_FA64		(1 << 27)
> > 
> > > At this pace we'll need HWCAP3 pretty soon (since we only allocated
> > > 32-bit in each). I wonder whether we could instead not bother at all and
> > > just provide user-space emulation for ID_AA64SMFR0_EL1.

> > I think so if people are willing to go along with just having userspace
> > check the ID register (IIRC access to it already does the right thing
> > but I need to confirm).  We'll also need to think about how we handle
> > any new SVE features, that's got a similar thing going on and is most of
> > the existing usage of HWCAP2.

> It would be good to get feedback from the libc people. IIRC the ifunc
> resolver relies currently on the HWCAP bits. Could this be adapted to
> use the MRS instruction? We'd still keep the main HWCAP2_SME but without
> the finer-grained bits.

> The other option is to start going into the upper 32-bit of the
> elf_hwcap. We tried to avoid this some time back when we were still
> having doubts about merging ILP32.

I'll leave things as they are just now to give more time for feedback.

> > > I'll comment here rather than the patch introducing has_feature_flag():
> > > an alternative would be to add a .field_width option and in
> > > feature_matches() use cpuid_feature_extract_field_width() directly. All

> > Sure, if people are happy with that - it's a more invasive change since
> > we don't currently set the widths, I wasn't clear if that was a case of
> > not needing it right now or a design decision.

> We didn't have a field_width since they were all 4 bits until SVE. If
> you don't want to touch all the entries in the array, we can say that a
> 0 value (i.e. not explicitly initialised) means default 4 and update it
> during init_cpu_features().

It's annoying to have to update everything but it feels more idiomatic
and less likely to cause surprises later on.
diff mbox series

Patch

diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index af106af8e1c0..155f726816f9 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -251,6 +251,39 @@  HWCAP2_ECV
 
     Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001.
 
+HWCAP2_SME
+
+    Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
+    by Documentation/arm64/sme.rst.
+
+HWCAP2_SME_I16I64
+
+    Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
+
+HWCAP2_SME_F64F64
+
+    Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
+
+HWCAP2_SME_I8I32
+
+    Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
+
+HWCAP2_SME_F16F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
+
+HWCAP2_SME_B16F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
+
+HWCAP2_SME_F32F32
+
+    Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
+
+HWCAP2_SME_FA64
+
+    Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 0f6d16faa540..667b66fe1a53 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -57,6 +57,7 @@  struct cpuinfo_arm64 {
 	u64		reg_id_aa64pfr0;
 	u64		reg_id_aa64pfr1;
 	u64		reg_id_aa64zfr0;
+	u64		reg_id_aa64smfr0;
 
 	struct cpuinfo_32bit	aarch32;
 
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index ef6be92b1921..079e9f15c5eb 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -727,6 +727,18 @@  static __always_inline bool system_supports_sve(void)
 		cpus_have_const_cap(ARM64_SVE);
 }
 
+static __always_inline bool system_supports_sme(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SME) &&
+		cpus_have_const_cap(ARM64_SME);
+}
+
+static __always_inline bool system_supports_fa64(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SME) &&
+		cpus_have_const_cap(ARM64_SME_FA64);
+}
+
 static __always_inline bool system_supports_cnp(void)
 {
 	return IS_ENABLED(CONFIG_ARM64_CNP) &&
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index cb24385e3632..d5f2825a2412 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -74,6 +74,8 @@  extern void sve_set_vq(unsigned long vq_minus_1);
 
 struct arm64_cpu_capabilities;
 extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused);
+extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused);
+extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused);
 
 extern u64 read_zcr_features(void);
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index b100e0055eab..dc008749e746 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -106,6 +106,14 @@ 
 #define KERNEL_HWCAP_BTI		__khwcap2_feature(BTI)
 #define KERNEL_HWCAP_MTE		__khwcap2_feature(MTE)
 #define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
+#define KERNEL_HWCAP_SME		__khwcap2_feature(SME)
+#define KERNEL_HWCAP_SME_I16I64		__khwcap2_feature(SME_I16I64)
+#define KERNEL_HWCAP_SME_F64F64		__khwcap2_feature(SME_F64F64)
+#define KERNEL_HWCAP_SME_I8I32		__khwcap2_feature(SME_I8I32)
+#define KERNEL_HWCAP_SME_F16F32		__khwcap2_feature(SME_F16F32)
+#define KERNEL_HWCAP_SME_B16F32		__khwcap2_feature(SME_B16F32)
+#define KERNEL_HWCAP_SME_F32F32		__khwcap2_feature(SME_F32F32)
+#define KERNEL_HWCAP_SME_FA64		__khwcap2_feature(SME_FA64)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 7b23b16f21ce..6f8ca04b6566 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -76,5 +76,13 @@ 
 #define HWCAP2_BTI		(1 << 17)
 #define HWCAP2_MTE		(1 << 18)
 #define HWCAP2_ECV		(1 << 19)
+#define HWCAP2_SME		(1 << 20)
+#define HWCAP2_SME_I16I64	(1 << 21)
+#define HWCAP2_SME_F64F64	(1 << 22)
+#define HWCAP2_SME_I8I32	(1 << 23)
+#define HWCAP2_SME_F16F32	(1 << 24)
+#define HWCAP2_SME_B16F32	(1 << 25)
+#define HWCAP2_SME_F32F32	(1 << 26)
+#define HWCAP2_SME_FA64		(1 << 27)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 81824c7ea74f..3cf60819c354 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -246,6 +246,7 @@  static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE),
@@ -278,6 +279,24 @@  static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0),
@@ -628,6 +647,7 @@  static const struct __ftr_reg_entry {
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
 			       &id_aa64pfr1_override),
 	ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0),
+	ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0),
 
 	/* Op1 = 0, CRn = 0, CRm = 5 */
 	ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
@@ -939,6 +959,7 @@  void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
+	init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0);
 
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
 		init_32bit_cpu_features(&info->aarch32);
@@ -2370,6 +2391,30 @@  static const struct arm64_cpu_capabilities arm64_features[] = {
 		.matches = has_cpuid_feature,
 		.min_field_value = 1,
 	},
+#ifdef CONFIG_ARM64_SME
+	{
+		.desc = "Scalable Matrix Extension",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_SME,
+		.sys_reg = SYS_ID_AA64PFR1_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64PFR1_SME_SHIFT,
+		.min_field_value = ID_AA64PFR1_SME,
+		.matches = has_cpuid_feature,
+		.cpu_enable = sme_kernel_enable,
+	},
+	{
+		.desc = "FA64",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_SME_FA64,
+		.sys_reg = SYS_ID_AA64SMFR0_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64SMFR0_FA64_SHIFT,
+		.min_field_value = ID_AA64SMFR0_FA64,
+		.matches = has_feature_flag,
+		.cpu_enable = fa64_kernel_enable,
+	},
+#endif /* CONFIG_ARM64_SME */
 	{},
 };
 
@@ -2502,6 +2547,16 @@  static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE),
 #endif /* CONFIG_ARM64_MTE */
 	HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
+#ifdef CONFIG_ARM64_SME
+	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME),
+	HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I16I64_SHIFT, FTR_UNSIGNED, ID_AA64SMFR0_I16I64, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
+	HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F64F64_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
+	HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I8I32_SHIFT, FTR_UNSIGNED, ID_AA64SMFR0_I8I32, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
+	HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F16F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
+	HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_B16F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
+	HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F32F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+#endif /* CONFIG_ARM64_SME */
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 6e27b759056a..5b0e9d2643a8 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -95,6 +95,14 @@  static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_BTI]		= "bti",
 	[KERNEL_HWCAP_MTE]		= "mte",
 	[KERNEL_HWCAP_ECV]		= "ecv",
+	[KERNEL_HWCAP_SME]		= "sme",
+	[KERNEL_HWCAP_SME_I16I64]	= "smei16i64",
+	[KERNEL_HWCAP_SME_F64F64]	= "smef64f64",
+	[KERNEL_HWCAP_SME_I8I32]	= "smei8i32",
+	[KERNEL_HWCAP_SME_F16F32]	= "smef16f32",
+	[KERNEL_HWCAP_SME_B16F32]	= "smeb16f32",
+	[KERNEL_HWCAP_SME_F32F32]	= "smef32f32",
+	[KERNEL_HWCAP_SME_FA64]		= "smefa64",
 };
 
 #ifdef CONFIG_COMPAT
@@ -397,6 +405,7 @@  static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
 	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
+	info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1);
 
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
 		info->reg_gmid = read_cpuid(GMID_EL1);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4a98cc3b1df1..8dde4593ca73 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -983,6 +983,32 @@  void fpsimd_release_task(struct task_struct *dead_task)
 
 #endif /* CONFIG_ARM64_SVE */
 
+#ifdef CONFIG_ARM64_SME
+
+void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	/* Set priority for all PEs to architecturally defined minimum */
+	write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK,
+		       SYS_SMPRI_EL1);
+
+	/* Allow SME in kernel */
+	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1);
+	isb();
+}
+
+/*
+ * This must be called after sme_kernel_enable(), we rely on the
+ * feature table being sorted to ensure this.
+ */
+void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	/* Allow use of FA64 */
+	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
+		       SYS_SMCR_EL1);
+}
+
+#endif /* CONFIG_ARM64_SVE */
+
 /*
  * Trapped SVE access
  *
@@ -1525,6 +1551,10 @@  static int __init fpsimd_init(void)
 	if (!cpu_have_named_feature(ASIMD))
 		pr_notice("Advanced SIMD is not implemented\n");
 
+
+	if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE))
+		pr_notice("SME is implemented but not SVE\n");
+
 	return sve_sysctl_init();
 }
 core_initcall(fpsimd_init);
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 870c39537dd0..58dfc8547c64 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -41,6 +41,8 @@  KVM_PROTECTED_MODE
 MISMATCHED_CACHE_TYPE
 MTE
 MTE_ASYMM
+SME
+SME_FA64
 SPECTRE_V2
 SPECTRE_V3A
 SPECTRE_V4