diff mbox series

[RESEND,v9,10/13] iommu/arm-smmu-v3: Check for SVA features

Message ID 20200817171558.325917-11-jean-philippe@linaro.org (mailing list archive)
State New, archived
Headers show
Series iommu: Shared Virtual Addressing for SMMUv3 (PT sharing part) | expand

Commit Message

Jean-Philippe Brucker Aug. 17, 2020, 5:15 p.m. UTC
Aggregate all sanity-checks for sharing CPU page tables with the SMMU
under a single ARM_SMMU_FEAT_SVA bit. For PCIe SVA, users also need to
check FEAT_ATS and FEAT_PRI. For platform SVA, they will have to check
FEAT_STALLS.

Introduce ARM_SMMU_FEAT_BTM (Broadcast TLB Maintenance), but don't
enable it at the moment. Since the entire VMID space is shared with the
CPU, enabling DVM (by clearing SMMU_CR2.PTM) could result in
over-invalidation and affect performance of stage-2 mappings.

Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   | 10 +++++
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 43 +++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  3 ++
 3 files changed, 56 insertions(+)

Comments

Eric Auger Sept. 8, 2020, 9:38 a.m. UTC | #1
Hi Jean,
On 8/17/20 7:15 PM, Jean-Philippe Brucker wrote:
> Aggregate all sanity-checks for sharing CPU page tables with the SMMU
> under a single ARM_SMMU_FEAT_SVA bit. For PCIe SVA, users also need to
> check FEAT_ATS and FEAT_PRI. For platform SVA, they will have to check
> FEAT_STALLS.
> 
> Introduce ARM_SMMU_FEAT_BTM (Broadcast TLB Maintenance), but don't
> enable it at the moment. Since the entire VMID space is shared with the
> CPU, enabling DVM (by clearing SMMU_CR2.PTM) could result in
> over-invalidation and affect performance of stage-2 mappings.
In which series do you plan to enable it?
> 
> Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   | 10 +++++
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 43 +++++++++++++++++++
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  3 ++
>  3 files changed, 56 insertions(+)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 90c08f156b43..7b14b48a26c7 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -602,6 +602,8 @@ struct arm_smmu_device {
>  #define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
>  #define ARM_SMMU_FEAT_VAX		(1 << 14)
>  #define ARM_SMMU_FEAT_RANGE_INV		(1 << 15)
> +#define ARM_SMMU_FEAT_BTM		(1 << 16)
> +#define ARM_SMMU_FEAT_SVA		(1 << 17)
>  	u32				features;
>  
>  #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
> @@ -683,4 +685,12 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
>  void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
>  bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
>  
> +#ifdef CONFIG_ARM_SMMU_V3_SVA
> +bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
> +#else /* CONFIG_ARM_SMMU_V3_SVA */
> +static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
> +{
> +	return false;
> +}
> +#endif /* CONFIG_ARM_SMMU_V3_SVA */
>  #endif /* _ARM_SMMU_V3_H */
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index e919ce894dd1..bf81d91ce71e 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -153,3 +153,46 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
>  		kfree(cd);
>  	}
>  }
> +
> +bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
> +{
> +	unsigned long reg, fld;
> +	unsigned long oas;
> +	unsigned long asid_bits;
> +
> +	u32 feat_mask = ARM_SMMU_FEAT_BTM | ARM_SMMU_FEAT_COHERENCY;
> +
> +	if ((smmu->features & feat_mask) != feat_mask)
> +		return false;
> +
> +	if (!(smmu->pgsize_bitmap & PAGE_SIZE))
> +		return false;
If we were to check VA_BITS versus SMMU capabilities I guess this would
be here?
> +
> +	/*
> +	 * Get the smallest PA size of all CPUs (sanitized by cpufeature). We're
> +	 * not even pretending to support AArch32 here. Abort if the MMU outputs
> +	 * addresses larger than what we support.
> +	 */
> +	reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
> +	fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
> +	oas = id_aa64mmfr0_parange_to_phys_shift(fld);
> +	if (smmu->oas < oas)
> +		return false;
> +
> +	/* We can support bigger ASIDs than the CPU, but not smaller */
> +	fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_ASID_SHIFT);
> +	asid_bits = fld ? 16 : 8;
> +	if (smmu->asid_bits < asid_bits)
> +		return false;
> +
> +	/*
> +	 * See max_pinned_asids in arch/arm64/mm/context.c. The following is
> +	 * generally the maximum number of bindable processes.
> +	 */
> +	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
Out of curiosity, What is the rationale behind using
arm64_kernel_unmapped_at_el0() versus
IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)?
CPU caps being finalized? Is that why you say "generally" here?
> +		asid_bits--;
> +	dev_dbg(smmu->dev, "%d shared contexts\n", (1 << asid_bits) -> +		num_possible_cpus() - 2);
nit: s/shared/bindable?
> +
> +	return true;
> +}
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 9e755caea525..15cb3d9c1a5d 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -3258,6 +3258,9 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
>  
>  	smmu->ias = max(smmu->ias, smmu->oas);
>  
> +	if (arm_smmu_sva_supported(smmu))
> +		smmu->features |= ARM_SMMU_FEAT_SVA;
> +
>  	dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
>  		 smmu->ias, smmu->oas, smmu->features);
>  	return 0;
> 
Thanks

Eric
Jean-Philippe Brucker Sept. 17, 2020, 2:39 p.m. UTC | #2
On Tue, Sep 08, 2020 at 11:38:31AM +0200, Auger Eric wrote:
> Hi Jean,
> On 8/17/20 7:15 PM, Jean-Philippe Brucker wrote:
> > Aggregate all sanity-checks for sharing CPU page tables with the SMMU
> > under a single ARM_SMMU_FEAT_SVA bit. For PCIe SVA, users also need to
> > check FEAT_ATS and FEAT_PRI. For platform SVA, they will have to check
> > FEAT_STALLS.
> > 
> > Introduce ARM_SMMU_FEAT_BTM (Broadcast TLB Maintenance), but don't
> > enable it at the moment. Since the entire VMID space is shared with the
> > CPU, enabling DVM (by clearing SMMU_CR2.PTM) could result in
> > over-invalidation and affect performance of stage-2 mappings.
> In which series do you plan to enable it?

In the third part, after the PRI+stall series. I still haven't had time to
look at solving the stage-2 DVM problem (pinning VMIDs through KVM), so it
might be a while.

[...]
> > +	/*
> > +	 * See max_pinned_asids in arch/arm64/mm/context.c. The following is
> > +	 * generally the maximum number of bindable processes.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
> Out of curiosity, What is the rationale behind using
> arm64_kernel_unmapped_at_el0() versus
> IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)?
> CPU caps being finalized?

I'm not sure. The caps are finalized at this point. I'll change it.

> Is that why you say "generally" here?

I said "generally" because having less PASIDs than ASIDs is in theory
possible, but hardware will normally support 20-bit PASIDs.

> > +		asid_bits--;
> > +	dev_dbg(smmu->dev, "%d shared contexts\n", (1 << asid_bits) -> +		num_possible_cpus() - 2);
> nit: s/shared/bindable?

I find "shared" clearer, with regard to contexts

Thanks,
Jean
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 90c08f156b43..7b14b48a26c7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -602,6 +602,8 @@  struct arm_smmu_device {
 #define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
 #define ARM_SMMU_FEAT_VAX		(1 << 14)
 #define ARM_SMMU_FEAT_RANGE_INV		(1 << 15)
+#define ARM_SMMU_FEAT_BTM		(1 << 16)
+#define ARM_SMMU_FEAT_SVA		(1 << 17)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
@@ -683,4 +685,12 @@  int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
 bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
 
+#ifdef CONFIG_ARM_SMMU_V3_SVA
+bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
+#else /* CONFIG_ARM_SMMU_V3_SVA */
+static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
+{
+	return false;
+}
+#endif /* CONFIG_ARM_SMMU_V3_SVA */
 #endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index e919ce894dd1..bf81d91ce71e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -153,3 +153,46 @@  static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
 		kfree(cd);
 	}
 }
+
+bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
+{
+	unsigned long reg, fld;
+	unsigned long oas;
+	unsigned long asid_bits;
+
+	u32 feat_mask = ARM_SMMU_FEAT_BTM | ARM_SMMU_FEAT_COHERENCY;
+
+	if ((smmu->features & feat_mask) != feat_mask)
+		return false;
+
+	if (!(smmu->pgsize_bitmap & PAGE_SIZE))
+		return false;
+
+	/*
+	 * Get the smallest PA size of all CPUs (sanitized by cpufeature). We're
+	 * not even pretending to support AArch32 here. Abort if the MMU outputs
+	 * addresses larger than what we support.
+	 */
+	reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+	fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
+	oas = id_aa64mmfr0_parange_to_phys_shift(fld);
+	if (smmu->oas < oas)
+		return false;
+
+	/* We can support bigger ASIDs than the CPU, but not smaller */
+	fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_ASID_SHIFT);
+	asid_bits = fld ? 16 : 8;
+	if (smmu->asid_bits < asid_bits)
+		return false;
+
+	/*
+	 * See max_pinned_asids in arch/arm64/mm/context.c. The following is
+	 * generally the maximum number of bindable processes.
+	 */
+	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
+		asid_bits--;
+	dev_dbg(smmu->dev, "%d shared contexts\n", (1 << asid_bits) -
+		num_possible_cpus() - 2);
+
+	return true;
+}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9e755caea525..15cb3d9c1a5d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3258,6 +3258,9 @@  static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 
 	smmu->ias = max(smmu->ias, smmu->oas);
 
+	if (arm_smmu_sva_supported(smmu))
+		smmu->features |= ARM_SMMU_FEAT_SVA;
+
 	dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
 		 smmu->ias, smmu->oas, smmu->features);
 	return 0;