diff mbox series

[v2] iommu/arm-smmu-v3: Add SMMUv3.2 range invalidation support

Message ID 20200117211628.27888-1-robh@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v2] iommu/arm-smmu-v3: Add SMMUv3.2 range invalidation support | expand

Commit Message

Rob Herring Jan. 17, 2020, 9:16 p.m. UTC
Arm SMMUv3.2 adds support for TLB range invalidate operations.
Support for range invalidate is determined by the RIL bit in the IDR3
register.

The range invalidate is in units of the leaf page size and operates on
1-32 chunks of a power of 2 multiple pages. First, we determine from the
size what power of 2 multiple we can use. Then we calculate how many
chunks (1-31) of the power of 2 size for the range on the iteration. On
each iteration, we move up in size by at least 5 bits.

Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Will Deacon <will@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

Comments

Eric Auger Jan. 30, 2020, 3:06 p.m. UTC | #1
Hi Rob,
On 1/17/20 10:16 PM, Rob Herring wrote:
> Arm SMMUv3.2 adds support for TLB range invalidate operations.
> Support for range invalidate is determined by the RIL bit in the IDR3
> register.
> 
> The range invalidate is in units of the leaf page size and operates on
> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> size what power of 2 multiple we can use. Then we calculate how many
> chunks (1-31) of the power of 2 size for the range on the iteration. On
> each iteration, we move up in size by at least 5 bits.
> 
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Cc: Will Deacon <will@kernel.org>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Signed-off-by: Rob Herring <robh@kernel.org>
> ---
>  drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
>  1 file changed, 65 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index e91b4a098215..0ee561db7149 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -70,6 +70,9 @@
>  #define IDR1_SSIDSIZE			GENMASK(10, 6)
>  #define IDR1_SIDSIZE			GENMASK(5, 0)
>  
> +#define ARM_SMMU_IDR3			0xc
> +#define IDR3_RIL			(1 << 10)
> +
>  #define ARM_SMMU_IDR5			0x14
>  #define IDR5_STALL_MAX			GENMASK(31, 16)
>  #define IDR5_GRAN64K			(1 << 6)
> @@ -327,9 +330,14 @@
>  #define CMDQ_CFGI_1_LEAF		(1UL << 0)
>  #define CMDQ_CFGI_1_RANGE		GENMASK_ULL(4, 0)
>  
> +#define CMDQ_TLBI_0_NUM			GENMASK_ULL(16, 12)
> +#define CMDQ_TLBI_RANGE_NUM_MAX		31
> +#define CMDQ_TLBI_0_SCALE		GENMASK_ULL(24, 20)
>  #define CMDQ_TLBI_0_VMID		GENMASK_ULL(47, 32)
>  #define CMDQ_TLBI_0_ASID		GENMASK_ULL(63, 48)
>  #define CMDQ_TLBI_1_LEAF		(1UL << 0)
> +#define CMDQ_TLBI_1_TTL			GENMASK_ULL(9, 8)
> +#define CMDQ_TLBI_1_TG			GENMASK_ULL(11, 10)
>  #define CMDQ_TLBI_1_VA_MASK		GENMASK_ULL(63, 12)
>  #define CMDQ_TLBI_1_IPA_MASK		GENMASK_ULL(51, 12)
>  
> @@ -455,9 +463,13 @@ struct arm_smmu_cmdq_ent {
>  		#define CMDQ_OP_TLBI_S2_IPA	0x2a
>  		#define CMDQ_OP_TLBI_NSNH_ALL	0x30
>  		struct {
> +			u8			num;
> +			u8			scale;
>  			u16			asid;
>  			u16			vmid;
>  			bool			leaf;
> +			u8			ttl;
> +			u8			tg;
>  			u64			addr;
>  		} tlbi;
>  
> @@ -595,6 +607,7 @@ struct arm_smmu_device {
>  #define ARM_SMMU_FEAT_HYP		(1 << 12)
>  #define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
>  #define ARM_SMMU_FEAT_VAX		(1 << 14)
> +#define ARM_SMMU_FEAT_RANGE_INV		(1 << 15)
>  	u32				features;
>  
>  #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
> @@ -856,13 +869,21 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>  		cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
>  		break;
>  	case CMDQ_OP_TLBI_NH_VA:
> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>  		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
>  		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>  		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
>  		break;
>  	case CMDQ_OP_TLBI_S2_IPA:
> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>  		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>  		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>  		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
>  		break;
>  	case CMDQ_OP_TLBI_NH_ASID:
> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>  {
>  	u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>  	struct arm_smmu_device *smmu = smmu_domain->smmu;
> -	unsigned long start = iova, end = iova + size;
> +	unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>  	int i = 0;
>  	struct arm_smmu_cmdq_ent cmd = {
>  		.tlbi = {
> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>  		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
>  	}
>  
> +	if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> +		/* Get the leaf page size */
> +		tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> +
> +		/* Convert page size of 12,14,16 (log2) to 1,2,3 */
> +		cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
> +
> +		/* Determine what level the granule is at */
> +		cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> +
> +		num_pages = size / (1UL << tg);
> +	}
> +
>  	while (iova < end) {
>  		if (i == CMDQ_BATCH_ENTRIES) {
>  			arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>  			i = 0;
>  		}
>  
> +		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> +			/*
> +			 * On each iteration of the loop, the range is 5 bits
> +			 * worth of the aligned size remaining.
> +			 * The range in pages is:
> +			 *
> +			 * range = (num_pages & (0x1f << __ffs(num_pages)))
> +			 */
> +			unsigned long scale, num;
> +
> +			/* Determine the power of 2 multiple number of pages */
> +			scale = __ffs(num_pages);
> +			cmd.tlbi.scale = scale;
> +
> +			/* Determine how many chunks of 2^scale size we have */
> +			num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> +			cmd.tlbi.num = num - 1;
> +
> +			/* range is num * 2^scale * pgsize */
> +			granule = num << (scale + tg);
> +
> +			/* Clear out the lower order bits for the next iteration */
> +			num_pages -= num << scale;
Regarding the 2 options given in
https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,

I understand you implemented 2) but I still do not understand why you
preferred that one against 1).

In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
31*2^0*4K pages
whereas you could achieve that with 10 invalidations with the 1st algo.
I did not get the case where it is more efficient. Please can you detail.

Also a question about TG. Reading the spec again & again, it is said
entries to be invalidated were inserted using this
Granule size. Here you pick the lowest granule supported by the domain.
Does it mean this was the one being used?

Thanks

Eric
> +		}
> +
>  		cmd.tlbi.addr = iova;
>  		arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
>  		iova += granule;
> @@ -3449,6 +3508,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
>  	if (smmu->sid_bits <= STRTAB_SPLIT)
>  		smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
>  
> +	/* IDR3 */
> +	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> +	if (FIELD_GET(IDR3_RIL, reg))
> +		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
> +
>  	/* IDR5 */
>  	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
>  
>
Robin Murphy Jan. 30, 2020, 5:34 p.m. UTC | #2
On 30/01/2020 3:06 pm, Auger Eric wrote:
> Hi Rob,
> On 1/17/20 10:16 PM, Rob Herring wrote:
>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
>> Support for range invalidate is determined by the RIL bit in the IDR3
>> register.
>>
>> The range invalidate is in units of the leaf page size and operates on
>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
>> size what power of 2 multiple we can use. Then we calculate how many
>> chunks (1-31) of the power of 2 size for the range on the iteration. On
>> each iteration, we move up in size by at least 5 bits.
>>
>> Cc: Eric Auger <eric.auger@redhat.com>
>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Robin Murphy <robin.murphy@arm.com>
>> Cc: Joerg Roedel <joro@8bytes.org>
>> Signed-off-by: Rob Herring <robh@kernel.org>
>> ---
>>   drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 65 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index e91b4a098215..0ee561db7149 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -70,6 +70,9 @@
>>   #define IDR1_SSIDSIZE			GENMASK(10, 6)
>>   #define IDR1_SIDSIZE			GENMASK(5, 0)
>>   
>> +#define ARM_SMMU_IDR3			0xc
>> +#define IDR3_RIL			(1 << 10)
>> +
>>   #define ARM_SMMU_IDR5			0x14
>>   #define IDR5_STALL_MAX			GENMASK(31, 16)
>>   #define IDR5_GRAN64K			(1 << 6)
>> @@ -327,9 +330,14 @@
>>   #define CMDQ_CFGI_1_LEAF		(1UL << 0)
>>   #define CMDQ_CFGI_1_RANGE		GENMASK_ULL(4, 0)
>>   
>> +#define CMDQ_TLBI_0_NUM			GENMASK_ULL(16, 12)
>> +#define CMDQ_TLBI_RANGE_NUM_MAX		31
>> +#define CMDQ_TLBI_0_SCALE		GENMASK_ULL(24, 20)
>>   #define CMDQ_TLBI_0_VMID		GENMASK_ULL(47, 32)
>>   #define CMDQ_TLBI_0_ASID		GENMASK_ULL(63, 48)
>>   #define CMDQ_TLBI_1_LEAF		(1UL << 0)
>> +#define CMDQ_TLBI_1_TTL			GENMASK_ULL(9, 8)
>> +#define CMDQ_TLBI_1_TG			GENMASK_ULL(11, 10)
>>   #define CMDQ_TLBI_1_VA_MASK		GENMASK_ULL(63, 12)
>>   #define CMDQ_TLBI_1_IPA_MASK		GENMASK_ULL(51, 12)
>>   
>> @@ -455,9 +463,13 @@ struct arm_smmu_cmdq_ent {
>>   		#define CMDQ_OP_TLBI_S2_IPA	0x2a
>>   		#define CMDQ_OP_TLBI_NSNH_ALL	0x30
>>   		struct {
>> +			u8			num;
>> +			u8			scale;
>>   			u16			asid;
>>   			u16			vmid;
>>   			bool			leaf;
>> +			u8			ttl;
>> +			u8			tg;
>>   			u64			addr;
>>   		} tlbi;
>>   
>> @@ -595,6 +607,7 @@ struct arm_smmu_device {
>>   #define ARM_SMMU_FEAT_HYP		(1 << 12)
>>   #define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
>>   #define ARM_SMMU_FEAT_VAX		(1 << 14)
>> +#define ARM_SMMU_FEAT_RANGE_INV		(1 << 15)
>>   	u32				features;
>>   
>>   #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
>> @@ -856,13 +869,21 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>>   		cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
>>   		break;
>>   	case CMDQ_OP_TLBI_NH_VA:
>> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
>> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>>   		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
>>   		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
>> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
>> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>>   		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
>>   		break;
>>   	case CMDQ_OP_TLBI_S2_IPA:
>> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
>> +		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>>   		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>>   		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
>> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
>> +		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>>   		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
>>   		break;
>>   	case CMDQ_OP_TLBI_NH_ASID:
>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>   {
>>   	u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>>   	struct arm_smmu_device *smmu = smmu_domain->smmu;
>> -	unsigned long start = iova, end = iova + size;
>> +	unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>>   	int i = 0;
>>   	struct arm_smmu_cmdq_ent cmd = {
>>   		.tlbi = {
>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>   		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
>>   	}
>>   
>> +	if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>> +		/* Get the leaf page size */
>> +		tg = __ffs(smmu_domain->domain.pgsize_bitmap);
>> +
>> +		/* Convert page size of 12,14,16 (log2) to 1,2,3 */
>> +		cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;

Given the comment, I think "(tg - 10) / 2" would suffice ;)

>> +
>> +		/* Determine what level the granule is at */
>> +		cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));

Is it possible to rephrase that with logs and shifts to avoid the division?

>> +
>> +		num_pages = size / (1UL << tg);

Similarly, in my experience GCC has always seemed too cautious to elide 
non-constant division even in a seemingly-obvious case like this, so 
explicit "size >> tg" might be preferable.

>> +	}
>> +
>>   	while (iova < end) {
>>   		if (i == CMDQ_BATCH_ENTRIES) {
>>   			arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>>   			i = 0;
>>   		}
>>   
>> +		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>> +			/*
>> +			 * On each iteration of the loop, the range is 5 bits
>> +			 * worth of the aligned size remaining.
>> +			 * The range in pages is:
>> +			 *
>> +			 * range = (num_pages & (0x1f << __ffs(num_pages)))
>> +			 */
>> +			unsigned long scale, num;
>> +
>> +			/* Determine the power of 2 multiple number of pages */
>> +			scale = __ffs(num_pages);
>> +			cmd.tlbi.scale = scale;
>> +
>> +			/* Determine how many chunks of 2^scale size we have */
>> +			num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
>> +			cmd.tlbi.num = num - 1;
>> +
>> +			/* range is num * 2^scale * pgsize */
>> +			granule = num << (scale + tg);
>> +
>> +			/* Clear out the lower order bits for the next iteration */
>> +			num_pages -= num << scale;
> Regarding the 2 options given in
> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
> 
> I understand you implemented 2) but I still do not understand why you
> preferred that one against 1).
> 
> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
> 31*2^0*4K pages
> whereas you could achieve that with 10 invalidations with the 1st algo.
> I did not get the case where it is more efficient. Please can you detail.

I guess essentially we want to solve for two variables to get a range as 
close to size as possible. There might be a more elegant numerical 
method, but for the numbers involved brute force is probably good enough 
for the real world. 5 minutes alone with the architecture spec and a 
blank editor begat this pseudo-implementation:

	size_t npages = size >> pgshift;
	while (npages) {
		unsigned long range;
		unsigned int delta, best = UINT_MAX;
		int num, scale = min(31, __ffs(npages));

		while (scale) {
			num = min(32, npages >> scale);
			if (num == 32)
				break;

			delta = npages & ((1 << scale) - 1);
			if (!delta || delta > best)
				break;

			best = delta;
			scale--;
		}

		//invalidate

		range = num << scale;
		npages -= range;
		iova += (range) << pgshift;
	}

Modulo any obvious thinkos, what do you reckon?

> Also a question about TG. Reading the spec again & again, it is said
> entries to be invalidated were inserted using this
> Granule size. Here you pick the lowest granule supported by the domain.
> Does it mean this was the one being used?

Yes - in arm_smmu_domain_finalise(), the domain's pgsize_bitmap is 
updated to reflect whichever granule alloc_io_pgtable_ops() actually picked.

Robin.
Rob Herring Feb. 13, 2020, 7:54 p.m. UTC | #3
On Thu, Jan 30, 2020 at 9:06 AM Auger Eric <eric.auger@redhat.com> wrote:
>
> Hi Rob,
> On 1/17/20 10:16 PM, Rob Herring wrote:
> > Arm SMMUv3.2 adds support for TLB range invalidate operations.
> > Support for range invalidate is determined by the RIL bit in the IDR3
> > register.
> >
> > The range invalidate is in units of the leaf page size and operates on
> > 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> > size what power of 2 multiple we can use. Then we calculate how many
> > chunks (1-31) of the power of 2 size for the range on the iteration. On
> > each iteration, we move up in size by at least 5 bits.
> >
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Robin Murphy <robin.murphy@arm.com>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Signed-off-by: Rob Herring <robh@kernel.org>
> > ---
> >  drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 65 insertions(+), 1 deletion(-)


> > @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >  {
> >       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
> >       struct arm_smmu_device *smmu = smmu_domain->smmu;
> > -     unsigned long start = iova, end = iova + size;
> > +     unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
> >       int i = 0;
> >       struct arm_smmu_cmdq_ent cmd = {
> >               .tlbi = {
> > @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> >       }
> >
> > +     if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> > +             /* Get the leaf page size */
> > +             tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> > +
> > +             /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> > +             cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
> > +
> > +             /* Determine what level the granule is at */
> > +             cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> > +
> > +             num_pages = size / (1UL << tg);
> > +     }
> > +
> >       while (iova < end) {
> >               if (i == CMDQ_BATCH_ENTRIES) {
> >                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
> >                       i = 0;
> >               }
> >
> > +             if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> > +                     /*
> > +                      * On each iteration of the loop, the range is 5 bits
> > +                      * worth of the aligned size remaining.
> > +                      * The range in pages is:
> > +                      *
> > +                      * range = (num_pages & (0x1f << __ffs(num_pages)))
> > +                      */
> > +                     unsigned long scale, num;
> > +
> > +                     /* Determine the power of 2 multiple number of pages */
> > +                     scale = __ffs(num_pages);
> > +                     cmd.tlbi.scale = scale;
> > +
> > +                     /* Determine how many chunks of 2^scale size we have */
> > +                     num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> > +                     cmd.tlbi.num = num - 1;
> > +
> > +                     /* range is num * 2^scale * pgsize */
> > +                     granule = num << (scale + tg);
> > +
> > +                     /* Clear out the lower order bits for the next iteration */
> > +                     num_pages -= num << scale;
> Regarding the 2 options given in
> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
>
> I understand you implemented 2) but I still do not understand why you
> preferred that one against 1).
>
> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
> 31*2^0*4K pages
> whereas you could achieve that with 10 invalidations with the 1st algo.
> I did not get the case where it is more efficient. Please can you detail.

No, it's only 2 commands. We do 31*4K and then 31*2^5*4K. Here's a the
output of a test case:

iova=10001000, num_pages=0x3e0, granule=1f000, num=31, scale=0, ttl=3
iova=10020000, num_pages=0x0, granule=3e0000, num=31, scale=5, ttl=3

(num_pages being what's left at end of the loop)

As I mentioned on v1, worst case is 4 commands for up to 4GB. It's
20-bits of size (32-12) and each loop processes a minimum of 5 bits.
Each loop becomes a larger aligned size, so scale goes up each pass.
This is what I tried to explain in the top comment.

Rob
Rob Herring Feb. 13, 2020, 9:49 p.m. UTC | #4
On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 30/01/2020 3:06 pm, Auger Eric wrote:
> > Hi Rob,
> > On 1/17/20 10:16 PM, Rob Herring wrote:
> >> Arm SMMUv3.2 adds support for TLB range invalidate operations.
> >> Support for range invalidate is determined by the RIL bit in the IDR3
> >> register.
> >>
> >> The range invalidate is in units of the leaf page size and operates on
> >> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> >> size what power of 2 multiple we can use. Then we calculate how many
> >> chunks (1-31) of the power of 2 size for the range on the iteration. On
> >> each iteration, we move up in size by at least 5 bits.
> >>
> >> Cc: Eric Auger <eric.auger@redhat.com>
> >> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> >> Cc: Will Deacon <will@kernel.org>
> >> Cc: Robin Murphy <robin.murphy@arm.com>
> >> Cc: Joerg Roedel <joro@8bytes.org>
> >> Signed-off-by: Rob Herring <robh@kernel.org>


> >> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>   {
> >>      u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
> >>      struct arm_smmu_device *smmu = smmu_domain->smmu;
> >> -    unsigned long start = iova, end = iova + size;
> >> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
> >>      int i = 0;
> >>      struct arm_smmu_cmdq_ent cmd = {
> >>              .tlbi = {
> >> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>              cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> >>      }
> >>
> >> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> >> +            /* Get the leaf page size */
> >> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> >> +
> >> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> >> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
>
> Given the comment, I think "(tg - 10) / 2" would suffice ;)

Well, duh...

>
> >> +
> >> +            /* Determine what level the granule is at */
> >> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
>
> Is it possible to rephrase that with logs and shifts to avoid the division?

Well, with a loop is the only other way I came up with:

bpl = tg - 3;
ttl = 3;
mask = BIT(bpl) - 1;
while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
    ttl--;

Doesn't seem like much improvement to me given we have h/w divide...

>
> >> +
> >> +            num_pages = size / (1UL << tg);
>
> Similarly, in my experience GCC has always seemed too cautious to elide
> non-constant division even in a seemingly-obvious case like this, so
> explicit "size >> tg" might be preferable.

My experience has been gcc is smart enough. I checked and there's only
one divide and it is the prior one. But I'll change it anyways.

>
> >> +    }
> >> +
> >>      while (iova < end) {
> >>              if (i == CMDQ_BATCH_ENTRIES) {
> >>                      arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
> >>                      i = 0;
> >>              }
> >>
> >> +            if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> >> +                    /*
> >> +                     * On each iteration of the loop, the range is 5 bits
> >> +                     * worth of the aligned size remaining.
> >> +                     * The range in pages is:
> >> +                     *
> >> +                     * range = (num_pages & (0x1f << __ffs(num_pages)))
> >> +                     */
> >> +                    unsigned long scale, num;
> >> +
> >> +                    /* Determine the power of 2 multiple number of pages */
> >> +                    scale = __ffs(num_pages);
> >> +                    cmd.tlbi.scale = scale;
> >> +
> >> +                    /* Determine how many chunks of 2^scale size we have */
> >> +                    num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> >> +                    cmd.tlbi.num = num - 1;
> >> +
> >> +                    /* range is num * 2^scale * pgsize */
> >> +                    granule = num << (scale + tg);
> >> +
> >> +                    /* Clear out the lower order bits for the next iteration */
> >> +                    num_pages -= num << scale;
> > Regarding the 2 options given in
> > https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
> >
> > I understand you implemented 2) but I still do not understand why you
> > preferred that one against 1).
> >
> > In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
> > 31*2^0*4K pages
> > whereas you could achieve that with 10 invalidations with the 1st algo.
> > I did not get the case where it is more efficient. Please can you detail.
>
> I guess essentially we want to solve for two variables to get a range as
> close to size as possible. There might be a more elegant numerical
> method, but for the numbers involved brute force is probably good enough
> for the real world. 5 minutes alone with the architecture spec and a
> blank editor begat this pseudo-implementation:
>
>         size_t npages = size >> pgshift;
>         while (npages) {
>                 unsigned long range;
>                 unsigned int delta, best = UINT_MAX;
>                 int num, scale = min(31, __ffs(npages));
>
>                 while (scale) {
>                         num = min(32, npages >> scale);
>                         if (num == 32)
>                                 break;
>
>                         delta = npages & ((1 << scale) - 1);
>                         if (!delta || delta > best)
>                                 break;
>
>                         best = delta;
>                         scale--;
>                 }
>
>                 //invalidate
>
>                 range = num << scale;
>                 npages -= range;
>                 iova += (range) << pgshift;
>         }
>
> Modulo any obvious thinkos, what do you reckon?

I don't think this is an improvement. See my other reply.

Rob
Robin Murphy Feb. 17, 2020, 7:17 p.m. UTC | #5
On 13/02/2020 9:49 pm, Rob Herring wrote:
> On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 30/01/2020 3:06 pm, Auger Eric wrote:
>>> Hi Rob,
>>> On 1/17/20 10:16 PM, Rob Herring wrote:
>>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
>>>> Support for range invalidate is determined by the RIL bit in the IDR3
>>>> register.
>>>>
>>>> The range invalidate is in units of the leaf page size and operates on
>>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
>>>> size what power of 2 multiple we can use. Then we calculate how many
>>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
>>>> each iteration, we move up in size by at least 5 bits.
>>>>
>>>> Cc: Eric Auger <eric.auger@redhat.com>
>>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>>>> Cc: Will Deacon <will@kernel.org>
>>>> Cc: Robin Murphy <robin.murphy@arm.com>
>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>> Signed-off-by: Rob Herring <robh@kernel.org>
> 
> 
>>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>    {
>>>>       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>>>>       struct arm_smmu_device *smmu = smmu_domain->smmu;
>>>> -    unsigned long start = iova, end = iova + size;
>>>> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>>>>       int i = 0;
>>>>       struct arm_smmu_cmdq_ent cmd = {
>>>>               .tlbi = {
>>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>>>>       }
>>>>
>>>> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>>> +            /* Get the leaf page size */
>>>> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
>>>> +
>>>> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
>>>> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
>>
>> Given the comment, I think "(tg - 10) / 2" would suffice ;)
> 
> Well, duh...
> 
>>
>>>> +
>>>> +            /* Determine what level the granule is at */
>>>> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
>>
>> Is it possible to rephrase that with logs and shifts to avoid the division?
> 
> Well, with a loop is the only other way I came up with:
> 
> bpl = tg - 3;
> ttl = 3;
> mask = BIT(bpl) - 1;
> while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
>      ttl--;
> 
> Doesn't seem like much improvement to me given we have h/w divide...

Sure, it doesn't take too many extra instructions to start matching 
typical IDIV latency, so there's no point being silly if there really 
isn't a clean alternative.

Some quick scribbling suggests "4 - ilog2(granule) / 10" might actually 
be close enough, but perhaps that's a bit too cheeky.

>>
>>>> +
>>>> +            num_pages = size / (1UL << tg);
>>
>> Similarly, in my experience GCC has always seemed too cautious to elide
>> non-constant division even in a seemingly-obvious case like this, so
>> explicit "size >> tg" might be preferable.
> 
> My experience has been gcc is smart enough. I checked and there's only
> one divide and it is the prior one. But I'll change it anyways.

Now that I think about it, the case that frustrated me may have had the 
shift and divide in separate statements - that's probably a lot harder 
to analyse than a single expression. Either way, the simple right shift 
is easier to read IMO.

>>>> +    }
>>>> +
>>>>       while (iova < end) {
>>>>               if (i == CMDQ_BATCH_ENTRIES) {
>>>>                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>>>>                       i = 0;
>>>>               }
>>>>
>>>> +            if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>>> +                    /*
>>>> +                     * On each iteration of the loop, the range is 5 bits
>>>> +                     * worth of the aligned size remaining.
>>>> +                     * The range in pages is:
>>>> +                     *
>>>> +                     * range = (num_pages & (0x1f << __ffs(num_pages)))
>>>> +                     */
>>>> +                    unsigned long scale, num;
>>>> +
>>>> +                    /* Determine the power of 2 multiple number of pages */
>>>> +                    scale = __ffs(num_pages);
>>>> +                    cmd.tlbi.scale = scale;
>>>> +
>>>> +                    /* Determine how many chunks of 2^scale size we have */
>>>> +                    num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
>>>> +                    cmd.tlbi.num = num - 1;
>>>> +
>>>> +                    /* range is num * 2^scale * pgsize */
>>>> +                    granule = num << (scale + tg);
>>>> +
>>>> +                    /* Clear out the lower order bits for the next iteration */
>>>> +                    num_pages -= num << scale;
>>> Regarding the 2 options given in
>>> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
>>>
>>> I understand you implemented 2) but I still do not understand why you
>>> preferred that one against 1).
>>>
>>> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
>>> 31*2^0*4K pages
>>> whereas you could achieve that with 10 invalidations with the 1st algo.
>>> I did not get the case where it is more efficient. Please can you detail.
>>
>> I guess essentially we want to solve for two variables to get a range as
>> close to size as possible. There might be a more elegant numerical
>> method, but for the numbers involved brute force is probably good enough
>> for the real world. 5 minutes alone with the architecture spec and a
>> blank editor begat this pseudo-implementation:
>>
>>          size_t npages = size >> pgshift;
>>          while (npages) {
>>                  unsigned long range;
>>                  unsigned int delta, best = UINT_MAX;
>>                  int num, scale = min(31, __ffs(npages));
>>
>>                  while (scale) {
>>                          num = min(32, npages >> scale);
>>                          if (num == 32)
>>                                  break;
>>
>>                          delta = npages & ((1 << scale) - 1);
>>                          if (!delta || delta > best)
>>                                  break;
>>
>>                          best = delta;
>>                          scale--;
>>                  }
>>
>>                  //invalidate
>>
>>                  range = num << scale;
>>                  npages -= range;
>>                  iova += (range) << pgshift;
>>          }
>>
>> Modulo any obvious thinkos, what do you reckon?
> 
> I don't think this is an improvement. See my other reply.

Indeed, I hadn't quite got my head round your algorithm at first, so 
deriving this was as much to help me get a better feel for the problem 
as anything. Now I see that "minimise the remainder" really boils down 
to "remove up to 5 high-order bits each time", which in turn is 
essentially the same thing just done in the other direction (and in a 
slightly more cumbersome manner). Now that I do get it, your algorithm 
is in fact really neat, sorry for doubting :)

FWIW it might be a little more efficient to maintain scale outside the 
loop, such that num_pages can simply be shifted right to lose the 
low-order bits each iteration, but other than that I think it's pretty 
much as good as it can get.

Thanks,
Robin.
Rob Herring Feb. 20, 2020, 5:54 p.m. UTC | #6
On Mon, Feb 17, 2020 at 1:17 PM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 13/02/2020 9:49 pm, Rob Herring wrote:
> > On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >>
> >> On 30/01/2020 3:06 pm, Auger Eric wrote:
> >>> Hi Rob,
> >>> On 1/17/20 10:16 PM, Rob Herring wrote:
> >>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
> >>>> Support for range invalidate is determined by the RIL bit in the IDR3
> >>>> register.
> >>>>
> >>>> The range invalidate is in units of the leaf page size and operates on
> >>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> >>>> size what power of 2 multiple we can use. Then we calculate how many
> >>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
> >>>> each iteration, we move up in size by at least 5 bits.
> >>>>
> >>>> Cc: Eric Auger <eric.auger@redhat.com>
> >>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> >>>> Cc: Will Deacon <will@kernel.org>
> >>>> Cc: Robin Murphy <robin.murphy@arm.com>
> >>>> Cc: Joerg Roedel <joro@8bytes.org>
> >>>> Signed-off-by: Rob Herring <robh@kernel.org>
> >
> >
> >>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>>>    {
> >>>>       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
> >>>>       struct arm_smmu_device *smmu = smmu_domain->smmu;
> >>>> -    unsigned long start = iova, end = iova + size;
> >>>> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
> >>>>       int i = 0;
> >>>>       struct arm_smmu_cmdq_ent cmd = {
> >>>>               .tlbi = {
> >>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>>>               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> >>>>       }
> >>>>
> >>>> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> >>>> +            /* Get the leaf page size */
> >>>> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> >>>> +
> >>>> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> >>>> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
> >>
> >> Given the comment, I think "(tg - 10) / 2" would suffice ;)
> >
> > Well, duh...
> >
> >>
> >>>> +
> >>>> +            /* Determine what level the granule is at */
> >>>> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> >>
> >> Is it possible to rephrase that with logs and shifts to avoid the division?
> >
> > Well, with a loop is the only other way I came up with:
> >
> > bpl = tg - 3;
> > ttl = 3;
> > mask = BIT(bpl) - 1;
> > while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
> >      ttl--;
> >
> > Doesn't seem like much improvement to me given we have h/w divide...
>
> Sure, it doesn't take too many extra instructions to start matching
> typical IDIV latency, so there's no point being silly if there really
> isn't a clean alternative.
>
> Some quick scribbling suggests "4 - ilog2(granule) / 10" might actually
> be close enough, but perhaps that's a bit too cheeky.

How does divide by 10 save a divide?

> >>>> +            num_pages = size / (1UL << tg);
> >>
> >> Similarly, in my experience GCC has always seemed too cautious to elide
> >> non-constant division even in a seemingly-obvious case like this, so
> >> explicit "size >> tg" might be preferable.
> >
> > My experience has been gcc is smart enough. I checked and there's only
> > one divide and it is the prior one. But I'll change it anyways.
>
> Now that I think about it, the case that frustrated me may have had the
> shift and divide in separate statements - that's probably a lot harder
> to analyse than a single expression. Either way, the simple right shift
> is easier to read IMO.
>
> >>>> +    }
> >>>> +
> >>>>       while (iova < end) {
> >>>>               if (i == CMDQ_BATCH_ENTRIES) {
> >>>>                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
> >>>>                       i = 0;
> >>>>               }
> >>>>
> >>>> +            if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> >>>> +                    /*
> >>>> +                     * On each iteration of the loop, the range is 5 bits
> >>>> +                     * worth of the aligned size remaining.
> >>>> +                     * The range in pages is:
> >>>> +                     *
> >>>> +                     * range = (num_pages & (0x1f << __ffs(num_pages)))
> >>>> +                     */
> >>>> +                    unsigned long scale, num;
> >>>> +
> >>>> +                    /* Determine the power of 2 multiple number of pages */
> >>>> +                    scale = __ffs(num_pages);
> >>>> +                    cmd.tlbi.scale = scale;
> >>>> +
> >>>> +                    /* Determine how many chunks of 2^scale size we have */
> >>>> +                    num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> >>>> +                    cmd.tlbi.num = num - 1;
> >>>> +
> >>>> +                    /* range is num * 2^scale * pgsize */
> >>>> +                    granule = num << (scale + tg);
> >>>> +
> >>>> +                    /* Clear out the lower order bits for the next iteration */
> >>>> +                    num_pages -= num << scale;
> >>> Regarding the 2 options given in
> >>> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
> >>>
> >>> I understand you implemented 2) but I still do not understand why you
> >>> preferred that one against 1).
> >>>
> >>> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
> >>> 31*2^0*4K pages
> >>> whereas you could achieve that with 10 invalidations with the 1st algo.
> >>> I did not get the case where it is more efficient. Please can you detail.
> >>
> >> I guess essentially we want to solve for two variables to get a range as
> >> close to size as possible. There might be a more elegant numerical
> >> method, but for the numbers involved brute force is probably good enough
> >> for the real world. 5 minutes alone with the architecture spec and a
> >> blank editor begat this pseudo-implementation:
> >>
> >>          size_t npages = size >> pgshift;
> >>          while (npages) {
> >>                  unsigned long range;
> >>                  unsigned int delta, best = UINT_MAX;
> >>                  int num, scale = min(31, __ffs(npages));
> >>
> >>                  while (scale) {
> >>                          num = min(32, npages >> scale);
> >>                          if (num == 32)
> >>                                  break;
> >>
> >>                          delta = npages & ((1 << scale) - 1);
> >>                          if (!delta || delta > best)
> >>                                  break;
> >>
> >>                          best = delta;
> >>                          scale--;
> >>                  }
> >>
> >>                  //invalidate
> >>
> >>                  range = num << scale;
> >>                  npages -= range;
> >>                  iova += (range) << pgshift;
> >>          }
> >>
> >> Modulo any obvious thinkos, what do you reckon?
> >
> > I don't think this is an improvement. See my other reply.
>
> Indeed, I hadn't quite got my head round your algorithm at first, so
> deriving this was as much to help me get a better feel for the problem
> as anything. Now I see that "minimise the remainder" really boils down
> to "remove up to 5 high-order bits each time", which in turn is
> essentially the same thing just done in the other direction (and in a
> slightly more cumbersome manner). Now that I do get it, your algorithm
> is in fact really neat, sorry for doubting :)
>
> FWIW it might be a little more efficient to maintain scale outside the
> loop, such that num_pages can simply be shifted right to lose the
> low-order bits each iteration, but other than that I think it's pretty
> much as good as it can get.

I think I understand what you're getting at. We'd need to have the
total running 'scale' and the incremental 'scale' for each loop
iteration. We could do that by using cmd.tlbi.scale to keep the
running total. So something like this (untested)?:

if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
    unsigned long scale, num;

    /* Determine the power of 2 multiple number of pages */
    scale = __ffs(num_pages);
    cmd.tlbi.scale += scale;

    /* Determine how many chunks of 2^scale size we have */
    num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
    cmd.tlbi.num = num - 1;

    /* range is num * 2^scale * pgsize */
    granule = num << (cmd.tlbi.scale + tg);

    /* Clear out the lower order bits for the next iteration */
    num_pages >>= 5 + scale;
}

That's actually worse by 1 add and probably a memory (re)load of
cmd.tlbi.scale AFAICT.

Rob
Robin Murphy Feb. 21, 2020, 4:18 p.m. UTC | #7
On 20/02/2020 5:54 pm, Rob Herring wrote:
> On Mon, Feb 17, 2020 at 1:17 PM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 13/02/2020 9:49 pm, Rob Herring wrote:
>>> On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>>>
>>>> On 30/01/2020 3:06 pm, Auger Eric wrote:
>>>>> Hi Rob,
>>>>> On 1/17/20 10:16 PM, Rob Herring wrote:
>>>>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
>>>>>> Support for range invalidate is determined by the RIL bit in the IDR3
>>>>>> register.
>>>>>>
>>>>>> The range invalidate is in units of the leaf page size and operates on
>>>>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
>>>>>> size what power of 2 multiple we can use. Then we calculate how many
>>>>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
>>>>>> each iteration, we move up in size by at least 5 bits.
>>>>>>
>>>>>> Cc: Eric Auger <eric.auger@redhat.com>
>>>>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>>>>>> Cc: Will Deacon <will@kernel.org>
>>>>>> Cc: Robin Murphy <robin.murphy@arm.com>
>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>>>> Signed-off-by: Rob Herring <robh@kernel.org>
>>>
>>>
>>>>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>>>     {
>>>>>>        u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>>>>>>        struct arm_smmu_device *smmu = smmu_domain->smmu;
>>>>>> -    unsigned long start = iova, end = iova + size;
>>>>>> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>>>>>>        int i = 0;
>>>>>>        struct arm_smmu_cmdq_ent cmd = {
>>>>>>                .tlbi = {
>>>>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>>>                cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>>>>>>        }
>>>>>>
>>>>>> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>>>>> +            /* Get the leaf page size */
>>>>>> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
>>>>>> +
>>>>>> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
>>>>>> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
>>>>
>>>> Given the comment, I think "(tg - 10) / 2" would suffice ;)
>>>
>>> Well, duh...
>>>
>>>>
>>>>>> +
>>>>>> +            /* Determine what level the granule is at */
>>>>>> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
>>>>
>>>> Is it possible to rephrase that with logs and shifts to avoid the division?
>>>
>>> Well, with a loop is the only other way I came up with:
>>>
>>> bpl = tg - 3;
>>> ttl = 3;
>>> mask = BIT(bpl) - 1;
>>> while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
>>>       ttl--;
>>>
>>> Doesn't seem like much improvement to me given we have h/w divide...
>>
>> Sure, it doesn't take too many extra instructions to start matching
>> typical IDIV latency, so there's no point being silly if there really
>> isn't a clean alternative.
>>
>> Some quick scribbling suggests "4 - ilog2(granule) / 10" might actually
>> be close enough, but perhaps that's a bit too cheeky.
> 
> How does divide by 10 save a divide?

Well, by that point I was more just thinking about the smallest 
expression, since *some* division seems unavoidable. Although GCC does 
apparently still think that transforming constant division is a win ;)

$ cat test.c
int y(int x) {
	return x / 10;
}

int z(int x) {
	return 10 / x;
}
$ gcc -O2 -c test.c && objdump -d test.o

test.o:     file format elf64-littleaarch64


Disassembly of section .text:

0000000000000000 <y>:
    0:	528ccce1 	mov	w1, #0x6667                	// #26215
    4:	72acccc1 	movk	w1, #0x6666, lsl #16
    8:	9b217c01 	smull	x1, w0, w1
    c:	9362fc21 	asr	x1, x1, #34
   10:	4b807c20 	sub	w0, w1, w0, asr #31
   14:	d65f03c0 	ret

0000000000000018 <z>:
   18:	52800141 	mov	w1, #0xa                   	// #10
   1c:	1ac00c20 	sdiv	w0, w1, w0
   20:	d65f03c0 	ret

> 
>>>>>> +            num_pages = size / (1UL << tg);
>>>>
>>>> Similarly, in my experience GCC has always seemed too cautious to elide
>>>> non-constant division even in a seemingly-obvious case like this, so
>>>> explicit "size >> tg" might be preferable.
>>>
>>> My experience has been gcc is smart enough. I checked and there's only
>>> one divide and it is the prior one. But I'll change it anyways.
>>
>> Now that I think about it, the case that frustrated me may have had the
>> shift and divide in separate statements - that's probably a lot harder
>> to analyse than a single expression. Either way, the simple right shift
>> is easier to read IMO.
>>
>>>>>> +    }
>>>>>> +
>>>>>>        while (iova < end) {
>>>>>>                if (i == CMDQ_BATCH_ENTRIES) {
>>>>>>                        arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>>>>>>                        i = 0;
>>>>>>                }
>>>>>>
>>>>>> +            if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>>>>> +                    /*
>>>>>> +                     * On each iteration of the loop, the range is 5 bits
>>>>>> +                     * worth of the aligned size remaining.
>>>>>> +                     * The range in pages is:
>>>>>> +                     *
>>>>>> +                     * range = (num_pages & (0x1f << __ffs(num_pages)))
>>>>>> +                     */
>>>>>> +                    unsigned long scale, num;
>>>>>> +
>>>>>> +                    /* Determine the power of 2 multiple number of pages */
>>>>>> +                    scale = __ffs(num_pages);
>>>>>> +                    cmd.tlbi.scale = scale;
>>>>>> +
>>>>>> +                    /* Determine how many chunks of 2^scale size we have */
>>>>>> +                    num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
>>>>>> +                    cmd.tlbi.num = num - 1;
>>>>>> +
>>>>>> +                    /* range is num * 2^scale * pgsize */
>>>>>> +                    granule = num << (scale + tg);
>>>>>> +
>>>>>> +                    /* Clear out the lower order bits for the next iteration */
>>>>>> +                    num_pages -= num << scale;
>>>>> Regarding the 2 options given in
>>>>> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
>>>>>
>>>>> I understand you implemented 2) but I still do not understand why you
>>>>> preferred that one against 1).
>>>>>
>>>>> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
>>>>> 31*2^0*4K pages
>>>>> whereas you could achieve that with 10 invalidations with the 1st algo.
>>>>> I did not get the case where it is more efficient. Please can you detail.
>>>>
>>>> I guess essentially we want to solve for two variables to get a range as
>>>> close to size as possible. There might be a more elegant numerical
>>>> method, but for the numbers involved brute force is probably good enough
>>>> for the real world. 5 minutes alone with the architecture spec and a
>>>> blank editor begat this pseudo-implementation:
>>>>
>>>>           size_t npages = size >> pgshift;
>>>>           while (npages) {
>>>>                   unsigned long range;
>>>>                   unsigned int delta, best = UINT_MAX;
>>>>                   int num, scale = min(31, __ffs(npages));
>>>>
>>>>                   while (scale) {
>>>>                           num = min(32, npages >> scale);
>>>>                           if (num == 32)
>>>>                                   break;
>>>>
>>>>                           delta = npages & ((1 << scale) - 1);
>>>>                           if (!delta || delta > best)
>>>>                                   break;
>>>>
>>>>                           best = delta;
>>>>                           scale--;
>>>>                   }
>>>>
>>>>                   //invalidate
>>>>
>>>>                   range = num << scale;
>>>>                   npages -= range;
>>>>                   iova += (range) << pgshift;
>>>>           }
>>>>
>>>> Modulo any obvious thinkos, what do you reckon?
>>>
>>> I don't think this is an improvement. See my other reply.
>>
>> Indeed, I hadn't quite got my head round your algorithm at first, so
>> deriving this was as much to help me get a better feel for the problem
>> as anything. Now I see that "minimise the remainder" really boils down
>> to "remove up to 5 high-order bits each time", which in turn is
>> essentially the same thing just done in the other direction (and in a
>> slightly more cumbersome manner). Now that I do get it, your algorithm
>> is in fact really neat, sorry for doubting :)
>>
>> FWIW it might be a little more efficient to maintain scale outside the
>> loop, such that num_pages can simply be shifted right to lose the
>> low-order bits each iteration, but other than that I think it's pretty
>> much as good as it can get.
> 
> I think I understand what you're getting at. We'd need to have the
> total running 'scale' and the incremental 'scale' for each loop
> iteration. We could do that by using cmd.tlbi.scale to keep the
> running total. So something like this (untested)?:
> 
> if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>      unsigned long scale, num;
> 
>      /* Determine the power of 2 multiple number of pages */
>      scale = __ffs(num_pages);
>      cmd.tlbi.scale += scale;
> 
>      /* Determine how many chunks of 2^scale size we have */
>      num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
>      cmd.tlbi.num = num - 1;
> 
>      /* range is num * 2^scale * pgsize */
>      granule = num << (cmd.tlbi.scale + tg);
> 
>      /* Clear out the lower order bits for the next iteration */
>      num_pages >>= 5 + scale;
> }
> 
> That's actually worse by 1 add and probably a memory (re)load of
> cmd.tlbi.scale AFAICT.

That's not quite what I was picturing, but when I started trying to 
write that out it quickly got messier than my mental picture anyway, so 
let's step back and not worry too much about the overall structure - the 
current shape is pretty nice and clear, and we can always come back 
later to consider whether the whole loop deserves surgery to help 
streamline both cases.

Thanks,
Robin.
Rob Herring Feb. 21, 2020, 4:47 p.m. UTC | #8
On Fri, Feb 21, 2020 at 10:19 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 20/02/2020 5:54 pm, Rob Herring wrote:
> > On Mon, Feb 17, 2020 at 1:17 PM Robin Murphy <robin.murphy@arm.com> wrote:
> >>
> >> On 13/02/2020 9:49 pm, Rob Herring wrote:
> >>> On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >>>>
> >>>> On 30/01/2020 3:06 pm, Auger Eric wrote:
> >>>>> Hi Rob,
> >>>>> On 1/17/20 10:16 PM, Rob Herring wrote:
> >>>>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
> >>>>>> Support for range invalidate is determined by the RIL bit in the IDR3
> >>>>>> register.
> >>>>>>
> >>>>>> The range invalidate is in units of the leaf page size and operates on
> >>>>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> >>>>>> size what power of 2 multiple we can use. Then we calculate how many
> >>>>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
> >>>>>> each iteration, we move up in size by at least 5 bits.
> >>>>>>
> >>>>>> Cc: Eric Auger <eric.auger@redhat.com>
> >>>>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> >>>>>> Cc: Will Deacon <will@kernel.org>
> >>>>>> Cc: Robin Murphy <robin.murphy@arm.com>
> >>>>>> Cc: Joerg Roedel <joro@8bytes.org>
> >>>>>> Signed-off-by: Rob Herring <robh@kernel.org>
> >>>
> >>>
> >>>>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>>>>>     {
> >>>>>>        u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
> >>>>>>        struct arm_smmu_device *smmu = smmu_domain->smmu;
> >>>>>> -    unsigned long start = iova, end = iova + size;
> >>>>>> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
> >>>>>>        int i = 0;
> >>>>>>        struct arm_smmu_cmdq_ent cmd = {
> >>>>>>                .tlbi = {
> >>>>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> >>>>>>                cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> >>>>>>        }
> >>>>>>
> >>>>>> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> >>>>>> +            /* Get the leaf page size */
> >>>>>> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> >>>>>> +
> >>>>>> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> >>>>>> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
> >>>>
> >>>> Given the comment, I think "(tg - 10) / 2" would suffice ;)
> >>>
> >>> Well, duh...
> >>>
> >>>>
> >>>>>> +
> >>>>>> +            /* Determine what level the granule is at */
> >>>>>> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> >>>>
> >>>> Is it possible to rephrase that with logs and shifts to avoid the division?
> >>>
> >>> Well, with a loop is the only other way I came up with:
> >>>
> >>> bpl = tg - 3;
> >>> ttl = 3;
> >>> mask = BIT(bpl) - 1;
> >>> while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
> >>>       ttl--;
> >>>
> >>> Doesn't seem like much improvement to me given we have h/w divide...
> >>
> >> Sure, it doesn't take too many extra instructions to start matching
> >> typical IDIV latency, so there's no point being silly if there really
> >> isn't a clean alternative.
> >>
> >> Some quick scribbling suggests "4 - ilog2(granule) / 10" might actually
> >> be close enough, but perhaps that's a bit too cheeky.
> >
> > How does divide by 10 save a divide?
>
> Well, by that point I was more just thinking about the smallest
> expression, since *some* division seems unavoidable. Although GCC does
> apparently still think that transforming constant division is a win ;)

Okay. Still, divide by 10 happens to work, but it is very much not
obvious. It also doesn't work for level 1 and 16 or 64KB pages (though
we'll never see that granule). Subtracting 3 is not that obvious
either, but is at least in the spec for calculating the bits per
level.

I think we've spent enough time micro-optimizing this and have better
things to worry about.

Rob
Robin Murphy Feb. 21, 2020, 5:19 p.m. UTC | #9
On 21/02/2020 4:47 pm, Rob Herring wrote:
> On Fri, Feb 21, 2020 at 10:19 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 20/02/2020 5:54 pm, Rob Herring wrote:
>>> On Mon, Feb 17, 2020 at 1:17 PM Robin Murphy <robin.murphy@arm.com> wrote:
>>>>
>>>> On 13/02/2020 9:49 pm, Rob Herring wrote:
>>>>> On Thu, Jan 30, 2020 at 11:34 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>>>>>
>>>>>> On 30/01/2020 3:06 pm, Auger Eric wrote:
>>>>>>> Hi Rob,
>>>>>>> On 1/17/20 10:16 PM, Rob Herring wrote:
>>>>>>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
>>>>>>>> Support for range invalidate is determined by the RIL bit in the IDR3
>>>>>>>> register.
>>>>>>>>
>>>>>>>> The range invalidate is in units of the leaf page size and operates on
>>>>>>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
>>>>>>>> size what power of 2 multiple we can use. Then we calculate how many
>>>>>>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
>>>>>>>> each iteration, we move up in size by at least 5 bits.
>>>>>>>>
>>>>>>>> Cc: Eric Auger <eric.auger@redhat.com>
>>>>>>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>>>>>>>> Cc: Will Deacon <will@kernel.org>
>>>>>>>> Cc: Robin Murphy <robin.murphy@arm.com>
>>>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>>>>>> Signed-off-by: Rob Herring <robh@kernel.org>
>>>>>
>>>>>
>>>>>>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>>>>>      {
>>>>>>>>         u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>>>>>>>>         struct arm_smmu_device *smmu = smmu_domain->smmu;
>>>>>>>> -    unsigned long start = iova, end = iova + size;
>>>>>>>> +    unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>>>>>>>>         int i = 0;
>>>>>>>>         struct arm_smmu_cmdq_ent cmd = {
>>>>>>>>                 .tlbi = {
>>>>>>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>>>>>>                 cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>>>>>>>>         }
>>>>>>>>
>>>>>>>> +    if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>>>>>>> +            /* Get the leaf page size */
>>>>>>>> +            tg = __ffs(smmu_domain->domain.pgsize_bitmap);
>>>>>>>> +
>>>>>>>> +            /* Convert page size of 12,14,16 (log2) to 1,2,3 */
>>>>>>>> +            cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
>>>>>>
>>>>>> Given the comment, I think "(tg - 10) / 2" would suffice ;)
>>>>>
>>>>> Well, duh...
>>>>>
>>>>>>
>>>>>>>> +
>>>>>>>> +            /* Determine what level the granule is at */
>>>>>>>> +            cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
>>>>>>
>>>>>> Is it possible to rephrase that with logs and shifts to avoid the division?
>>>>>
>>>>> Well, with a loop is the only other way I came up with:
>>>>>
>>>>> bpl = tg - 3;
>>>>> ttl = 3;
>>>>> mask = BIT(bpl) - 1;
>>>>> while ((granule & (mask << ((4 - ttl) * bpl + 3))) == 0)
>>>>>        ttl--;
>>>>>
>>>>> Doesn't seem like much improvement to me given we have h/w divide...
>>>>
>>>> Sure, it doesn't take too many extra instructions to start matching
>>>> typical IDIV latency, so there's no point being silly if there really
>>>> isn't a clean alternative.
>>>>
>>>> Some quick scribbling suggests "4 - ilog2(granule) / 10" might actually
>>>> be close enough, but perhaps that's a bit too cheeky.
>>>
>>> How does divide by 10 save a divide?
>>
>> Well, by that point I was more just thinking about the smallest
>> expression, since *some* division seems unavoidable. Although GCC does
>> apparently still think that transforming constant division is a win ;)
> 
> Okay. Still, divide by 10 happens to work, but it is very much not
> obvious. It also doesn't work for level 1 and 16 or 64KB pages (though

(FWIW the more subtle part of the trick is that 16K doesn't allow level 
1 blocks, and even in the vanishingly unlikely 64K level 1 case TTL=0 
isn't "wrong")

> we'll never see that granule). Subtracting 3 is not that obvious
> either, but is at least in the spec for calculating the bits per
> level.
> 
> I think we've spent enough time micro-optimizing this and have better
> things to worry about.

Agreed! Invalidation performance has just been the bogeyman for so long 
that it's an area to which I've become overly sensitive.

Cheers,
Robin.
Eric Auger Feb. 24, 2020, 11:41 a.m. UTC | #10
Hi Rob,

On 2/13/20 8:54 PM, Rob Herring wrote:
> On Thu, Jan 30, 2020 at 9:06 AM Auger Eric <eric.auger@redhat.com> wrote:
>>
>> Hi Rob,
>> On 1/17/20 10:16 PM, Rob Herring wrote:
>>> Arm SMMUv3.2 adds support for TLB range invalidate operations.
>>> Support for range invalidate is determined by the RIL bit in the IDR3
>>> register.
>>>
>>> The range invalidate is in units of the leaf page size and operates on
>>> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
>>> size what power of 2 multiple we can use. Then we calculate how many
>>> chunks (1-31) of the power of 2 size for the range on the iteration. On
>>> each iteration, we move up in size by at least 5 bits.
>>>
>>> Cc: Eric Auger <eric.auger@redhat.com>
>>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>>> Cc: Will Deacon <will@kernel.org>
>>> Cc: Robin Murphy <robin.murphy@arm.com>
>>> Cc: Joerg Roedel <joro@8bytes.org>
>>> Signed-off-by: Rob Herring <robh@kernel.org>
>>> ---
>>>  drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 65 insertions(+), 1 deletion(-)
> 
> 
>>> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>  {
>>>       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>>>       struct arm_smmu_device *smmu = smmu_domain->smmu;
>>> -     unsigned long start = iova, end = iova + size;
>>> +     unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>>>       int i = 0;
>>>       struct arm_smmu_cmdq_ent cmd = {
>>>               .tlbi = {
>>> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
>>>               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>>>       }
>>>
>>> +     if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>> +             /* Get the leaf page size */
>>> +             tg = __ffs(smmu_domain->domain.pgsize_bitmap);
>>> +
>>> +             /* Convert page size of 12,14,16 (log2) to 1,2,3 */
>>> +             cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
>>> +
>>> +             /* Determine what level the granule is at */
>>> +             cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
>>> +
>>> +             num_pages = size / (1UL << tg);
>>> +     }
>>> +
>>>       while (iova < end) {
>>>               if (i == CMDQ_BATCH_ENTRIES) {
>>>                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>>>                       i = 0;
>>>               }
>>>
>>> +             if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
>>> +                     /*
>>> +                      * On each iteration of the loop, the range is 5 bits
>>> +                      * worth of the aligned size remaining.
>>> +                      * The range in pages is:
>>> +                      *
>>> +                      * range = (num_pages & (0x1f << __ffs(num_pages)))
>>> +                      */
>>> +                     unsigned long scale, num;
>>> +
>>> +                     /* Determine the power of 2 multiple number of pages */
>>> +                     scale = __ffs(num_pages);
>>> +                     cmd.tlbi.scale = scale;
>>> +
>>> +                     /* Determine how many chunks of 2^scale size we have */
>>> +                     num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
>>> +                     cmd.tlbi.num = num - 1;
>>> +
>>> +                     /* range is num * 2^scale * pgsize */
>>> +                     granule = num << (scale + tg);
>>> +
>>> +                     /* Clear out the lower order bits for the next iteration */
>>> +                     num_pages -= num << scale;
>> Regarding the 2 options given in
>> https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=MOOpf6deQgH6s73c0UNdA@mail.gmail.com/raw,
>>
>> I understand you implemented 2) but I still do not understand why you
>> preferred that one against 1).
>>
>> In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
>> 31*2^0*4K pages
>> whereas you could achieve that with 10 invalidations with the 1st algo.
>> I did not get the case where it is more efficient. Please can you detail.
> 
> No, it's only 2 commands. We do 31*4K and then 31*2^5*4K. Here's a the
> output of a test case:
> 
> iova=10001000, num_pages=0x3e0, granule=1f000, num=31, scale=0, ttl=3
> iova=10020000, num_pages=0x0, granule=3e0000, num=31, scale=5, ttl=3
> 
> (num_pages being what's left at end of the loop)

> 
> As I mentioned on v1, worst case is 4 commands for up to 4GB. It's
> 20-bits of size (32-12) and each loop processes a minimum of 5 bits.
> Each loop becomes a larger aligned size, so scale goes up each pass.
> This is what I tried to explain in the top comment.

Sorry for the delay, I was out of the office. Yes indeed I misunderstood
the code and the algo looks good to me now.

just a minor comment, I would use a dedicated "inv_range" loop block
local variable instead of "granule".

Besides
Reviewed-by: Eric Auger <eric.auger@redhat.com>

Thanks

Eric




> 
> Rob
>
diff mbox series

Patch

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index e91b4a098215..0ee561db7149 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -70,6 +70,9 @@ 
 #define IDR1_SSIDSIZE			GENMASK(10, 6)
 #define IDR1_SIDSIZE			GENMASK(5, 0)
 
+#define ARM_SMMU_IDR3			0xc
+#define IDR3_RIL			(1 << 10)
+
 #define ARM_SMMU_IDR5			0x14
 #define IDR5_STALL_MAX			GENMASK(31, 16)
 #define IDR5_GRAN64K			(1 << 6)
@@ -327,9 +330,14 @@ 
 #define CMDQ_CFGI_1_LEAF		(1UL << 0)
 #define CMDQ_CFGI_1_RANGE		GENMASK_ULL(4, 0)
 
+#define CMDQ_TLBI_0_NUM			GENMASK_ULL(16, 12)
+#define CMDQ_TLBI_RANGE_NUM_MAX		31
+#define CMDQ_TLBI_0_SCALE		GENMASK_ULL(24, 20)
 #define CMDQ_TLBI_0_VMID		GENMASK_ULL(47, 32)
 #define CMDQ_TLBI_0_ASID		GENMASK_ULL(63, 48)
 #define CMDQ_TLBI_1_LEAF		(1UL << 0)
+#define CMDQ_TLBI_1_TTL			GENMASK_ULL(9, 8)
+#define CMDQ_TLBI_1_TG			GENMASK_ULL(11, 10)
 #define CMDQ_TLBI_1_VA_MASK		GENMASK_ULL(63, 12)
 #define CMDQ_TLBI_1_IPA_MASK		GENMASK_ULL(51, 12)
 
@@ -455,9 +463,13 @@  struct arm_smmu_cmdq_ent {
 		#define CMDQ_OP_TLBI_S2_IPA	0x2a
 		#define CMDQ_OP_TLBI_NSNH_ALL	0x30
 		struct {
+			u8			num;
+			u8			scale;
 			u16			asid;
 			u16			vmid;
 			bool			leaf;
+			u8			ttl;
+			u8			tg;
 			u64			addr;
 		} tlbi;
 
@@ -595,6 +607,7 @@  struct arm_smmu_device {
 #define ARM_SMMU_FEAT_HYP		(1 << 12)
 #define ARM_SMMU_FEAT_STALL_FORCE	(1 << 13)
 #define ARM_SMMU_FEAT_VAX		(1 << 14)
+#define ARM_SMMU_FEAT_RANGE_INV		(1 << 15)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
@@ -856,13 +869,21 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
 		break;
 	case CMDQ_OP_TLBI_NH_VA:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
 		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
 		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
 		break;
 	case CMDQ_OP_TLBI_S2_IPA:
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
 		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+		cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
 		cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
 		break;
 	case CMDQ_OP_TLBI_NH_ASID:
@@ -2003,7 +2024,7 @@  static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
 {
 	u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	unsigned long start = iova, end = iova + size;
+	unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
 	int i = 0;
 	struct arm_smmu_cmdq_ent cmd = {
 		.tlbi = {
@@ -2022,12 +2043,50 @@  static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
 		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
 	}
 
+	if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+		/* Get the leaf page size */
+		tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+		/* Convert page size of 12,14,16 (log2) to 1,2,3 */
+		cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
+
+		/* Determine what level the granule is at */
+		cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+
+		num_pages = size / (1UL << tg);
+	}
+
 	while (iova < end) {
 		if (i == CMDQ_BATCH_ENTRIES) {
 			arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
 			i = 0;
 		}
 
+		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+			/*
+			 * On each iteration of the loop, the range is 5 bits
+			 * worth of the aligned size remaining.
+			 * The range in pages is:
+			 *
+			 * range = (num_pages & (0x1f << __ffs(num_pages)))
+			 */
+			unsigned long scale, num;
+
+			/* Determine the power of 2 multiple number of pages */
+			scale = __ffs(num_pages);
+			cmd.tlbi.scale = scale;
+
+			/* Determine how many chunks of 2^scale size we have */
+			num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
+			cmd.tlbi.num = num - 1;
+
+			/* range is num * 2^scale * pgsize */
+			granule = num << (scale + tg);
+
+			/* Clear out the lower order bits for the next iteration */
+			num_pages -= num << scale;
+		}
+
 		cmd.tlbi.addr = iova;
 		arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
 		iova += granule;
@@ -3449,6 +3508,11 @@  static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 	if (smmu->sid_bits <= STRTAB_SPLIT)
 		smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
 
+	/* IDR3 */
+	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+	if (FIELD_GET(IDR3_RIL, reg))
+		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
+
 	/* IDR5 */
 	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);