diff mbox

[v2,3/4] iommu/arm-smmu-v3: Use CMD_SYNC completion MSI

Message ID dbf0ce00f8e249c64f3d2041acd8d91818178e52.1504182142.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Robin Murphy Aug. 31, 2017, 1:44 p.m. UTC
As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least
because we often need to wait for sync completion within someone else's
IRQ handler anyway. However, when the SMMU is both coherent and supports
MSIs, we can have a lot more fun by not using it as an interrupt at all.
Following the example suggested in the architecture and using a write
targeting normal memory, we can let callers wait on a status variable
outside the lock instead of having to stall the entire queue or even
touch MMIO registers. Since multiple sync commands are guaranteed to
complete in order, a simple incrementing sequence count is all we need
to unambiguously support any realistic number of overlapping waiters.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: Remove redundant 'bool msi' command member, other cosmetic tweaks

 drivers/iommu/arm-smmu-v3.c | 47 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

Comments

Will Deacon Oct. 13, 2017, 6:32 p.m. UTC | #1
Hi Robin,

This mostly looks good. Just a few comments below.

On Thu, Aug 31, 2017 at 02:44:27PM +0100, Robin Murphy wrote:
> As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least
> because we often need to wait for sync completion within someone else's
> IRQ handler anyway. However, when the SMMU is both coherent and supports
> MSIs, we can have a lot more fun by not using it as an interrupt at all.
> Following the example suggested in the architecture and using a write
> targeting normal memory, we can let callers wait on a status variable
> outside the lock instead of having to stall the entire queue or even
> touch MMIO registers. Since multiple sync commands are guaranteed to
> complete in order, a simple incrementing sequence count is all we need
> to unambiguously support any realistic number of overlapping waiters.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: Remove redundant 'bool msi' command member, other cosmetic tweaks
> 
>  drivers/iommu/arm-smmu-v3.c | 47 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f066725298cd..311f482b93d5 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -377,7 +377,16 @@
>  
>  #define CMDQ_SYNC_0_CS_SHIFT		12
>  #define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
> +#define CMDQ_SYNC_0_CS_IRQ		(1UL << CMDQ_SYNC_0_CS_SHIFT)
>  #define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
> +#define CMDQ_SYNC_0_MSH_SHIFT		22
> +#define CMDQ_SYNC_0_MSH_ISH		(3UL << CMDQ_SYNC_0_MSH_SHIFT)
> +#define CMDQ_SYNC_0_MSIATTR_SHIFT	24
> +#define CMDQ_SYNC_0_MSIATTR_OIWB	(0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT)
> +#define CMDQ_SYNC_0_MSIDATA_SHIFT	32
> +#define CMDQ_SYNC_0_MSIDATA_MASK	0xffffffffUL
> +#define CMDQ_SYNC_1_MSIADDR_SHIFT	0
> +#define CMDQ_SYNC_1_MSIADDR_MASK	0xffffffffffffcUL
>  
>  /* Event queue */
>  #define EVTQ_ENT_DWORDS			4
> @@ -409,6 +418,7 @@
>  /* High-level queue structures */
>  #define ARM_SMMU_POLL_TIMEOUT_US	100
>  #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
> +#define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */

We only ever do this when waiting for the queue to drain, so may as well
just reuse the drain timeout.

>  #define MSI_IOVA_BASE			0x8000000
>  #define MSI_IOVA_LENGTH			0x100000
> @@ -504,6 +514,10 @@ struct arm_smmu_cmdq_ent {
>  		} pri;
>  
>  		#define CMDQ_OP_CMD_SYNC	0x46
> +		struct {
> +			u32			msidata;
> +			u64			msiaddr;
> +		} sync;
>  	};
>  };
>  
> @@ -617,6 +631,9 @@ struct arm_smmu_device {
>  	int				gerr_irq;
>  	int				combined_irq;
>  
> +	atomic_t			sync_nr;
> +	u32				sync_count;

It's probably worth sticking these in separate cachelines so we don't
get spurious wakeups when sync_nr is incremented. (yes, I know it should
be the ERG, but that can be unreasonably huge!).

> +
>  	unsigned long			ias; /* IPA */
>  	unsigned long			oas; /* PA */
>  	unsigned long			pgsize_bitmap;
> @@ -878,7 +895,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>  		}
>  		break;
>  	case CMDQ_OP_CMD_SYNC:
> -		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
> +		if (ent->sync.msiaddr)
> +			cmd[0] |= CMDQ_SYNC_0_CS_IRQ;
> +		else
> +			cmd[0] |= CMDQ_SYNC_0_CS_SEV;
> +		cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB;
> +		cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT;
> +		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
>  		break;
>  	default:
>  		return -ENOENT;
> @@ -964,21 +987,40 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  }
>  
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +{
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
> +	u32 val = smp_cond_load_acquire(&smmu->sync_count,
> +					(int)(VAL - sync_idx) >= 0 ||
> +					!ktime_before(ktime_get(), timeout));
> +
> +	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;

There are some theoretical overflow issues here which I don't think will
ever occur in practice, but deserve at least a comment to explain why.

> +}
> +
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);

I don't think this is sufficient for the case where we fail to setup MSIs
and fall back on legacy IRQs.

>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>  	int ret;
>  
> +	if (msi) {
> +		ent.sync.msidata = atomic_inc_return(&smmu->sync_nr);

I don't think you need barrier semantics here.

> +		ent.sync.msiaddr = virt_to_phys(&smmu->sync_count);
> +	}
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>  	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	if (!msi)
> +		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> +	if (msi)
> +		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);

This looks like the queue polling should be wrapped up in a function that
returns with the spinlock released.

Will
Robin Murphy Oct. 16, 2017, 12:25 p.m. UTC | #2
On 13/10/17 19:32, Will Deacon wrote:
> Hi Robin,
> 
> This mostly looks good. Just a few comments below.
> 
> On Thu, Aug 31, 2017 at 02:44:27PM +0100, Robin Murphy wrote:
>> As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least
>> because we often need to wait for sync completion within someone else's
>> IRQ handler anyway. However, when the SMMU is both coherent and supports
>> MSIs, we can have a lot more fun by not using it as an interrupt at all.
>> Following the example suggested in the architecture and using a write
>> targeting normal memory, we can let callers wait on a status variable
>> outside the lock instead of having to stall the entire queue or even
>> touch MMIO registers. Since multiple sync commands are guaranteed to
>> complete in order, a simple incrementing sequence count is all we need
>> to unambiguously support any realistic number of overlapping waiters.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>
>> v2: Remove redundant 'bool msi' command member, other cosmetic tweaks
>>
>>  drivers/iommu/arm-smmu-v3.c | 47 +++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 45 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index f066725298cd..311f482b93d5 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -377,7 +377,16 @@
>>  
>>  #define CMDQ_SYNC_0_CS_SHIFT		12
>>  #define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
>> +#define CMDQ_SYNC_0_CS_IRQ		(1UL << CMDQ_SYNC_0_CS_SHIFT)
>>  #define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
>> +#define CMDQ_SYNC_0_MSH_SHIFT		22
>> +#define CMDQ_SYNC_0_MSH_ISH		(3UL << CMDQ_SYNC_0_MSH_SHIFT)
>> +#define CMDQ_SYNC_0_MSIATTR_SHIFT	24
>> +#define CMDQ_SYNC_0_MSIATTR_OIWB	(0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT)
>> +#define CMDQ_SYNC_0_MSIDATA_SHIFT	32
>> +#define CMDQ_SYNC_0_MSIDATA_MASK	0xffffffffUL
>> +#define CMDQ_SYNC_1_MSIADDR_SHIFT	0
>> +#define CMDQ_SYNC_1_MSIADDR_MASK	0xffffffffffffcUL
>>  
>>  /* Event queue */
>>  #define EVTQ_ENT_DWORDS			4
>> @@ -409,6 +418,7 @@
>>  /* High-level queue structures */
>>  #define ARM_SMMU_POLL_TIMEOUT_US	100
>>  #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
>> +#define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
> 
> We only ever do this when waiting for the queue to drain, so may as well
> just reuse the drain timeout.

As you've discovered, we remove the "drain" case entirely in the end.

>>  #define MSI_IOVA_BASE			0x8000000
>>  #define MSI_IOVA_LENGTH			0x100000
>> @@ -504,6 +514,10 @@ struct arm_smmu_cmdq_ent {
>>  		} pri;
>>  
>>  		#define CMDQ_OP_CMD_SYNC	0x46
>> +		struct {
>> +			u32			msidata;
>> +			u64			msiaddr;
>> +		} sync;
>>  	};
>>  };
>>  
>> @@ -617,6 +631,9 @@ struct arm_smmu_device {
>>  	int				gerr_irq;
>>  	int				combined_irq;
>>  
>> +	atomic_t			sync_nr;
>> +	u32				sync_count;
> 
> It's probably worth sticking these in separate cachelines so we don't
> get spurious wakeups when sync_nr is incremented. (yes, I know it should
> be the ERG, but that can be unreasonably huge!).

Good point - we've got 8K of bitmaps embedded in the structure anyway,
so even maximum ERG separation is easily possible.

>> +
>>  	unsigned long			ias; /* IPA */
>>  	unsigned long			oas; /* PA */
>>  	unsigned long			pgsize_bitmap;
>> @@ -878,7 +895,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>>  		}
>>  		break;
>>  	case CMDQ_OP_CMD_SYNC:
>> -		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
>> +		if (ent->sync.msiaddr)
>> +			cmd[0] |= CMDQ_SYNC_0_CS_IRQ;
>> +		else
>> +			cmd[0] |= CMDQ_SYNC_0_CS_SEV;
>> +		cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB;
>> +		cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT;
>> +		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
>>  		break;
>>  	default:
>>  		return -ENOENT;
>> @@ -964,21 +987,40 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>  }
>>  
>> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>> +{
>> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
>> +	u32 val = smp_cond_load_acquire(&smmu->sync_count,
>> +					(int)(VAL - sync_idx) >= 0 ||
>> +					!ktime_before(ktime_get(), timeout));
>> +
>> +	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
> 
> There are some theoretical overflow issues here which I don't think will
> ever occur in practice, but deserve at least a comment to explain why.

Even if we did have 2^31 or more CPUs, the size of a queue is bounded at
2^20, so we can never have enough in-flight syncs to get near to an
overflow problem. I can certainly document that if you like.

>> +}
>> +
>>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>  {
>>  	u64 cmd[CMDQ_ENT_DWORDS];
>>  	unsigned long flags;
>>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> 
> I don't think this is sufficient for the case where we fail to setup MSIs
> and fall back on legacy IRQs.

Remember this 'MSI' is going nowhere near the GIC, so the IRQ
configuration is irrelevant (especially after patch #2) - the feature
bits tell us "is the SMMU capable of generating sync-completion writes?"
and "are those writes coherent?", which is precisely what matters here.

>>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>  	int ret;
>>  
>> +	if (msi) {
>> +		ent.sync.msidata = atomic_inc_return(&smmu->sync_nr);
> 
> I don't think you need barrier semantics here.

You mean atomic_inc_return_relaxed() would be sufficient? TBH I don't
think I'd given this any thought - I guess the coherency protocols make
it impossible to do an atomic op on stale data, so that seems reasonable.
>> +		ent.sync.msiaddr = virt_to_phys(&smmu->sync_count);
>> +	}
>>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>>  
>>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>  	arm_smmu_cmdq_insert_cmd(smmu, cmd);
>> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>> +	if (!msi)
>> +		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>  
>> +	if (msi)
>> +		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> 
> This looks like the queue polling should be wrapped up in a function that
> returns with the spinlock released.

I did ponder that, but I can't help finding such asymmetric interfaces
pretty grim, and things do get better again once both cases can wait
outside the lock.

Robin.
diff mbox

Patch

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f066725298cd..311f482b93d5 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -377,7 +377,16 @@ 
 
 #define CMDQ_SYNC_0_CS_SHIFT		12
 #define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_CS_IRQ		(1UL << CMDQ_SYNC_0_CS_SHIFT)
 #define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_MSH_SHIFT		22
+#define CMDQ_SYNC_0_MSH_ISH		(3UL << CMDQ_SYNC_0_MSH_SHIFT)
+#define CMDQ_SYNC_0_MSIATTR_SHIFT	24
+#define CMDQ_SYNC_0_MSIATTR_OIWB	(0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT)
+#define CMDQ_SYNC_0_MSIDATA_SHIFT	32
+#define CMDQ_SYNC_0_MSIDATA_MASK	0xffffffffUL
+#define CMDQ_SYNC_1_MSIADDR_SHIFT	0
+#define CMDQ_SYNC_1_MSIADDR_MASK	0xffffffffffffcUL
 
 /* Event queue */
 #define EVTQ_ENT_DWORDS			4
@@ -409,6 +418,7 @@ 
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
 #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
 
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@@ -504,6 +514,10 @@  struct arm_smmu_cmdq_ent {
 		} pri;
 
 		#define CMDQ_OP_CMD_SYNC	0x46
+		struct {
+			u32			msidata;
+			u64			msiaddr;
+		} sync;
 	};
 };
 
@@ -617,6 +631,9 @@  struct arm_smmu_device {
 	int				gerr_irq;
 	int				combined_irq;
 
+	atomic_t			sync_nr;
+	u32				sync_count;
+
 	unsigned long			ias; /* IPA */
 	unsigned long			oas; /* PA */
 	unsigned long			pgsize_bitmap;
@@ -878,7 +895,13 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		}
 		break;
 	case CMDQ_OP_CMD_SYNC:
-		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		if (ent->sync.msiaddr)
+			cmd[0] |= CMDQ_SYNC_0_CS_IRQ;
+		else
+			cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB;
+		cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT;
+		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 		break;
 	default:
 		return -ENOENT;
@@ -964,21 +987,40 @@  static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 }
 
+static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
+	u32 val = smp_cond_load_acquire(&smmu->sync_count,
+					(int)(VAL - sync_idx) >= 0 ||
+					!ktime_before(ktime_get(), timeout));
+
+	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+}
+
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
 	int ret;
 
+	if (msi) {
+		ent.sync.msidata = atomic_inc_return(&smmu->sync_nr);
+		ent.sync.msiaddr = virt_to_phys(&smmu->sync_count);
+	}
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	if (!msi)
+		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
+	if (msi)
+		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
@@ -2156,6 +2198,7 @@  static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 {
 	int ret;
 
+	atomic_set(&smmu->sync_nr, 0);
 	ret = arm_smmu_init_queues(smmu);
 	if (ret)
 		return ret;