diff mbox series

iommu/arm-smmu-v3: remove the approach of MSI polling for CMD SYNC

Message ID 20200716230709.32820-1-song.bao.hua@hisilicon.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm-smmu-v3: remove the approach of MSI polling for CMD SYNC | expand

Commit Message

Song Bao Hua (Barry Song) July 16, 2020, 11:07 p.m. UTC
Before commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during
command-queue insertion"), msi polling perhaps performed better since
it could run outside the spin_lock_irqsave() while the code polling cons
reg was running in the lock.

But after the great reorganization of smmu queue, neither of these two
polling methods are running in a spinlock. And real tests show polling
cons reg via sev means smaller latency. It is probably because polling
by msi will ask hardware to write memory but sev polling depends on the
update of register only.

Using 16 threads to run netperf on hns3 100G NIC with UDP packet size
in 32768bytes and set iommu to strict, TX throughput can improve from
25227.74Mbps to 27145.59Mbps by this patch. In this case, SMMU is super
busy as hns3 sends map/unmap requests extremely frequently.

Cc: Prime Zeng <prime.zeng@hisilicon.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
 drivers/iommu/arm-smmu-v3.c | 46 +------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

Comments

Robin Murphy July 17, 2020, 8:54 a.m. UTC | #1
On 2020-07-17 00:07, Barry Song wrote:
> Before commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during
> command-queue insertion"), msi polling perhaps performed better since
> it could run outside the spin_lock_irqsave() while the code polling cons
> reg was running in the lock.
> 
> But after the great reorganization of smmu queue, neither of these two
> polling methods are running in a spinlock. And real tests show polling
> cons reg via sev means smaller latency. It is probably because polling
> by msi will ask hardware to write memory but sev polling depends on the
> update of register only.
> 
> Using 16 threads to run netperf on hns3 100G NIC with UDP packet size
> in 32768bytes and set iommu to strict, TX throughput can improve from
> 25227.74Mbps to 27145.59Mbps by this patch. In this case, SMMU is super
> busy as hns3 sends map/unmap requests extremely frequently.

How many different systems and SMMU implementations are those numbers 
representative of? Given that we may have cases where the SMMU can use 
MSIs but can't use SEV, so would have to fall back to inefficient 
busy-polling, I'd be wary of removing this entirely. Allowing particular 
platforms or SMMU implementations to suppress MSI functionality if they 
know for sure it makes sense seems like a safer bet.

Robin.

> Cc: Prime Zeng <prime.zeng@hisilicon.com>
> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>   drivers/iommu/arm-smmu-v3.c | 46 +------------------------------------
>   1 file changed, 1 insertion(+), 45 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f578677a5c41..e55282a636c8 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -964,12 +964,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>   		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
>   		break;
>   	case CMDQ_OP_CMD_SYNC:
> -		if (ent->sync.msiaddr) {
> -			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
> -			cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
> -		} else {
> -			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
> -		}
> +		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
>   		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
>   		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
>   		break;
> @@ -983,21 +978,10 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>   static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
>   					 u32 prod)
>   {
> -	struct arm_smmu_queue *q = &smmu->cmdq.q;
>   	struct arm_smmu_cmdq_ent ent = {
>   		.opcode = CMDQ_OP_CMD_SYNC,
>   	};
>   
> -	/*
> -	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
> -	 * payload, so the write will zero the entire command on that platform.
> -	 */
> -	if (smmu->features & ARM_SMMU_FEAT_MSI &&
> -	    smmu->features & ARM_SMMU_FEAT_COHERENCY) {
> -		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
> -				   q->ent_dwords * 8;
> -	}
> -
>   	arm_smmu_cmdq_build_cmd(cmd, &ent);
>   }
>   
> @@ -1251,30 +1235,6 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
>   	return ret;
>   }
>   
> -/*
> - * Wait until the SMMU signals a CMD_SYNC completion MSI.
> - * Must be called with the cmdq lock held in some capacity.
> - */
> -static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
> -					  struct arm_smmu_ll_queue *llq)
> -{
> -	int ret = 0;
> -	struct arm_smmu_queue_poll qp;
> -	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -	u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
> -
> -	queue_poll_init(smmu, &qp);
> -
> -	/*
> -	 * The MSI won't generate an event, since it's being written back
> -	 * into the command queue.
> -	 */
> -	qp.wfe = false;
> -	smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
> -	llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
> -	return ret;
> -}
> -
>   /*
>    * Wait until the SMMU cons index passes llq->prod.
>    * Must be called with the cmdq lock held in some capacity.
> @@ -1332,10 +1292,6 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
>   static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
>   					 struct arm_smmu_ll_queue *llq)
>   {
> -	if (smmu->features & ARM_SMMU_FEAT_MSI &&
> -	    smmu->features & ARM_SMMU_FEAT_COHERENCY)
> -		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
> -
>   	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
>   }
>   
>
Song Bao Hua (Barry Song) July 17, 2020, 9:07 a.m. UTC | #2
> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy@arm.com]
> Sent: Friday, July 17, 2020 8:55 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; will@kernel.org;
> joro@8bytes.org
> Cc: linux-kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux-foundation.org;
> Zengtao (B) <prime.zeng@hisilicon.com>
> Subject: Re: [PATCH] iommu/arm-smmu-v3: remove the approach of MSI
> polling for CMD SYNC
> 
> On 2020-07-17 00:07, Barry Song wrote:
> > Before commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention
> during
> > command-queue insertion"), msi polling perhaps performed better since
> > it could run outside the spin_lock_irqsave() while the code polling cons
> > reg was running in the lock.
> >
> > But after the great reorganization of smmu queue, neither of these two
> > polling methods are running in a spinlock. And real tests show polling
> > cons reg via sev means smaller latency. It is probably because polling
> > by msi will ask hardware to write memory but sev polling depends on the
> > update of register only.
> >
> > Using 16 threads to run netperf on hns3 100G NIC with UDP packet size
> > in 32768bytes and set iommu to strict, TX throughput can improve from
> > 25227.74Mbps to 27145.59Mbps by this patch. In this case, SMMU is super
> > busy as hns3 sends map/unmap requests extremely frequently.
> 
> How many different systems and SMMU implementations are those numbers
> representative of? Given that we may have cases where the SMMU can use
> MSIs but can't use SEV, so would have to fall back to inefficient
> busy-polling, I'd be wary of removing this entirely. Allowing particular
> platforms or SMMU implementations to suppress MSI functionality if they
> know for sure it makes sense seems like a safer bet.
> 
Hello Robin,

Thanks for taking a look. Actually I was really struggling with the good way to make every platform happy.
And I don't have other platforms to test and check if those platforms run better by sev polling. Even two
platforms have completely same SMMU features, it is still possible they behave differently.
So I simply sent this patch to get the discussion started to get opinions.

At the first beginning, I wanted to have a module parameter for users to decide if msi polling should be disabled.
But the module parameter might be totally ignored by linux distro.

--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -418,6 +418,11 @@ module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);  MODULE_PARM_DESC(disable_bypass,
 	"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
 
+static bool disable_msipolling = 1;
+module_param_named(disable_msipolling, disable_msipolling, bool, 
+S_IRUGO); MODULE_PARM_DESC(disable_msipolling,
+	"Don't use MSI to poll the completion of CMD_SYNC if it is slower than 
+SEV");
+
 enum pri_resp {
 	PRI_RESP_DENY = 0,
 	PRI_RESP_FAIL = 1,
@@ -992,7 +997,7 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
 	 * payload, so the write will zero the entire command on that platform.
 	 */
-	if (smmu->features & ARM_SMMU_FEAT_MSI &&
+	if (!disable_msipolling && smmu->features & ARM_SMMU_FEAT_MSI &&
 	    smmu->features & ARM_SMMU_FEAT_COHERENCY) {
 		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
 				   q->ent_dwords * 8;
@@ -1332,7 +1337,7 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,  static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
 					 struct arm_smmu_ll_queue *llq)
 {
-	if (smmu->features & ARM_SMMU_FEAT_MSI &&
+	if (!disable_msipolling && smmu->features & ARM_SMMU_FEAT_MSI &&
 	    smmu->features & ARM_SMMU_FEAT_COHERENCY)
 		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);


Another option is that we don't use module parameter, alternatively, we check the vendor/chip ID,
if the chip has better performance on sev polling, it may set disable_msipolling to true.

You are very welcome to give your suggestions.

Thanks
Barry
Song Bao Hua (Barry Song) July 21, 2020, 12:44 a.m. UTC | #3
> -----Original Message-----
> From: Song Bao Hua (Barry Song)
> Sent: Friday, July 17, 2020 9:06 PM
> To: 'Robin Murphy' <robin.murphy@arm.com>; will@kernel.org;
> joro@8bytes.org
> Cc: linux-kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux-foundation.org;
> Zengtao (B) <prime.zeng@hisilicon.com>
> Subject: RE: [PATCH] iommu/arm-smmu-v3: remove the approach of MSI
> polling for CMD SYNC
> 
> 
> 
> > -----Original Message-----
> > From: Robin Murphy [mailto:robin.murphy@arm.com]
> > Sent: Friday, July 17, 2020 8:55 PM
> > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>;
> will@kernel.org;
> > joro@8bytes.org
> > Cc: linux-kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>;
> > linux-arm-kernel@lists.infradead.org; iommu@lists.linux-foundation.org;
> > Zengtao (B) <prime.zeng@hisilicon.com>
> > Subject: Re: [PATCH] iommu/arm-smmu-v3: remove the approach of MSI
> > polling for CMD SYNC
> >
> > On 2020-07-17 00:07, Barry Song wrote:
> > > Before commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention
> > during
> > > command-queue insertion"), msi polling perhaps performed better since
> > > it could run outside the spin_lock_irqsave() while the code polling cons
> > > reg was running in the lock.
> > >
> > > But after the great reorganization of smmu queue, neither of these two
> > > polling methods are running in a spinlock. And real tests show polling
> > > cons reg via sev means smaller latency. It is probably because polling
> > > by msi will ask hardware to write memory but sev polling depends on the
> > > update of register only.
> > >
> > > Using 16 threads to run netperf on hns3 100G NIC with UDP packet size
> > > in 32768bytes and set iommu to strict, TX throughput can improve from
> > > 25227.74Mbps to 27145.59Mbps by this patch. In this case, SMMU is
> super
> > > busy as hns3 sends map/unmap requests extremely frequently.
> >
> > How many different systems and SMMU implementations are those numbers
> > representative of? Given that we may have cases where the SMMU can use
> > MSIs but can't use SEV, so would have to fall back to inefficient
> > busy-polling, I'd be wary of removing this entirely. Allowing particular
> > platforms or SMMU implementations to suppress MSI functionality if they
> > know for sure it makes sense seems like a safer bet.
> >
> Hello Robin,
> 
> Thanks for taking a look. Actually I was really struggling with the good way to
> make every platform happy.
> And I don't have other platforms to test and check if those platforms run
> better by sev polling. Even two
> platforms have completely same SMMU features, it is still possible they
> behave differently.
> So I simply sent this patch to get the discussion started to get opinions.
> 
> At the first beginning, I wanted to have a module parameter for users to decide
> if msi polling should be disabled.
> But the module parameter might be totally ignored by linux distro.
> 
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -418,6 +418,11 @@ module_param_named(disable_bypass,
> disable_bypass, bool, S_IRUGO);  MODULE_PARM_DESC(disable_bypass,
>  	"Disable bypass streams such that incoming transactions from devices
> that are not attached to an iommu domain will report an abort back to the
> device and will not be allowed to pass through the SMMU.");
> 
> +static bool disable_msipolling = 1;
> +module_param_named(disable_msipolling, disable_msipolling, bool,
> +S_IRUGO); MODULE_PARM_DESC(disable_msipolling,
> +	"Don't use MSI to poll the completion of CMD_SYNC if it is slower than
> +SEV");
> +
>  enum pri_resp {
>  	PRI_RESP_DENY = 0,
>  	PRI_RESP_FAIL = 1,
> @@ -992,7 +997,7 @@ static void arm_smmu_cmdq_build_sync_cmd(u64
> *cmd, struct arm_smmu_device *smmu,
>  	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
>  	 * payload, so the write will zero the entire command on that platform.
>  	 */
> -	if (smmu->features & ARM_SMMU_FEAT_MSI &&
> +	if (!disable_msipolling && smmu->features & ARM_SMMU_FEAT_MSI &&
>  	    smmu->features & ARM_SMMU_FEAT_COHERENCY) {
>  		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
>  				   q->ent_dwords * 8;
> @@ -1332,7 +1337,7 @@ static int
> __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
> static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
>  					 struct arm_smmu_ll_queue *llq)
>  {
> -	if (smmu->features & ARM_SMMU_FEAT_MSI &&
> +	if (!disable_msipolling && smmu->features & ARM_SMMU_FEAT_MSI &&
>  	    smmu->features & ARM_SMMU_FEAT_COHERENCY)
>  		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
> 
> 
> Another option is that we don't use module parameter, alternatively, we check
> the vendor/chip ID,
> if the chip has better performance on sev polling, it may set disable_msipolling
> to true.
> 
> You are very welcome to give your suggestions.

A possible way to do some chip-specific configuration would be setting smmu->options according to model ID:

static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu)
{
	switch (model) {
	case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX:
		smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY;
		break;
	case ACPI_IORT_SMMU_V3_HISILICON_HI161X:
		smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
		break;
+	case ACPI_IORT_SMMU_V3_HISILICON_HI162X:
+		smmu->options |= ARM_SMMU_OPT_DISABLE_MSIPOLL;
+		break;
	}

	dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options);
}

I dumped the model id, but unluckily the id is just zero.

#define ACPI_IORT_SMMU_V3_GENERIC           0x00000000	/* Generic SMMUv3 */

Robin, 
would you like to think applying for a new model ID is a right way to set this chip-specific option?

Thanks
Barry
diff mbox series

Patch

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f578677a5c41..e55282a636c8 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -964,12 +964,7 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
 		break;
 	case CMDQ_OP_CMD_SYNC:
-		if (ent->sync.msiaddr) {
-			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
-			cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
-		} else {
-			cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
-		}
+		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
 		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
 		cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
 		break;
@@ -983,21 +978,10 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 					 u32 prod)
 {
-	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	struct arm_smmu_cmdq_ent ent = {
 		.opcode = CMDQ_OP_CMD_SYNC,
 	};
 
-	/*
-	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
-	 * payload, so the write will zero the entire command on that platform.
-	 */
-	if (smmu->features & ARM_SMMU_FEAT_MSI &&
-	    smmu->features & ARM_SMMU_FEAT_COHERENCY) {
-		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
-				   q->ent_dwords * 8;
-	}
-
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 }
 
@@ -1251,30 +1235,6 @@  static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
 	return ret;
 }
 
-/*
- * Wait until the SMMU signals a CMD_SYNC completion MSI.
- * Must be called with the cmdq lock held in some capacity.
- */
-static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
-					  struct arm_smmu_ll_queue *llq)
-{
-	int ret = 0;
-	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
-	u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
-
-	queue_poll_init(smmu, &qp);
-
-	/*
-	 * The MSI won't generate an event, since it's being written back
-	 * into the command queue.
-	 */
-	qp.wfe = false;
-	smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
-	llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
-	return ret;
-}
-
 /*
  * Wait until the SMMU cons index passes llq->prod.
  * Must be called with the cmdq lock held in some capacity.
@@ -1332,10 +1292,6 @@  static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
 					 struct arm_smmu_ll_queue *llq)
 {
-	if (smmu->features & ARM_SMMU_FEAT_MSI &&
-	    smmu->features & ARM_SMMU_FEAT_COHERENCY)
-		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
-
 	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
 }