diff mbox series

[v6,1/4] iommu/arm-smmu: add NVIDIA implementation for dual ARM MMU-500 usage

Message ID 20200604234414.21912-2-vdumpa@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Nvidia Arm SMMUv2 Implementation | expand

Commit Message

Krishna Reddy June 4, 2020, 11:44 p.m. UTC
NVIDIA's Tegra194 soc uses two ARM MMU-500s together to interleave
IOVA accesses across them.
Add NVIDIA implementation for dual ARM MMU-500s and add new compatible
string for Tegra194 soc.

Signed-off-by: Krishna Reddy <vdumpa@nvidia.com>
---
 MAINTAINERS                     |   2 +
 drivers/iommu/Makefile          |   2 +-
 drivers/iommu/arm-smmu-impl.c   |   3 +
 drivers/iommu/arm-smmu-nvidia.c | 161 ++++++++++++++++++++++++++++++++
 drivers/iommu/arm-smmu.h        |   1 +
 5 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/arm-smmu-nvidia.c

Comments

Thierry Reding June 23, 2020, 10:29 a.m. UTC | #1
On Thu, Jun 04, 2020 at 04:44:11PM -0700, Krishna Reddy wrote:
> NVIDIA's Tegra194 soc uses two ARM MMU-500s together to interleave

s/soc/SoC/

> IOVA accesses across them.
> Add NVIDIA implementation for dual ARM MMU-500s and add new compatible
> string for Tegra194 soc.

Same here.

> 
> Signed-off-by: Krishna Reddy <vdumpa@nvidia.com>
> ---
>  MAINTAINERS                     |   2 +
>  drivers/iommu/Makefile          |   2 +-
>  drivers/iommu/arm-smmu-impl.c   |   3 +
>  drivers/iommu/arm-smmu-nvidia.c | 161 ++++++++++++++++++++++++++++++++
>  drivers/iommu/arm-smmu.h        |   1 +
>  5 files changed, 168 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/arm-smmu-nvidia.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 50659d76976b7..118da0893c964 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16572,9 +16572,11 @@ F:	drivers/i2c/busses/i2c-tegra.c
>  
>  TEGRA IOMMU DRIVERS
>  M:	Thierry Reding <thierry.reding@gmail.com>
> +R:	Krishna Reddy <vdumpa@nvidia.com>
>  L:	linux-tegra@vger.kernel.org
>  S:	Supported
>  F:	drivers/iommu/tegra*
> +F:	drivers/iommu/arm-smmu-nvidia.c
>  
>  TEGRA KBC DRIVER
>  M:	Laxman Dewangan <ldewangan@nvidia.com>
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 57cf4ba5e27cb..35542df00da72 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -15,7 +15,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o amd_iommu_quirks.o
>  obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
>  obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
> -arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o
> +arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o arm-smmu-nvidia.o
>  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
>  obj-$(CONFIG_DMAR_TABLE) += dmar.o
>  obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
> diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c
> index c75b9d957b702..52c84c30f83e4 100644
> --- a/drivers/iommu/arm-smmu-impl.c
> +++ b/drivers/iommu/arm-smmu-impl.c
> @@ -160,6 +160,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
>  	 */
>  	switch (smmu->model) {
>  	case ARM_MMU500:
> +		if (of_device_is_compatible(smmu->dev->of_node,
> +					    "nvidia,tegra194-smmu-500"))
> +			return nvidia_smmu_impl_init(smmu);

Should NVIDIA_TEGRA194_SMMU be a separate value for smmu->model,
perhaps? That way we avoid this somewhat odd check here.

>  		smmu->impl = &arm_mmu500_impl;
>  		break;
>  	case CAVIUM_SMMUV2:
> diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c

I wonder if it would be better to name this arm-smmu-tegra.c to make it
clearer that this is for a Tegra chip. We do have regular expressions in
MAINTAINERS that catch anything with "tegra" in it to make this easier.

> new file mode 100644
> index 0000000000000..dafc293a45217
> --- /dev/null
> +++ b/drivers/iommu/arm-smmu-nvidia.c
> @@ -0,0 +1,161 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Nvidia ARM SMMU v2 implementation quirks

s/Nvidia/NVIDIA/

> +// Copyright (C) 2019 NVIDIA CORPORATION.  All rights reserved.

I suppose this should now also include 2020.

> +
> +#define pr_fmt(fmt) "nvidia-smmu: " fmt

Same here. Might be worth making this "tegra-smmu: " for consistency.

> +
> +#include <linux/bitfield.h>
> +#include <linux/delay.h>
> +#include <linux/of.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +
> +#include "arm-smmu.h"
> +
> +/* Tegra194 has three ARM MMU-500 Instances.
> + * Two of them are used together for Interleaved IOVA accesses and
> + * used by Non-Isochronous Hw devices for SMMU translations.

"non-isochronous", s/Hw/HW/

> + * Third one is used for SMMU translations from Isochronous HW devices.

"isochronous"

> + * It is possible to use this Implementation to program either

"implementation"

> + * all three or two of the instances identically as desired through
> + * DT node.
> + *
> + * Programming all the three instances identically comes with redundant tlb

s/tlb/TLB/

> + * invalidations as all three never need to be tlb invalidated for a HW device.

Same here.

> + *
> + * When Linux Kernel supports multiple SMMU devices, The SMMU device used for

"kernel" and "..., the SMMU device"

> + * Isochornous HW devices should be added as a separate ARM MMU-500 device

"isochronous"

> + * in DT and be programmed independently for efficient tlb invalidates.

"TLB"

> + *
> + */
> +#define MAX_SMMU_INSTANCES 3
> +
> +#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */

USEC_PER_SEC?

> +#define TLB_SPIN_COUNT			10
> +
> +struct nvidia_smmu {
> +	struct arm_smmu_device	smmu;
> +	unsigned int		num_inst;
> +	void __iomem		*bases[MAX_SMMU_INSTANCES];
> +};
> +
> +#define to_nvidia_smmu(s) container_of(s, struct nvidia_smmu, smmu)

Making this static inline can make error messages more readable.

> +
> +#define nsmmu_page(smmu, inst, page) \
> +	(((inst) ? to_nvidia_smmu(smmu)->bases[(inst)] : smmu->base) + \
> +	((page) << smmu->pgshift))

Can we simply define to_nvidia_smmu(smmu)->bases[0] = smmu->base in
nvidia_smmu_impl_init()? Then this would become just:

	to_nvidia_smmu(smmu)->bases[inst] + ((page) << (smmu)->pgshift)

Also, the nsmmu_ prefix looks somewhat odd here. You already use struct
nvidia_smmu as the name of the structure, so why not be consistent and
continue to use nvidia_smmu_ as the prefix for function names?

Or perhaps even use tegra_smmu_ as the prefix to match the filename
change I suggested earlier.

> +
> +static u32 nsmmu_read_reg(struct arm_smmu_device *smmu,
> +			      int page, int offset)
> +{
> +	return readl_relaxed(nsmmu_page(smmu, 0, page) + offset);
> +}
> +
> +static void nsmmu_write_reg(struct arm_smmu_device *smmu,
> +			    int page, int offset, u32 val)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++)
> +		writel_relaxed(val, nsmmu_page(smmu, i, page) + offset);
> +}
> +
> +static u64 nsmmu_read_reg64(struct arm_smmu_device *smmu,
> +				int page, int offset)
> +{
> +	return readq_relaxed(nsmmu_page(smmu, 0, page) + offset);
> +}
> +
> +static void nsmmu_write_reg64(struct arm_smmu_device *smmu,
> +				  int page, int offset, u64 val)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++)
> +		writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset);
> +}
> +
> +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page,
> +			   int sync, int status)
> +{
> +	u32 reg;

I see this is being used to store the value read from a register. I find
it clearer to call this "value" or "val" (or in this case perhaps even
"status") because whenever I read "reg" I immediately think this is
meant to be a register offset, which can then be confusing when I see it
used in I/O accessors because it is in the wrong position.

But anyway, that's just my opinion and this is a bit subjective, so feel
free to ignore.

> +	unsigned int i;
> +	unsigned int spin_cnt, delay;
> +
> +	arm_smmu_writel(smmu, page, sync, 0);
> +
> +	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> +		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> +			reg = 0;

You may want to declare the variable at this scope since you never need
it outside. Also, use a space between variable initialization and the
for block below for better readability.

> +			for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) {
> +				reg |= readl_relaxed(
> +					nsmmu_page(smmu, i, page) + status);
> +			}

Maybe add a separate variable for the page address so this can be a bit
uncluttered. Also, I'd prefer a blank line after the block for
readability.

> +			if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
> +				return;
> +			cpu_relax();

Blank line above cpu_relax() for readability.

> +		}
> +		udelay(delay);

Again, a blank line between blocks and subsequent statements can help
readability.

> +	}
> +	dev_err_ratelimited(smmu->dev,
> +			    "TLB sync timed out -- SMMU may be deadlocked\n");

Same here.

Also, is there anything we can do when this happens?

> +}
> +
> +static int nsmmu_reset(struct arm_smmu_device *smmu)
> +{
> +	u32 reg;
> +	unsigned int i;
> +
> +	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) {
> +		/* clear global FSR */
> +		reg = readl_relaxed(nsmmu_page(smmu, i, ARM_SMMU_GR0) +
> +				    ARM_SMMU_GR0_sGFSR);
> +		writel_relaxed(reg, nsmmu_page(smmu, i, ARM_SMMU_GR0) +
> +				    ARM_SMMU_GR0_sGFSR);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct arm_smmu_impl nvidia_smmu_impl = {
> +	.read_reg = nsmmu_read_reg,
> +	.write_reg = nsmmu_write_reg,
> +	.read_reg64 = nsmmu_read_reg64,
> +	.write_reg64 = nsmmu_write_reg64,
> +	.reset = nsmmu_reset,
> +	.tlb_sync = nsmmu_tlb_sync,
> +};
> +
> +struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
> +{
> +	unsigned int i;
> +	struct nvidia_smmu *nsmmu;
> +	struct resource *res;
> +	struct device *dev = smmu->dev;
> +	struct platform_device *pdev = to_platform_device(smmu->dev);
> +
> +	nsmmu = devm_kzalloc(smmu->dev, sizeof(*nsmmu), GFP_KERNEL);
> +	if (!nsmmu)
> +		return ERR_PTR(-ENOMEM);
> +
> +	nsmmu->smmu = *smmu;
> +	/* Instance 0 is ioremapped by arm-smmu.c */
> +	nsmmu->num_inst = 1;

Maybe add this here to simplify the nsmmu_page() macro above:

	nsmmu->bases[0] = smmu->base;

> +
> +	for (i = 1; i < MAX_SMMU_INSTANCES; i++) {
> +		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
> +		if (!res)
> +			break;
> +		nsmmu->bases[i] = devm_ioremap_resource(dev, res);
> +		if (IS_ERR(nsmmu->bases[i]))
> +			return (struct arm_smmu_device *)nsmmu->bases[i];

ERR_CAST()?

> +		nsmmu->num_inst++;
> +	}

More blank lines would make this much easier to read, in my opinion.

> +
> +	nsmmu->smmu.impl = &nvidia_smmu_impl;
> +	devm_kfree(smmu->dev, smmu);

Maybe a comment here would be useful for readers to immediately
understand why you're doing this here.

> +	pr_info("NVIDIA ARM SMMU Implementation, Instances=%d\n",
> +		nsmmu->num_inst);

I think I'd just omit this. In general you should only let the user know
when things go wrong, but the above is printed when everything goes as
expected.

Thierry
Robin Murphy June 23, 2020, 11:16 a.m. UTC | #2
On 2020-06-23 11:29, Thierry Reding wrote:
[...]
>> diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c
>> index c75b9d957b702..52c84c30f83e4 100644
>> --- a/drivers/iommu/arm-smmu-impl.c
>> +++ b/drivers/iommu/arm-smmu-impl.c
>> @@ -160,6 +160,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
>>   	 */
>>   	switch (smmu->model) {
>>   	case ARM_MMU500:
>> +		if (of_device_is_compatible(smmu->dev->of_node,
>> +					    "nvidia,tegra194-smmu-500"))
>> +			return nvidia_smmu_impl_init(smmu);
> 
> Should NVIDIA_TEGRA194_SMMU be a separate value for smmu->model,
> perhaps? That way we avoid this somewhat odd check here.

No, this is simply in the wrong place. The design here is that we pick 
up anything related to the basic SMMU IP (model) first, then make any 
platform-specific integration checks. That way a platform-specific init 
function can see the model impl set and subclass it if necessary 
(although nobody's actually done that yet). The setup for Cavium is just 
a short-cut since their model is unique to their integration, so the 
lines get a bit blurred and there's little benefit to trying to separate 
it out.

In short, put this down below with the other of_device_is_compatible() 
checks.

>>   		smmu->impl = &arm_mmu500_impl;
>>   		break;
>>   	case CAVIUM_SMMUV2:
>> diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
> 
> I wonder if it would be better to name this arm-smmu-tegra.c to make it
> clearer that this is for a Tegra chip. We do have regular expressions in
> MAINTAINERS that catch anything with "tegra" in it to make this easier.

There was a notion that these would be grouped by vendor, but if there's 
a strong preference for all NVIDIA-SoC-related stuff to be named "Tegra" 
then I'm not going to complain too much.

>> new file mode 100644
>> index 0000000000000..dafc293a45217
>> --- /dev/null
>> +++ b/drivers/iommu/arm-smmu-nvidia.c
>> @@ -0,0 +1,161 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +// Nvidia ARM SMMU v2 implementation quirks
> 
> s/Nvidia/NVIDIA/
> 
>> +// Copyright (C) 2019 NVIDIA CORPORATION.  All rights reserved.
> 
> I suppose this should now also include 2020.
> 
>> +
>> +#define pr_fmt(fmt) "nvidia-smmu: " fmt
> 
> Same here. Might be worth making this "tegra-smmu: " for consistency.

On the other hand, a log prefix that is literally the name of a 
completely unrelated driver seems more confusing to users than useful. 
Same for the function naming - the tegra_smmu_* namespace is already 
owned by that driver.

Robin.
Thierry Reding June 23, 2020, 12:47 p.m. UTC | #3
On Tue, Jun 23, 2020 at 12:16:55PM +0100, Robin Murphy wrote:
> On 2020-06-23 11:29, Thierry Reding wrote:
> [...]
> > > diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c
> > > index c75b9d957b702..52c84c30f83e4 100644
> > > --- a/drivers/iommu/arm-smmu-impl.c
> > > +++ b/drivers/iommu/arm-smmu-impl.c
> > > @@ -160,6 +160,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
> > >   	 */
> > >   	switch (smmu->model) {
> > >   	case ARM_MMU500:
> > > +		if (of_device_is_compatible(smmu->dev->of_node,
> > > +					    "nvidia,tegra194-smmu-500"))
> > > +			return nvidia_smmu_impl_init(smmu);
> > 
> > Should NVIDIA_TEGRA194_SMMU be a separate value for smmu->model,
> > perhaps? That way we avoid this somewhat odd check here.
> 
> No, this is simply in the wrong place. The design here is that we pick up
> anything related to the basic SMMU IP (model) first, then make any
> platform-specific integration checks. That way a platform-specific init
> function can see the model impl set and subclass it if necessary (although
> nobody's actually done that yet). The setup for Cavium is just a short-cut
> since their model is unique to their integration, so the lines get a bit
> blurred and there's little benefit to trying to separate it out.
> 
> In short, put this down below with the other of_device_is_compatible()
> checks.
> 
> > >   		smmu->impl = &arm_mmu500_impl;
> > >   		break;
> > >   	case CAVIUM_SMMUV2:
> > > diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
> > 
> > I wonder if it would be better to name this arm-smmu-tegra.c to make it
> > clearer that this is for a Tegra chip. We do have regular expressions in
> > MAINTAINERS that catch anything with "tegra" in it to make this easier.
> 
> There was a notion that these would be grouped by vendor, but if there's a
> strong preference for all NVIDIA-SoC-related stuff to be named "Tegra" then
> I'm not going to complain too much.

Maybe I was being overly cautious. I was just trying to avoid adding
something called nvidia-arm-smmu which might eventually turn out to be
ambiguous if there was ever a non-Tegra chip and the ARM SMMU
implementation was not compatible with the one instantiated on Tegra.

Note that I have no knowledge of such a chip being designed, so this may
never actually become an issue.

In either case, the compatible string already identifies this as Tegra-
specific, so we could always change the driver name later on if we have
to.

> > > new file mode 100644
> > > index 0000000000000..dafc293a45217
> > > --- /dev/null
> > > +++ b/drivers/iommu/arm-smmu-nvidia.c
> > > @@ -0,0 +1,161 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +// Nvidia ARM SMMU v2 implementation quirks
> > 
> > s/Nvidia/NVIDIA/
> > 
> > > +// Copyright (C) 2019 NVIDIA CORPORATION.  All rights reserved.
> > 
> > I suppose this should now also include 2020.
> > 
> > > +
> > > +#define pr_fmt(fmt) "nvidia-smmu: " fmt
> > 
> > Same here. Might be worth making this "tegra-smmu: " for consistency.
> 
> On the other hand, a log prefix that is literally the name of a completely
> unrelated driver seems more confusing to users than useful. Same for the
> function naming - the tegra_smmu_* namespace is already owned by that
> driver.

The ARM SMMU replaced the Tegra SMMU on Tegra186 and later, so both
drivers are never going to run concurrently. The only "problem" might be
that both drivers have symbols with the same prefix, but since they
don't export any of those symbols I don't see how that would become a
real issue.

But then again, the Tegra SMMU is also technically an NVIDIA SMMU, so
sticking with the current name might also be confusing.

Perhaps a good compromise would be to use a "tegra194{-,_}smmu" prefix
instead, which would make this fairly specific to just Tegra194 (and
compatible chips). That's a fairly common pattern we've been following
on Tegra, as you can for example see in drivers/gpio/gpio-tegra186.c,
drivers/dma/tegra210-adma.c, drivers/memory/tegra/tegra186-emc.c, etc.

Thierry
Krishna Reddy June 25, 2020, 11:13 p.m. UTC | #4
>Should NVIDIA_TEGRA194_SMMU be a separate value for smmu->model, perhaps? That way we avoid this somewhat odd check here.

NVIDIA haven't made any changes to arm,mmu-500. It is only used in different topology.  New model would be mis-leading here.
As suggested by Robin, It can just be moved to end of function.

>> diff --git a/drivers/iommu/arm-smmu-nvidia.c 
>> b/drivers/iommu/arm-smmu-nvidia.c
>I wonder if it would be better to name this arm-smmu-tegra.c to make it clearer that this is for a Tegra chip. We do have regular expressions in MAINTAINERS that catch anything with "tegra" in it to make this easier.
>Also, the nsmmu_ prefix looks somewhat odd here. You already use struct nvidia_smmu as the name of the structure, so why not be consistent and continue to use nvidia_smmu_ as the prefix for function names?
>Or perhaps even use tegra_smmu_ as the prefix to match the filename change I suggested earlier.

Prefix can be updated to nvidia_smmu as we seem to be okay for now to keep file name as arm-smmu-nvidia.c after the vendor name.  

>> +#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
>USEC_PER_SEC?

It is not meant for a conversion. Reused Timeout variable from arm-smmu.c for tlb_sync implementation.  Can rename it to TLB_LOOP_TIMEOUT_IN_US.


>> +	}
>> +	dev_err_ratelimited(smmu->dev,
>> +			    "TLB sync timed out -- SMMU may be deadlocked\n");
>Same here.
>Also, is there anything we can do when this happens?

This is never expected to happen on Silicon. This code and message is reused from arm-smmu.c.


>> +#define nsmmu_page(smmu, inst, page) \
>> +	(((inst) ? to_nvidia_smmu(smmu)->bases[(inst)] : smmu->base) + \
>> +	((page) << smmu->pgshift))

>Can we simply define to_nvidia_smmu(smmu)->bases[0] = smmu->base in nvidia_smmu_impl_init()? Then this would become just:
>	to_nvidia_smmu(smmu)->bases[inst] + ((page) << (smmu)->pgshift)
> +
>Maybe add this here to simplify the nsmmu_page() macro above:
>	nsmmu->bases[0] = smmu->base;

This preferred to avoid the check in nsmmu_page(). But, smmu->base is not yet populated when nvidia_smmu_impl_init() is called.  
Let me look at the alternative place to set it.

-KR
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 50659d76976b7..118da0893c964 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16572,9 +16572,11 @@  F:	drivers/i2c/busses/i2c-tegra.c
 
 TEGRA IOMMU DRIVERS
 M:	Thierry Reding <thierry.reding@gmail.com>
+R:	Krishna Reddy <vdumpa@nvidia.com>
 L:	linux-tegra@vger.kernel.org
 S:	Supported
 F:	drivers/iommu/tegra*
+F:	drivers/iommu/arm-smmu-nvidia.c
 
 TEGRA KBC DRIVER
 M:	Laxman Dewangan <ldewangan@nvidia.com>
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 57cf4ba5e27cb..35542df00da72 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -15,7 +15,7 @@  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o amd_iommu_quirks.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
-arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o
+arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o arm-smmu-nvidia.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c
index c75b9d957b702..52c84c30f83e4 100644
--- a/drivers/iommu/arm-smmu-impl.c
+++ b/drivers/iommu/arm-smmu-impl.c
@@ -160,6 +160,9 @@  struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
 	 */
 	switch (smmu->model) {
 	case ARM_MMU500:
+		if (of_device_is_compatible(smmu->dev->of_node,
+					    "nvidia,tegra194-smmu-500"))
+			return nvidia_smmu_impl_init(smmu);
 		smmu->impl = &arm_mmu500_impl;
 		break;
 	case CAVIUM_SMMUV2:
diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
new file mode 100644
index 0000000000000..dafc293a45217
--- /dev/null
+++ b/drivers/iommu/arm-smmu-nvidia.c
@@ -0,0 +1,161 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+// Nvidia ARM SMMU v2 implementation quirks
+// Copyright (C) 2019 NVIDIA CORPORATION.  All rights reserved.
+
+#define pr_fmt(fmt) "nvidia-smmu: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include "arm-smmu.h"
+
+/* Tegra194 has three ARM MMU-500 Instances.
+ * Two of them are used together for Interleaved IOVA accesses and
+ * used by Non-Isochronous Hw devices for SMMU translations.
+ * Third one is used for SMMU translations from Isochronous HW devices.
+ * It is possible to use this Implementation to program either
+ * all three or two of the instances identically as desired through
+ * DT node.
+ *
+ * Programming all the three instances identically comes with redundant tlb
+ * invalidations as all three never need to be tlb invalidated for a HW device.
+ *
+ * When Linux Kernel supports multiple SMMU devices, The SMMU device used for
+ * Isochornous HW devices should be added as a separate ARM MMU-500 device
+ * in DT and be programmed independently for efficient tlb invalidates.
+ *
+ */
+#define MAX_SMMU_INSTANCES 3
+
+#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
+#define TLB_SPIN_COUNT			10
+
+struct nvidia_smmu {
+	struct arm_smmu_device	smmu;
+	unsigned int		num_inst;
+	void __iomem		*bases[MAX_SMMU_INSTANCES];
+};
+
+#define to_nvidia_smmu(s) container_of(s, struct nvidia_smmu, smmu)
+
+#define nsmmu_page(smmu, inst, page) \
+	(((inst) ? to_nvidia_smmu(smmu)->bases[(inst)] : smmu->base) + \
+	((page) << smmu->pgshift))
+
+static u32 nsmmu_read_reg(struct arm_smmu_device *smmu,
+			      int page, int offset)
+{
+	return readl_relaxed(nsmmu_page(smmu, 0, page) + offset);
+}
+
+static void nsmmu_write_reg(struct arm_smmu_device *smmu,
+			    int page, int offset, u32 val)
+{
+	unsigned int i;
+
+	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++)
+		writel_relaxed(val, nsmmu_page(smmu, i, page) + offset);
+}
+
+static u64 nsmmu_read_reg64(struct arm_smmu_device *smmu,
+				int page, int offset)
+{
+	return readq_relaxed(nsmmu_page(smmu, 0, page) + offset);
+}
+
+static void nsmmu_write_reg64(struct arm_smmu_device *smmu,
+				  int page, int offset, u64 val)
+{
+	unsigned int i;
+
+	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++)
+		writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset);
+}
+
+static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+			   int sync, int status)
+{
+	u32 reg;
+	unsigned int i;
+	unsigned int spin_cnt, delay;
+
+	arm_smmu_writel(smmu, page, sync, 0);
+
+	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+			reg = 0;
+			for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) {
+				reg |= readl_relaxed(
+					nsmmu_page(smmu, i, page) + status);
+			}
+			if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+				return;
+			cpu_relax();
+		}
+		udelay(delay);
+	}
+	dev_err_ratelimited(smmu->dev,
+			    "TLB sync timed out -- SMMU may be deadlocked\n");
+}
+
+static int nsmmu_reset(struct arm_smmu_device *smmu)
+{
+	u32 reg;
+	unsigned int i;
+
+	for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) {
+		/* clear global FSR */
+		reg = readl_relaxed(nsmmu_page(smmu, i, ARM_SMMU_GR0) +
+				    ARM_SMMU_GR0_sGFSR);
+		writel_relaxed(reg, nsmmu_page(smmu, i, ARM_SMMU_GR0) +
+				    ARM_SMMU_GR0_sGFSR);
+	}
+
+	return 0;
+}
+
+static const struct arm_smmu_impl nvidia_smmu_impl = {
+	.read_reg = nsmmu_read_reg,
+	.write_reg = nsmmu_write_reg,
+	.read_reg64 = nsmmu_read_reg64,
+	.write_reg64 = nsmmu_write_reg64,
+	.reset = nsmmu_reset,
+	.tlb_sync = nsmmu_tlb_sync,
+};
+
+struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
+{
+	unsigned int i;
+	struct nvidia_smmu *nsmmu;
+	struct resource *res;
+	struct device *dev = smmu->dev;
+	struct platform_device *pdev = to_platform_device(smmu->dev);
+
+	nsmmu = devm_kzalloc(smmu->dev, sizeof(*nsmmu), GFP_KERNEL);
+	if (!nsmmu)
+		return ERR_PTR(-ENOMEM);
+
+	nsmmu->smmu = *smmu;
+	/* Instance 0 is ioremapped by arm-smmu.c */
+	nsmmu->num_inst = 1;
+
+	for (i = 1; i < MAX_SMMU_INSTANCES; i++) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
+		if (!res)
+			break;
+		nsmmu->bases[i] = devm_ioremap_resource(dev, res);
+		if (IS_ERR(nsmmu->bases[i]))
+			return (struct arm_smmu_device *)nsmmu->bases[i];
+		nsmmu->num_inst++;
+	}
+
+	nsmmu->smmu.impl = &nvidia_smmu_impl;
+	devm_kfree(smmu->dev, smmu);
+	pr_info("NVIDIA ARM SMMU Implementation, Instances=%d\n",
+		nsmmu->num_inst);
+
+	return &nsmmu->smmu;
+}
diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
index d172c024be618..d88374b68da31 100644
--- a/drivers/iommu/arm-smmu.h
+++ b/drivers/iommu/arm-smmu.h
@@ -451,6 +451,7 @@  static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page,
 
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 
 int arm_mmu500_reset(struct arm_smmu_device *smmu);