diff mbox

[05/33] drm/omap: partial workaround for DRA7 DMM errata i878

Message ID 1455875288-4370-6-git-send-email-tomi.valkeinen@ti.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tomi Valkeinen Feb. 19, 2016, 9:47 a.m. UTC
Errata i878 says that MPU should not be used to access RAM and DMM at
the same time. As it's not possible to prevent MPU accessing RAM, we
need to access DMM via a proxy.

This patch changes DMM driver to access DMM registers via sDMA. Instead
of doing a normal readl/writel call to read/write a register, we use
sDMA to copy 4 bytes from/to the DMM registers.

This patch provides only a partial workaround for i878, as not only DMM
register reads/writes are affected, but also accesses to the DMM mapped
buffers (framebuffers, usually).

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 Documentation/devicetree/bindings/arm/omap/dmm.txt |   3 +-
 drivers/gpu/drm/omapdrm/omap_dmm_priv.h            |   8 ++
 drivers/gpu/drm/omapdrm/omap_dmm_tiler.c           | 141 ++++++++++++++++++++-
 3 files changed, 149 insertions(+), 3 deletions(-)

Comments

Laurent Pinchart Feb. 23, 2016, 9:57 p.m. UTC | #1
Hi Tomi,

Thank you for the patch.

On Friday 19 February 2016 11:47:40 Tomi Valkeinen wrote:
> Errata i878 says that MPU should not be used to access RAM and DMM at
> the same time. As it's not possible to prevent MPU accessing RAM, we
> need to access DMM via a proxy.
> 
> This patch changes DMM driver to access DMM registers via sDMA. Instead
> of doing a normal readl/writel call to read/write a register, we use
> sDMA to copy 4 bytes from/to the DMM registers.
> 
> This patch provides only a partial workaround for i878, as not only DMM
> register reads/writes are affected, but also accesses to the DMM mapped
> buffers (framebuffers, usually).

Will this patch really improve the situation if the DMM mapping is accessed 
anyway ?

> Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
> ---
>  Documentation/devicetree/bindings/arm/omap/dmm.txt |   3 +-
>  drivers/gpu/drm/omapdrm/omap_dmm_priv.h            |   8 ++
>  drivers/gpu/drm/omapdrm/omap_dmm_tiler.c           | 141 +++++++++++++++++-
>  3 files changed, 149 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/arm/omap/dmm.txt
> b/Documentation/devicetree/bindings/arm/omap/dmm.txt index
> 8bd6d0a238a8..e5fc2eb7f4da 100644
> --- a/Documentation/devicetree/bindings/arm/omap/dmm.txt
> +++ b/Documentation/devicetree/bindings/arm/omap/dmm.txt
> @@ -8,7 +8,8 @@ translation for initiators which need contiguous dma bus
> addresses.
> 
>  Required properties:
>  - compatible:	Should contain "ti,omap4-dmm" for OMAP4 family
> -		Should contain "ti,omap5-dmm" for OMAP5 and DRA7x family
> +		Should contain "ti,omap5-dmm" for OMAP5 family
> +		Should contain "ti,dra7-dmm" for DRA7x family

Isn't it DRA7xx instead of DRA7x ?

>  - reg:		Contains DMM register address range (base address and length)
>  - interrupts:	Should contain an interrupt-specifier for DMM_IRQ.
>  - ti,hwmods:	Name of the hwmod associated to DMM, which is typically 
"dmm"
> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h index 9f32a83ca507..4d292ba5bcca
> 100644
> --- a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> @@ -155,10 +155,12 @@ struct refill_engine {
> 
>  struct dmm_platform_data {
>  	uint32_t cpu_cache_flags;
> +	bool errata_i878_wa;
>  };
> 
>  struct dmm {
>  	struct device *dev;
> +	u32 phys_base;
>  	void __iomem *base;
>  	int irq;
> 
> @@ -189,6 +191,12 @@ struct dmm {
>  	struct list_head alloc_head;
> 
>  	const struct dmm_platform_data *plat_data;
> +
> +	bool dmm_workaround;
> +	spinlock_t wa_lock;
> +	u32 *wa_dma_data;
> +	dma_addr_t wa_dma_handle;
> +	struct dma_chan *wa_dma_chan;
>  };
> 
>  #endif
> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index fe5260477b52..3ec5c6633585
> 100644
> --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> @@ -19,6 +19,7 @@
>  #include <linux/completion.h>
>  #include <linux/delay.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/dmaengine.h>
>  #include <linux/errno.h>
>  #include <linux/init.h>
>  #include <linux/interrupt.h>
> @@ -79,14 +80,124 @@ static const uint32_t reg[][4] = {
>  			DMM_PAT_DESCR__2, DMM_PAT_DESCR__3},
>  };
> 
> +static int dmm_dma_copy(struct dmm *dmm, u32 src, u32 dst)

Anything wrong with using dma_addr_t ?

> +{
> +	struct dma_device *dma_dev = dmm->wa_dma_chan->device;
> +	struct dma_async_tx_descriptor *tx = NULL;

No need to initialize tx to NULL.

> +	enum dma_status status;
> +	dma_cookie_t cookie;
> +
> +	tx = dma_dev->device_prep_dma_memcpy(dmm->wa_dma_chan, dst, src, 4, 0);

Given that you're transferring between an I/O mapped register and system 
memory, I believe you should use dmaengine_prep_slave_single() instead, with a 
call to dmaengine_slave_config() to set the I/O mapped register physical 
address. You will also need to request a slave DMA channel instead of a memcpy 
DMA channel.

> +	if (!tx) {
> +		dev_err(dmm->dev, "Failed to prepare DMA memcpy\n");
> +		return -EIO;
> +	}
> +
> +	cookie = tx->tx_submit(tx);
> +	if (dma_submit_error(cookie)) {
> +		dev_err(dmm->dev, "Failed to do DMA tx_submit\n");
> +		return -EIO;
> +	}
> +
> +	dma_async_issue_pending(dmm->wa_dma_chan);
> +	status = dma_sync_wait(dmm->wa_dma_chan, cookie);
> +	if (status != DMA_COMPLETE)
> +		dev_err(dmm->dev, "i878 wa DMA copy failure\n");
> +
> +	dmaengine_terminate_all(dmm->wa_dma_chan);
> +	return 0;
> +}
> +
> +static u32 dmm_read_wa(struct dmm *dmm, u32 reg)
> +{
> +	u32 src, dst;
> +	int r;
> +
> +	src = (u32)(dmm->phys_base + reg);
> +	dst = (u32)dmm->wa_dma_handle;
> +
> +	r = dmm_dma_copy(dmm, src, dst);
> +	if (r) {
> +		dev_err(dmm->dev, "sDMA read transfer timeout\n");
> +		return readl(dmm->base + reg);
> +	}
> +
> +	return readl(dmm->wa_dma_data);

readl() has a memory barrier *after* the read, not before. Similarly writel() 
has its memory barrier *before* the write. I don't think that's what you want, 
shouldn't you use readl_relaxed() and writel_relaxed() with explicit barriers 
?

I'm also unsure whether readl and writel are the right primitives, as they 
target access to I/O memory, not system memory. Can't you use explicit 
barriers and direct access (return *dmm->wa_dma_data) ?

> +}
> +
> +static void dmm_write_wa(struct dmm *dmm, u32 val, u32 reg)
> +{
> +	u32 src, dst;
> +	int r;
> +
> +	writel(val, dmm->wa_dma_data);

Same comment here.

> +	src = (u32)dmm->wa_dma_handle;
> +	dst = (u32)(dmm->phys_base + reg);
> +
> +	r = dmm_dma_copy(dmm, src, dst);
> +	if (r) {
> +		dev_err(dmm->dev, "sDMA write transfer timeout\n");
> +		writel(val, dmm->base + reg);
> +	}
> +}
> +
>  static u32 dmm_read(struct dmm *dmm, u32 reg)
>  {
> -	return readl(dmm->base + reg);
> +	if (dmm->dmm_workaround) {
> +		u32 v;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&dmm->wa_lock, flags);
> +		v = dmm_read_wa(dmm, reg);

dma_sync_wait() with a spinlock held doesn't seem like the best idea to me.

> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
> +
> +		return v;
> +	} else {
> +		return readl(dmm->base + reg);
> +	}
>  }
> 
>  static void dmm_write(struct dmm *dmm, u32 val, u32 reg)
>  {
> -	writel(val, dmm->base + reg);
> +	if (dmm->dmm_workaround) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&dmm->wa_lock, flags);
> +		dmm_write_wa(dmm, val, reg);

Ditto.

> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
> +	} else {
> +		writel(val, dmm->base + reg);
> +	}
> +}
> +
> +static int dmm_workaround_init(struct dmm *dmm)
> +{
> +	dma_cap_mask_t mask;
> +
> +	dma_cap_zero(mask);
> +	dma_cap_set(DMA_MEMCPY, mask);

Nitpicking, I'd move those two lines down right before the 
dma_request_channel() call, as they are part of the same logical block of 
code.

> +	spin_lock_init(&dmm->wa_lock);
> +
> +	dmm->wa_dma_data = dma_alloc_coherent(dmm->dev, 4, &dmm->wa_dma_handle,
> GFP_KERNEL);

Maybe a macro instead of 4 ?

> +	if (!dmm->wa_dma_data)
> +		return -ENOMEM;
> +
> +	dmm->wa_dma_chan = dma_request_channel(mask, NULL, NULL);
> +	if (!dmm->wa_dma_chan) {
> +		dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
> +		return -ENODEV;
> +	}
> +
> +	return 0;
> +}
> +
> +static void dmm_workaround_uninit(struct dmm *dmm)
> +{
> +	dma_release_channel(dmm->wa_dma_chan);
> +
> +	dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
>  }
> 
>  /* simple allocator to grab next 16 byte aligned memory from txn */
> @@ -606,6 +717,9 @@ static int omap_dmm_remove(struct platform_device *dev)
>  		if (omap_dmm->dummy_page)
>  			__free_page(omap_dmm->dummy_page);
> 
> +		if (omap_dmm->dmm_workaround)
> +			dmm_workaround_uninit(omap_dmm);
> +
>  		if (omap_dmm->irq > 0)
>  			free_irq(omap_dmm->irq, omap_dmm);
> 
> @@ -653,6 +767,7 @@ static int omap_dmm_probe(struct platform_device *dev)
>  		goto fail;
>  	}
> 
> +	omap_dmm->phys_base = mem->start;
>  	omap_dmm->base = ioremap(mem->start, SZ_2K);
> 
>  	if (!omap_dmm->base) {
> @@ -668,6 +783,19 @@ static int omap_dmm_probe(struct platform_device *dev)
> 
>  	omap_dmm->dev = &dev->dev;
> 
> +	omap_dmm->dmm_workaround = omap_dmm->plat_data->errata_i878_wa;
> +
> +	if (omap_dmm->dmm_workaround) {

I'd test omap_dmm->plat_data->errata_i878_wa here, omap_dmm->dmm_workaround is 
initialized to false by kzalloc().

> +		int r;
> +		r = dmm_workaround_init(omap_dmm);

Can't you use the ret variable ? I know it's pre-initialized to -EFAULT, but 
it seems better to me to set it to explicit values in the three goto fail; 
error cases that would be impacted, especially given that -EFAULT is the wrong 
error code to return (and as it would touch that code, the 
platform_get_resource() + ioremap() calls can be optimized in a 
devm_ioremap_resource() in another cleanup patch).

> +		if (r) {
> +			omap_dmm->dmm_workaround = false;
> +			dev_err(&dev->dev, "failed to initialize work-around, WA not 
used\n");

As this is a non-fatal error, maybe dev_warn() ?

> +		} else {
> +			dev_info(&dev->dev, "workaround for errata i878 in use\n");
> +		}
> +	}
> +
>  	hwinfo = dmm_read(omap_dmm, DMM_PAT_HWINFO);
>  	omap_dmm->num_engines = (hwinfo >> 24) & 0x1F;
>  	omap_dmm->num_lut = (hwinfo >> 16) & 0x1F;
> @@ -1031,6 +1159,11 @@ static const struct dmm_platform_data
> dmm_omap5_platform_data = { .cpu_cache_flags = OMAP_BO_UNCACHED,
>  };
> 
> +static const struct dmm_platform_data dmm_dra7_platform_data = {
> +	.cpu_cache_flags = OMAP_BO_UNCACHED,
> +	.errata_i878_wa = true,
> +};
> +
>  static const struct of_device_id dmm_of_match[] = {
>  	{
>  		.compatible = "ti,omap4-dmm",
> @@ -1040,6 +1173,10 @@ static const struct of_device_id dmm_of_match[] = {
>  		.compatible = "ti,omap5-dmm",
>  		.data = &dmm_omap5_platform_data,
>  	},
> +	{
> +		.compatible = "ti,dra7-dmm",
> +		.data = &dmm_dra7_platform_data,
> +	},
>  	{},
>  };
>  #endif
Tomi Valkeinen Feb. 24, 2016, 9:14 a.m. UTC | #2
Hi,

On 23/02/16 23:57, Laurent Pinchart wrote:
> Hi Tomi,
> 
> Thank you for the patch.

Thanks for review, good points. I'll drop this patch from the series for
now. The reason being, this issue is rather difficult to reproduce and
test, so any changes I do to patch this will take time to verify.

I'll address the comments later.

> On Friday 19 February 2016 11:47:40 Tomi Valkeinen wrote:
>> Errata i878 says that MPU should not be used to access RAM and DMM at
>> the same time. As it's not possible to prevent MPU accessing RAM, we
>> need to access DMM via a proxy.
>>
>> This patch changes DMM driver to access DMM registers via sDMA. Instead
>> of doing a normal readl/writel call to read/write a register, we use
>> sDMA to copy 4 bytes from/to the DMM registers.
>>
>> This patch provides only a partial workaround for i878, as not only DMM
>> register reads/writes are affected, but also accesses to the DMM mapped
>> buffers (framebuffers, usually).
> 
> Will this patch really improve the situation if the DMM mapping is accessed 
> anyway ?

Depends. For TI environments it helps, as we don't draw with the CPU. If
there's drawing done with CPU, the issue may still happen. I have never
seen it happen in normal situations, though, but I did manage to trigger
it with explicit stress testing.

Also, I don't have any good solution to the framebuffer part... Except
doing a memcpy, ruining the performance.

 Tomi
Peter Ujfalusi Sept. 29, 2017, 12:29 p.m. UTC | #3
Hi Laurent,


Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki

On 2016-02-23 23:57, Laurent Pinchart wrote:
> Hi Tomi,
> 
> Thank you for the patch.
> 
> On Friday 19 February 2016 11:47:40 Tomi Valkeinen wrote:
>> Errata i878 says that MPU should not be used to access RAM and DMM at
>> the same time. As it's not possible to prevent MPU accessing RAM, we
>> need to access DMM via a proxy.
>>
>> This patch changes DMM driver to access DMM registers via sDMA. Instead
>> of doing a normal readl/writel call to read/write a register, we use
>> sDMA to copy 4 bytes from/to the DMM registers.
>>
>> This patch provides only a partial workaround for i878, as not only DMM
>> register reads/writes are affected, but also accesses to the DMM mapped
>> buffers (framebuffers, usually).
> 
> Will this patch really improve the situation if the DMM mapping is accessed 
> anyway ?
> 
>> Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
>> ---
>>  Documentation/devicetree/bindings/arm/omap/dmm.txt |   3 +-
>>  drivers/gpu/drm/omapdrm/omap_dmm_priv.h            |   8 ++
>>  drivers/gpu/drm/omapdrm/omap_dmm_tiler.c           | 141 +++++++++++++++++-
>>  3 files changed, 149 insertions(+), 3 deletions(-)
>>
>> diff --git a/Documentation/devicetree/bindings/arm/omap/dmm.txt
>> b/Documentation/devicetree/bindings/arm/omap/dmm.txt index
>> 8bd6d0a238a8..e5fc2eb7f4da 100644
>> --- a/Documentation/devicetree/bindings/arm/omap/dmm.txt
>> +++ b/Documentation/devicetree/bindings/arm/omap/dmm.txt
>> @@ -8,7 +8,8 @@ translation for initiators which need contiguous dma bus
>> addresses.
>>
>>  Required properties:
>>  - compatible:	Should contain "ti,omap4-dmm" for OMAP4 family
>> -		Should contain "ti,omap5-dmm" for OMAP5 and DRA7x family
>> +		Should contain "ti,omap5-dmm" for OMAP5 family
>> +		Should contain "ti,dra7-dmm" for DRA7x family
> 
> Isn't it DRA7xx instead of DRA7x ?
> 
>>  - reg:		Contains DMM register address range (base address and length)
>>  - interrupts:	Should contain an interrupt-specifier for DMM_IRQ.
>>  - ti,hwmods:	Name of the hwmod associated to DMM, which is typically 
> "dmm"
>> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
>> b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h index 9f32a83ca507..4d292ba5bcca
>> 100644
>> --- a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
>> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
>> @@ -155,10 +155,12 @@ struct refill_engine {
>>
>>  struct dmm_platform_data {
>>  	uint32_t cpu_cache_flags;
>> +	bool errata_i878_wa;
>>  };
>>
>>  struct dmm {
>>  	struct device *dev;
>> +	u32 phys_base;
>>  	void __iomem *base;
>>  	int irq;
>>
>> @@ -189,6 +191,12 @@ struct dmm {
>>  	struct list_head alloc_head;
>>
>>  	const struct dmm_platform_data *plat_data;
>> +
>> +	bool dmm_workaround;
>> +	spinlock_t wa_lock;
>> +	u32 *wa_dma_data;
>> +	dma_addr_t wa_dma_handle;
>> +	struct dma_chan *wa_dma_chan;
>>  };
>>
>>  #endif
>> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
>> b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index fe5260477b52..3ec5c6633585
>> 100644
>> --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
>> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
>> @@ -19,6 +19,7 @@
>>  #include <linux/completion.h>
>>  #include <linux/delay.h>
>>  #include <linux/dma-mapping.h>
>> +#include <linux/dmaengine.h>
>>  #include <linux/errno.h>
>>  #include <linux/init.h>
>>  #include <linux/interrupt.h>
>> @@ -79,14 +80,124 @@ static const uint32_t reg[][4] = {
>>  			DMM_PAT_DESCR__2, DMM_PAT_DESCR__3},
>>  };
>>
>> +static int dmm_dma_copy(struct dmm *dmm, u32 src, u32 dst)
> 
> Anything wrong with using dma_addr_t ?

Nothing wrong with the dma_addr_t.

>> +{
>> +	struct dma_device *dma_dev = dmm->wa_dma_chan->device;
>> +	struct dma_async_tx_descriptor *tx = NULL;
> 
> No need to initialize tx to NULL.

True.

>> +	enum dma_status status;
>> +	dma_cookie_t cookie;
>> +
>> +	tx = dma_dev->device_prep_dma_memcpy(dmm->wa_dma_chan, dst, src, 4, 0);
> 
> Given that you're transferring between an I/O mapped register and system 
> memory, I believe you should use dmaengine_prep_slave_single() instead, with a 
> call to dmaengine_slave_config() to set the I/O mapped register physical 
> address. You will also need to request a slave DMA channel instead of a memcpy 
> DMA channel.

No, slave channel can not be used in this case as a slave channel would
need hardware trigger to do the work and in this case we are not
implementing DMA support for dmm, but we are using the DMA to read and
write the registers due to the errata.

The DMA setup (sDMA for example) is almost identical in both case with
the exception that we don't have trigger configured for the memcpy type.

> 
>> +	if (!tx) {
>> +		dev_err(dmm->dev, "Failed to prepare DMA memcpy\n");
>> +		return -EIO;
>> +	}
>> +
>> +	cookie = tx->tx_submit(tx);
>> +	if (dma_submit_error(cookie)) {
>> +		dev_err(dmm->dev, "Failed to do DMA tx_submit\n");
>> +		return -EIO;
>> +	}
>> +
>> +	dma_async_issue_pending(dmm->wa_dma_chan);
>> +	status = dma_sync_wait(dmm->wa_dma_chan, cookie);
>> +	if (status != DMA_COMPLETE)
>> +		dev_err(dmm->dev, "i878 wa DMA copy failure\n");
>> +
>> +	dmaengine_terminate_all(dmm->wa_dma_chan);
>> +	return 0;
>> +}
>> +
>> +static u32 dmm_read_wa(struct dmm *dmm, u32 reg)
>> +{
>> +	u32 src, dst;
>> +	int r;
>> +
>> +	src = (u32)(dmm->phys_base + reg);
>> +	dst = (u32)dmm->wa_dma_handle;
>> +
>> +	r = dmm_dma_copy(dmm, src, dst);
>> +	if (r) {
>> +		dev_err(dmm->dev, "sDMA read transfer timeout\n");
>> +		return readl(dmm->base + reg);
>> +	}
>> +
>> +	return readl(dmm->wa_dma_data);
> 
> readl() has a memory barrier *after* the read, not before. Similarly writel() 
> has its memory barrier *before* the write. I don't think that's what you want, 
> shouldn't you use readl_relaxed() and writel_relaxed() with explicit barriers 
> ?
> 
> I'm also unsure whether readl and writel are the right primitives, as they 
> target access to I/O memory, not system memory. Can't you use explicit 
> barriers and direct access (return *dmm->wa_dma_data) ?

we would also need to mark it as volatile to make sure that we do reach
out for the data to RAM.

This is done by the readl/writel.

>> +}
>> +
>> +static void dmm_write_wa(struct dmm *dmm, u32 val, u32 reg)
>> +{
>> +	u32 src, dst;
>> +	int r;
>> +
>> +	writel(val, dmm->wa_dma_data);
> 
> Same comment here.


> 
>> +	src = (u32)dmm->wa_dma_handle;
>> +	dst = (u32)(dmm->phys_base + reg);
>> +
>> +	r = dmm_dma_copy(dmm, src, dst);
>> +	if (r) {
>> +		dev_err(dmm->dev, "sDMA write transfer timeout\n");
>> +		writel(val, dmm->base + reg);
>> +	}
>> +}
>> +
>>  static u32 dmm_read(struct dmm *dmm, u32 reg)
>>  {
>> -	return readl(dmm->base + reg);
>> +	if (dmm->dmm_workaround) {
>> +		u32 v;
>> +		unsigned long flags;
>> +
>> +		spin_lock_irqsave(&dmm->wa_lock, flags);
>> +		v = dmm_read_wa(dmm, reg);
> 
> dma_sync_wait() with a spinlock held doesn't seem like the best idea to me.

We are transferring 4 bytes in one go and the interrupts might be
disabled as well when we want to read registers. So if we would use
async DMA + callback + wait_for_completion_timeout() in dma_read/write
things might just fail on us.

With the dma_sync_wait() we are checking the tx_status of the transfer.
Since we have short transfer the DMA is already finished when we first
going to check the status. For sure we are not going to spin too long.

>> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
>> +
>> +		return v;
>> +	} else {
>> +		return readl(dmm->base + reg);
>> +	}
>>  }
>>
>>  static void dmm_write(struct dmm *dmm, u32 val, u32 reg)
>>  {
>> -	writel(val, dmm->base + reg);
>> +	if (dmm->dmm_workaround) {
>> +		unsigned long flags;
>> +
>> +		spin_lock_irqsave(&dmm->wa_lock, flags);
>> +		dmm_write_wa(dmm, val, reg);
> 
> Ditto.
> 
>> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
>> +	} else {
>> +		writel(val, dmm->base + reg);
>> +	}
>> +}
>> +
>> +static int dmm_workaround_init(struct dmm *dmm)
>> +{
>> +	dma_cap_mask_t mask;
>> +
>> +	dma_cap_zero(mask);
>> +	dma_cap_set(DMA_MEMCPY, mask);
> 
> Nitpicking, I'd move those two lines down right before the 
> dma_request_channel() call, as they are part of the same logical block of 
> code.

OK.

>> +	spin_lock_init(&dmm->wa_lock);
>> +
>> +	dmm->wa_dma_data = dma_alloc_coherent(dmm->dev, 4, &dmm->wa_dma_handle,
>> GFP_KERNEL);
> 
> Maybe a macro instead of 4 ?

sizeof(u32) ? The DMM registers are 32bits.

> 
>> +	if (!dmm->wa_dma_data)
>> +		return -ENOMEM;
>> +
>> +	dmm->wa_dma_chan = dma_request_channel(mask, NULL, NULL);
>> +	if (!dmm->wa_dma_chan) {
>> +		dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
>> +		return -ENODEV;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void dmm_workaround_uninit(struct dmm *dmm)
>> +{
>> +	dma_release_channel(dmm->wa_dma_chan);
>> +
>> +	dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
>>  }
>>
>>  /* simple allocator to grab next 16 byte aligned memory from txn */
>> @@ -606,6 +717,9 @@ static int omap_dmm_remove(struct platform_device *dev)
>>  		if (omap_dmm->dummy_page)
>>  			__free_page(omap_dmm->dummy_page);
>>
>> +		if (omap_dmm->dmm_workaround)
>> +			dmm_workaround_uninit(omap_dmm);
>> +
>>  		if (omap_dmm->irq > 0)
>>  			free_irq(omap_dmm->irq, omap_dmm);
>>
>> @@ -653,6 +767,7 @@ static int omap_dmm_probe(struct platform_device *dev)
>>  		goto fail;
>>  	}
>>
>> +	omap_dmm->phys_base = mem->start;
>>  	omap_dmm->base = ioremap(mem->start, SZ_2K);
>>
>>  	if (!omap_dmm->base) {
>> @@ -668,6 +783,19 @@ static int omap_dmm_probe(struct platform_device *dev)
>>
>>  	omap_dmm->dev = &dev->dev;
>>
>> +	omap_dmm->dmm_workaround = omap_dmm->plat_data->errata_i878_wa;
>> +
>> +	if (omap_dmm->dmm_workaround) {
> 
> I'd test omap_dmm->plat_data->errata_i878_wa here, omap_dmm->dmm_workaround is 
> initialized to false by kzalloc().
> 
>> +		int r;
>> +		r = dmm_workaround_init(omap_dmm);
> 
> Can't you use the ret variable ? I know it's pre-initialized to -EFAULT, but 
> it seems better to me to set it to explicit values in the three goto fail; 
> error cases that would be impacted, especially given that -EFAULT is the wrong 
> error code to return (and as it would touch that code, the 
> platform_get_resource() + ioremap() calls can be optimized in a 
> devm_ioremap_resource() in another cleanup patch).
> 
>> +		if (r) {
>> +			omap_dmm->dmm_workaround = false;
>> +			dev_err(&dev->dev, "failed to initialize work-around, WA not 
> used\n");
> 
> As this is a non-fatal error, maybe dev_warn() ?

Since it is not treated as fatal error, it might be simplier to

if (omap_dmm->plat_data->errata_i878_wa) {
	if (!dmm_workaround_init(omap_dmm))
		omap_dmm->dmm_workaround = true;
		dev_info(&dev->dev,
			 "workaround for errata i878 in use\n");
	} else {
		dev_warn(&dev->dev,
			 "failed to initialize work-around, WA not used\n");
	}
}

> 
>> +		} else {
>> +			dev_info(&dev->dev, "workaround for errata i878 in use\n");
>> +		}
>> +	}
>> +
>>  	hwinfo = dmm_read(omap_dmm, DMM_PAT_HWINFO);
>>  	omap_dmm->num_engines = (hwinfo >> 24) & 0x1F;
>>  	omap_dmm->num_lut = (hwinfo >> 16) & 0x1F;
>> @@ -1031,6 +1159,11 @@ static const struct dmm_platform_data
>> dmm_omap5_platform_data = { .cpu_cache_flags = OMAP_BO_UNCACHED,
>>  };
>>
>> +static const struct dmm_platform_data dmm_dra7_platform_data = {
>> +	.cpu_cache_flags = OMAP_BO_UNCACHED,
>> +	.errata_i878_wa = true,
>> +};
>> +
>>  static const struct of_device_id dmm_of_match[] = {
>>  	{
>>  		.compatible = "ti,omap4-dmm",
>> @@ -1040,6 +1173,10 @@ static const struct of_device_id dmm_of_match[] = {
>>  		.compatible = "ti,omap5-dmm",
>>  		.data = &dmm_omap5_platform_data,
>>  	},
>> +	{
>> +		.compatible = "ti,dra7-dmm",
>> +		.data = &dmm_dra7_platform_data,
>> +	},
>>  	{},
>>  };
>>  #endif
> 

- Péter
diff mbox

Patch

diff --git a/Documentation/devicetree/bindings/arm/omap/dmm.txt b/Documentation/devicetree/bindings/arm/omap/dmm.txt
index 8bd6d0a238a8..e5fc2eb7f4da 100644
--- a/Documentation/devicetree/bindings/arm/omap/dmm.txt
+++ b/Documentation/devicetree/bindings/arm/omap/dmm.txt
@@ -8,7 +8,8 @@  translation for initiators which need contiguous dma bus addresses.
 
 Required properties:
 - compatible:	Should contain "ti,omap4-dmm" for OMAP4 family
-		Should contain "ti,omap5-dmm" for OMAP5 and DRA7x family
+		Should contain "ti,omap5-dmm" for OMAP5 family
+		Should contain "ti,dra7-dmm" for DRA7x family
 - reg:		Contains DMM register address range (base address and length)
 - interrupts:	Should contain an interrupt-specifier for DMM_IRQ.
 - ti,hwmods:	Name of the hwmod associated to DMM, which is typically "dmm"
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
index 9f32a83ca507..4d292ba5bcca 100644
--- a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
@@ -155,10 +155,12 @@  struct refill_engine {
 
 struct dmm_platform_data {
 	uint32_t cpu_cache_flags;
+	bool errata_i878_wa;
 };
 
 struct dmm {
 	struct device *dev;
+	u32 phys_base;
 	void __iomem *base;
 	int irq;
 
@@ -189,6 +191,12 @@  struct dmm {
 	struct list_head alloc_head;
 
 	const struct dmm_platform_data *plat_data;
+
+	bool dmm_workaround;
+	spinlock_t wa_lock;
+	u32 *wa_dma_data;
+	dma_addr_t wa_dma_handle;
+	struct dma_chan *wa_dma_chan;
 };
 
 #endif
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
index fe5260477b52..3ec5c6633585 100644
--- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
@@ -19,6 +19,7 @@ 
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -79,14 +80,124 @@  static const uint32_t reg[][4] = {
 			DMM_PAT_DESCR__2, DMM_PAT_DESCR__3},
 };
 
+static int dmm_dma_copy(struct dmm *dmm, u32 src, u32 dst)
+{
+	struct dma_device *dma_dev = dmm->wa_dma_chan->device;
+	struct dma_async_tx_descriptor *tx = NULL;
+	enum dma_status status;
+	dma_cookie_t cookie;
+
+	tx = dma_dev->device_prep_dma_memcpy(dmm->wa_dma_chan, dst, src, 4, 0);
+	if (!tx) {
+		dev_err(dmm->dev, "Failed to prepare DMA memcpy\n");
+		return -EIO;
+	}
+
+	cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		dev_err(dmm->dev, "Failed to do DMA tx_submit\n");
+		return -EIO;
+	}
+
+	dma_async_issue_pending(dmm->wa_dma_chan);
+	status = dma_sync_wait(dmm->wa_dma_chan, cookie);
+	if (status != DMA_COMPLETE)
+		dev_err(dmm->dev, "i878 wa DMA copy failure\n");
+
+	dmaengine_terminate_all(dmm->wa_dma_chan);
+	return 0;
+}
+
+static u32 dmm_read_wa(struct dmm *dmm, u32 reg)
+{
+	u32 src, dst;
+	int r;
+
+	src = (u32)(dmm->phys_base + reg);
+	dst = (u32)dmm->wa_dma_handle;
+
+	r = dmm_dma_copy(dmm, src, dst);
+	if (r) {
+		dev_err(dmm->dev, "sDMA read transfer timeout\n");
+		return readl(dmm->base + reg);
+	}
+
+	return readl(dmm->wa_dma_data);
+}
+
+static void dmm_write_wa(struct dmm *dmm, u32 val, u32 reg)
+{
+	u32 src, dst;
+	int r;
+
+	writel(val, dmm->wa_dma_data);
+
+	src = (u32)dmm->wa_dma_handle;
+	dst = (u32)(dmm->phys_base + reg);
+
+	r = dmm_dma_copy(dmm, src, dst);
+	if (r) {
+		dev_err(dmm->dev, "sDMA write transfer timeout\n");
+		writel(val, dmm->base + reg);
+	}
+}
+
 static u32 dmm_read(struct dmm *dmm, u32 reg)
 {
-	return readl(dmm->base + reg);
+	if (dmm->dmm_workaround) {
+		u32 v;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dmm->wa_lock, flags);
+		v = dmm_read_wa(dmm, reg);
+		spin_unlock_irqrestore(&dmm->wa_lock, flags);
+
+		return v;
+	} else {
+		return readl(dmm->base + reg);
+	}
 }
 
 static void dmm_write(struct dmm *dmm, u32 val, u32 reg)
 {
-	writel(val, dmm->base + reg);
+	if (dmm->dmm_workaround) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dmm->wa_lock, flags);
+		dmm_write_wa(dmm, val, reg);
+		spin_unlock_irqrestore(&dmm->wa_lock, flags);
+	} else {
+		writel(val, dmm->base + reg);
+	}
+}
+
+static int dmm_workaround_init(struct dmm *dmm)
+{
+	dma_cap_mask_t mask;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_MEMCPY, mask);
+
+	spin_lock_init(&dmm->wa_lock);
+
+	dmm->wa_dma_data = dma_alloc_coherent(dmm->dev, 4, &dmm->wa_dma_handle, GFP_KERNEL);
+	if (!dmm->wa_dma_data)
+		return -ENOMEM;
+
+	dmm->wa_dma_chan = dma_request_channel(mask, NULL, NULL);
+	if (!dmm->wa_dma_chan) {
+		dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void dmm_workaround_uninit(struct dmm *dmm)
+{
+	dma_release_channel(dmm->wa_dma_chan);
+
+	dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
 }
 
 /* simple allocator to grab next 16 byte aligned memory from txn */
@@ -606,6 +717,9 @@  static int omap_dmm_remove(struct platform_device *dev)
 		if (omap_dmm->dummy_page)
 			__free_page(omap_dmm->dummy_page);
 
+		if (omap_dmm->dmm_workaround)
+			dmm_workaround_uninit(omap_dmm);
+
 		if (omap_dmm->irq > 0)
 			free_irq(omap_dmm->irq, omap_dmm);
 
@@ -653,6 +767,7 @@  static int omap_dmm_probe(struct platform_device *dev)
 		goto fail;
 	}
 
+	omap_dmm->phys_base = mem->start;
 	omap_dmm->base = ioremap(mem->start, SZ_2K);
 
 	if (!omap_dmm->base) {
@@ -668,6 +783,19 @@  static int omap_dmm_probe(struct platform_device *dev)
 
 	omap_dmm->dev = &dev->dev;
 
+	omap_dmm->dmm_workaround = omap_dmm->plat_data->errata_i878_wa;
+
+	if (omap_dmm->dmm_workaround) {
+		int r;
+		r = dmm_workaround_init(omap_dmm);
+		if (r) {
+			omap_dmm->dmm_workaround = false;
+			dev_err(&dev->dev, "failed to initialize work-around, WA not used\n");
+		} else {
+			dev_info(&dev->dev, "workaround for errata i878 in use\n");
+		}
+	}
+
 	hwinfo = dmm_read(omap_dmm, DMM_PAT_HWINFO);
 	omap_dmm->num_engines = (hwinfo >> 24) & 0x1F;
 	omap_dmm->num_lut = (hwinfo >> 16) & 0x1F;
@@ -1031,6 +1159,11 @@  static const struct dmm_platform_data dmm_omap5_platform_data = {
 	.cpu_cache_flags = OMAP_BO_UNCACHED,
 };
 
+static const struct dmm_platform_data dmm_dra7_platform_data = {
+	.cpu_cache_flags = OMAP_BO_UNCACHED,
+	.errata_i878_wa = true,
+};
+
 static const struct of_device_id dmm_of_match[] = {
 	{
 		.compatible = "ti,omap4-dmm",
@@ -1040,6 +1173,10 @@  static const struct of_device_id dmm_of_match[] = {
 		.compatible = "ti,omap5-dmm",
 		.data = &dmm_omap5_platform_data,
 	},
+	{
+		.compatible = "ti,dra7-dmm",
+		.data = &dmm_dra7_platform_data,
+	},
 	{},
 };
 #endif