diff mbox

[v4,07/11] soc/fsl/qbman: Rework portal mapping calls for ARM/PPC

Message ID 1503607075-28970-8-git-send-email-roy.pledge@nxp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roy Pledge Aug. 24, 2017, 8:37 p.m. UTC
Rework portal mapping for PPC and ARM. The PPC devices require a
cacheable coherent mapping while ARM will work with a non-cachable/write
combine mapping. This also eliminates the need for manual cache
flushes on ARM

Signed-off-by: Roy Pledge <roy.pledge@nxp.com>
---
 drivers/soc/fsl/qbman/bman.c        |  6 +++---
 drivers/soc/fsl/qbman/bman_portal.c | 36 +++++++++++++++++++++++-------------
 drivers/soc/fsl/qbman/bman_priv.h   |  8 +++-----
 drivers/soc/fsl/qbman/dpaa_sys.h    |  8 ++++----
 drivers/soc/fsl/qbman/qman.c        |  6 +++---
 drivers/soc/fsl/qbman/qman_portal.c | 36 +++++++++++++++++++++++-------------
 drivers/soc/fsl/qbman/qman_priv.h   |  8 +++-----
 7 files changed, 62 insertions(+), 46 deletions(-)

Comments

Catalin Marinas Sept. 14, 2017, 2 p.m. UTC | #1
On Thu, Aug 24, 2017 at 04:37:51PM -0400, Roy Pledge wrote:
> diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
> index ff8998f..e31c843 100644
> --- a/drivers/soc/fsl/qbman/bman.c
> +++ b/drivers/soc/fsl/qbman/bman.c
> @@ -154,7 +154,7 @@ struct bm_mc {
>  };
>  
>  struct bm_addr {
> -	void __iomem *ce;	/* cache-enabled */
> +	void *ce;		/* cache-enabled */
>  	void __iomem *ci;	/* cache-inhibited */
>  };

You dropped __iomem from ce, which is fine since it is now set via
memremap. However, I haven't seen (at least not in this patch), a change
to bm_ce_in() which still uses __raw_readl().

(it may be worth checking this code with sparse, it may warn about this)

> diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c
> index 39b39c8..bb03503 100644
> --- a/drivers/soc/fsl/qbman/bman_portal.c
> +++ b/drivers/soc/fsl/qbman/bman_portal.c
> @@ -91,7 +91,6 @@ static int bman_portal_probe(struct platform_device *pdev)
>  	struct device_node *node = dev->of_node;
>  	struct bm_portal_config *pcfg;
>  	struct resource *addr_phys[2];
> -	void __iomem *va;
>  	int irq, cpu;
>  
>  	pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
> @@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev)
>  	}
>  	pcfg->irq = irq;
>  
> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
> -	if (!va) {
> -		dev_err(dev, "ioremap::CE failed\n");
> +	/*
> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
> +	 * (coherent) mapping for the portal on both architectures but that
> +	 * isn't currently available in the kernel.  Because of HW differences
> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
> +	 * cacheable mappings
> +	 */

This comment mentions "cacheable/non-shareable (coherent)". Was this
meant for ARM platforms? Because non-shareable is not coherent, nor is
this combination guaranteed to work with different CPUs and
interconnects.

> +#ifdef CONFIG_PPC
> +	/* PPC requires a cacheable/non-coherent mapping of the portal */
> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
> +				resource_size(addr_phys[0]), MEMREMAP_WB);
> +#else
> +	/* ARM can use a write combine mapping. */
> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
> +				resource_size(addr_phys[0]), MEMREMAP_WC);
> +#endif

Nitpick: you could define something like QBMAN_MAP_ATTR to be different
between PPC and the rest and just keep a single memremap() call.

One may complain that "ce" is no longer "cache enabled" but I'm
personally fine to keep the same name for historical reasons.

> diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
> index 81a9a5e..0a1d573 100644
> --- a/drivers/soc/fsl/qbman/dpaa_sys.h
> +++ b/drivers/soc/fsl/qbman/dpaa_sys.h
> @@ -51,12 +51,12 @@
>  
>  static inline void dpaa_flush(void *p)
>  {
> +	/*
> +	 * Only PPC needs to flush the cache currently - on ARM the mapping
> +	 * is non cacheable
> +	 */
>  #ifdef CONFIG_PPC
>  	flush_dcache_range((unsigned long)p, (unsigned long)p+64);
> -#elif defined(CONFIG_ARM)
> -	__cpuc_flush_dcache_area(p, 64);
> -#elif defined(CONFIG_ARM64)
> -	__flush_dcache_area(p, 64);
>  #endif
>  }

Dropping the private API cache maintenance is fine and the memory is WC
now for ARM (mapping to Normal NonCacheable). However, do you require
any barriers here? Normal NC doesn't guarantee any ordering.

> diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
> index cbacdf4..41fe33a 100644
> --- a/drivers/soc/fsl/qbman/qman_portal.c
> +++ b/drivers/soc/fsl/qbman/qman_portal.c
> @@ -224,7 +224,6 @@ static int qman_portal_probe(struct platform_device *pdev)
>  	struct device_node *node = dev->of_node;
>  	struct qm_portal_config *pcfg;
>  	struct resource *addr_phys[2];
> -	void __iomem *va;
>  	int irq, cpu, err;
>  	u32 val;
>  
> @@ -262,23 +261,34 @@ static int qman_portal_probe(struct platform_device *pdev)
>  	}
>  	pcfg->irq = irq;
>  
> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
> -	if (!va) {
> -		dev_err(dev, "ioremap::CE failed\n");
> +	/*
> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
> +	 * (coherent) mapping for the portal on both architectures but that
> +	 * isn't currently available in the kernel.  Because of HW differences
> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
> +	 * cacheable mappings
> +	 */

Same comment as above non non-shareable.

> +#ifdef CONFIG_PPC
> +	/* PPC requires a cacheable mapping of the portal */
> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
> +				resource_size(addr_phys[0]), MEMREMAP_WB);
> +#else
> +	/* ARM can use write combine mapping for the cacheable area */
> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
> +				resource_size(addr_phys[0]), MEMREMAP_WT);
> +#endif

Same nitpick: a single memremap() call.
Roy Pledge Sept. 14, 2017, 7:07 p.m. UTC | #2
On 9/14/2017 10:00 AM, Catalin Marinas wrote:
> On Thu, Aug 24, 2017 at 04:37:51PM -0400, Roy Pledge wrote:
>> diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
>> index ff8998f..e31c843 100644
>> --- a/drivers/soc/fsl/qbman/bman.c
>> +++ b/drivers/soc/fsl/qbman/bman.c
>> @@ -154,7 +154,7 @@ struct bm_mc {
>>   };
>>   
>>   struct bm_addr {
>> -	void __iomem *ce;	/* cache-enabled */
>> +	void *ce;		/* cache-enabled */
>>   	void __iomem *ci;	/* cache-inhibited */
>>   };
> 
> You dropped __iomem from ce, which is fine since it is now set via
> memremap. However, I haven't seen (at least not in this patch), a change
> to bm_ce_in() which still uses __raw_readl().
> 
> (it may be worth checking this code with sparse, it may warn about this)
Thanks, you're correct I missed this. I will fix this (and the qman 
version) and run sparse.
> 
>> diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c
>> index 39b39c8..bb03503 100644
>> --- a/drivers/soc/fsl/qbman/bman_portal.c
>> +++ b/drivers/soc/fsl/qbman/bman_portal.c
>> @@ -91,7 +91,6 @@ static int bman_portal_probe(struct platform_device *pdev)
>>   	struct device_node *node = dev->of_node;
>>   	struct bm_portal_config *pcfg;
>>   	struct resource *addr_phys[2];
>> -	void __iomem *va;
>>   	int irq, cpu;
>>   
>>   	pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
>> @@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev)
>>   	}
>>   	pcfg->irq = irq;
>>   
>> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
>> -	if (!va) {
>> -		dev_err(dev, "ioremap::CE failed\n");
>> +	/*
>> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
>> +	 * (coherent) mapping for the portal on both architectures but that
>> +	 * isn't currently available in the kernel.  Because of HW differences
>> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
>> +	 * cacheable mappings
>> +	 */
> 
> This comment mentions "cacheable/non-shareable (coherent)". Was this
> meant for ARM platforms? Because non-shareable is not coherent, nor is
> this combination guaranteed to work with different CPUs and
> interconnects.
My wording is poor I should have been clearer that non-shareable == 
non-coherent.  I will fix this.

We do understand that cacheable/non shareable isn't supported on all 
CPU/interconnect combinations but we have verified with ARM that for the 
CPU/interconnects we have integrated QBMan on our use is OK. The note is 
here to try to explain why the mapping is different right now. Once we 
get the basic QBMan support integrated for ARM we do plan to try to have 
patches integrated that enable the cacheable mapping as it gives a 
significant performance boost.  This is a step 2 as we understand the 
topic is complex and a little controversial so I think treating it as an 
independent change will be easier than mixing it with the less 
interesting changes in this patchset.

> 
>> +#ifdef CONFIG_PPC
>> +	/* PPC requires a cacheable/non-coherent mapping of the portal */
>> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
>> +				resource_size(addr_phys[0]), MEMREMAP_WB);
>> +#else
>> +	/* ARM can use a write combine mapping. */
>> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
>> +				resource_size(addr_phys[0]), MEMREMAP_WC);
>> +#endif
> 
> Nitpick: you could define something like QBMAN_MAP_ATTR to be different
> between PPC and the rest and just keep a single memremap() call.
I will change this - it will be a little more compact.
> 
> One may complain that "ce" is no longer "cache enabled" but I'm
> personally fine to keep the same name for historical reasons.
Cache Enabled is also how the 'data sheet' for the processor describes 
the region and I think it is useful to keep it aligned so that anyone 
looking at the manual and the code can easily correlate the ter >
>> diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
>> index 81a9a5e..0a1d573 100644
>> --- a/drivers/soc/fsl/qbman/dpaa_sys.h
>> +++ b/drivers/soc/fsl/qbman/dpaa_sys.h
>> @@ -51,12 +51,12 @@
>>   
>>   static inline void dpaa_flush(void *p)
>>   {
>> +	/*
>> +	 * Only PPC needs to flush the cache currently - on ARM the mapping
>> +	 * is non cacheable
>> +	 */
>>   #ifdef CONFIG_PPC
>>   	flush_dcache_range((unsigned long)p, (unsigned long)p+64);
>> -#elif defined(CONFIG_ARM)
>> -	__cpuc_flush_dcache_area(p, 64);
>> -#elif defined(CONFIG_ARM64)
>> -	__flush_dcache_area(p, 64);
>>   #endif
>>   }
> 
> Dropping the private API cache maintenance is fine and the memory is WC
> now for ARM (mapping to Normal NonCacheable). However, do you require
> any barriers here? Normal NC doesn't guarantee any ordering.
The barrier is done in the code where the command is formed. We follow 
this pattern
a) Zero the command cache line (the device never reacts to a 0 command 
verb so a cast out of this will have no effect)
b) Fill in everything in the command except the command verb (byte 0)
c) Execute a memory barrier
d) Set the command verb (byte 0)
e) Flush the command
If a castout happens between d) and e) doesn't matter since it was about 
to be flushed anyway .  Any castout before d) will not cause HW to 
process the command because verb is still 0. The barrier at c) prevents 
reordering so the HW cannot see the verb set before the command is formed.

> 
>> diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
>> index cbacdf4..41fe33a 100644
>> --- a/drivers/soc/fsl/qbman/qman_portal.c
>> +++ b/drivers/soc/fsl/qbman/qman_portal.c
>> @@ -224,7 +224,6 @@ static int qman_portal_probe(struct platform_device *pdev)
>>   	struct device_node *node = dev->of_node;
>>   	struct qm_portal_config *pcfg;
>>   	struct resource *addr_phys[2];
>> -	void __iomem *va;
>>   	int irq, cpu, err;
>>   	u32 val;
>>   
>> @@ -262,23 +261,34 @@ static int qman_portal_probe(struct platform_device *pdev)
>>   	}
>>   	pcfg->irq = irq;
>>   
>> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
>> -	if (!va) {
>> -		dev_err(dev, "ioremap::CE failed\n");
>> +	/*
>> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
>> +	 * (coherent) mapping for the portal on both architectures but that
>> +	 * isn't currently available in the kernel.  Because of HW differences
>> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
>> +	 * cacheable mappings
>> +	 */
> 
> Same comment as above non non-shareable.
> 
>> +#ifdef CONFIG_PPC
>> +	/* PPC requires a cacheable mapping of the portal */
>> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
>> +				resource_size(addr_phys[0]), MEMREMAP_WB);
>> +#else
>> +	/* ARM can use write combine mapping for the cacheable area */
>> +	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
>> +				resource_size(addr_phys[0]), MEMREMAP_WT);
>> +#endif
> 
> Same nitpick: a single memremap() call.
>
Catalin Marinas Sept. 15, 2017, 9:49 p.m. UTC | #3
On Thu, Sep 14, 2017 at 07:07:50PM +0000, Roy Pledge wrote:
> On 9/14/2017 10:00 AM, Catalin Marinas wrote:
> > On Thu, Aug 24, 2017 at 04:37:51PM -0400, Roy Pledge wrote:
> >> @@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev)
> >>   	}
> >>   	pcfg->irq = irq;
> >>   
> >> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
> >> -	if (!va) {
> >> -		dev_err(dev, "ioremap::CE failed\n");
> >> +	/*
> >> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
> >> +	 * (coherent) mapping for the portal on both architectures but that
> >> +	 * isn't currently available in the kernel.  Because of HW differences
> >> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
> >> +	 * cacheable mappings
> >> +	 */
> > 
> > This comment mentions "cacheable/non-shareable (coherent)". Was this
> > meant for ARM platforms? Because non-shareable is not coherent, nor is
> > this combination guaranteed to work with different CPUs and
> > interconnects.
> 
> My wording is poor I should have been clearer that non-shareable == 
> non-coherent.  I will fix this.
> 
> We do understand that cacheable/non shareable isn't supported on all 
> CPU/interconnect combinations but we have verified with ARM that for the 
> CPU/interconnects we have integrated QBMan on our use is OK. The note is 
> here to try to explain why the mapping is different right now. Once we 
> get the basic QBMan support integrated for ARM we do plan to try to have 
> patches integrated that enable the cacheable mapping as it gives a 
> significant performance boost.

I will definitely not ack those patches (at least not in the form I've
seen, assuming certain eviction order of the bytes in a cacheline). The
reason is that it is incredibly fragile, highly dependent on the CPU
microarchitecture and interconnects. Assuming that you ever only have a
single SoC with this device, you may get away with #ifdefs in the
driver. But if you support two or more SoCs with different behaviours,
you'd have to make run-time decisions in the driver or run-time code
patching. We are very keen on single kernel binary image/drivers and
architecturally compliant code (the cacheable mapping hacks are well
outside the architecture behaviour).

> >> diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
> >> index 81a9a5e..0a1d573 100644
> >> --- a/drivers/soc/fsl/qbman/dpaa_sys.h
> >> +++ b/drivers/soc/fsl/qbman/dpaa_sys.h
> >> @@ -51,12 +51,12 @@
> >>   
> >>   static inline void dpaa_flush(void *p)
> >>   {
> >> +	/*
> >> +	 * Only PPC needs to flush the cache currently - on ARM the mapping
> >> +	 * is non cacheable
> >> +	 */
> >>   #ifdef CONFIG_PPC
> >>   	flush_dcache_range((unsigned long)p, (unsigned long)p+64);
> >> -#elif defined(CONFIG_ARM)
> >> -	__cpuc_flush_dcache_area(p, 64);
> >> -#elif defined(CONFIG_ARM64)
> >> -	__flush_dcache_area(p, 64);
> >>   #endif
> >>   }
> > 
> > Dropping the private API cache maintenance is fine and the memory is WC
> > now for ARM (mapping to Normal NonCacheable). However, do you require
> > any barriers here? Normal NC doesn't guarantee any ordering.
> 
> The barrier is done in the code where the command is formed. We follow 
> this pattern
> a) Zero the command cache line (the device never reacts to a 0 command 
> verb so a cast out of this will have no effect)
> b) Fill in everything in the command except the command verb (byte 0)
> c) Execute a memory barrier
> d) Set the command verb (byte 0)
> e) Flush the command
> If a castout happens between d) and e) doesn't matter since it was about 
> to be flushed anyway .  Any castout before d) will not cause HW to 
> process the command because verb is still 0. The barrier at c) prevents 
> reordering so the HW cannot see the verb set before the command is formed.

I think that's fine, the dpaa_flush() can be a no-op with non-cacheable
memory (I had forgotten the details).
Roy Pledge Sept. 18, 2017, 6:48 p.m. UTC | #4
On 9/15/2017 5:49 PM, Catalin Marinas wrote:
> On Thu, Sep 14, 2017 at 07:07:50PM +0000, Roy Pledge wrote:
>> On 9/14/2017 10:00 AM, Catalin Marinas wrote:
>>> On Thu, Aug 24, 2017 at 04:37:51PM -0400, Roy Pledge wrote:
>>>> @@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev)
>>>>    	}
>>>>    	pcfg->irq = irq;
>>>>    
>>>> -	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
>>>> -	if (!va) {
>>>> -		dev_err(dev, "ioremap::CE failed\n");
>>>> +	/*
>>>> +	 * TODO: Ultimately we would like to use a cacheable/non-shareable
>>>> +	 * (coherent) mapping for the portal on both architectures but that
>>>> +	 * isn't currently available in the kernel.  Because of HW differences
>>>> +	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
>>>> +	 * cacheable mappings
>>>> +	 */
>>>
>>> This comment mentions "cacheable/non-shareable (coherent)". Was this
>>> meant for ARM platforms? Because non-shareable is not coherent, nor is
>>> this combination guaranteed to work with different CPUs and
>>> interconnects.
>>
>> My wording is poor I should have been clearer that non-shareable ==
>> non-coherent.  I will fix this.
>>
>> We do understand that cacheable/non shareable isn't supported on all
>> CPU/interconnect combinations but we have verified with ARM that for the
>> CPU/interconnects we have integrated QBMan on our use is OK. The note is
>> here to try to explain why the mapping is different right now. Once we
>> get the basic QBMan support integrated for ARM we do plan to try to have
>> patches integrated that enable the cacheable mapping as it gives a
>> significant performance boost.
> 
> I will definitely not ack those patches (at least not in the form I've
> seen, assuming certain eviction order of the bytes in a cacheline). The
> reason is that it is incredibly fragile, highly dependent on the CPU
> microarchitecture and interconnects. Assuming that you ever only have a
> single SoC with this device, you may get away with #ifdefs in the
> driver. But if you support two or more SoCs with different behaviours,
> you'd have to make run-time decisions in the driver or run-time code
> patching. We are very keen on single kernel binary image/drivers and
> architecturally compliant code (the cacheable mapping hacks are well
> outside the architecture behaviour).
> 

Let's put this particular point on hold for now, I would like to focus 
on getting the basic functions merged in ASAP. I removed the comment in 
question (it sort of happened naturally when I applied your other 
comments) in the next revision of the patchset.  I have submitted the 
patches to our automated test system for sanity checking and I will sent 
a new patchset once I get the results.

Thanks again for your comments - they have been very useful and have 
improved the quality of the code for sure.

>>>> diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
>>>> index 81a9a5e..0a1d573 100644
>>>> --- a/drivers/soc/fsl/qbman/dpaa_sys.h
>>>> +++ b/drivers/soc/fsl/qbman/dpaa_sys.h
>>>> @@ -51,12 +51,12 @@
>>>>    
>>>>    static inline void dpaa_flush(void *p)
>>>>    {
>>>> +	/*
>>>> +	 * Only PPC needs to flush the cache currently - on ARM the mapping
>>>> +	 * is non cacheable
>>>> +	 */
>>>>    #ifdef CONFIG_PPC
>>>>    	flush_dcache_range((unsigned long)p, (unsigned long)p+64);
>>>> -#elif defined(CONFIG_ARM)
>>>> -	__cpuc_flush_dcache_area(p, 64);
>>>> -#elif defined(CONFIG_ARM64)
>>>> -	__flush_dcache_area(p, 64);
>>>>    #endif
>>>>    }
>>>
>>> Dropping the private API cache maintenance is fine and the memory is WC
>>> now for ARM (mapping to Normal NonCacheable). However, do you require
>>> any barriers here? Normal NC doesn't guarantee any ordering.
>>
>> The barrier is done in the code where the command is formed. We follow
>> this pattern
>> a) Zero the command cache line (the device never reacts to a 0 command
>> verb so a cast out of this will have no effect)
>> b) Fill in everything in the command except the command verb (byte 0)
>> c) Execute a memory barrier
>> d) Set the command verb (byte 0)
>> e) Flush the command
>> If a castout happens between d) and e) doesn't matter since it was about
>> to be flushed anyway .  Any castout before d) will not cause HW to
>> process the command because verb is still 0. The barrier at c) prevents
>> reordering so the HW cannot see the verb set before the command is formed.
> 
> I think that's fine, the dpaa_flush() can be a no-op with non-cacheable
> memory (I had forgotten the details).
>
diff mbox

Patch

diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
index ff8998f..e31c843 100644
--- a/drivers/soc/fsl/qbman/bman.c
+++ b/drivers/soc/fsl/qbman/bman.c
@@ -154,7 +154,7 @@  struct bm_mc {
 };
 
 struct bm_addr {
-	void __iomem *ce;	/* cache-enabled */
+	void *ce;		/* cache-enabled */
 	void __iomem *ci;	/* cache-inhibited */
 };
 
@@ -512,8 +512,8 @@  static int bman_create_portal(struct bman_portal *portal,
 	 * config, everything that follows depends on it and "config" is more
 	 * for (de)reference...
 	 */
-	p->addr.ce = c->addr_virt[DPAA_PORTAL_CE];
-	p->addr.ci = c->addr_virt[DPAA_PORTAL_CI];
+	p->addr.ce = c->addr_virt_ce;
+	p->addr.ci = c->addr_virt_ci;
 	if (bm_rcr_init(p, bm_rcr_pvb, bm_rcr_cce)) {
 		dev_err(c->dev, "RCR initialisation failed\n");
 		goto fail_rcr;
diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c
index 39b39c8..bb03503 100644
--- a/drivers/soc/fsl/qbman/bman_portal.c
+++ b/drivers/soc/fsl/qbman/bman_portal.c
@@ -91,7 +91,6 @@  static int bman_portal_probe(struct platform_device *pdev)
 	struct device_node *node = dev->of_node;
 	struct bm_portal_config *pcfg;
 	struct resource *addr_phys[2];
-	void __iomem *va;
 	int irq, cpu;
 
 	pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
@@ -123,23 +122,34 @@  static int bman_portal_probe(struct platform_device *pdev)
 	}
 	pcfg->irq = irq;
 
-	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
-	if (!va) {
-		dev_err(dev, "ioremap::CE failed\n");
+	/*
+	 * TODO: Ultimately we would like to use a cacheable/non-shareable
+	 * (coherent) mapping for the portal on both architectures but that
+	 * isn't currently available in the kernel.  Because of HW differences
+	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
+	 * cacheable mappings
+	 */
+#ifdef CONFIG_PPC
+	/* PPC requires a cacheable/non-coherent mapping of the portal */
+	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+				resource_size(addr_phys[0]), MEMREMAP_WB);
+#else
+	/* ARM can use a write combine mapping. */
+	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+				resource_size(addr_phys[0]), MEMREMAP_WC);
+#endif
+	if (!pcfg->addr_virt_ce) {
+		dev_err(dev, "memremap::CE failed\n");
 		goto err_ioremap1;
 	}
 
-	pcfg->addr_virt[DPAA_PORTAL_CE] = va;
-
-	va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]),
-			  _PAGE_GUARDED | _PAGE_NO_CACHE);
-	if (!va) {
+	pcfg->addr_virt_ci = ioremap(addr_phys[1]->start,
+					resource_size(addr_phys[1]));
+	if (!pcfg->addr_virt_ci) {
 		dev_err(dev, "ioremap::CI failed\n");
 		goto err_ioremap2;
 	}
 
-	pcfg->addr_virt[DPAA_PORTAL_CI] = va;
-
 	spin_lock(&bman_lock);
 	cpu = cpumask_next_zero(-1, &portal_cpus);
 	if (cpu >= nr_cpu_ids) {
@@ -164,9 +174,9 @@  static int bman_portal_probe(struct platform_device *pdev)
 	return 0;
 
 err_portal_init:
-	iounmap(pcfg->addr_virt[DPAA_PORTAL_CI]);
+	iounmap(pcfg->addr_virt_ci);
 err_ioremap2:
-	iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]);
+	memunmap(pcfg->addr_virt_ce);
 err_ioremap1:
 	return -ENXIO;
 }
diff --git a/drivers/soc/fsl/qbman/bman_priv.h b/drivers/soc/fsl/qbman/bman_priv.h
index 765a4bf..c48e6eb 100644
--- a/drivers/soc/fsl/qbman/bman_priv.h
+++ b/drivers/soc/fsl/qbman/bman_priv.h
@@ -49,11 +49,9 @@  extern u16 bman_ip_rev;	/* 0 if uninitialised, otherwise BMAN_REVx */
 extern struct gen_pool *bm_bpalloc;
 
 struct bm_portal_config {
-	/*
-	 * Corenet portal addresses;
-	 * [0]==cache-enabled, [1]==cache-inhibited.
-	 */
-	void __iomem *addr_virt[2];
+	/* Portal addresses */
+	void  *addr_virt_ce;
+	void __iomem *addr_virt_ci;
 	/* Allow these to be joined in lists */
 	struct list_head list;
 	struct device *dev;
diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index 81a9a5e..0a1d573 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -51,12 +51,12 @@ 
 
 static inline void dpaa_flush(void *p)
 {
+	/*
+	 * Only PPC needs to flush the cache currently - on ARM the mapping
+	 * is non cacheable
+	 */
 #ifdef CONFIG_PPC
 	flush_dcache_range((unsigned long)p, (unsigned long)p+64);
-#elif defined(CONFIG_ARM)
-	__cpuc_flush_dcache_area(p, 64);
-#elif defined(CONFIG_ARM64)
-	__flush_dcache_area(p, 64);
 #endif
 }
 
diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index 25419e1..668fab1 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -300,7 +300,7 @@  struct qm_mc {
 };
 
 struct qm_addr {
-	void __iomem *ce;	/* cache-enabled */
+	void *ce;		/* cache-enabled */
 	void __iomem *ci;	/* cache-inhibited */
 };
 
@@ -1123,8 +1123,8 @@  static int qman_create_portal(struct qman_portal *portal,
 	 * config, everything that follows depends on it and "config" is more
 	 * for (de)reference
 	 */
-	p->addr.ce = c->addr_virt[DPAA_PORTAL_CE];
-	p->addr.ci = c->addr_virt[DPAA_PORTAL_CI];
+	p->addr.ce = c->addr_virt_ce;
+	p->addr.ci = c->addr_virt_ci;
 	/*
 	 * If CI-stashing is used, the current defaults use a threshold of 3,
 	 * and stash with high-than-DQRR priority.
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index cbacdf4..41fe33a 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -224,7 +224,6 @@  static int qman_portal_probe(struct platform_device *pdev)
 	struct device_node *node = dev->of_node;
 	struct qm_portal_config *pcfg;
 	struct resource *addr_phys[2];
-	void __iomem *va;
 	int irq, cpu, err;
 	u32 val;
 
@@ -262,23 +261,34 @@  static int qman_portal_probe(struct platform_device *pdev)
 	}
 	pcfg->irq = irq;
 
-	va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
-	if (!va) {
-		dev_err(dev, "ioremap::CE failed\n");
+	/*
+	 * TODO: Ultimately we would like to use a cacheable/non-shareable
+	 * (coherent) mapping for the portal on both architectures but that
+	 * isn't currently available in the kernel.  Because of HW differences
+	 * PPC needs to be mapped cacheable while ARM SoCs will work with non
+	 * cacheable mappings
+	 */
+#ifdef CONFIG_PPC
+	/* PPC requires a cacheable mapping of the portal */
+	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+				resource_size(addr_phys[0]), MEMREMAP_WB);
+#else
+	/* ARM can use write combine mapping for the cacheable area */
+	pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+				resource_size(addr_phys[0]), MEMREMAP_WT);
+#endif
+	if (!pcfg->addr_virt_ce) {
+		dev_err(dev, "memremap::CE failed\n");
 		goto err_ioremap1;
 	}
 
-	pcfg->addr_virt[DPAA_PORTAL_CE] = va;
-
-	va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]),
-			  _PAGE_GUARDED | _PAGE_NO_CACHE);
-	if (!va) {
+	pcfg->addr_virt_ci = ioremap(addr_phys[1]->start,
+				resource_size(addr_phys[1]));
+	if (!pcfg->addr_virt_ci) {
 		dev_err(dev, "ioremap::CI failed\n");
 		goto err_ioremap2;
 	}
 
-	pcfg->addr_virt[DPAA_PORTAL_CI] = va;
-
 	pcfg->pools = qm_get_pools_sdqcr();
 
 	spin_lock(&qman_lock);
@@ -310,9 +320,9 @@  static int qman_portal_probe(struct platform_device *pdev)
 	return 0;
 
 err_portal_init:
-	iounmap(pcfg->addr_virt[DPAA_PORTAL_CI]);
+	iounmap(pcfg->addr_virt_ci);
 err_ioremap2:
-	iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]);
+	memunmap(pcfg->addr_virt_ce);
 err_ioremap1:
 	return -ENXIO;
 }
diff --git a/drivers/soc/fsl/qbman/qman_priv.h b/drivers/soc/fsl/qbman/qman_priv.h
index 957ef54..bab7f15 100644
--- a/drivers/soc/fsl/qbman/qman_priv.h
+++ b/drivers/soc/fsl/qbman/qman_priv.h
@@ -155,11 +155,9 @@  static inline void qman_cgrs_xor(struct qman_cgrs *dest,
 void qman_init_cgr_all(void);
 
 struct qm_portal_config {
-	/*
-	 * Corenet portal addresses;
-	 * [0]==cache-enabled, [1]==cache-inhibited.
-	 */
-	void __iomem *addr_virt[2];
+	/* Portal addresses */
+	void *addr_virt_ce;
+	void __iomem *addr_virt_ci;
 	struct device *dev;
 	struct iommu_domain *iommu_domain;
 	/* Allow these to be joined in lists */