diff mbox

[1/4] ARM: tlb: don't perform inner-shareable invalidation for local TLB ops

Message ID 1364235581-17900-2-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Will Deacon March 25, 2013, 6:19 p.m. UTC
Inner-shareable TLB invalidation is typically more expensive than local
(non-shareable) invalidation, so performing the broadcasting for
local_flush_tlb_* operations is a waste of cycles and needlessly
clobbers entries in the TLBs of other CPUs.

This patch introduces __flush_tlb_* versions for many of the TLB
invalidation functions, which only respect inner-shareable variants of
the invalidation instructions. This allows us to modify the v7 SMP TLB
flags to include *both* inner-shareable and non-shareable operations and
then check the relevant flags depending on whether the operation is
local or not.

This gains us around 0.5% in hackbench scores for a dual-core A15, but I
would expect this to improve as more cores (and clusters) are added to
the equation.

Reported-by: Albin Tonnerre <Albin.Tonnerre@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/include/asm/tlbflush.h | 67 ++++++++++++++++++++++++++++++++++++++---
 arch/arm/kernel/smp_tlb.c       |  8 ++---
 arch/arm/mm/context.c           |  5 +--
 3 files changed, 68 insertions(+), 12 deletions(-)

Comments

Catalin Marinas March 27, 2013, 10:34 a.m. UTC | #1
On Mon, Mar 25, 2013 at 06:19:38PM +0000, Will Deacon wrote:
> @@ -352,22 +369,33 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
>  		dsb();
>  
>  	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> +		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
>  			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
>  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
>  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
>  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
>  		}
> -		put_cpu();

Why is this change needed? You only flush the local TLB if the mm never
wasn't active on this processor?

> @@ -398,6 +426,21 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
>  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
>  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
>  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
> +
> +	if (tlb_flag(TLB_BARRIER))
> +		dsb();
> +}
> +
> +static inline void
> +__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +

I guess here we could just have a single *_tlb_page() variant. I
couldn't find any place where we call the local_flush_tlb_page()
explicitly, I guess we don't really need local semantics. On ARMv6 SMP,
they are local anyway.

If we have a single *_tlb_page() function, you would need to drop the
TLB_V6_*_PAGE from the v8 possible TLB ops.

>  #ifdef CONFIG_ARM_ERRATA_720789
>  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
>  #else
> @@ -428,6 +471,22 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
>  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
>  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
>  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> +
> +	if (tlb_flag(TLB_BARRIER)) {
> +		dsb();
> +		isb();
> +	}
> +}

I have some worries with this function. It is used by set_top_pte() and
it really doesn't look like it has local-only semantics. For example,
you use it do flush the I-cache aliases and this must target all the
CPUs because of speculative prefetches, which means that set_top_pte()
must set the new alias on all the CPUs.

Highmem mappings need to be revisited as well.

> --- a/arch/arm/mm/context.c
> +++ b/arch/arm/mm/context.c
> @@ -134,10 +134,7 @@ static void flush_context(unsigned int cpu)
>  	}
>  
>  	/* Queue a TLB invalidate and flush the I-cache if necessary. */
> -	if (!tlb_ops_need_broadcast())
> -		cpumask_set_cpu(cpu, &tlb_flush_pending);
> -	else
> -		cpumask_setall(&tlb_flush_pending);
> +	cpumask_setall(&tlb_flush_pending);

That's a good change ;)
Will Deacon March 27, 2013, 12:07 p.m. UTC | #2
Hi Catalin,

Cheers for looking at this.

On Wed, Mar 27, 2013 at 10:34:30AM +0000, Catalin Marinas wrote:
> On Mon, Mar 25, 2013 at 06:19:38PM +0000, Will Deacon wrote:
> > @@ -352,22 +369,33 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> >  		dsb();
> >  
> >  	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> > -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> > +		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
> >  			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> >  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> >  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> >  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> >  		}
> > -		put_cpu();
> 
> Why is this change needed? You only flush the local TLB if the mm never
> wasn't active on this processor?

Ouch, that's a cock-up, sorry. I'll remove the '!'.

> > @@ -398,6 +426,21 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
> >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
> >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
> > +
> > +	if (tlb_flag(TLB_BARRIER))
> > +		dsb();
> > +}
> > +
> > +static inline void
> > +__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> > +{
> > +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> > +
> > +	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
> > +
> > +	if (tlb_flag(TLB_WB))
> > +		dsb();
> > +
> 
> I guess here we could just have a single *_tlb_page() variant. I
> couldn't find any place where we call the local_flush_tlb_page()
> explicitly, I guess we don't really need local semantics. On ARMv6 SMP,
> they are local anyway.
> 
> If we have a single *_tlb_page() function, you would need to drop the
> TLB_V6_*_PAGE from the v8 possible TLB ops.

Having the local variant doesn't hurt though, and provides the same symmetry
as other architectures (powerpc, sh, tile, mips, ...).

> >  #ifdef CONFIG_ARM_ERRATA_720789
> >  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
> >  #else
> > @@ -428,6 +471,22 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
> >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
> >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> > +
> > +	if (tlb_flag(TLB_BARRIER)) {
> > +		dsb();
> > +		isb();
> > +	}
> > +}
> 
> I have some worries with this function. It is used by set_top_pte() and
> it really doesn't look like it has local-only semantics. For example,
> you use it do flush the I-cache aliases and this must target all the
> CPUs because of speculative prefetches, which means that set_top_pte()
> must set the new alias on all the CPUs.

This looks like a bug in set_top_pte when it's called for cache-flushing.
However, the only core this would affect is 11MPCore, which uses the
ipi-based flushing anyway, so I think we're ok.

> Highmem mappings need to be revisited as well.

I think they're ok. Everything is either done in atomic context or under a
raw spinlock, so the mappings aren't expected to be used by other CPUs.

Cheers,

Will
Catalin Marinas March 27, 2013, 12:30 p.m. UTC | #3
On Wed, Mar 27, 2013 at 12:07:37PM +0000, Will Deacon wrote:
> Cheers for looking at this.

Just warming up for Marc's KVM patches ;)

> On Wed, Mar 27, 2013 at 10:34:30AM +0000, Catalin Marinas wrote:
> > On Mon, Mar 25, 2013 at 06:19:38PM +0000, Will Deacon wrote:
> > > @@ -352,22 +369,33 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> > >  		dsb();
> > >  
> > >  	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> > > -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> > > +		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
> > >  			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> > >  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> > >  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> > >  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> > >  		}
> > > -		put_cpu();
> > 
> > Why is this change needed? You only flush the local TLB if the mm never
> > wasn't active on this processor?
> 
> Ouch, that's a cock-up, sorry. I'll remove the '!'.

Do we also need to disable preemtion?

> > > @@ -398,6 +426,21 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> > >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
> > >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
> > >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
> > > +
> > > +	if (tlb_flag(TLB_BARRIER))
> > > +		dsb();
> > > +}
> > > +
> > > +static inline void
> > > +__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> > > +{
> > > +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> > > +
> > > +	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
> > > +
> > > +	if (tlb_flag(TLB_WB))
> > > +		dsb();
> > > +
> > 
> > I guess here we could just have a single *_tlb_page() variant. I
> > couldn't find any place where we call the local_flush_tlb_page()
> > explicitly, I guess we don't really need local semantics. On ARMv6 SMP,
> > they are local anyway.
> > 
> > If we have a single *_tlb_page() function, you would need to drop the
> > TLB_V6_*_PAGE from the v8 possible TLB ops.
> 
> Having the local variant doesn't hurt though, and provides the same symmetry
> as other architectures (powerpc, sh, tile, mips, ...).

It's probably harmless to have them, though they may not get used.

> > >  #ifdef CONFIG_ARM_ERRATA_720789
> > >  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
> > >  #else
> > > @@ -428,6 +471,22 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> > >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
> > >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
> > >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> > > +
> > > +	if (tlb_flag(TLB_BARRIER)) {
> > > +		dsb();
> > > +		isb();
> > > +	}
> > > +}
> > 
> > I have some worries with this function. It is used by set_top_pte() and
> > it really doesn't look like it has local-only semantics. For example,
> > you use it do flush the I-cache aliases and this must target all the
> > CPUs because of speculative prefetches, which means that set_top_pte()
> > must set the new alias on all the CPUs.
> 
> This looks like a bug in set_top_pte when it's called for cache-flushing.
> However, the only core this would affect is 11MPCore, which uses the
> ipi-based flushing anyway, so I think we're ok.

I don't think its 11MPCore only, set_top_pte() is called by
flush_icache_alias() from flush_ptrace_access() even on ARMv7.

> > Highmem mappings need to be revisited as well.
> 
> I think they're ok. Everything is either done in atomic context or under a
> raw spinlock, so the mappings aren't expected to be used by other CPUs.

It's not whether they are used explicitly but whether a speculative TLB
load can bring them in on a different CPU. I don't immediately see a
problem with non-aliasing caches but needs some more thinking.
Will Deacon March 27, 2013, 12:56 p.m. UTC | #4
On Wed, Mar 27, 2013 at 12:30:55PM +0000, Catalin Marinas wrote:
> On Wed, Mar 27, 2013 at 12:07:37PM +0000, Will Deacon wrote:
> > On Wed, Mar 27, 2013 at 10:34:30AM +0000, Catalin Marinas wrote:
> > > On Mon, Mar 25, 2013 at 06:19:38PM +0000, Will Deacon wrote:
> > > > @@ -352,22 +369,33 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> > > >  		dsb();
> > > >  
> > > >  	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> > > > -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> > > > +		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
> > > >  			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> > > >  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> > > >  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> > > >  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> > > >  		}
> > > > -		put_cpu();
> > > 
> > > Why is this change needed? You only flush the local TLB if the mm never
> > > wasn't active on this processor?
> > 
> > Ouch, that's a cock-up, sorry. I'll remove the '!'.
> 
> Do we also need to disable preemtion?

I don't think so, that should be taken care of by the caller if they are
issuing the local_ operation (otherwise it's racy anyway).

> > > >  #ifdef CONFIG_ARM_ERRATA_720789
> > > >  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
> > > >  #else
> > > > @@ -428,6 +471,22 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> > > >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
> > > >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
> > > >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> > > > +
> > > > +	if (tlb_flag(TLB_BARRIER)) {
> > > > +		dsb();
> > > > +		isb();
> > > > +	}
> > > > +}
> > > 
> > > I have some worries with this function. It is used by set_top_pte() and
> > > it really doesn't look like it has local-only semantics. For example,
> > > you use it do flush the I-cache aliases and this must target all the
> > > CPUs because of speculative prefetches, which means that set_top_pte()
> > > must set the new alias on all the CPUs.
> > 
> > This looks like a bug in set_top_pte when it's called for cache-flushing.
> > However, the only core this would affect is 11MPCore, which uses the
> > ipi-based flushing anyway, so I think we're ok.
> 
> I don't think its 11MPCore only, set_top_pte() is called by
> flush_icache_alias() from flush_ptrace_access() even on ARMv7.

Damn, yes, I missed those. Perhaps we should add set_top_pte_atomic, which
just does the local flush, and then promote the current flush to be IS?

> > > Highmem mappings need to be revisited as well.
> > 
> > I think they're ok. Everything is either done in atomic context or under a
> > raw spinlock, so the mappings aren't expected to be used by other CPUs.
> 
> It's not whether they are used explicitly but whether a speculative TLB
> load can bring them in on a different CPU. I don't immediately see a
> problem with non-aliasing caches but needs some more thinking.

But why do we care about the speculation? If the core doing the speculating
is always going to write a new pte before dereferencing anything mapped
there, then it will invalidate its own TLB then.

Will
Catalin Marinas March 27, 2013, 1:40 p.m. UTC | #5
On Wed, Mar 27, 2013 at 12:56:39PM +0000, Will Deacon wrote:
> On Wed, Mar 27, 2013 at 12:30:55PM +0000, Catalin Marinas wrote:
> > On Wed, Mar 27, 2013 at 12:07:37PM +0000, Will Deacon wrote:
> > > On Wed, Mar 27, 2013 at 10:34:30AM +0000, Catalin Marinas wrote:
> > > > On Mon, Mar 25, 2013 at 06:19:38PM +0000, Will Deacon wrote:
> > > > > @@ -352,22 +369,33 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
> > > > >  		dsb();
> > > > >  
> > > > >  	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> > > > > -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> > > > > +		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
> > > > >  			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
> > > > >  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
> > > > >  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
> > > > >  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
> > > > >  		}
> > > > > -		put_cpu();
> > > > 
> > > > Why is this change needed? You only flush the local TLB if the mm never
> > > > wasn't active on this processor?
> > > 
> > > Ouch, that's a cock-up, sorry. I'll remove the '!'.
> > 
> > Do we also need to disable preemtion?
> 
> I don't think so, that should be taken care of by the caller if they are
> issuing the local_ operation (otherwise it's racy anyway).

OK.

> > > > >  #ifdef CONFIG_ARM_ERRATA_720789
> > > > >  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
> > > > >  #else
> > > > > @@ -428,6 +471,22 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> > > > >  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
> > > > >  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
> > > > >  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> > > > > +
> > > > > +	if (tlb_flag(TLB_BARRIER)) {
> > > > > +		dsb();
> > > > > +		isb();
> > > > > +	}
> > > > > +}
> > > > 
> > > > I have some worries with this function. It is used by set_top_pte() and
> > > > it really doesn't look like it has local-only semantics. For example,
> > > > you use it do flush the I-cache aliases and this must target all the
> > > > CPUs because of speculative prefetches, which means that set_top_pte()
> > > > must set the new alias on all the CPUs.
> > > 
> > > This looks like a bug in set_top_pte when it's called for cache-flushing.
> > > However, the only core this would affect is 11MPCore, which uses the
> > > ipi-based flushing anyway, so I think we're ok.
> > 
> > I don't think its 11MPCore only, set_top_pte() is called by
> > flush_icache_alias() from flush_ptrace_access() even on ARMv7.
> 
> Damn, yes, I missed those. Perhaps we should add set_top_pte_atomic, which
> just does the local flush, and then promote the current flush to be IS?

Where would we use the set_top_pte_atomic() on ARMv7?

> > > > Highmem mappings need to be revisited as well.
> > > 
> > > I think they're ok. Everything is either done in atomic context or under a
> > > raw spinlock, so the mappings aren't expected to be used by other CPUs.
> > 
> > It's not whether they are used explicitly but whether a speculative TLB
> > load can bring them in on a different CPU. I don't immediately see a
> > problem with non-aliasing caches but needs some more thinking.
> 
> But why do we care about the speculation? If the core doing the speculating
> is always going to write a new pte before dereferencing anything mapped
> there, then it will invalidate its own TLB then.

It's about speculation on another CPU.

Let's say CPU0 does several kmap_atomic() calls which in turn call
set_top_pte(). The same page tables are visible to CPU1 which
speculatively loads some top pte (not the latest). At this point we have
a VA pointing to different PAs on CPU0 and CPU1. CPU1 would not access
this VA, so not a problem here, but whether this matters for
inner-shareable cache maintenance (dma_cache_maint_page), I can't tell
yet (internal thread with the architecture guys).
Will Deacon March 27, 2013, 1:54 p.m. UTC | #6
On Wed, Mar 27, 2013 at 01:40:29PM +0000, Catalin Marinas wrote:
> On Wed, Mar 27, 2013 at 12:56:39PM +0000, Will Deacon wrote:
> > Damn, yes, I missed those. Perhaps we should add set_top_pte_atomic, which
> > just does the local flush, and then promote the current flush to be IS?
> 
> Where would we use the set_top_pte_atomic() on ARMv7?

I was thinking of kmap_atomic{_pfn}, to avoid adding further overhead to
highmem.

> > > It's not whether they are used explicitly but whether a speculative TLB
> > > load can bring them in on a different CPU. I don't immediately see a
> > > problem with non-aliasing caches but needs some more thinking.
> > 
> > But why do we care about the speculation? If the core doing the speculating
> > is always going to write a new pte before dereferencing anything mapped
> > there, then it will invalidate its own TLB then.
> 
> It's about speculation on another CPU.
> 
> Let's say CPU0 does several kmap_atomic() calls which in turn call
> set_top_pte(). The same page tables are visible to CPU1 which
> speculatively loads some top pte (not the latest). At this point we have
> a VA pointing to different PAs on CPU0 and CPU1. CPU1 would not access
> this VA, so not a problem here, but whether this matters for
> inner-shareable cache maintenance (dma_cache_maint_page), I can't tell
> yet (internal thread with the architecture guys).

Ok... given that the (speculated) lines won't be dirty, I'm don't see why
this matters for cache maintenance, but I'll wait to see what the
architecture guys some back with.

Will
diff mbox

Patch

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index 4db8c88..ae9f34f 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -170,6 +170,8 @@ 
 #endif
 
 #define v7wbi_tlb_flags_smp	(TLB_WB | TLB_DCLEAN | TLB_BARRIER | \
+				 TLB_V6_U_FULL | TLB_V6_U_PAGE | \
+				 TLB_V6_U_ASID | \
 				 TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | \
 				 TLB_V7_UIS_ASID | TLB_V7_UIS_BP)
 #define v7wbi_tlb_flags_up	(TLB_WB | TLB_DCLEAN | TLB_BARRIER | \
@@ -334,6 +336,21 @@  static inline void local_flush_tlb_all(void)
 	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
 	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
 	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
+
+	if (tlb_flag(TLB_BARRIER)) {
+		dsb();
+		isb();
+	}
+}
+
+static inline void __flush_tlb_all(void)
+{
+	const int zero = 0;
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
 	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
 
 	if (tlb_flag(TLB_BARRIER)) {
@@ -352,22 +369,33 @@  static inline void local_flush_tlb_mm(struct mm_struct *mm)
 		dsb();
 
 	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
-		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
 			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
 			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
 			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
 			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
 		}
-		put_cpu();
 	}
 
 	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
 	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
 	tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
+
+	if (tlb_flag(TLB_BARRIER))
+		dsb();
+}
+
+static inline void __flush_tlb_mm(struct mm_struct *mm)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
 #ifdef CONFIG_ARM_ERRATA_720789
-	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", 0);
 #else
-	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", ASID(mm));
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -398,6 +426,21 @@  local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
 	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
 	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
+
+	if (tlb_flag(TLB_BARRIER))
+		dsb();
+}
+
+static inline void
+__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
 #ifdef CONFIG_ARM_ERRATA_720789
 	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
 #else
@@ -428,6 +471,22 @@  static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
 	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
 	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
 	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
+
+	if (tlb_flag(TLB_BARRIER)) {
+		dsb();
+		isb();
+	}
+}
+
+static inline void __flush_tlb_kernel_page(unsigned long kaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	kaddr &= PAGE_MASK;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
 	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
 
 	if (tlb_flag(TLB_BARRIER)) {
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index bd03005..6ae08b1 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -74,7 +74,7 @@  void flush_tlb_all(void)
 	if (tlb_ops_need_broadcast())
 		on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 	else
-		local_flush_tlb_all();
+		__flush_tlb_all();
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
@@ -82,7 +82,7 @@  void flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_ops_need_broadcast())
 		on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1);
 	else
-		local_flush_tlb_mm(mm);
+		__flush_tlb_mm(mm);
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
@@ -94,7 +94,7 @@  void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 		on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page,
 					&ta, 1);
 	} else
-		local_flush_tlb_page(vma, uaddr);
+		__flush_tlb_page(vma, uaddr);
 }
 
 void flush_tlb_kernel_page(unsigned long kaddr)
@@ -104,7 +104,7 @@  void flush_tlb_kernel_page(unsigned long kaddr)
 		ta.ta_start = kaddr;
 		on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
 	} else
-		local_flush_tlb_kernel_page(kaddr);
+		__flush_tlb_kernel_page(kaddr);
 }
 
 void flush_tlb_range(struct vm_area_struct *vma,
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index a5a4b2bc..550acf8 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -134,10 +134,7 @@  static void flush_context(unsigned int cpu)
 	}
 
 	/* Queue a TLB invalidate and flush the I-cache if necessary. */
-	if (!tlb_ops_need_broadcast())
-		cpumask_set_cpu(cpu, &tlb_flush_pending);
-	else
-		cpumask_setall(&tlb_flush_pending);
+	cpumask_setall(&tlb_flush_pending);
 
 	if (icache_is_vivt_asid_tagged())
 		__flush_icache_all();