diff mbox

[RFC/PATCH] ppc: Batch TLB flushes on 32-bit 6xx/7xx/7xxx in hash mode

Message ID 1465208582.4274.47.camel@kernel.crashing.org (mailing list archive)
State New, archived
Headers show

Commit Message

Benjamin Herrenschmidt June 6, 2016, 10:23 a.m. UTC
This ports the existing 64-bit mechanism to 32-bit, thus series
of 64 tlbie's followed by a sync like some versions of Darwin
(ab)use will result in a single flush.

We apply a pending flush on any sync instruction though, as Darwin
doesn't use tlbsync on non-SMP systems.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

---

Note: I haven't done any performance impact measurements with this
one ... feel free to let me know what it does for you :-)

 target-ppc/cpu.h         |  2 +-
 target-ppc/helper_regs.h |  2 +-
 target-ppc/mmu_helper.c  | 44 ++++++++------------------------------------
 target-ppc/translate.c   | 27 +++++++++++++++++++++------
 4 files changed, 31 insertions(+), 44 deletions(-)

Comments

Cédric Le Goater June 6, 2016, 11:13 a.m. UTC | #1
On 06/06/2016 12:23 PM, Benjamin Herrenschmidt wrote:
> This ports the existing 64-bit mechanism to 32-bit, thus series
> of 64 tlbie's followed by a sync like some versions of Darwin
> (ab)use will result in a single flush.
> 
> We apply a pending flush on any sync instruction though, as Darwin
> doesn't use tlbsync on non-SMP systems.

Yes, this is the case at the right beginning of boot but it does use 
tlbsync after, in hw_rem_map() where pvr is only tested against 603.

> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
> 
> Note: I haven't done any performance impact measurements with this
> one ... feel free to let me know what it does for you :-)

It adds a couple of seconds improvement on a ~47s boot time on my 
thinkpad. So a 2-5% I would say but I haven't done much more perf.


Thanks,

C.

>  target-ppc/cpu.h         |  2 +-
>  target-ppc/helper_regs.h |  2 +-
>  target-ppc/mmu_helper.c  | 44 ++++++++------------------------------------
>  target-ppc/translate.c   | 27 +++++++++++++++++++++------
>  4 files changed, 31 insertions(+), 44 deletions(-)
> 
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index d8f8f7e..c2962d7 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -959,7 +959,6 @@ struct CPUPPCState {
>      ppc_slb_t slb[MAX_SLB_ENTRIES];
>      int32_t slb_nr;
>      /* tcg TLB needs flush (deferred slb inval instruction typically) */
> -    uint32_t tlb_need_flush;
>  #endif
>      /* segment registers */
>      hwaddr htab_base;
> @@ -985,6 +984,7 @@ struct CPUPPCState {
>      target_ulong pb[4];
>      bool tlb_dirty;   /* Set to non-zero when modifying TLB                  */
>      bool kvm_sw_tlb;  /* non-zero if KVM SW TLB API is active                */
> +    uint32_t tlb_need_flush; /* Delayed flush needed */
>  #endif
>  
>      /* Other registers */
> diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
> index 104b690..8fc0934 100644
> --- a/target-ppc/helper_regs.h
> +++ b/target-ppc/helper_regs.h
> @@ -151,7 +151,7 @@ static inline int hreg_store_msr(CPUPPCState *env, target_ulong value,
>      return excp;
>  }
>  
> -#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
> +#if !defined(CONFIG_USER_ONLY)
>  static inline void check_tlb_flush(CPUPPCState *env)
>  {
>      CPUState *cs = CPU(ppc_env_get_cpu(env));
> diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> index a5e3878..485d5b8 100644
> --- a/target-ppc/mmu_helper.c
> +++ b/target-ppc/mmu_helper.c
> @@ -1935,8 +1935,8 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
>      case POWERPC_MMU_2_06a:
>      case POWERPC_MMU_2_07:
>      case POWERPC_MMU_2_07a:
> -        env->tlb_need_flush = 0;
>  #endif /* defined(TARGET_PPC64) */
> +        env->tlb_need_flush = 0;
>          tlb_flush(CPU(cpu), 1);
>          break;
>      default:
> @@ -1949,9 +1949,6 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
>  void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
>  {
>  #if !defined(FLUSH_ALL_TLBS)
> -    PowerPCCPU *cpu = ppc_env_get_cpu(env);
> -    CPUState *cs;
> -
>      addr &= TARGET_PAGE_MASK;
>      switch (env->mmu_model) {
>      case POWERPC_MMU_SOFT_6xx:
> @@ -1963,36 +1960,12 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
>          break;
>      case POWERPC_MMU_32B:
>      case POWERPC_MMU_601:
> -        /* tlbie invalidate TLBs for all segments */
> -        addr &= ~((target_ulong)-1ULL << 28);
> -        cs = CPU(cpu);
> -        /* XXX: this case should be optimized,
> -         * giving a mask to tlb_flush_page
> -         */
> -        /* This is broken, some CPUs invalidate a whole congruence
> -         * class on an even smaller subset of bits and some OSes take
> -         * advantage of this. Just blow the whole thing away.
> +        /* Actual CPUs invalidate entire congruence classes based on the
> +         * geometry of their TLBs and some OSes take that into account,
> +         * we just mark the TLB to be flushed later (context synchronizing
> +         * event or sync instruction on 32-bit).
>           */
> -#if 0
> -        tlb_flush_page(cs, addr | (0x0 << 28));
> -        tlb_flush_page(cs, addr | (0x1 << 28));
> -        tlb_flush_page(cs, addr | (0x2 << 28));
> -        tlb_flush_page(cs, addr | (0x3 << 28));
> -        tlb_flush_page(cs, addr | (0x4 << 28));
> -        tlb_flush_page(cs, addr | (0x5 << 28));
> -        tlb_flush_page(cs, addr | (0x6 << 28));
> -        tlb_flush_page(cs, addr | (0x7 << 28));
> -        tlb_flush_page(cs, addr | (0x8 << 28));
> -        tlb_flush_page(cs, addr | (0x9 << 28));
> -        tlb_flush_page(cs, addr | (0xA << 28));
> -        tlb_flush_page(cs, addr | (0xB << 28));
> -        tlb_flush_page(cs, addr | (0xC << 28));
> -        tlb_flush_page(cs, addr | (0xD << 28));
> -        tlb_flush_page(cs, addr | (0xE << 28));
> -        tlb_flush_page(cs, addr | (0xF << 28));
> -#else
> -        tlb_flush(cs, 1);
> -#endif
> +        env->tlb_need_flush = 1;
>          break;
>  #if defined(TARGET_PPC64)
>      case POWERPC_MMU_64B:
> @@ -2058,13 +2031,12 @@ target_ulong helper_load_sr(CPUPPCState *env, target_ulong sr_num)
>  
>  void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
>  {
> -    PowerPCCPU *cpu = ppc_env_get_cpu(env);
> -
>      qemu_log_mask(CPU_LOG_MMU,
>              "%s: reg=%d " TARGET_FMT_lx " " TARGET_FMT_lx "\n", __func__,
>              (int)srnum, value, env->sr[srnum]);
>  #if defined(TARGET_PPC64)
>      if (env->mmu_model & POWERPC_MMU_64) {
> +        PowerPCCPU *cpu = ppc_env_get_cpu(env);
>          uint64_t esid, vsid;
>  
>          /* ESID = srnum */
> @@ -2093,7 +2065,7 @@ void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
>              }
>          }
>  #else
> -        tlb_flush(CPU(cpu), 1);
> +        env->tlb_need_flush = 1;
>  #endif
>      }
>  }
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index 7763431..ab5862f 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -193,6 +193,7 @@ struct DisasContext {
>      uint32_t exception;
>      /* Routine used to access memory */
>      bool pr, hv;
> +    bool lazy_tlb_flush;
>      int mem_idx;
>      int access_type;
>      /* Translation flags */
> @@ -3290,12 +3291,17 @@ static void gen_eieio(DisasContext *ctx)
>  {
>  }
>  
> -#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
> +#if !defined(CONFIG_USER_ONLY)
>  static inline void gen_check_tlb_flush(DisasContext *ctx)
>  {
> -    TCGv_i32 t = tcg_temp_new_i32();
> -    TCGLabel *l = gen_new_label();
> +    TCGv_i32 t;
> +    TCGLabel *l;
>  
> +    if (!ctx->lazy_tlb_flush) {
> +        return;
> +    }
> +    l = gen_new_label();
> +    t = tcg_temp_new_i32();
>      tcg_gen_ld_i32(t, cpu_env, offsetof(CPUPPCState, tlb_need_flush));
>      tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, l);
>      gen_helper_check_tlb_flush(cpu_env);
> @@ -3475,10 +3481,14 @@ static void gen_sync(DisasContext *ctx)
>      uint32_t l = (ctx->opcode >> 21) & 3;
>  
>      /*
> -     * For l == 2, it's a ptesync, We need to check for a pending TLB flush.
> -     * This can only happen in kernel mode however so check MSR_PR as well.
> +     * We may need to check for a pending TLB flush.
> +     *
> +     * We do this on ptesync (l == 2) on ppc64 and any sync pn ppc32.
> +     *
> +     * Additionally, this can only happen in kernel mode however so
> +     * check MSR_PR as well.
>       */
> -    if (l == 2 && !ctx->pr) {
> +    if (((l == 2) || !(ctx->insns_flags & PPC_64B)) && !ctx->pr) {
>          gen_check_tlb_flush(ctx);
>      }
>  }
> @@ -11491,6 +11501,11 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
>      ctx.sf_mode = msr_is_64bit(env, env->msr);
>      ctx.has_cfar = !!(env->flags & POWERPC_FLAG_CFAR);
>  #endif
> +    if (env->mmu_model == POWERPC_MMU_32B ||
> +        env->mmu_model == POWERPC_MMU_601 ||
> +        (env->mmu_model & POWERPC_MMU_64B))
> +            ctx.lazy_tlb_flush = true;
> +
>      ctx.fpu_enabled = msr_fp;
>      if ((env->flags & POWERPC_FLAG_SPE) && msr_spe)
>          ctx.spe_enabled = msr_spe;
> 
>
Mark Cave-Ayland June 6, 2016, 10:36 p.m. UTC | #2
On 06/06/16 11:23, Benjamin Herrenschmidt wrote:

> This ports the existing 64-bit mechanism to 32-bit, thus series
> of 64 tlbie's followed by a sync like some versions of Darwin
> (ab)use will result in a single flush.
> 
> We apply a pending flush on any sync instruction though, as Darwin
> doesn't use tlbsync on non-SMP systems.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
> 
> Note: I haven't done any performance impact measurements with this
> one ... feel free to let me know what it does for you :-)
> 
>  target-ppc/cpu.h         |  2 +-
>  target-ppc/helper_regs.h |  2 +-
>  target-ppc/mmu_helper.c  | 44 ++++++++------------------------------------
>  target-ppc/translate.c   | 27 +++++++++++++++++++++------
>  4 files changed, 31 insertions(+), 44 deletions(-)
> 
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index d8f8f7e..c2962d7 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -959,7 +959,6 @@ struct CPUPPCState {
>      ppc_slb_t slb[MAX_SLB_ENTRIES];
>      int32_t slb_nr;
>      /* tcg TLB needs flush (deferred slb inval instruction typically) */
> -    uint32_t tlb_need_flush;
>  #endif
>      /* segment registers */
>      hwaddr htab_base;
> @@ -985,6 +984,7 @@ struct CPUPPCState {
>      target_ulong pb[4];
>      bool tlb_dirty;   /* Set to non-zero when modifying TLB                  */
>      bool kvm_sw_tlb;  /* non-zero if KVM SW TLB API is active                */
> +    uint32_t tlb_need_flush; /* Delayed flush needed */
>  #endif
>  
>      /* Other registers */
> diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
> index 104b690..8fc0934 100644
> --- a/target-ppc/helper_regs.h
> +++ b/target-ppc/helper_regs.h
> @@ -151,7 +151,7 @@ static inline int hreg_store_msr(CPUPPCState *env, target_ulong value,
>      return excp;
>  }
>  
> -#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
> +#if !defined(CONFIG_USER_ONLY)
>  static inline void check_tlb_flush(CPUPPCState *env)
>  {
>      CPUState *cs = CPU(ppc_env_get_cpu(env));
> diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> index a5e3878..485d5b8 100644
> --- a/target-ppc/mmu_helper.c
> +++ b/target-ppc/mmu_helper.c
> @@ -1935,8 +1935,8 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
>      case POWERPC_MMU_2_06a:
>      case POWERPC_MMU_2_07:
>      case POWERPC_MMU_2_07a:
> -        env->tlb_need_flush = 0;
>  #endif /* defined(TARGET_PPC64) */
> +        env->tlb_need_flush = 0;
>          tlb_flush(CPU(cpu), 1);
>          break;
>      default:
> @@ -1949,9 +1949,6 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
>  void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
>  {
>  #if !defined(FLUSH_ALL_TLBS)
> -    PowerPCCPU *cpu = ppc_env_get_cpu(env);
> -    CPUState *cs;
> -
>      addr &= TARGET_PAGE_MASK;
>      switch (env->mmu_model) {
>      case POWERPC_MMU_SOFT_6xx:
> @@ -1963,36 +1960,12 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
>          break;
>      case POWERPC_MMU_32B:
>      case POWERPC_MMU_601:
> -        /* tlbie invalidate TLBs for all segments */
> -        addr &= ~((target_ulong)-1ULL << 28);
> -        cs = CPU(cpu);
> -        /* XXX: this case should be optimized,
> -         * giving a mask to tlb_flush_page
> -         */
> -        /* This is broken, some CPUs invalidate a whole congruence
> -         * class on an even smaller subset of bits and some OSes take
> -         * advantage of this. Just blow the whole thing away.
> +        /* Actual CPUs invalidate entire congruence classes based on the
> +         * geometry of their TLBs and some OSes take that into account,
> +         * we just mark the TLB to be flushed later (context synchronizing
> +         * event or sync instruction on 32-bit).
>           */
> -#if 0
> -        tlb_flush_page(cs, addr | (0x0 << 28));
> -        tlb_flush_page(cs, addr | (0x1 << 28));
> -        tlb_flush_page(cs, addr | (0x2 << 28));
> -        tlb_flush_page(cs, addr | (0x3 << 28));
> -        tlb_flush_page(cs, addr | (0x4 << 28));
> -        tlb_flush_page(cs, addr | (0x5 << 28));
> -        tlb_flush_page(cs, addr | (0x6 << 28));
> -        tlb_flush_page(cs, addr | (0x7 << 28));
> -        tlb_flush_page(cs, addr | (0x8 << 28));
> -        tlb_flush_page(cs, addr | (0x9 << 28));
> -        tlb_flush_page(cs, addr | (0xA << 28));
> -        tlb_flush_page(cs, addr | (0xB << 28));
> -        tlb_flush_page(cs, addr | (0xC << 28));
> -        tlb_flush_page(cs, addr | (0xD << 28));
> -        tlb_flush_page(cs, addr | (0xE << 28));
> -        tlb_flush_page(cs, addr | (0xF << 28));
> -#else
> -        tlb_flush(cs, 1);
> -#endif
> +        env->tlb_need_flush = 1;
>          break;
>  #if defined(TARGET_PPC64)
>      case POWERPC_MMU_64B:
> @@ -2058,13 +2031,12 @@ target_ulong helper_load_sr(CPUPPCState *env, target_ulong sr_num)
>  
>  void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
>  {
> -    PowerPCCPU *cpu = ppc_env_get_cpu(env);
> -
>      qemu_log_mask(CPU_LOG_MMU,
>              "%s: reg=%d " TARGET_FMT_lx " " TARGET_FMT_lx "\n", __func__,
>              (int)srnum, value, env->sr[srnum]);
>  #if defined(TARGET_PPC64)
>      if (env->mmu_model & POWERPC_MMU_64) {
> +        PowerPCCPU *cpu = ppc_env_get_cpu(env);
>          uint64_t esid, vsid;
>  
>          /* ESID = srnum */
> @@ -2093,7 +2065,7 @@ void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
>              }
>          }
>  #else
> -        tlb_flush(CPU(cpu), 1);
> +        env->tlb_need_flush = 1;
>  #endif
>      }
>  }
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index 7763431..ab5862f 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -193,6 +193,7 @@ struct DisasContext {
>      uint32_t exception;
>      /* Routine used to access memory */
>      bool pr, hv;
> +    bool lazy_tlb_flush;
>      int mem_idx;
>      int access_type;
>      /* Translation flags */
> @@ -3290,12 +3291,17 @@ static void gen_eieio(DisasContext *ctx)
>  {
>  }
>  
> -#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
> +#if !defined(CONFIG_USER_ONLY)
>  static inline void gen_check_tlb_flush(DisasContext *ctx)
>  {
> -    TCGv_i32 t = tcg_temp_new_i32();
> -    TCGLabel *l = gen_new_label();
> +    TCGv_i32 t;
> +    TCGLabel *l;
>  
> +    if (!ctx->lazy_tlb_flush) {
> +        return;
> +    }
> +    l = gen_new_label();
> +    t = tcg_temp_new_i32();
>      tcg_gen_ld_i32(t, cpu_env, offsetof(CPUPPCState, tlb_need_flush));
>      tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, l);
>      gen_helper_check_tlb_flush(cpu_env);
> @@ -3475,10 +3481,14 @@ static void gen_sync(DisasContext *ctx)
>      uint32_t l = (ctx->opcode >> 21) & 3;
>  
>      /*
> -     * For l == 2, it's a ptesync, We need to check for a pending TLB flush.
> -     * This can only happen in kernel mode however so check MSR_PR as well.
> +     * We may need to check for a pending TLB flush.
> +     *
> +     * We do this on ptesync (l == 2) on ppc64 and any sync pn ppc32.
> +     *
> +     * Additionally, this can only happen in kernel mode however so
> +     * check MSR_PR as well.
>       */
> -    if (l == 2 && !ctx->pr) {
> +    if (((l == 2) || !(ctx->insns_flags & PPC_64B)) && !ctx->pr) {
>          gen_check_tlb_flush(ctx);
>      }
>  }
> @@ -11491,6 +11501,11 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
>      ctx.sf_mode = msr_is_64bit(env, env->msr);
>      ctx.has_cfar = !!(env->flags & POWERPC_FLAG_CFAR);
>  #endif
> +    if (env->mmu_model == POWERPC_MMU_32B ||
> +        env->mmu_model == POWERPC_MMU_601 ||
> +        (env->mmu_model & POWERPC_MMU_64B))
> +            ctx.lazy_tlb_flush = true;
> +
>      ctx.fpu_enabled = msr_fp;
>      if ((env->flags & POWERPC_FLAG_SPE) && msr_spe)
>          ctx.spe_enabled = msr_spe;
> 
> 

After another run of the OpenBIOS tests with this patch applied on top
of the previous 2 patches, I see no regressions introduced. Like Cédric
I don't get the feeling that the Mac machines necessarily run faster,
however the overall experience does feel smoother and more responsive.

Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>


ATB,

Mark.
Benjamin Herrenschmidt June 6, 2016, 10:49 p.m. UTC | #3
On Mon, 2016-06-06 at 23:36 +0100, Mark Cave-Ayland wrote:
> 
> After another run of the OpenBIOS tests with this patch applied on top
> of the previous 2 patches, I see no regressions introduced. Like Cédric
> I don't get the feeling that the Mac machines necessarily run faster,
> however the overall experience does feel smoother and more responsive.
> 
> Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Thanks !

Cheers,
Ben.
diff mbox

Patch

diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index d8f8f7e..c2962d7 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -959,7 +959,6 @@  struct CPUPPCState {
     ppc_slb_t slb[MAX_SLB_ENTRIES];
     int32_t slb_nr;
     /* tcg TLB needs flush (deferred slb inval instruction typically) */
-    uint32_t tlb_need_flush;
 #endif
     /* segment registers */
     hwaddr htab_base;
@@ -985,6 +984,7 @@  struct CPUPPCState {
     target_ulong pb[4];
     bool tlb_dirty;   /* Set to non-zero when modifying TLB                  */
     bool kvm_sw_tlb;  /* non-zero if KVM SW TLB API is active                */
+    uint32_t tlb_need_flush; /* Delayed flush needed */
 #endif
 
     /* Other registers */
diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
index 104b690..8fc0934 100644
--- a/target-ppc/helper_regs.h
+++ b/target-ppc/helper_regs.h
@@ -151,7 +151,7 @@  static inline int hreg_store_msr(CPUPPCState *env, target_ulong value,
     return excp;
 }
 
-#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
+#if !defined(CONFIG_USER_ONLY)
 static inline void check_tlb_flush(CPUPPCState *env)
 {
     CPUState *cs = CPU(ppc_env_get_cpu(env));
diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
index a5e3878..485d5b8 100644
--- a/target-ppc/mmu_helper.c
+++ b/target-ppc/mmu_helper.c
@@ -1935,8 +1935,8 @@  void ppc_tlb_invalidate_all(CPUPPCState *env)
     case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
     case POWERPC_MMU_2_07a:
-        env->tlb_need_flush = 0;
 #endif /* defined(TARGET_PPC64) */
+        env->tlb_need_flush = 0;
         tlb_flush(CPU(cpu), 1);
         break;
     default:
@@ -1949,9 +1949,6 @@  void ppc_tlb_invalidate_all(CPUPPCState *env)
 void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
 {
 #if !defined(FLUSH_ALL_TLBS)
-    PowerPCCPU *cpu = ppc_env_get_cpu(env);
-    CPUState *cs;
-
     addr &= TARGET_PAGE_MASK;
     switch (env->mmu_model) {
     case POWERPC_MMU_SOFT_6xx:
@@ -1963,36 +1960,12 @@  void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
         break;
     case POWERPC_MMU_32B:
     case POWERPC_MMU_601:
-        /* tlbie invalidate TLBs for all segments */
-        addr &= ~((target_ulong)-1ULL << 28);
-        cs = CPU(cpu);
-        /* XXX: this case should be optimized,
-         * giving a mask to tlb_flush_page
-         */
-        /* This is broken, some CPUs invalidate a whole congruence
-         * class on an even smaller subset of bits and some OSes take
-         * advantage of this. Just blow the whole thing away.
+        /* Actual CPUs invalidate entire congruence classes based on the
+         * geometry of their TLBs and some OSes take that into account,
+         * we just mark the TLB to be flushed later (context synchronizing
+         * event or sync instruction on 32-bit).
          */
-#if 0
-        tlb_flush_page(cs, addr | (0x0 << 28));
-        tlb_flush_page(cs, addr | (0x1 << 28));
-        tlb_flush_page(cs, addr | (0x2 << 28));
-        tlb_flush_page(cs, addr | (0x3 << 28));
-        tlb_flush_page(cs, addr | (0x4 << 28));
-        tlb_flush_page(cs, addr | (0x5 << 28));
-        tlb_flush_page(cs, addr | (0x6 << 28));
-        tlb_flush_page(cs, addr | (0x7 << 28));
-        tlb_flush_page(cs, addr | (0x8 << 28));
-        tlb_flush_page(cs, addr | (0x9 << 28));
-        tlb_flush_page(cs, addr | (0xA << 28));
-        tlb_flush_page(cs, addr | (0xB << 28));
-        tlb_flush_page(cs, addr | (0xC << 28));
-        tlb_flush_page(cs, addr | (0xD << 28));
-        tlb_flush_page(cs, addr | (0xE << 28));
-        tlb_flush_page(cs, addr | (0xF << 28));
-#else
-        tlb_flush(cs, 1);
-#endif
+        env->tlb_need_flush = 1;
         break;
 #if defined(TARGET_PPC64)
     case POWERPC_MMU_64B:
@@ -2058,13 +2031,12 @@  target_ulong helper_load_sr(CPUPPCState *env, target_ulong sr_num)
 
 void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
 {
-    PowerPCCPU *cpu = ppc_env_get_cpu(env);
-
     qemu_log_mask(CPU_LOG_MMU,
             "%s: reg=%d " TARGET_FMT_lx " " TARGET_FMT_lx "\n", __func__,
             (int)srnum, value, env->sr[srnum]);
 #if defined(TARGET_PPC64)
     if (env->mmu_model & POWERPC_MMU_64) {
+        PowerPCCPU *cpu = ppc_env_get_cpu(env);
         uint64_t esid, vsid;
 
         /* ESID = srnum */
@@ -2093,7 +2065,7 @@  void helper_store_sr(CPUPPCState *env, target_ulong srnum, target_ulong value)
             }
         }
 #else
-        tlb_flush(CPU(cpu), 1);
+        env->tlb_need_flush = 1;
 #endif
     }
 }
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 7763431..ab5862f 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -193,6 +193,7 @@  struct DisasContext {
     uint32_t exception;
     /* Routine used to access memory */
     bool pr, hv;
+    bool lazy_tlb_flush;
     int mem_idx;
     int access_type;
     /* Translation flags */
@@ -3290,12 +3291,17 @@  static void gen_eieio(DisasContext *ctx)
 {
 }
 
-#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
+#if !defined(CONFIG_USER_ONLY)
 static inline void gen_check_tlb_flush(DisasContext *ctx)
 {
-    TCGv_i32 t = tcg_temp_new_i32();
-    TCGLabel *l = gen_new_label();
+    TCGv_i32 t;
+    TCGLabel *l;
 
+    if (!ctx->lazy_tlb_flush) {
+        return;
+    }
+    l = gen_new_label();
+    t = tcg_temp_new_i32();
     tcg_gen_ld_i32(t, cpu_env, offsetof(CPUPPCState, tlb_need_flush));
     tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, l);
     gen_helper_check_tlb_flush(cpu_env);
@@ -3475,10 +3481,14 @@  static void gen_sync(DisasContext *ctx)
     uint32_t l = (ctx->opcode >> 21) & 3;
 
     /*
-     * For l == 2, it's a ptesync, We need to check for a pending TLB flush.
-     * This can only happen in kernel mode however so check MSR_PR as well.
+     * We may need to check for a pending TLB flush.
+     *
+     * We do this on ptesync (l == 2) on ppc64 and any sync pn ppc32.
+     *
+     * Additionally, this can only happen in kernel mode however so
+     * check MSR_PR as well.
      */
-    if (l == 2 && !ctx->pr) {
+    if (((l == 2) || !(ctx->insns_flags & PPC_64B)) && !ctx->pr) {
         gen_check_tlb_flush(ctx);
     }
 }
@@ -11491,6 +11501,11 @@  void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
     ctx.sf_mode = msr_is_64bit(env, env->msr);
     ctx.has_cfar = !!(env->flags & POWERPC_FLAG_CFAR);
 #endif
+    if (env->mmu_model == POWERPC_MMU_32B ||
+        env->mmu_model == POWERPC_MMU_601 ||
+        (env->mmu_model & POWERPC_MMU_64B))
+            ctx.lazy_tlb_flush = true;
+
     ctx.fpu_enabled = msr_fp;
     if ((env->flags & POWERPC_FLAG_SPE) && msr_spe)
         ctx.spe_enabled = msr_spe;