diff mbox series

[RFC/PATCH] i386: Atomically update PTEs with mttcg

Message ID f4349b9ee95cbd9d5b16e9033fdf0d7c9e99aebf.camel@kernel.crashing.org (mailing list archive)
State New, archived
Headers show
Series [RFC/PATCH] i386: Atomically update PTEs with mttcg | expand

Commit Message

Benjamin Herrenschmidt Nov. 28, 2018, 11:15 p.m. UTC
Afaik, this isn't well documented (at least it wasn't when I last looked)
but OSes such as Linux rely on this behaviour:

The HW updates to the page tables need to be done atomically with the
checking of the present bit (and other permissions).

This is what allows Linux to do simple xchg of PTEs with 0 and assume
the value read has "final" stable dirty and accessed bits (the TLB
invalidation is deferred).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

This is lightly tested at this point, mostly to gather comments on the
approach.

I noticed that RiscV is attempting to do something similar but with endian
bugs, at least from my reading of the code.

 include/exec/memory_ldst.inc.h |   3 +
 memory_ldst.inc.c              |  38 ++++++++++++
 target/i386/excp_helper.c      | 104 +++++++++++++++++++++++++--------
 3 files changed, 121 insertions(+), 24 deletions(-)

Comments

Paolo Bonzini Nov. 29, 2018, 9:26 a.m. UTC | #1
On 29/11/18 00:15, Benjamin Herrenschmidt wrote:
> Afaik, this isn't well documented (at least it wasn't when I last looked)
> but OSes such as Linux rely on this behaviour:
> 
> The HW updates to the page tables need to be done atomically with the
> checking of the present bit (and other permissions).
> 
> This is what allows Linux to do simple xchg of PTEs with 0 and assume
> the value read has "final" stable dirty and accessed bits (the TLB
> invalidation is deferred).
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
> 
> This is lightly tested at this point, mostly to gather comments on the
> approach.

Looks good, but please kill the notdirty stuff first.  It's not needed
and it's a clear bug.  When migrating, it can lead to PTEs being
migrated without accessed and dirty bits.

Paolo

> I noticed that RiscV is attempting to do something similar but with endian
> bugs, at least from my reading of the code.
> 
>  include/exec/memory_ldst.inc.h |   3 +
>  memory_ldst.inc.c              |  38 ++++++++++++
>  target/i386/excp_helper.c      | 104 +++++++++++++++++++++++++--------
>  3 files changed, 121 insertions(+), 24 deletions(-)
> 
> diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h
> index 272c20f02e..b7a41a0ad4 100644
> --- a/include/exec/memory_ldst.inc.h
> +++ b/include/exec/memory_ldst.inc.h
> @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL,
>      hwaddr addr, MemTxAttrs attrs, MemTxResult *result);
>  extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
>      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
> +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
> +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs,
> +    MemTxResult *result);
>  extern void glue(address_space_stw, SUFFIX)(ARG1_DECL,
>      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
>  extern void glue(address_space_stl, SUFFIX)(ARG1_DECL,
> diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
> index acf865b900..63af8f7dd2 100644
> --- a/memory_ldst.inc.c
> +++ b/memory_ldst.inc.c
> @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
>      RCU_READ_UNLOCK();
>  }
>  
> +/* This is meant to be used for atomic PTE updates under MT-TCG */
> +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
> +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result)
> +{
> +    uint8_t *ptr;
> +    MemoryRegion *mr;
> +    hwaddr l = 4;
> +    hwaddr addr1;
> +    MemTxResult r;
> +    uint8_t dirty_log_mask;
> +
> +    /* Must test result */
> +    assert(result);
> +
> +    RCU_READ_LOCK();
> +    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> +    if (l < 4 || !memory_access_is_direct(mr, true)) {
> +        r = MEMTX_ERROR;
> +    } else {
> +        uint32_t orig = old;
> +
> +        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> +        old = atomic_cmpxchg(ptr, orig, new);
> +
> +        if (old == orig) {
> +            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
> +            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
> +            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
> +                                                4, dirty_log_mask);
> +        }
> +        r = MEMTX_OK;
> +    }
> +    *result = r;
> +    RCU_READ_UNLOCK();
> +
> +    return old;
> +}
> +
>  /* warning: addr must be aligned */
>  static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
>      hwaddr addr, uint32_t val, MemTxAttrs attrs,
> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
> index 49231f6b69..5ccb9d6d6a 100644
> --- a/target/i386/excp_helper.c
> +++ b/target/i386/excp_helper.c
> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>  
>  #else
>  
> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
> +                                    uint64_t orig_entry, uint32_t bits)
> +{
> +    uint64_t new_entry = orig_entry | bits;
> +
> +    /* Write the updated bottom 32-bits */
> +    if (qemu_tcg_mttcg_enabled()) {
> +        uint32_t old_le = cpu_to_le32(orig_entry);
> +        uint32_t new_le = cpu_to_le32(new_entry);
> +        MemTxResult result;
> +        uint32_t old_ret;
> +
> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
> +                                                  old_le, new_le,
> +                                                  MEMTXATTRS_UNSPECIFIED,
> +                                                  &result);
> +        if (result == MEMTX_OK) {
> +            if (old_ret != old_le) {
> +                new_entry = 0;
> +            }
> +            return new_entry;
> +        }
> +
> +        /* Do we need to support this case where PTEs aren't in RAM ?
> +         *
> +         * For now fallback to non-atomic case
> +         */
> +    }
> +
> +    x86_stl_phys_notdirty(cs, addr, new_entry);
> +
> +    return new_entry;
> +}
> +
>  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>                          int *prot)
>  {
>      CPUX86State *env = &X86_CPU(cs)->env;
> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> +    uint64_t rsvd_mask;
>      uint64_t ptep, pte;
>      uint64_t exit_info_1 = 0;
>      target_ulong pde_addr, pte_addr;
> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          return gphys;
>      }
>  
> + restart:
> +    rsvd_mask = PG_HI_RSVD_MASK;
>      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
>          rsvd_mask |= PG_NX_MASK;
>      }
> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>                  goto do_fault_rsvd;
>              }
>              if (!(pml4e & PG_ACCESSED_MASK)) {
> -                pml4e |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> +                if (!pml4e) {
> +                    goto restart;
> +                }
>              }
>              ptep &= pml4e ^ PG_NX_MASK;
>              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>              }
>              ptep &= pdpe ^ PG_NX_MASK;
>              if (!(pdpe & PG_ACCESSED_MASK)) {
> -                pdpe |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> +                if (!pdpe) {
> +                    goto restart;
> +                }
>              }
>              if (pdpe & PG_PSE_MASK) {
>                  /* 1 GB page */
> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          }
>          /* 4 KB page */
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
>          pte = x86_ldq_phys(cs, pte_addr);
> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          }
>  
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>  
>          /* page directory entry */
> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>      int error_code = 0;
>      int is_dirty, prot, page_size, is_write, is_user;
>      hwaddr paddr;
> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> +    uint64_t rsvd_mask;
>      uint32_t page_offset;
>      target_ulong vaddr;
>  
> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          goto do_mapping;
>      }
>  
> + restart:
> +    rsvd_mask = PG_HI_RSVD_MASK;
>      if (!(env->efer & MSR_EFER_NXE)) {
>          rsvd_mask |= PG_NX_MASK;
>      }
> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>                      goto do_fault_rsvd;
>                  }
>                  if (!(pml5e & PG_ACCESSED_MASK)) {
> -                    pml5e |= PG_ACCESSED_MASK;
> -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
> +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
> +                    if (!pml5e) {
> +                        goto restart;
> +                    }
>                  }
>                  ptep = pml5e ^ PG_NX_MASK;
>              } else {
> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>                  goto do_fault_rsvd;
>              }
>              if (!(pml4e & PG_ACCESSED_MASK)) {
> -                pml4e |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> +                if (!pml4e) {
> +                    goto restart;
> +                }
>              }
>              ptep &= pml4e ^ PG_NX_MASK;
>              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>              }
>              ptep &= pdpe ^ PG_NX_MASK;
>              if (!(pdpe & PG_ACCESSED_MASK)) {
> -                pdpe |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> +                if (!pdpe) {
> +                    goto restart;
> +                }
>              }
>              if (pdpe & PG_PSE_MASK) {
>                  /* 1 GB page */
> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          }
>          /* 4 KB page */
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
>              a20_mask;
> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          }
>  
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>  
>          /* page directory entry */
> @@ -634,11 +690,11 @@ do_check_protect_pse36:
>      /* yes, it can! */
>      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
>      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
> -        pte |= PG_ACCESSED_MASK;
> -        if (is_dirty) {
> -            pte |= PG_DIRTY_MASK;
> +        pte = update_entry(cs, pte_addr, pte,
> +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
> +        if (!pte) {
> +            goto restart;
>          }
> -        x86_stl_phys_notdirty(cs, pte_addr, pte);
>      }
>  
>      if (!(pte & PG_DIRTY_MASK)) {
> 
>
Benjamin Herrenschmidt Nov. 29, 2018, 10:04 a.m. UTC | #2
On Thu, 2018-11-29 at 10:26 +0100, Paolo Bonzini wrote:
> On 29/11/18 00:15, Benjamin Herrenschmidt wrote:
> > Afaik, this isn't well documented (at least it wasn't when I last looked)
> > but OSes such as Linux rely on this behaviour:
> > 
> > The HW updates to the page tables need to be done atomically with the
> > checking of the present bit (and other permissions).
> > 
> > This is what allows Linux to do simple xchg of PTEs with 0 and assume
> > the value read has "final" stable dirty and accessed bits (the TLB
> > invalidation is deferred).
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> > 
> > This is lightly tested at this point, mostly to gather comments on the
> > approach.
> 
> Looks good, but please kill the notdirty stuff first.  It's not needed
> and it's a clear bug.  When migrating, it can lead to PTEs being
> migrated without accessed and dirty bits.

I thought that too but looking closely with rth, it seems it's still
setting *those* dirty bits, it just avoids the collision test with the
translator. Unless I missed something...

Do you still want to kill them ? They are warty, no doubt...
 
For powerpc I need a cmpxchgq variant too, I'll probably split the
patch adding those mem ops from the rest as well.

Annoyingly, fixing riscv will need some tests on target_ulong size.

Cheers,
Ben.

> Paolo
> 
> > I noticed that RiscV is attempting to do something similar but with endian
> > bugs, at least from my reading of the code.
> > 
> >  include/exec/memory_ldst.inc.h |   3 +
> >  memory_ldst.inc.c              |  38 ++++++++++++
> >  target/i386/excp_helper.c      | 104 +++++++++++++++++++++++++--------
> >  3 files changed, 121 insertions(+), 24 deletions(-)
> > 
> > diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h
> > index 272c20f02e..b7a41a0ad4 100644
> > --- a/include/exec/memory_ldst.inc.h
> > +++ b/include/exec/memory_ldst.inc.h
> > @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL,
> >      hwaddr addr, MemTxAttrs attrs, MemTxResult *result);
> >  extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
> >      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
> > +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
> > +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs,
> > +    MemTxResult *result);
> >  extern void glue(address_space_stw, SUFFIX)(ARG1_DECL,
> >      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
> >  extern void glue(address_space_stl, SUFFIX)(ARG1_DECL,
> > diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
> > index acf865b900..63af8f7dd2 100644
> > --- a/memory_ldst.inc.c
> > +++ b/memory_ldst.inc.c
> > @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
> >      RCU_READ_UNLOCK();
> >  }
> >  
> > +/* This is meant to be used for atomic PTE updates under MT-TCG */
> > +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
> > +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result)
> > +{
> > +    uint8_t *ptr;
> > +    MemoryRegion *mr;
> > +    hwaddr l = 4;
> > +    hwaddr addr1;
> > +    MemTxResult r;
> > +    uint8_t dirty_log_mask;
> > +
> > +    /* Must test result */
> > +    assert(result);
> > +
> > +    RCU_READ_LOCK();
> > +    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> > +    if (l < 4 || !memory_access_is_direct(mr, true)) {
> > +        r = MEMTX_ERROR;
> > +    } else {
> > +        uint32_t orig = old;
> > +
> > +        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> > +        old = atomic_cmpxchg(ptr, orig, new);
> > +
> > +        if (old == orig) {
> > +            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
> > +            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
> > +            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
> > +                                                4, dirty_log_mask);
> > +        }
> > +        r = MEMTX_OK;
> > +    }
> > +    *result = r;
> > +    RCU_READ_UNLOCK();
> > +
> > +    return old;
> > +}
> > +
> >  /* warning: addr must be aligned */
> >  static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
> >      hwaddr addr, uint32_t val, MemTxAttrs attrs,
> > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
> > index 49231f6b69..5ccb9d6d6a 100644
> > --- a/target/i386/excp_helper.c
> > +++ b/target/i386/excp_helper.c
> > @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >  
> >  #else
> >  
> > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
> > +                                    uint64_t orig_entry, uint32_t bits)
> > +{
> > +    uint64_t new_entry = orig_entry | bits;
> > +
> > +    /* Write the updated bottom 32-bits */
> > +    if (qemu_tcg_mttcg_enabled()) {
> > +        uint32_t old_le = cpu_to_le32(orig_entry);
> > +        uint32_t new_le = cpu_to_le32(new_entry);
> > +        MemTxResult result;
> > +        uint32_t old_ret;
> > +
> > +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
> > +                                                  old_le, new_le,
> > +                                                  MEMTXATTRS_UNSPECIFIED,
> > +                                                  &result);
> > +        if (result == MEMTX_OK) {
> > +            if (old_ret != old_le) {
> > +                new_entry = 0;
> > +            }
> > +            return new_entry;
> > +        }
> > +
> > +        /* Do we need to support this case where PTEs aren't in RAM ?
> > +         *
> > +         * For now fallback to non-atomic case
> > +         */
> > +    }
> > +
> > +    x86_stl_phys_notdirty(cs, addr, new_entry);
> > +
> > +    return new_entry;
> > +}
> > +
> >  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >                          int *prot)
> >  {
> >      CPUX86State *env = &X86_CPU(cs)->env;
> > -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> > +    uint64_t rsvd_mask;
> >      uint64_t ptep, pte;
> >      uint64_t exit_info_1 = 0;
> >      target_ulong pde_addr, pte_addr;
> > @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >          return gphys;
> >      }
> >  
> > + restart:
> > +    rsvd_mask = PG_HI_RSVD_MASK;
> >      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
> >          rsvd_mask |= PG_NX_MASK;
> >      }
> > @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >                  goto do_fault_rsvd;
> >              }
> >              if (!(pml4e & PG_ACCESSED_MASK)) {
> > -                pml4e |= PG_ACCESSED_MASK;
> > -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> > +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> > +                if (!pml4e) {
> > +                    goto restart;
> > +                }
> >              }
> >              ptep &= pml4e ^ PG_NX_MASK;
> >              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
> > @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >              }
> >              ptep &= pdpe ^ PG_NX_MASK;
> >              if (!(pdpe & PG_ACCESSED_MASK)) {
> > -                pdpe |= PG_ACCESSED_MASK;
> > -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> > +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> > +                if (!pdpe) {
> > +                    goto restart;
> > +                }
> >              }
> >              if (pdpe & PG_PSE_MASK) {
> >                  /* 1 GB page */
> > @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >          }
> >          /* 4 KB page */
> >          if (!(pde & PG_ACCESSED_MASK)) {
> > -            pde |= PG_ACCESSED_MASK;
> > -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> > +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> > +            if (!pde) {
> > +                goto restart;
> > +            }
> >          }
> >          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
> >          pte = x86_ldq_phys(cs, pte_addr);
> > @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> >          }
> >  
> >          if (!(pde & PG_ACCESSED_MASK)) {
> > -            pde |= PG_ACCESSED_MASK;
> > -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> > +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> > +            if (!pde) {
> > +                goto restart;
> > +            }
> >          }
> >  
> >          /* page directory entry */
> > @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >      int error_code = 0;
> >      int is_dirty, prot, page_size, is_write, is_user;
> >      hwaddr paddr;
> > -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> > +    uint64_t rsvd_mask;
> >      uint32_t page_offset;
> >      target_ulong vaddr;
> >  
> > @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >          goto do_mapping;
> >      }
> >  
> > + restart:
> > +    rsvd_mask = PG_HI_RSVD_MASK;
> >      if (!(env->efer & MSR_EFER_NXE)) {
> >          rsvd_mask |= PG_NX_MASK;
> >      }
> > @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >                      goto do_fault_rsvd;
> >                  }
> >                  if (!(pml5e & PG_ACCESSED_MASK)) {
> > -                    pml5e |= PG_ACCESSED_MASK;
> > -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
> > +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
> > +                    if (!pml5e) {
> > +                        goto restart;
> > +                    }
> >                  }
> >                  ptep = pml5e ^ PG_NX_MASK;
> >              } else {
> > @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >                  goto do_fault_rsvd;
> >              }
> >              if (!(pml4e & PG_ACCESSED_MASK)) {
> > -                pml4e |= PG_ACCESSED_MASK;
> > -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> > +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> > +                if (!pml4e) {
> > +                    goto restart;
> > +                }
> >              }
> >              ptep &= pml4e ^ PG_NX_MASK;
> >              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
> > @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >              }
> >              ptep &= pdpe ^ PG_NX_MASK;
> >              if (!(pdpe & PG_ACCESSED_MASK)) {
> > -                pdpe |= PG_ACCESSED_MASK;
> > -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> > +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> > +                if (!pdpe) {
> > +                    goto restart;
> > +                }
> >              }
> >              if (pdpe & PG_PSE_MASK) {
> >                  /* 1 GB page */
> > @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >          }
> >          /* 4 KB page */
> >          if (!(pde & PG_ACCESSED_MASK)) {
> > -            pde |= PG_ACCESSED_MASK;
> > -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> > +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> > +            if (!pde) {
> > +                goto restart;
> > +            }
> >          }
> >          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
> >              a20_mask;
> > @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
> >          }
> >  
> >          if (!(pde & PG_ACCESSED_MASK)) {
> > -            pde |= PG_ACCESSED_MASK;
> > -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> > +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> > +            if (!pde) {
> > +                goto restart;
> > +            }
> >          }
> >  
> >          /* page directory entry */
> > @@ -634,11 +690,11 @@ do_check_protect_pse36:
> >      /* yes, it can! */
> >      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
> >      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
> > -        pte |= PG_ACCESSED_MASK;
> > -        if (is_dirty) {
> > -            pte |= PG_DIRTY_MASK;
> > +        pte = update_entry(cs, pte_addr, pte,
> > +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
> > +        if (!pte) {
> > +            goto restart;
> >          }
> > -        x86_stl_phys_notdirty(cs, pte_addr, pte);
> >      }
> >  
> >      if (!(pte & PG_DIRTY_MASK)) {
> > 
> >
Paolo Bonzini Nov. 29, 2018, 10:10 a.m. UTC | #3
On 29/11/18 11:04, Benjamin Herrenschmidt wrote:
> On Thu, 2018-11-29 at 10:26 +0100, Paolo Bonzini wrote:
>> On 29/11/18 00:15, Benjamin Herrenschmidt wrote:
>>> Afaik, this isn't well documented (at least it wasn't when I last looked)
>>> but OSes such as Linux rely on this behaviour:
>>>
>>> The HW updates to the page tables need to be done atomically with the
>>> checking of the present bit (and other permissions).
>>>
>>> This is what allows Linux to do simple xchg of PTEs with 0 and assume
>>> the value read has "final" stable dirty and accessed bits (the TLB
>>> invalidation is deferred).
>>>
>>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> ---
>>>
>>> This is lightly tested at this point, mostly to gather comments on the
>>> approach.
>>
>> Looks good, but please kill the notdirty stuff first.  It's not needed
>> and it's a clear bug.  When migrating, it can lead to PTEs being
>> migrated without accessed and dirty bits.
> 
> I thought that too but looking closely with rth, it seems it's still
> setting *those* dirty bits, it just avoids the collision test with the
> translator.

Ah, you're right.  I guess it's a minor performance optimization from
avoiding some pointer chasing in page_find (radix tree walk).

Paolo

> For powerpc I need a cmpxchgq variant too, I'll probably split the
> patch adding those mem ops from the rest as well.
> 
> Annoyingly, fixing riscv will need some tests on target_ulong size.
> 
> Cheers,
> Ben.
> 
>> Paolo
>>
>>> I noticed that RiscV is attempting to do something similar but with endian
>>> bugs, at least from my reading of the code.
>>>
>>>  include/exec/memory_ldst.inc.h |   3 +
>>>  memory_ldst.inc.c              |  38 ++++++++++++
>>>  target/i386/excp_helper.c      | 104 +++++++++++++++++++++++++--------
>>>  3 files changed, 121 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h
>>> index 272c20f02e..b7a41a0ad4 100644
>>> --- a/include/exec/memory_ldst.inc.h
>>> +++ b/include/exec/memory_ldst.inc.h
>>> @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL,
>>>      hwaddr addr, MemTxAttrs attrs, MemTxResult *result);
>>>  extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
>>>      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
>>> +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
>>> +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs,
>>> +    MemTxResult *result);
>>>  extern void glue(address_space_stw, SUFFIX)(ARG1_DECL,
>>>      hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
>>>  extern void glue(address_space_stl, SUFFIX)(ARG1_DECL,
>>> diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
>>> index acf865b900..63af8f7dd2 100644
>>> --- a/memory_ldst.inc.c
>>> +++ b/memory_ldst.inc.c
>>> @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
>>>      RCU_READ_UNLOCK();
>>>  }
>>>  
>>> +/* This is meant to be used for atomic PTE updates under MT-TCG */
>>> +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
>>> +    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result)
>>> +{
>>> +    uint8_t *ptr;
>>> +    MemoryRegion *mr;
>>> +    hwaddr l = 4;
>>> +    hwaddr addr1;
>>> +    MemTxResult r;
>>> +    uint8_t dirty_log_mask;
>>> +
>>> +    /* Must test result */
>>> +    assert(result);
>>> +
>>> +    RCU_READ_LOCK();
>>> +    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>>> +    if (l < 4 || !memory_access_is_direct(mr, true)) {
>>> +        r = MEMTX_ERROR;
>>> +    } else {
>>> +        uint32_t orig = old;
>>> +
>>> +        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
>>> +        old = atomic_cmpxchg(ptr, orig, new);
>>> +
>>> +        if (old == orig) {
>>> +            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
>>> +            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
>>> +            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
>>> +                                                4, dirty_log_mask);
>>> +        }
>>> +        r = MEMTX_OK;
>>> +    }
>>> +    *result = r;
>>> +    RCU_READ_UNLOCK();
>>> +
>>> +    return old;
>>> +}
>>> +
>>>  /* warning: addr must be aligned */
>>>  static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
>>>      hwaddr addr, uint32_t val, MemTxAttrs attrs,
>>> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
>>> index 49231f6b69..5ccb9d6d6a 100644
>>> --- a/target/i386/excp_helper.c
>>> +++ b/target/i386/excp_helper.c
>>> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>  
>>>  #else
>>>  
>>> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
>>> +                                    uint64_t orig_entry, uint32_t bits)
>>> +{
>>> +    uint64_t new_entry = orig_entry | bits;
>>> +
>>> +    /* Write the updated bottom 32-bits */
>>> +    if (qemu_tcg_mttcg_enabled()) {
>>> +        uint32_t old_le = cpu_to_le32(orig_entry);
>>> +        uint32_t new_le = cpu_to_le32(new_entry);
>>> +        MemTxResult result;
>>> +        uint32_t old_ret;
>>> +
>>> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
>>> +                                                  old_le, new_le,
>>> +                                                  MEMTXATTRS_UNSPECIFIED,
>>> +                                                  &result);
>>> +        if (result == MEMTX_OK) {
>>> +            if (old_ret != old_le) {
>>> +                new_entry = 0;
>>> +            }
>>> +            return new_entry;
>>> +        }
>>> +
>>> +        /* Do we need to support this case where PTEs aren't in RAM ?
>>> +         *
>>> +         * For now fallback to non-atomic case
>>> +         */
>>> +    }
>>> +
>>> +    x86_stl_phys_notdirty(cs, addr, new_entry);
>>> +
>>> +    return new_entry;
>>> +}
>>> +
>>>  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>                          int *prot)
>>>  {
>>>      CPUX86State *env = &X86_CPU(cs)->env;
>>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>>> +    uint64_t rsvd_mask;
>>>      uint64_t ptep, pte;
>>>      uint64_t exit_info_1 = 0;
>>>      target_ulong pde_addr, pte_addr;
>>> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>          return gphys;
>>>      }
>>>  
>>> + restart:
>>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>>      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
>>>          rsvd_mask |= PG_NX_MASK;
>>>      }
>>> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>                  goto do_fault_rsvd;
>>>              }
>>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>>> -                pml4e |= PG_ACCESSED_MASK;
>>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>>> +                if (!pml4e) {
>>> +                    goto restart;
>>> +                }
>>>              }
>>>              ptep &= pml4e ^ PG_NX_MASK;
>>>              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
>>> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>              }
>>>              ptep &= pdpe ^ PG_NX_MASK;
>>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>>> -                pdpe |= PG_ACCESSED_MASK;
>>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>>> +                if (!pdpe) {
>>> +                    goto restart;
>>> +                }
>>>              }
>>>              if (pdpe & PG_PSE_MASK) {
>>>                  /* 1 GB page */
>>> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>          }
>>>          /* 4 KB page */
>>>          if (!(pde & PG_ACCESSED_MASK)) {
>>> -            pde |= PG_ACCESSED_MASK;
>>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>>> +            if (!pde) {
>>> +                goto restart;
>>> +            }
>>>          }
>>>          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
>>>          pte = x86_ldq_phys(cs, pte_addr);
>>> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>          }
>>>  
>>>          if (!(pde & PG_ACCESSED_MASK)) {
>>> -            pde |= PG_ACCESSED_MASK;
>>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>>> +            if (!pde) {
>>> +                goto restart;
>>> +            }
>>>          }
>>>  
>>>          /* page directory entry */
>>> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>      int error_code = 0;
>>>      int is_dirty, prot, page_size, is_write, is_user;
>>>      hwaddr paddr;
>>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>>> +    uint64_t rsvd_mask;
>>>      uint32_t page_offset;
>>>      target_ulong vaddr;
>>>  
>>> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>          goto do_mapping;
>>>      }
>>>  
>>> + restart:
>>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>>      if (!(env->efer & MSR_EFER_NXE)) {
>>>          rsvd_mask |= PG_NX_MASK;
>>>      }
>>> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>                      goto do_fault_rsvd;
>>>                  }
>>>                  if (!(pml5e & PG_ACCESSED_MASK)) {
>>> -                    pml5e |= PG_ACCESSED_MASK;
>>> -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
>>> +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
>>> +                    if (!pml5e) {
>>> +                        goto restart;
>>> +                    }
>>>                  }
>>>                  ptep = pml5e ^ PG_NX_MASK;
>>>              } else {
>>> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>                  goto do_fault_rsvd;
>>>              }
>>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>>> -                pml4e |= PG_ACCESSED_MASK;
>>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>>> +                if (!pml4e) {
>>> +                    goto restart;
>>> +                }
>>>              }
>>>              ptep &= pml4e ^ PG_NX_MASK;
>>>              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
>>> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>              }
>>>              ptep &= pdpe ^ PG_NX_MASK;
>>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>>> -                pdpe |= PG_ACCESSED_MASK;
>>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>>> +                if (!pdpe) {
>>> +                    goto restart;
>>> +                }
>>>              }
>>>              if (pdpe & PG_PSE_MASK) {
>>>                  /* 1 GB page */
>>> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>          }
>>>          /* 4 KB page */
>>>          if (!(pde & PG_ACCESSED_MASK)) {
>>> -            pde |= PG_ACCESSED_MASK;
>>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>>> +            if (!pde) {
>>> +                goto restart;
>>> +            }
>>>          }
>>>          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
>>>              a20_mask;
>>> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>>          }
>>>  
>>>          if (!(pde & PG_ACCESSED_MASK)) {
>>> -            pde |= PG_ACCESSED_MASK;
>>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>>> +            if (!pde) {
>>> +                goto restart;
>>> +            }
>>>          }
>>>  
>>>          /* page directory entry */
>>> @@ -634,11 +690,11 @@ do_check_protect_pse36:
>>>      /* yes, it can! */
>>>      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
>>>      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
>>> -        pte |= PG_ACCESSED_MASK;
>>> -        if (is_dirty) {
>>> -            pte |= PG_DIRTY_MASK;
>>> +        pte = update_entry(cs, pte_addr, pte,
>>> +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
>>> +        if (!pte) {
>>> +            goto restart;
>>>          }
>>> -        x86_stl_phys_notdirty(cs, pte_addr, pte);
>>>      }
>>>  
>>>      if (!(pte & PG_DIRTY_MASK)) {
>>>
>>>
>
Richard Henderson Nov. 29, 2018, 7:04 p.m. UTC | #4
On 11/28/18 3:15 PM, Benjamin Herrenschmidt wrote:
> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
> +                                    uint64_t orig_entry, uint32_t bits)
> +{
> +    uint64_t new_entry = orig_entry | bits;
> +
> +    /* Write the updated bottom 32-bits */
> +    if (qemu_tcg_mttcg_enabled()) {
> +        uint32_t old_le = cpu_to_le32(orig_entry);
> +        uint32_t new_le = cpu_to_le32(new_entry);
> +        MemTxResult result;
> +        uint32_t old_ret;
> +
> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
> +                                                  old_le, new_le,
> +                                                  MEMTXATTRS_UNSPECIFIED,
> +                                                  &result);
> +        if (result == MEMTX_OK) {
> +            if (old_ret != old_le) {
> +                new_entry = 0;
> +            }
> +            return new_entry;
> +        }
> +
> +        /* Do we need to support this case where PTEs aren't in RAM ?
> +         *
> +         * For now fallback to non-atomic case
> +         */
> +    }
> +
> +    x86_stl_phys_notdirty(cs, addr, new_entry);
> +
> +    return new_entry;
> +}

While I know the existing code just updates the low 32-bits, I'd rather update
the whole entry, and make use of the old value returned from the cmpxchg.

E.g.

static bool update_entry(CPUState *cs, target_ulong addr,
                         uint64_t *entry, uint32_t bits)
{
    uint64_t old_entry = *entry;
    uint64_t new_entry = old_entry | bits;

    if (qemu_tcg_mttcg_enabled()) {
        uint64_t cmp_le = cpu_to_le64(old_entry);
        uint64_t new_le = cpu_to_le64(new_entry);
        uint64_t old_le;
        MemTxResult result;

        old_le = address_space_cmpxchgl_notdirty(cs->as, addr,
                                                 cmp_le, new_le,
                                                 MEMTXATTRS_UNSPECIFIED,
                                                 &result);
        if (result == MEMTX_OK) {
            if (old_le == cmp_le) {
                return true;
            } else {
                *entry = le_to_cpu64(old_le);
                return false;
            }
        }
    }
    x86_stq_phys_notdirty(cs, addr, new_entry);
    return true;
}

....

    pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
                (((gphys >> 30) & 0x1ff) << 3);
    pdpe = x86_ldq_phys(cs, pdpe_addr);
    do {
        if (!(pdpe & PG_PRESENT_MASK)) {
            goto do_fault;
        }
        if (pdpe & rsvd_mask) {
            goto do_fault_rsvd;
        }
        if (pdpe & PG_ACCESSED_MASK) {
            break;
        }
    } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK));
    ptep &= pdpe ^ PG_NX_MASK;

....

Although I think it would be really great if we could figure out something that
allows us to promote this whole load/cmpxchg loop into a primitive that avoids
multiple translations of the address.

No, I don't know what that primitive would look like.  :-)


r~
Benjamin Herrenschmidt Nov. 29, 2018, 10:54 p.m. UTC | #5
On Thu, 2018-11-29 at 11:04 -0800, Richard Henderson wrote:
> On 11/28/18 3:15 PM, Benjamin Herrenschmidt wrote:
> > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
> > +                                    uint64_t orig_entry, uint32_t bits)
> > +{
> > +    uint64_t new_entry = orig_entry | bits;
> > +
> > +    /* Write the updated bottom 32-bits */
> > +    if (qemu_tcg_mttcg_enabled()) {
> > +        uint32_t old_le = cpu_to_le32(orig_entry);
> > +        uint32_t new_le = cpu_to_le32(new_entry);
> > +        MemTxResult result;
> > +        uint32_t old_ret;
> > +
> > +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
> > +                                                  old_le, new_le,
> > +                                                  MEMTXATTRS_UNSPECIFIED,
> > +                                                  &result);
> > +        if (result == MEMTX_OK) {
> > +            if (old_ret != old_le) {
> > +                new_entry = 0;
> > +            }
> > +            return new_entry;
> > +        }
> > +
> > +        /* Do we need to support this case where PTEs aren't in RAM ?
> > +         *
> > +         * For now fallback to non-atomic case
> > +         */
> > +    }
> > +
> > +    x86_stl_phys_notdirty(cs, addr, new_entry);
> > +
> > +    return new_entry;
> > +}
> 
> While I know the existing code just updates the low 32-bits, I'd rather update
> the whole entry, and make use of the old value returned from the cmpxchg.

Not all PTEs on x86 ae 64-bits, the old 32-bit format still exists, so
we'd have to create two helpers. Not a big deal mind you. I'm already
creating a second address_space_cmpxchgq_notdirty for use by powerpc

> static bool update_entry(CPUState *cs, target_ulong addr,
>                          uint64_t *entry, uint32_t bits)
> {
>     uint64_t old_entry = *entry;
>     uint64_t new_entry = old_entry | bits;
> 
>     if (qemu_tcg_mttcg_enabled()) {
>         uint64_t cmp_le = cpu_to_le64(old_entry);
>         uint64_t new_le = cpu_to_le64(new_entry);
>         uint64_t old_le;
>         MemTxResult result;
> 
>         old_le = address_space_cmpxchgl_notdirty(cs->as, addr,
>                                                  cmp_le, new_le,
>                                                  MEMTXATTRS_UNSPECIFIED,
>                                                  &result);
>         if (result == MEMTX_OK) {
>             if (old_le == cmp_le) {
>                 return true;
>             } else {
>                 *entry = le_to_cpu64(old_le);
>                 return false;
>             }
>         }
>     }
>     x86_stq_phys_notdirty(cs, addr, new_entry);
>     return true;
> }


> ....
> 
>     pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
>                 (((gphys >> 30) & 0x1ff) << 3);
>     pdpe = x86_ldq_phys(cs, pdpe_addr);
>     do {
>         if (!(pdpe & PG_PRESENT_MASK)) {
>             goto do_fault;
>         }
>         if (pdpe & rsvd_mask) {
>             goto do_fault_rsvd;
>         }
>         if (pdpe & PG_ACCESSED_MASK) {
>             break;
>         }
>     } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK));
>     ptep &= pdpe ^ PG_NX_MASK;
> 
> ....

Hrm.. I see. So not re-do the full walk. Not sure it's really worth it
though, how often do we expect to hit the failing case ?

> 
> Although I think it would be really great if we could figure out something that
> allows us to promote this whole load/cmpxchg loop into a primitive that avoids
> multiple translations of the address.
> 
> No, I don't know what that primitive would look like.  :-)

You mean translating once for the load and for the udpate ? Do you
expect that translation to have such a significant cost considering
that all it needs should be in L1 at that point ?

There are subtle differences in what happens between the load and the
update, and we hope for the update to be fairly rare (even though it's
done by HW it's costly even on actual chips).

So best way I could think of would be to use a macro, so we can open-
code the statements that test/manipulate the value.

Not sure I'll get to it today though, I'm off early this afternoon for
the week-end.

Cheers,
Ben.
> 
> r~
Richard Henderson Nov. 30, 2018, 12:12 a.m. UTC | #6
On 11/29/18 2:54 PM, Benjamin Herrenschmidt wrote:
>>     pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
>>                 (((gphys >> 30) & 0x1ff) << 3);
>>     pdpe = x86_ldq_phys(cs, pdpe_addr);
>>     do {
>>         if (!(pdpe & PG_PRESENT_MASK)) {
>>             goto do_fault;
>>         }
>>         if (pdpe & rsvd_mask) {
>>             goto do_fault_rsvd;
>>         }
>>         if (pdpe & PG_ACCESSED_MASK) {
>>             break;
>>         }
>>     } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK));
>>     ptep &= pdpe ^ PG_NX_MASK;
>>
>> ....
> 
> Hrm.. I see. So not re-do the full walk. Not sure it's really worth it
> though, how often do we expect to hit the failing case ?

It is probably rare-ish, I admit.

I suppose we could also signal "success" from update_entry if the cmpxchg
fails, but the value that was reloaded only differs in setting PG_ACCESSED_MASK
| PG_DIRTY_MASK, as long as 'bits' itself was set.

>> Although I think it would be really great if we could figure out something that
>> allows us to promote this whole load/cmpxchg loop into a primitive that avoids
>> multiple translations of the address.
>>
>> No, I don't know what that primitive would look like.  :-)
> 
> You mean translating once for the load and for the udpate ? Do you
> expect that translation to have such a significant cost considering
> that all it needs should be in L1 at that point ?

I guess if the update is rare-ish, the re-translating isn't a big deal.  And I
suppose we'd have to retain the RCU lock to hold on to the translation, which
probably isn't the best idea.

Nevermind on this.


r~
Benjamin Herrenschmidt Nov. 30, 2018, 2:12 a.m. UTC | #7
On Thu, 2018-11-29 at 16:12 -0800, Richard Henderson wrote:
> > You mean translating once for the load and for the udpate ? Do you
> > expect that translation to have such a significant cost considering
> > that all it needs should be in L1 at that point ?
> 
> I guess if the update is rare-ish, the re-translating isn't a big deal.  And I
> suppose we'd have to retain the RCU lock to hold on to the translation, which
> probably isn't the best idea.

Yeay I toyed with this a bit but it ended up very messy real quick.

Cheers,
Ben.
Benjamin Herrenschmidt Nov. 30, 2018, 2:18 a.m. UTC | #8
On Thu, 2018-11-29 at 16:12 -0800, Richard Henderson wrote:
> On 11/29/18 2:54 PM, Benjamin Herrenschmidt wrote:
> > >      pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
> > >                  (((gphys >> 30) & 0x1ff) << 3);
> > >      pdpe = x86_ldq_phys(cs, pdpe_addr);
> > >      do {
> > >          if (!(pdpe & PG_PRESENT_MASK)) {
> > >              goto do_fault;
> > >          }
> > >          if (pdpe & rsvd_mask) {
> > >              goto do_fault_rsvd;
> > >          }
> > >          if (pdpe & PG_ACCESSED_MASK) {
> > >              break;
> > >          }
> > >      } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK));
> > >      ptep &= pdpe ^ PG_NX_MASK;
> > > 
> > > ....
> > 
> > Hrm.. I see. So not re-do the full walk. Not sure it's really worth it
> > though, how often do we expect to hit the failing case ?
> 
> It is probably rare-ish, I admit.
> 
> I suppose we could also signal "success" from update_entry if the cmpxchg
> fails, but the value that was reloaded only differs in setting PG_ACCESSED_MASK

The latter optimization is trivial. As for the former, replacing my
"goto restart" with those loops, it will make the patch significantly
bigger, though not adding another goto has its perks and the end result
might be slightly nicer ...

What way do you prefer ?

Cheers,
Ben.
Benjamin Herrenschmidt Dec. 13, 2018, 11:39 p.m. UTC | #9
On Thu, 2018-11-29 at 11:04 -0800, Richard Henderson wrote:
> 
> While I know the existing code just updates the low 32-bits, I'd rather update
> the whole entry, and make use of the old value returned from the cmpxchg.

So I had to put this on the back burner for a bit, but I'm back to it
now.

I've tried the above but it gets messy again. The i386 code can have
either 32 or 64 bits PDE/PTEs. The udpate code for the PTE is designed
so a single code path can work with either by exploiting that LE trick,
so changing that would involve more re-factoring and I'd rather avoid
changing more than strictly needed in there.

As for this:

>    pdpe = x86_ldq_phys(cs, pdpe_addr);
>     do {
>         if (!(pdpe & PG_PRESENT_MASK)) {
>             goto do_fault;
>         }
>         if (pdpe & rsvd_mask) {
>             goto do_fault_rsvd;
>         }
>         if (pdpe & PG_ACCESSED_MASK) {
>             break;
>         }
>     } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK));
>     ptep &= pdpe ^ PG_NX_MASK;

While that form of construct is nicer than the current goto restart in
my patch, in a similar way, it works for the intermediary cases but
would require some serious refactoring of the whole function for the
final PTE case.

At this point I'd rather not take chances and introduce more bugs, so
I'll post an updated variant with a few fixes but will postpone more
refactoring unless you really really want them now :)

Cheers,
Ben.
Richard Henderson Dec. 13, 2018, 11:47 p.m. UTC | #10
On 12/13/18 5:39 PM, Benjamin Herrenschmidt wrote:
> At this point I'd rather not take chances and introduce more bugs, so
> I'll post an updated variant with a few fixes but will postpone more
> refactoring unless you really really want them now :)

No, that's quite all right.  Thanks for the investigation.

r~
diff mbox series

Patch

diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h
index 272c20f02e..b7a41a0ad4 100644
--- a/include/exec/memory_ldst.inc.h
+++ b/include/exec/memory_ldst.inc.h
@@ -28,6 +28,9 @@  extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL,
     hwaddr addr, MemTxAttrs attrs, MemTxResult *result);
 extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
+extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs,
+    MemTxResult *result);
 extern void glue(address_space_stw, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
 extern void glue(address_space_stl, SUFFIX)(ARG1_DECL,
diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
index acf865b900..63af8f7dd2 100644
--- a/memory_ldst.inc.c
+++ b/memory_ldst.inc.c
@@ -320,6 +320,44 @@  void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
     RCU_READ_UNLOCK();
 }
 
+/* This is meant to be used for atomic PTE updates under MT-TCG */
+uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result)
+{
+    uint8_t *ptr;
+    MemoryRegion *mr;
+    hwaddr l = 4;
+    hwaddr addr1;
+    MemTxResult r;
+    uint8_t dirty_log_mask;
+
+    /* Must test result */
+    assert(result);
+
+    RCU_READ_LOCK();
+    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
+    if (l < 4 || !memory_access_is_direct(mr, true)) {
+        r = MEMTX_ERROR;
+    } else {
+        uint32_t orig = old;
+
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
+        old = atomic_cmpxchg(ptr, orig, new);
+
+        if (old == orig) {
+            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
+            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                                4, dirty_log_mask);
+        }
+        r = MEMTX_OK;
+    }
+    *result = r;
+    RCU_READ_UNLOCK();
+
+    return old;
+}
+
 /* warning: addr must be aligned */
 static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs,
diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
index 49231f6b69..5ccb9d6d6a 100644
--- a/target/i386/excp_helper.c
+++ b/target/i386/excp_helper.c
@@ -157,11 +157,45 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
 
 #else
 
+static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
+                                    uint64_t orig_entry, uint32_t bits)
+{
+    uint64_t new_entry = orig_entry | bits;
+
+    /* Write the updated bottom 32-bits */
+    if (qemu_tcg_mttcg_enabled()) {
+        uint32_t old_le = cpu_to_le32(orig_entry);
+        uint32_t new_le = cpu_to_le32(new_entry);
+        MemTxResult result;
+        uint32_t old_ret;
+
+        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
+                                                  old_le, new_le,
+                                                  MEMTXATTRS_UNSPECIFIED,
+                                                  &result);
+        if (result == MEMTX_OK) {
+            if (old_ret != old_le) {
+                new_entry = 0;
+            }
+            return new_entry;
+        }
+
+        /* Do we need to support this case where PTEs aren't in RAM ?
+         *
+         * For now fallback to non-atomic case
+         */
+    }
+
+    x86_stl_phys_notdirty(cs, addr, new_entry);
+
+    return new_entry;
+}
+
 static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
                         int *prot)
 {
     CPUX86State *env = &X86_CPU(cs)->env;
-    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
+    uint64_t rsvd_mask;
     uint64_t ptep, pte;
     uint64_t exit_info_1 = 0;
     target_ulong pde_addr, pte_addr;
@@ -172,6 +206,8 @@  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         return gphys;
     }
 
+ restart:
+    rsvd_mask = PG_HI_RSVD_MASK;
     if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
         rsvd_mask |= PG_NX_MASK;
     }
@@ -198,8 +234,10 @@  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
                 goto do_fault_rsvd;
             }
             if (!(pml4e & PG_ACCESSED_MASK)) {
-                pml4e |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
+                if (!pml4e) {
+                    goto restart;
+                }
             }
             ptep &= pml4e ^ PG_NX_MASK;
             pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
@@ -213,8 +251,10 @@  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
             }
             ptep &= pdpe ^ PG_NX_MASK;
             if (!(pdpe & PG_ACCESSED_MASK)) {
-                pdpe |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
+                if (!pdpe) {
+                    goto restart;
+                }
             }
             if (pdpe & PG_PSE_MASK) {
                 /* 1 GB page */
@@ -256,8 +296,10 @@  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         }
         /* 4 KB page */
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
         pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
         pte = x86_ldq_phys(cs, pte_addr);
@@ -295,8 +337,10 @@  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         }
 
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
 
         /* page directory entry */
@@ -376,7 +420,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
     int error_code = 0;
     int is_dirty, prot, page_size, is_write, is_user;
     hwaddr paddr;
-    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
+    uint64_t rsvd_mask;
     uint32_t page_offset;
     target_ulong vaddr;
 
@@ -401,6 +445,8 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         goto do_mapping;
     }
 
+ restart:
+    rsvd_mask = PG_HI_RSVD_MASK;
     if (!(env->efer & MSR_EFER_NXE)) {
         rsvd_mask |= PG_NX_MASK;
     }
@@ -436,8 +482,10 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
                     goto do_fault_rsvd;
                 }
                 if (!(pml5e & PG_ACCESSED_MASK)) {
-                    pml5e |= PG_ACCESSED_MASK;
-                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
+                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
+                    if (!pml5e) {
+                        goto restart;
+                    }
                 }
                 ptep = pml5e ^ PG_NX_MASK;
             } else {
@@ -456,8 +504,10 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
                 goto do_fault_rsvd;
             }
             if (!(pml4e & PG_ACCESSED_MASK)) {
-                pml4e |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
+                if (!pml4e) {
+                    goto restart;
+                }
             }
             ptep &= pml4e ^ PG_NX_MASK;
             pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
@@ -472,8 +522,10 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
             }
             ptep &= pdpe ^ PG_NX_MASK;
             if (!(pdpe & PG_ACCESSED_MASK)) {
-                pdpe |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
+                if (!pdpe) {
+                    goto restart;
+                }
             }
             if (pdpe & PG_PSE_MASK) {
                 /* 1 GB page */
@@ -520,8 +572,10 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         }
         /* 4 KB page */
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
         pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
             a20_mask;
@@ -563,8 +617,10 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         }
 
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
 
         /* page directory entry */
@@ -634,11 +690,11 @@  do_check_protect_pse36:
     /* yes, it can! */
     is_dirty = is_write && !(pte & PG_DIRTY_MASK);
     if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
-        pte |= PG_ACCESSED_MASK;
-        if (is_dirty) {
-            pte |= PG_DIRTY_MASK;
+        pte = update_entry(cs, pte_addr, pte,
+                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
+        if (!pte) {
+            goto restart;
         }
-        x86_stl_phys_notdirty(cs, pte_addr, pte);
     }
 
     if (!(pte & PG_DIRTY_MASK)) {