Message ID | f4349b9ee95cbd9d5b16e9033fdf0d7c9e99aebf.camel@kernel.crashing.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC/PATCH] i386: Atomically update PTEs with mttcg | expand |
On 29/11/18 00:15, Benjamin Herrenschmidt wrote: > Afaik, this isn't well documented (at least it wasn't when I last looked) > but OSes such as Linux rely on this behaviour: > > The HW updates to the page tables need to be done atomically with the > checking of the present bit (and other permissions). > > This is what allows Linux to do simple xchg of PTEs with 0 and assume > the value read has "final" stable dirty and accessed bits (the TLB > invalidation is deferred). > > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> > --- > > This is lightly tested at this point, mostly to gather comments on the > approach. Looks good, but please kill the notdirty stuff first. It's not needed and it's a clear bug. When migrating, it can lead to PTEs being migrated without accessed and dirty bits. Paolo > I noticed that RiscV is attempting to do something similar but with endian > bugs, at least from my reading of the code. > > include/exec/memory_ldst.inc.h | 3 + > memory_ldst.inc.c | 38 ++++++++++++ > target/i386/excp_helper.c | 104 +++++++++++++++++++++++++-------- > 3 files changed, 121 insertions(+), 24 deletions(-) > > diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h > index 272c20f02e..b7a41a0ad4 100644 > --- a/include/exec/memory_ldst.inc.h > +++ b/include/exec/memory_ldst.inc.h > @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL, > hwaddr addr, MemTxAttrs attrs, MemTxResult *result); > extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, > + MemTxResult *result); > extern void glue(address_space_stw, SUFFIX)(ARG1_DECL, > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > extern void glue(address_space_stl, SUFFIX)(ARG1_DECL, > diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c > index acf865b900..63af8f7dd2 100644 > --- a/memory_ldst.inc.c > +++ b/memory_ldst.inc.c > @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > RCU_READ_UNLOCK(); > } > > +/* This is meant to be used for atomic PTE updates under MT-TCG */ > +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result) > +{ > + uint8_t *ptr; > + MemoryRegion *mr; > + hwaddr l = 4; > + hwaddr addr1; > + MemTxResult r; > + uint8_t dirty_log_mask; > + > + /* Must test result */ > + assert(result); > + > + RCU_READ_LOCK(); > + mr = TRANSLATE(addr, &addr1, &l, true, attrs); > + if (l < 4 || !memory_access_is_direct(mr, true)) { > + r = MEMTX_ERROR; > + } else { > + uint32_t orig = old; > + > + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); > + old = atomic_cmpxchg(ptr, orig, new); > + > + if (old == orig) { > + dirty_log_mask = memory_region_get_dirty_log_mask(mr); > + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); > + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, > + 4, dirty_log_mask); > + } > + r = MEMTX_OK; > + } > + *result = r; > + RCU_READ_UNLOCK(); > + > + return old; > +} > + > /* warning: addr must be aligned */ > static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, > hwaddr addr, uint32_t val, MemTxAttrs attrs, > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c > index 49231f6b69..5ccb9d6d6a 100644 > --- a/target/i386/excp_helper.c > +++ b/target/i386/excp_helper.c > @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > #else > > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > + uint64_t orig_entry, uint32_t bits) > +{ > + uint64_t new_entry = orig_entry | bits; > + > + /* Write the updated bottom 32-bits */ > + if (qemu_tcg_mttcg_enabled()) { > + uint32_t old_le = cpu_to_le32(orig_entry); > + uint32_t new_le = cpu_to_le32(new_entry); > + MemTxResult result; > + uint32_t old_ret; > + > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > + old_le, new_le, > + MEMTXATTRS_UNSPECIFIED, > + &result); > + if (result == MEMTX_OK) { > + if (old_ret != old_le) { > + new_entry = 0; > + } > + return new_entry; > + } > + > + /* Do we need to support this case where PTEs aren't in RAM ? > + * > + * For now fallback to non-atomic case > + */ > + } > + > + x86_stl_phys_notdirty(cs, addr, new_entry); > + > + return new_entry; > +} > + > static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > int *prot) > { > CPUX86State *env = &X86_CPU(cs)->env; > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > + uint64_t rsvd_mask; > uint64_t ptep, pte; > uint64_t exit_info_1 = 0; > target_ulong pde_addr, pte_addr; > @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > return gphys; > } > > + restart: > + rsvd_mask = PG_HI_RSVD_MASK; > if (!(env->nested_pg_mode & SVM_NPT_NXE)) { > rsvd_mask |= PG_NX_MASK; > } > @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > goto do_fault_rsvd; > } > if (!(pml4e & PG_ACCESSED_MASK)) { > - pml4e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); > + if (!pml4e) { > + goto restart; > + } > } > ptep &= pml4e ^ PG_NX_MASK; > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > } > ptep &= pdpe ^ PG_NX_MASK; > if (!(pdpe & PG_ACCESSED_MASK)) { > - pdpe |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > + if (!pdpe) { > + goto restart; > + } > } > if (pdpe & PG_PSE_MASK) { > /* 1 GB page */ > @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > } > /* 4 KB page */ > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); > pte = x86_ldq_phys(cs, pte_addr); > @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > } > > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > > /* page directory entry */ > @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > int error_code = 0; > int is_dirty, prot, page_size, is_write, is_user; > hwaddr paddr; > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > + uint64_t rsvd_mask; > uint32_t page_offset; > target_ulong vaddr; > > @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > goto do_mapping; > } > > + restart: > + rsvd_mask = PG_HI_RSVD_MASK; > if (!(env->efer & MSR_EFER_NXE)) { > rsvd_mask |= PG_NX_MASK; > } > @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > goto do_fault_rsvd; > } > if (!(pml5e & PG_ACCESSED_MASK)) { > - pml5e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); > + pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK); > + if (!pml5e) { > + goto restart; > + } > } > ptep = pml5e ^ PG_NX_MASK; > } else { > @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > goto do_fault_rsvd; > } > if (!(pml4e & PG_ACCESSED_MASK)) { > - pml4e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); > + if (!pml4e) { > + goto restart; > + } > } > ptep &= pml4e ^ PG_NX_MASK; > pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & > @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > } > ptep &= pdpe ^ PG_NX_MASK; > if (!(pdpe & PG_ACCESSED_MASK)) { > - pdpe |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > + if (!pdpe) { > + goto restart; > + } > } > if (pdpe & PG_PSE_MASK) { > /* 1 GB page */ > @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > } > /* 4 KB page */ > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & > a20_mask; > @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > } > > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > > /* page directory entry */ > @@ -634,11 +690,11 @@ do_check_protect_pse36: > /* yes, it can! */ > is_dirty = is_write && !(pte & PG_DIRTY_MASK); > if (!(pte & PG_ACCESSED_MASK) || is_dirty) { > - pte |= PG_ACCESSED_MASK; > - if (is_dirty) { > - pte |= PG_DIRTY_MASK; > + pte = update_entry(cs, pte_addr, pte, > + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0)); > + if (!pte) { > + goto restart; > } > - x86_stl_phys_notdirty(cs, pte_addr, pte); > } > > if (!(pte & PG_DIRTY_MASK)) { > >
On Thu, 2018-11-29 at 10:26 +0100, Paolo Bonzini wrote: > On 29/11/18 00:15, Benjamin Herrenschmidt wrote: > > Afaik, this isn't well documented (at least it wasn't when I last looked) > > but OSes such as Linux rely on this behaviour: > > > > The HW updates to the page tables need to be done atomically with the > > checking of the present bit (and other permissions). > > > > This is what allows Linux to do simple xchg of PTEs with 0 and assume > > the value read has "final" stable dirty and accessed bits (the TLB > > invalidation is deferred). > > > > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> > > --- > > > > This is lightly tested at this point, mostly to gather comments on the > > approach. > > Looks good, but please kill the notdirty stuff first. It's not needed > and it's a clear bug. When migrating, it can lead to PTEs being > migrated without accessed and dirty bits. I thought that too but looking closely with rth, it seems it's still setting *those* dirty bits, it just avoids the collision test with the translator. Unless I missed something... Do you still want to kill them ? They are warty, no doubt... For powerpc I need a cmpxchgq variant too, I'll probably split the patch adding those mem ops from the rest as well. Annoyingly, fixing riscv will need some tests on target_ulong size. Cheers, Ben. > Paolo > > > I noticed that RiscV is attempting to do something similar but with endian > > bugs, at least from my reading of the code. > > > > include/exec/memory_ldst.inc.h | 3 + > > memory_ldst.inc.c | 38 ++++++++++++ > > target/i386/excp_helper.c | 104 +++++++++++++++++++++++++-------- > > 3 files changed, 121 insertions(+), 24 deletions(-) > > > > diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h > > index 272c20f02e..b7a41a0ad4 100644 > > --- a/include/exec/memory_ldst.inc.h > > +++ b/include/exec/memory_ldst.inc.h > > @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL, > > hwaddr addr, MemTxAttrs attrs, MemTxResult *result); > > extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > > +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, > > + MemTxResult *result); > > extern void glue(address_space_stw, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > > extern void glue(address_space_stl, SUFFIX)(ARG1_DECL, > > diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c > > index acf865b900..63af8f7dd2 100644 > > --- a/memory_ldst.inc.c > > +++ b/memory_ldst.inc.c > > @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > > RCU_READ_UNLOCK(); > > } > > > > +/* This is meant to be used for atomic PTE updates under MT-TCG */ > > +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result) > > +{ > > + uint8_t *ptr; > > + MemoryRegion *mr; > > + hwaddr l = 4; > > + hwaddr addr1; > > + MemTxResult r; > > + uint8_t dirty_log_mask; > > + > > + /* Must test result */ > > + assert(result); > > + > > + RCU_READ_LOCK(); > > + mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > + if (l < 4 || !memory_access_is_direct(mr, true)) { > > + r = MEMTX_ERROR; > > + } else { > > + uint32_t orig = old; > > + > > + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); > > + old = atomic_cmpxchg(ptr, orig, new); > > + > > + if (old == orig) { > > + dirty_log_mask = memory_region_get_dirty_log_mask(mr); > > + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); > > + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, > > + 4, dirty_log_mask); > > + } > > + r = MEMTX_OK; > > + } > > + *result = r; > > + RCU_READ_UNLOCK(); > > + > > + return old; > > +} > > + > > /* warning: addr must be aligned */ > > static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, > > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c > > index 49231f6b69..5ccb9d6d6a 100644 > > --- a/target/i386/excp_helper.c > > +++ b/target/i386/excp_helper.c > > @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > > > #else > > > > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > > + uint64_t orig_entry, uint32_t bits) > > +{ > > + uint64_t new_entry = orig_entry | bits; > > + > > + /* Write the updated bottom 32-bits */ > > + if (qemu_tcg_mttcg_enabled()) { > > + uint32_t old_le = cpu_to_le32(orig_entry); > > + uint32_t new_le = cpu_to_le32(new_entry); > > + MemTxResult result; > > + uint32_t old_ret; > > + > > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > > + old_le, new_le, > > + MEMTXATTRS_UNSPECIFIED, > > + &result); > > + if (result == MEMTX_OK) { > > + if (old_ret != old_le) { > > + new_entry = 0; > > + } > > + return new_entry; > > + } > > + > > + /* Do we need to support this case where PTEs aren't in RAM ? > > + * > > + * For now fallback to non-atomic case > > + */ > > + } > > + > > + x86_stl_phys_notdirty(cs, addr, new_entry); > > + > > + return new_entry; > > +} > > + > > static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > int *prot) > > { > > CPUX86State *env = &X86_CPU(cs)->env; > > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > > + uint64_t rsvd_mask; > > uint64_t ptep, pte; > > uint64_t exit_info_1 = 0; > > target_ulong pde_addr, pte_addr; > > @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > return gphys; > > } > > > > + restart: > > + rsvd_mask = PG_HI_RSVD_MASK; > > if (!(env->nested_pg_mode & SVM_NPT_NXE)) { > > rsvd_mask |= PG_NX_MASK; > > } > > @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > goto do_fault_rsvd; > > } > > if (!(pml4e & PG_ACCESSED_MASK)) { > > - pml4e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > > + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); > > + if (!pml4e) { > > + goto restart; > > + } > > } > > ptep &= pml4e ^ PG_NX_MASK; > > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > > @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > } > > ptep &= pdpe ^ PG_NX_MASK; > > if (!(pdpe & PG_ACCESSED_MASK)) { > > - pdpe |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > > + if (!pdpe) { > > + goto restart; > > + } > > } > > if (pdpe & PG_PSE_MASK) { > > /* 1 GB page */ > > @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > } > > /* 4 KB page */ > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); > > pte = x86_ldq_phys(cs, pte_addr); > > @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, > > } > > > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > > > /* page directory entry */ > > @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > int error_code = 0; > > int is_dirty, prot, page_size, is_write, is_user; > > hwaddr paddr; > > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > > + uint64_t rsvd_mask; > > uint32_t page_offset; > > target_ulong vaddr; > > > > @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > goto do_mapping; > > } > > > > + restart: > > + rsvd_mask = PG_HI_RSVD_MASK; > > if (!(env->efer & MSR_EFER_NXE)) { > > rsvd_mask |= PG_NX_MASK; > > } > > @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > goto do_fault_rsvd; > > } > > if (!(pml5e & PG_ACCESSED_MASK)) { > > - pml5e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); > > + pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK); > > + if (!pml5e) { > > + goto restart; > > + } > > } > > ptep = pml5e ^ PG_NX_MASK; > > } else { > > @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > goto do_fault_rsvd; > > } > > if (!(pml4e & PG_ACCESSED_MASK)) { > > - pml4e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > > + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); > > + if (!pml4e) { > > + goto restart; > > + } > > } > > ptep &= pml4e ^ PG_NX_MASK; > > pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & > > @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > } > > ptep &= pdpe ^ PG_NX_MASK; > > if (!(pdpe & PG_ACCESSED_MASK)) { > > - pdpe |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > > + if (!pdpe) { > > + goto restart; > > + } > > } > > if (pdpe & PG_PSE_MASK) { > > /* 1 GB page */ > > @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > } > > /* 4 KB page */ > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & > > a20_mask; > > @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, > > } > > > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > > > /* page directory entry */ > > @@ -634,11 +690,11 @@ do_check_protect_pse36: > > /* yes, it can! */ > > is_dirty = is_write && !(pte & PG_DIRTY_MASK); > > if (!(pte & PG_ACCESSED_MASK) || is_dirty) { > > - pte |= PG_ACCESSED_MASK; > > - if (is_dirty) { > > - pte |= PG_DIRTY_MASK; > > + pte = update_entry(cs, pte_addr, pte, > > + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0)); > > + if (!pte) { > > + goto restart; > > } > > - x86_stl_phys_notdirty(cs, pte_addr, pte); > > } > > > > if (!(pte & PG_DIRTY_MASK)) { > > > >
On 29/11/18 11:04, Benjamin Herrenschmidt wrote: > On Thu, 2018-11-29 at 10:26 +0100, Paolo Bonzini wrote: >> On 29/11/18 00:15, Benjamin Herrenschmidt wrote: >>> Afaik, this isn't well documented (at least it wasn't when I last looked) >>> but OSes such as Linux rely on this behaviour: >>> >>> The HW updates to the page tables need to be done atomically with the >>> checking of the present bit (and other permissions). >>> >>> This is what allows Linux to do simple xchg of PTEs with 0 and assume >>> the value read has "final" stable dirty and accessed bits (the TLB >>> invalidation is deferred). >>> >>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> >>> --- >>> >>> This is lightly tested at this point, mostly to gather comments on the >>> approach. >> >> Looks good, but please kill the notdirty stuff first. It's not needed >> and it's a clear bug. When migrating, it can lead to PTEs being >> migrated without accessed and dirty bits. > > I thought that too but looking closely with rth, it seems it's still > setting *those* dirty bits, it just avoids the collision test with the > translator. Ah, you're right. I guess it's a minor performance optimization from avoiding some pointer chasing in page_find (radix tree walk). Paolo > For powerpc I need a cmpxchgq variant too, I'll probably split the > patch adding those mem ops from the rest as well. > > Annoyingly, fixing riscv will need some tests on target_ulong size. > > Cheers, > Ben. > >> Paolo >> >>> I noticed that RiscV is attempting to do something similar but with endian >>> bugs, at least from my reading of the code. >>> >>> include/exec/memory_ldst.inc.h | 3 + >>> memory_ldst.inc.c | 38 ++++++++++++ >>> target/i386/excp_helper.c | 104 +++++++++++++++++++++++++-------- >>> 3 files changed, 121 insertions(+), 24 deletions(-) >>> >>> diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h >>> index 272c20f02e..b7a41a0ad4 100644 >>> --- a/include/exec/memory_ldst.inc.h >>> +++ b/include/exec/memory_ldst.inc.h >>> @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL, >>> hwaddr addr, MemTxAttrs attrs, MemTxResult *result); >>> extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, >>> hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); >>> +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, >>> + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, >>> + MemTxResult *result); >>> extern void glue(address_space_stw, SUFFIX)(ARG1_DECL, >>> hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); >>> extern void glue(address_space_stl, SUFFIX)(ARG1_DECL, >>> diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c >>> index acf865b900..63af8f7dd2 100644 >>> --- a/memory_ldst.inc.c >>> +++ b/memory_ldst.inc.c >>> @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, >>> RCU_READ_UNLOCK(); >>> } >>> >>> +/* This is meant to be used for atomic PTE updates under MT-TCG */ >>> +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, >>> + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result) >>> +{ >>> + uint8_t *ptr; >>> + MemoryRegion *mr; >>> + hwaddr l = 4; >>> + hwaddr addr1; >>> + MemTxResult r; >>> + uint8_t dirty_log_mask; >>> + >>> + /* Must test result */ >>> + assert(result); >>> + >>> + RCU_READ_LOCK(); >>> + mr = TRANSLATE(addr, &addr1, &l, true, attrs); >>> + if (l < 4 || !memory_access_is_direct(mr, true)) { >>> + r = MEMTX_ERROR; >>> + } else { >>> + uint32_t orig = old; >>> + >>> + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); >>> + old = atomic_cmpxchg(ptr, orig, new); >>> + >>> + if (old == orig) { >>> + dirty_log_mask = memory_region_get_dirty_log_mask(mr); >>> + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); >>> + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, >>> + 4, dirty_log_mask); >>> + } >>> + r = MEMTX_OK; >>> + } >>> + *result = r; >>> + RCU_READ_UNLOCK(); >>> + >>> + return old; >>> +} >>> + >>> /* warning: addr must be aligned */ >>> static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, >>> hwaddr addr, uint32_t val, MemTxAttrs attrs, >>> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c >>> index 49231f6b69..5ccb9d6d6a 100644 >>> --- a/target/i386/excp_helper.c >>> +++ b/target/i386/excp_helper.c >>> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> >>> #else >>> >>> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, >>> + uint64_t orig_entry, uint32_t bits) >>> +{ >>> + uint64_t new_entry = orig_entry | bits; >>> + >>> + /* Write the updated bottom 32-bits */ >>> + if (qemu_tcg_mttcg_enabled()) { >>> + uint32_t old_le = cpu_to_le32(orig_entry); >>> + uint32_t new_le = cpu_to_le32(new_entry); >>> + MemTxResult result; >>> + uint32_t old_ret; >>> + >>> + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, >>> + old_le, new_le, >>> + MEMTXATTRS_UNSPECIFIED, >>> + &result); >>> + if (result == MEMTX_OK) { >>> + if (old_ret != old_le) { >>> + new_entry = 0; >>> + } >>> + return new_entry; >>> + } >>> + >>> + /* Do we need to support this case where PTEs aren't in RAM ? >>> + * >>> + * For now fallback to non-atomic case >>> + */ >>> + } >>> + >>> + x86_stl_phys_notdirty(cs, addr, new_entry); >>> + >>> + return new_entry; >>> +} >>> + >>> static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> int *prot) >>> { >>> CPUX86State *env = &X86_CPU(cs)->env; >>> - uint64_t rsvd_mask = PG_HI_RSVD_MASK; >>> + uint64_t rsvd_mask; >>> uint64_t ptep, pte; >>> uint64_t exit_info_1 = 0; >>> target_ulong pde_addr, pte_addr; >>> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> return gphys; >>> } >>> >>> + restart: >>> + rsvd_mask = PG_HI_RSVD_MASK; >>> if (!(env->nested_pg_mode & SVM_NPT_NXE)) { >>> rsvd_mask |= PG_NX_MASK; >>> } >>> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> goto do_fault_rsvd; >>> } >>> if (!(pml4e & PG_ACCESSED_MASK)) { >>> - pml4e |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); >>> + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); >>> + if (!pml4e) { >>> + goto restart; >>> + } >>> } >>> ptep &= pml4e ^ PG_NX_MASK; >>> pdpe_addr = (pml4e & PG_ADDRESS_MASK) + >>> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> } >>> ptep &= pdpe ^ PG_NX_MASK; >>> if (!(pdpe & PG_ACCESSED_MASK)) { >>> - pdpe |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); >>> + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); >>> + if (!pdpe) { >>> + goto restart; >>> + } >>> } >>> if (pdpe & PG_PSE_MASK) { >>> /* 1 GB page */ >>> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> } >>> /* 4 KB page */ >>> if (!(pde & PG_ACCESSED_MASK)) { >>> - pde |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pde_addr, pde); >>> + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); >>> + if (!pde) { >>> + goto restart; >>> + } >>> } >>> pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); >>> pte = x86_ldq_phys(cs, pte_addr); >>> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, >>> } >>> >>> if (!(pde & PG_ACCESSED_MASK)) { >>> - pde |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pde_addr, pde); >>> + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); >>> + if (!pde) { >>> + goto restart; >>> + } >>> } >>> >>> /* page directory entry */ >>> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> int error_code = 0; >>> int is_dirty, prot, page_size, is_write, is_user; >>> hwaddr paddr; >>> - uint64_t rsvd_mask = PG_HI_RSVD_MASK; >>> + uint64_t rsvd_mask; >>> uint32_t page_offset; >>> target_ulong vaddr; >>> >>> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> goto do_mapping; >>> } >>> >>> + restart: >>> + rsvd_mask = PG_HI_RSVD_MASK; >>> if (!(env->efer & MSR_EFER_NXE)) { >>> rsvd_mask |= PG_NX_MASK; >>> } >>> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> goto do_fault_rsvd; >>> } >>> if (!(pml5e & PG_ACCESSED_MASK)) { >>> - pml5e |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); >>> + pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK); >>> + if (!pml5e) { >>> + goto restart; >>> + } >>> } >>> ptep = pml5e ^ PG_NX_MASK; >>> } else { >>> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> goto do_fault_rsvd; >>> } >>> if (!(pml4e & PG_ACCESSED_MASK)) { >>> - pml4e |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); >>> + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); >>> + if (!pml4e) { >>> + goto restart; >>> + } >>> } >>> ptep &= pml4e ^ PG_NX_MASK; >>> pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & >>> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> } >>> ptep &= pdpe ^ PG_NX_MASK; >>> if (!(pdpe & PG_ACCESSED_MASK)) { >>> - pdpe |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); >>> + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); >>> + if (!pdpe) { >>> + goto restart; >>> + } >>> } >>> if (pdpe & PG_PSE_MASK) { >>> /* 1 GB page */ >>> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> } >>> /* 4 KB page */ >>> if (!(pde & PG_ACCESSED_MASK)) { >>> - pde |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pde_addr, pde); >>> + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); >>> + if (!pde) { >>> + goto restart; >>> + } >>> } >>> pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & >>> a20_mask; >>> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, >>> } >>> >>> if (!(pde & PG_ACCESSED_MASK)) { >>> - pde |= PG_ACCESSED_MASK; >>> - x86_stl_phys_notdirty(cs, pde_addr, pde); >>> + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); >>> + if (!pde) { >>> + goto restart; >>> + } >>> } >>> >>> /* page directory entry */ >>> @@ -634,11 +690,11 @@ do_check_protect_pse36: >>> /* yes, it can! */ >>> is_dirty = is_write && !(pte & PG_DIRTY_MASK); >>> if (!(pte & PG_ACCESSED_MASK) || is_dirty) { >>> - pte |= PG_ACCESSED_MASK; >>> - if (is_dirty) { >>> - pte |= PG_DIRTY_MASK; >>> + pte = update_entry(cs, pte_addr, pte, >>> + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0)); >>> + if (!pte) { >>> + goto restart; >>> } >>> - x86_stl_phys_notdirty(cs, pte_addr, pte); >>> } >>> >>> if (!(pte & PG_DIRTY_MASK)) { >>> >>> >
On 11/28/18 3:15 PM, Benjamin Herrenschmidt wrote: > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > + uint64_t orig_entry, uint32_t bits) > +{ > + uint64_t new_entry = orig_entry | bits; > + > + /* Write the updated bottom 32-bits */ > + if (qemu_tcg_mttcg_enabled()) { > + uint32_t old_le = cpu_to_le32(orig_entry); > + uint32_t new_le = cpu_to_le32(new_entry); > + MemTxResult result; > + uint32_t old_ret; > + > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > + old_le, new_le, > + MEMTXATTRS_UNSPECIFIED, > + &result); > + if (result == MEMTX_OK) { > + if (old_ret != old_le) { > + new_entry = 0; > + } > + return new_entry; > + } > + > + /* Do we need to support this case where PTEs aren't in RAM ? > + * > + * For now fallback to non-atomic case > + */ > + } > + > + x86_stl_phys_notdirty(cs, addr, new_entry); > + > + return new_entry; > +} While I know the existing code just updates the low 32-bits, I'd rather update the whole entry, and make use of the old value returned from the cmpxchg. E.g. static bool update_entry(CPUState *cs, target_ulong addr, uint64_t *entry, uint32_t bits) { uint64_t old_entry = *entry; uint64_t new_entry = old_entry | bits; if (qemu_tcg_mttcg_enabled()) { uint64_t cmp_le = cpu_to_le64(old_entry); uint64_t new_le = cpu_to_le64(new_entry); uint64_t old_le; MemTxResult result; old_le = address_space_cmpxchgl_notdirty(cs->as, addr, cmp_le, new_le, MEMTXATTRS_UNSPECIFIED, &result); if (result == MEMTX_OK) { if (old_le == cmp_le) { return true; } else { *entry = le_to_cpu64(old_le); return false; } } } x86_stq_phys_notdirty(cs, addr, new_entry); return true; } .... pdpe_addr = (pml4e & PG_ADDRESS_MASK) + (((gphys >> 30) & 0x1ff) << 3); pdpe = x86_ldq_phys(cs, pdpe_addr); do { if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; } if (pdpe & rsvd_mask) { goto do_fault_rsvd; } if (pdpe & PG_ACCESSED_MASK) { break; } } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK)); ptep &= pdpe ^ PG_NX_MASK; .... Although I think it would be really great if we could figure out something that allows us to promote this whole load/cmpxchg loop into a primitive that avoids multiple translations of the address. No, I don't know what that primitive would look like. :-) r~
On Thu, 2018-11-29 at 11:04 -0800, Richard Henderson wrote: > On 11/28/18 3:15 PM, Benjamin Herrenschmidt wrote: > > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > > + uint64_t orig_entry, uint32_t bits) > > +{ > > + uint64_t new_entry = orig_entry | bits; > > + > > + /* Write the updated bottom 32-bits */ > > + if (qemu_tcg_mttcg_enabled()) { > > + uint32_t old_le = cpu_to_le32(orig_entry); > > + uint32_t new_le = cpu_to_le32(new_entry); > > + MemTxResult result; > > + uint32_t old_ret; > > + > > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > > + old_le, new_le, > > + MEMTXATTRS_UNSPECIFIED, > > + &result); > > + if (result == MEMTX_OK) { > > + if (old_ret != old_le) { > > + new_entry = 0; > > + } > > + return new_entry; > > + } > > + > > + /* Do we need to support this case where PTEs aren't in RAM ? > > + * > > + * For now fallback to non-atomic case > > + */ > > + } > > + > > + x86_stl_phys_notdirty(cs, addr, new_entry); > > + > > + return new_entry; > > +} > > While I know the existing code just updates the low 32-bits, I'd rather update > the whole entry, and make use of the old value returned from the cmpxchg. Not all PTEs on x86 ae 64-bits, the old 32-bit format still exists, so we'd have to create two helpers. Not a big deal mind you. I'm already creating a second address_space_cmpxchgq_notdirty for use by powerpc > static bool update_entry(CPUState *cs, target_ulong addr, > uint64_t *entry, uint32_t bits) > { > uint64_t old_entry = *entry; > uint64_t new_entry = old_entry | bits; > > if (qemu_tcg_mttcg_enabled()) { > uint64_t cmp_le = cpu_to_le64(old_entry); > uint64_t new_le = cpu_to_le64(new_entry); > uint64_t old_le; > MemTxResult result; > > old_le = address_space_cmpxchgl_notdirty(cs->as, addr, > cmp_le, new_le, > MEMTXATTRS_UNSPECIFIED, > &result); > if (result == MEMTX_OK) { > if (old_le == cmp_le) { > return true; > } else { > *entry = le_to_cpu64(old_le); > return false; > } > } > } > x86_stq_phys_notdirty(cs, addr, new_entry); > return true; > } > .... > > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > (((gphys >> 30) & 0x1ff) << 3); > pdpe = x86_ldq_phys(cs, pdpe_addr); > do { > if (!(pdpe & PG_PRESENT_MASK)) { > goto do_fault; > } > if (pdpe & rsvd_mask) { > goto do_fault_rsvd; > } > if (pdpe & PG_ACCESSED_MASK) { > break; > } > } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK)); > ptep &= pdpe ^ PG_NX_MASK; > > .... Hrm.. I see. So not re-do the full walk. Not sure it's really worth it though, how often do we expect to hit the failing case ? > > Although I think it would be really great if we could figure out something that > allows us to promote this whole load/cmpxchg loop into a primitive that avoids > multiple translations of the address. > > No, I don't know what that primitive would look like. :-) You mean translating once for the load and for the udpate ? Do you expect that translation to have such a significant cost considering that all it needs should be in L1 at that point ? There are subtle differences in what happens between the load and the update, and we hope for the update to be fairly rare (even though it's done by HW it's costly even on actual chips). So best way I could think of would be to use a macro, so we can open- code the statements that test/manipulate the value. Not sure I'll get to it today though, I'm off early this afternoon for the week-end. Cheers, Ben. > > r~
On 11/29/18 2:54 PM, Benjamin Herrenschmidt wrote: >> pdpe_addr = (pml4e & PG_ADDRESS_MASK) + >> (((gphys >> 30) & 0x1ff) << 3); >> pdpe = x86_ldq_phys(cs, pdpe_addr); >> do { >> if (!(pdpe & PG_PRESENT_MASK)) { >> goto do_fault; >> } >> if (pdpe & rsvd_mask) { >> goto do_fault_rsvd; >> } >> if (pdpe & PG_ACCESSED_MASK) { >> break; >> } >> } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK)); >> ptep &= pdpe ^ PG_NX_MASK; >> >> .... > > Hrm.. I see. So not re-do the full walk. Not sure it's really worth it > though, how often do we expect to hit the failing case ? It is probably rare-ish, I admit. I suppose we could also signal "success" from update_entry if the cmpxchg fails, but the value that was reloaded only differs in setting PG_ACCESSED_MASK | PG_DIRTY_MASK, as long as 'bits' itself was set. >> Although I think it would be really great if we could figure out something that >> allows us to promote this whole load/cmpxchg loop into a primitive that avoids >> multiple translations of the address. >> >> No, I don't know what that primitive would look like. :-) > > You mean translating once for the load and for the udpate ? Do you > expect that translation to have such a significant cost considering > that all it needs should be in L1 at that point ? I guess if the update is rare-ish, the re-translating isn't a big deal. And I suppose we'd have to retain the RCU lock to hold on to the translation, which probably isn't the best idea. Nevermind on this. r~
On Thu, 2018-11-29 at 16:12 -0800, Richard Henderson wrote: > > You mean translating once for the load and for the udpate ? Do you > > expect that translation to have such a significant cost considering > > that all it needs should be in L1 at that point ? > > I guess if the update is rare-ish, the re-translating isn't a big deal. And I > suppose we'd have to retain the RCU lock to hold on to the translation, which > probably isn't the best idea. Yeay I toyed with this a bit but it ended up very messy real quick. Cheers, Ben.
On Thu, 2018-11-29 at 16:12 -0800, Richard Henderson wrote: > On 11/29/18 2:54 PM, Benjamin Herrenschmidt wrote: > > > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > > > (((gphys >> 30) & 0x1ff) << 3); > > > pdpe = x86_ldq_phys(cs, pdpe_addr); > > > do { > > > if (!(pdpe & PG_PRESENT_MASK)) { > > > goto do_fault; > > > } > > > if (pdpe & rsvd_mask) { > > > goto do_fault_rsvd; > > > } > > > if (pdpe & PG_ACCESSED_MASK) { > > > break; > > > } > > > } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK)); > > > ptep &= pdpe ^ PG_NX_MASK; > > > > > > .... > > > > Hrm.. I see. So not re-do the full walk. Not sure it's really worth it > > though, how often do we expect to hit the failing case ? > > It is probably rare-ish, I admit. > > I suppose we could also signal "success" from update_entry if the cmpxchg > fails, but the value that was reloaded only differs in setting PG_ACCESSED_MASK The latter optimization is trivial. As for the former, replacing my "goto restart" with those loops, it will make the patch significantly bigger, though not adding another goto has its perks and the end result might be slightly nicer ... What way do you prefer ? Cheers, Ben.
On Thu, 2018-11-29 at 11:04 -0800, Richard Henderson wrote: > > While I know the existing code just updates the low 32-bits, I'd rather update > the whole entry, and make use of the old value returned from the cmpxchg. So I had to put this on the back burner for a bit, but I'm back to it now. I've tried the above but it gets messy again. The i386 code can have either 32 or 64 bits PDE/PTEs. The udpate code for the PTE is designed so a single code path can work with either by exploiting that LE trick, so changing that would involve more re-factoring and I'd rather avoid changing more than strictly needed in there. As for this: > pdpe = x86_ldq_phys(cs, pdpe_addr); > do { > if (!(pdpe & PG_PRESENT_MASK)) { > goto do_fault; > } > if (pdpe & rsvd_mask) { > goto do_fault_rsvd; > } > if (pdpe & PG_ACCESSED_MASK) { > break; > } > } while (!update_entry(cs, pdpe_addr, &pdpe, PG_ACCESSED_MASK)); > ptep &= pdpe ^ PG_NX_MASK; While that form of construct is nicer than the current goto restart in my patch, in a similar way, it works for the intermediary cases but would require some serious refactoring of the whole function for the final PTE case. At this point I'd rather not take chances and introduce more bugs, so I'll post an updated variant with a few fixes but will postpone more refactoring unless you really really want them now :) Cheers, Ben.
On 12/13/18 5:39 PM, Benjamin Herrenschmidt wrote: > At this point I'd rather not take chances and introduce more bugs, so > I'll post an updated variant with a few fixes but will postpone more > refactoring unless you really really want them now :) No, that's quite all right. Thanks for the investigation. r~
diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h index 272c20f02e..b7a41a0ad4 100644 --- a/include/exec/memory_ldst.inc.h +++ b/include/exec/memory_ldst.inc.h @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL, hwaddr addr, MemTxAttrs attrs, MemTxResult *result); extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, + MemTxResult *result); extern void glue(address_space_stw, SUFFIX)(ARG1_DECL, hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); extern void glue(address_space_stl, SUFFIX)(ARG1_DECL, diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c index acf865b900..63af8f7dd2 100644 --- a/memory_ldst.inc.c +++ b/memory_ldst.inc.c @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, RCU_READ_UNLOCK(); } +/* This is meant to be used for atomic PTE updates under MT-TCG */ +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result) +{ + uint8_t *ptr; + MemoryRegion *mr; + hwaddr l = 4; + hwaddr addr1; + MemTxResult r; + uint8_t dirty_log_mask; + + /* Must test result */ + assert(result); + + RCU_READ_LOCK(); + mr = TRANSLATE(addr, &addr1, &l, true, attrs); + if (l < 4 || !memory_access_is_direct(mr, true)) { + r = MEMTX_ERROR; + } else { + uint32_t orig = old; + + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); + old = atomic_cmpxchg(ptr, orig, new); + + if (old == orig) { + dirty_log_mask = memory_region_get_dirty_log_mask(mr); + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, + 4, dirty_log_mask); + } + r = MEMTX_OK; + } + *result = r; + RCU_READ_UNLOCK(); + + return old; +} + /* warning: addr must be aligned */ static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, hwaddr addr, uint32_t val, MemTxAttrs attrs, diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c index 49231f6b69..5ccb9d6d6a 100644 --- a/target/i386/excp_helper.c +++ b/target/i386/excp_helper.c @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, #else +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, + uint64_t orig_entry, uint32_t bits) +{ + uint64_t new_entry = orig_entry | bits; + + /* Write the updated bottom 32-bits */ + if (qemu_tcg_mttcg_enabled()) { + uint32_t old_le = cpu_to_le32(orig_entry); + uint32_t new_le = cpu_to_le32(new_entry); + MemTxResult result; + uint32_t old_ret; + + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, + old_le, new_le, + MEMTXATTRS_UNSPECIFIED, + &result); + if (result == MEMTX_OK) { + if (old_ret != old_le) { + new_entry = 0; + } + return new_entry; + } + + /* Do we need to support this case where PTEs aren't in RAM ? + * + * For now fallback to non-atomic case + */ + } + + x86_stl_phys_notdirty(cs, addr, new_entry); + + return new_entry; +} + static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, int *prot) { CPUX86State *env = &X86_CPU(cs)->env; - uint64_t rsvd_mask = PG_HI_RSVD_MASK; + uint64_t rsvd_mask; uint64_t ptep, pte; uint64_t exit_info_1 = 0; target_ulong pde_addr, pte_addr; @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, return gphys; } + restart: + rsvd_mask = PG_HI_RSVD_MASK; if (!(env->nested_pg_mode & SVM_NPT_NXE)) { rsvd_mask |= PG_NX_MASK; } @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, goto do_fault_rsvd; } if (!(pml4e & PG_ACCESSED_MASK)) { - pml4e |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); + if (!pml4e) { + goto restart; + } } ptep &= pml4e ^ PG_NX_MASK; pdpe_addr = (pml4e & PG_ADDRESS_MASK) + @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, } ptep &= pdpe ^ PG_NX_MASK; if (!(pdpe & PG_ACCESSED_MASK)) { - pdpe |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); + if (!pdpe) { + goto restart; + } } if (pdpe & PG_PSE_MASK) { /* 1 GB page */ @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, } /* 4 KB page */ if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); + if (!pde) { + goto restart; + } } pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); pte = x86_ldq_phys(cs, pte_addr); @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type, } if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); + if (!pde) { + goto restart; + } } /* page directory entry */ @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, int error_code = 0; int is_dirty, prot, page_size, is_write, is_user; hwaddr paddr; - uint64_t rsvd_mask = PG_HI_RSVD_MASK; + uint64_t rsvd_mask; uint32_t page_offset; target_ulong vaddr; @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, goto do_mapping; } + restart: + rsvd_mask = PG_HI_RSVD_MASK; if (!(env->efer & MSR_EFER_NXE)) { rsvd_mask |= PG_NX_MASK; } @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, goto do_fault_rsvd; } if (!(pml5e & PG_ACCESSED_MASK)) { - pml5e |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); + pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK); + if (!pml5e) { + goto restart; + } } ptep = pml5e ^ PG_NX_MASK; } else { @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, goto do_fault_rsvd; } if (!(pml4e & PG_ACCESSED_MASK)) { - pml4e |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); + pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK); + if (!pml4e) { + goto restart; + } } ptep &= pml4e ^ PG_NX_MASK; pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, } ptep &= pdpe ^ PG_NX_MASK; if (!(pdpe & PG_ACCESSED_MASK)) { - pdpe |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); + if (!pdpe) { + goto restart; + } } if (pdpe & PG_PSE_MASK) { /* 1 GB page */ @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, } /* 4 KB page */ if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); + if (!pde) { + goto restart; + } } pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & a20_mask; @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size, } if (!(pde & PG_ACCESSED_MASK)) { - pde |= PG_ACCESSED_MASK; - x86_stl_phys_notdirty(cs, pde_addr, pde); + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); + if (!pde) { + goto restart; + } } /* page directory entry */ @@ -634,11 +690,11 @@ do_check_protect_pse36: /* yes, it can! */ is_dirty = is_write && !(pte & PG_DIRTY_MASK); if (!(pte & PG_ACCESSED_MASK) || is_dirty) { - pte |= PG_ACCESSED_MASK; - if (is_dirty) { - pte |= PG_DIRTY_MASK; + pte = update_entry(cs, pte_addr, pte, + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0)); + if (!pte) { + goto restart; } - x86_stl_phys_notdirty(cs, pte_addr, pte); } if (!(pte & PG_DIRTY_MASK)) {
Afaik, this isn't well documented (at least it wasn't when I last looked) but OSes such as Linux rely on this behaviour: The HW updates to the page tables need to be done atomically with the checking of the present bit (and other permissions). This is what allows Linux to do simple xchg of PTEs with 0 and assume the value read has "final" stable dirty and accessed bits (the TLB invalidation is deferred). Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> --- This is lightly tested at this point, mostly to gather comments on the approach. I noticed that RiscV is attempting to do something similar but with endian bugs, at least from my reading of the code. include/exec/memory_ldst.inc.h | 3 + memory_ldst.inc.c | 38 ++++++++++++ target/i386/excp_helper.c | 104 +++++++++++++++++++++++++-------- 3 files changed, 121 insertions(+), 24 deletions(-)