diff mbox series

[RFC,1/1] accel/tcg: Clear PAGE_WRITE before translation

Message ID 20210804224633.154083-2-iii@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series accel/tcg: Clear PAGE_WRITE before translation | expand

Commit Message

Ilya Leoshkevich Aug. 4, 2021, 10:46 p.m. UTC
translate_insn() implementations fetch instruction bytes piecemeal,
which can cause qemu-user to generate inconsistent translations if
another thread modifies them concurrently [1].

Fix by marking translation block pages non-writable earlier.

[1] https://lists.nongnu.org/archive/html/qemu-devel/2021-08/msg00644.html

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 accel/tcg/translate-all.c    | 59 +++++++++++++++++++++---------------
 accel/tcg/translator.c       | 26 ++++++++++++++--
 include/exec/translate-all.h |  1 +
 3 files changed, 59 insertions(+), 27 deletions(-)

Comments

Richard Henderson Aug. 5, 2021, 12:30 a.m. UTC | #1
On 8/4/21 12:46 PM, Ilya Leoshkevich wrote:
> translate_insn() implementations fetch instruction bytes piecemeal,
> which can cause qemu-user to generate inconsistent translations if
> another thread modifies them concurrently [1].
> 
> Fix by marking translation block pages non-writable earlier.
> 
> [1] https://lists.nongnu.org/archive/html/qemu-devel/2021-08/msg00644.html
> 
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>   accel/tcg/translate-all.c    | 59 +++++++++++++++++++++---------------
>   accel/tcg/translator.c       | 26 ++++++++++++++--
>   include/exec/translate-all.h |  1 +
>   3 files changed, 59 insertions(+), 27 deletions(-)
> 
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index bbfcfb698c..fb9ebfad9e 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1297,31 +1297,8 @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
>       invalidate_page_bitmap(p);
>   
>   #if defined(CONFIG_USER_ONLY)
> -    if (p->flags & PAGE_WRITE) {
> -        target_ulong addr;
> -        PageDesc *p2;
> -        int prot;
> -
> -        /* force the host page as non writable (writes will have a
> -           page fault + mprotect overhead) */
> -        page_addr &= qemu_host_page_mask;
> -        prot = 0;
> -        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
> -            addr += TARGET_PAGE_SIZE) {
> -
> -            p2 = page_find(addr >> TARGET_PAGE_BITS);
> -            if (!p2) {
> -                continue;
> -            }
> -            prot |= p2->flags;
> -            p2->flags &= ~PAGE_WRITE;
> -          }
> -        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
> -                 (prot & PAGE_BITS) & ~PAGE_WRITE);
> -        if (DEBUG_TB_INVALIDATE_GATE) {
> -            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
> -        }
> -    }
> +    /* translator_loop() must have made all TB pages non-writable */
> +    assert(!(p->flags & PAGE_WRITE));
>   #else
>       /* if some code is already present, then the pages are already
>          protected. So we handle the case where only the first TB is
> @@ -2394,6 +2371,38 @@ int page_check_range(target_ulong start, target_ulong len, int flags)
>       return 0;
>   }
>   
> +void page_protect(tb_page_addr_t page_addr)
> +{
> +    target_ulong addr;
> +    PageDesc *p;
> +    int prot;
> +
> +    p = page_find(page_addr >> TARGET_PAGE_BITS);
> +    if (p && (p->flags & PAGE_WRITE)) {
> +        /*
> +         * Force the host page as non writable (writes will have a page fault +
> +         * mprotect overhead).
> +         */
> +        page_addr &= qemu_host_page_mask;
> +        prot = 0;
> +        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
> +             addr += TARGET_PAGE_SIZE) {
> +
> +            p = page_find(addr >> TARGET_PAGE_BITS);
> +            if (!p) {
> +                continue;
> +            }
> +            prot |= p->flags;
> +            p->flags &= ~PAGE_WRITE;
> +        }
> +        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
> +                 (prot & PAGE_BITS) & ~PAGE_WRITE);
> +        if (DEBUG_TB_INVALIDATE_GATE) {
> +            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
> +        }
> +    }
> +}
> +
>   /* called from signal handler: invalidate the code and unprotect the
>    * page. Return 0 if the fault was not handled, 1 if it was handled,
>    * and 2 if it was handled but the caller must cause the TB to be
> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
> index c53a7f8e44..bfbe7d7ccf 100644
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -14,6 +14,7 @@
>   #include "exec/exec-all.h"
>   #include "exec/gen-icount.h"
>   #include "exec/log.h"
> +#include "exec/translate-all.h"
>   #include "exec/translator.h"
>   #include "exec/plugin-gen.h"
>   #include "sysemu/replay.h"
> @@ -47,6 +48,10 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
>   {
>       uint32_t cflags = tb_cflags(tb);
>       bool plugin_enabled;
> +    bool stop = false;
> +#ifdef CONFIG_USER_ONLY
> +    target_ulong page_addr = -1;
> +#endif
>   
>       /* Initialize DisasContext */
>       db->tb = tb;
> @@ -71,6 +76,21 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
>       plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags & CF_MEMI_ONLY);
>   
>       while (true) {
> +#ifdef CONFIG_USER_ONLY
> +        /*
> +         * Make the page containing the next instruction non-writable in order
> +         * to get a consistent translation if another thread is modifying the
> +         * code while translate_insn() fetches the instruction bytes piecemeal.
> +         * Writer threads will wait for mmap_lock() in page_unprotect().
> +         */
> +        if ((db->pc_next & TARGET_PAGE_MASK) != page_addr) {
> +            page_addr = db->pc_next & TARGET_PAGE_MASK;
> +            page_protect(page_addr);
> +        }
> +#endif
> +        if (stop) {
> +            break;
> +        }

So... I think this isn't quite right.

(1) If we stop exactly at the page break, this protects the *next* page unnecessarily.

(2) This only protects the page of the start of the insn.  If the instruction crosses the 
page boundary, then the latter part of the insn is still victim to the race we're trying 
to fix.

I think a protect needs to happen in translator_ld*_swap, before reading the data.

I think that the translator_ld*_swap functions should be moved out of 
include/exec/translator.h into accel/tcg/translator.c.

I think that the translator_ld* functions should add a DisasContextBase argument, which 
should then contain the cache for the protection.  This will be a moderately large change, 
but it should be mostly mechanical.

I think that we should initialize the protection cache before translating the first insn, 
outside of that loop.  This will mean that you need not check for two pages 
simultaneously, when a single read crosses the page boundary.  You'll know that (at 
minimum) the first byte of the first read is already covered, and only need to check the 
last byte of each subsequent read.  I think the value you use for your cache should be the 
end of the page for which protection is known to apply, so that the check reduces to

   end = pc + len - 1;
   if (end > dcbase->page_protect_end) {
     dcbase->page_protect_end = end | ~TARGET_PAGE_MASK;
     page_protect(end);
   }


r~
Ilya Leoshkevich Aug. 5, 2021, 10:56 a.m. UTC | #2
On Wed, 2021-08-04 at 14:30 -1000, Richard Henderson wrote:
> On 8/4/21 12:46 PM, Ilya Leoshkevich wrote:
> > translate_insn() implementations fetch instruction bytes piecemeal,
> > which can cause qemu-user to generate inconsistent translations if
> > another thread modifies them concurrently [1].
> > 
> > Fix by marking translation block pages non-writable earlier.
> > 
> > [1] 
> > https://lists.nongnu.org/archive/html/qemu-devel/2021-08/msg00644.html
> > 
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> >   accel/tcg/translate-all.c    | 59 +++++++++++++++++++++----------
> > -----
> >   accel/tcg/translator.c       | 26 ++++++++++++++--
> >   include/exec/translate-all.h |  1 +
> >   3 files changed, 59 insertions(+), 27 deletions(-)
> > 
> > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > index bbfcfb698c..fb9ebfad9e 100644
> > --- a/accel/tcg/translate-all.c
> > +++ b/accel/tcg/translate-all.c
> > @@ -1297,31 +1297,8 @@ static inline void tb_page_add(PageDesc *p,
> > TranslationBlock *tb,
> >       invalidate_page_bitmap(p);
> >   
> >   #if defined(CONFIG_USER_ONLY)
> > -    if (p->flags & PAGE_WRITE) {
> > -        target_ulong addr;
> > -        PageDesc *p2;
> > -        int prot;
> > -
> > -        /* force the host page as non writable (writes will have a
> > -           page fault + mprotect overhead) */
> > -        page_addr &= qemu_host_page_mask;
> > -        prot = 0;
> > -        for (addr = page_addr; addr < page_addr +
> > qemu_host_page_size;
> > -            addr += TARGET_PAGE_SIZE) {
> > -
> > -            p2 = page_find(addr >> TARGET_PAGE_BITS);
> > -            if (!p2) {
> > -                continue;
> > -            }
> > -            prot |= p2->flags;
> > -            p2->flags &= ~PAGE_WRITE;
> > -          }
> > -        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
> > -                 (prot & PAGE_BITS) & ~PAGE_WRITE);
> > -        if (DEBUG_TB_INVALIDATE_GATE) {
> > -            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT
> > "\n", page_addr);
> > -        }
> > -    }
> > +    /* translator_loop() must have made all TB pages non-writable
> > */
> > +    assert(!(p->flags & PAGE_WRITE));
> >   #else
> >       /* if some code is already present, then the pages are
> > already
> >          protected. So we handle the case where only the first TB
> > is
> > @@ -2394,6 +2371,38 @@ int page_check_range(target_ulong start,
> > target_ulong len, int flags)
> >       return 0;
> >   }
> >   
> > +void page_protect(tb_page_addr_t page_addr)
> > +{
> > +    target_ulong addr;
> > +    PageDesc *p;
> > +    int prot;
> > +
> > +    p = page_find(page_addr >> TARGET_PAGE_BITS);
> > +    if (p && (p->flags & PAGE_WRITE)) {
> > +        /*
> > +         * Force the host page as non writable (writes will have a
> > page fault +
> > +         * mprotect overhead).
> > +         */
> > +        page_addr &= qemu_host_page_mask;
> > +        prot = 0;
> > +        for (addr = page_addr; addr < page_addr +
> > qemu_host_page_size;
> > +             addr += TARGET_PAGE_SIZE) {
> > +
> > +            p = page_find(addr >> TARGET_PAGE_BITS);
> > +            if (!p) {
> > +                continue;
> > +            }
> > +            prot |= p->flags;
> > +            p->flags &= ~PAGE_WRITE;
> > +        }
> > +        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
> > +                 (prot & PAGE_BITS) & ~PAGE_WRITE);
> > +        if (DEBUG_TB_INVALIDATE_GATE) {
> > +            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT
> > "\n", page_addr);
> > +        }
> > +    }
> > +}
> > +
> >   /* called from signal handler: invalidate the code and unprotect
> > the
> >    * page. Return 0 if the fault was not handled, 1 if it was
> > handled,
> >    * and 2 if it was handled but the caller must cause the TB to be
> > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
> > index c53a7f8e44..bfbe7d7ccf 100644
> > --- a/accel/tcg/translator.c
> > +++ b/accel/tcg/translator.c
> > @@ -14,6 +14,7 @@
> >   #include "exec/exec-all.h"
> >   #include "exec/gen-icount.h"
> >   #include "exec/log.h"
> > +#include "exec/translate-all.h"
> >   #include "exec/translator.h"
> >   #include "exec/plugin-gen.h"
> >   #include "sysemu/replay.h"
> > @@ -47,6 +48,10 @@ void translator_loop(const TranslatorOps *ops,
> > DisasContextBase *db,
> >   {
> >       uint32_t cflags = tb_cflags(tb);
> >       bool plugin_enabled;
> > +    bool stop = false;
> > +#ifdef CONFIG_USER_ONLY
> > +    target_ulong page_addr = -1;
> > +#endif
> >   
> >       /* Initialize DisasContext */
> >       db->tb = tb;
> > @@ -71,6 +76,21 @@ void translator_loop(const TranslatorOps *ops,
> > DisasContextBase *db,
> >       plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags &
> > CF_MEMI_ONLY);
> >   
> >       while (true) {
> > +#ifdef CONFIG_USER_ONLY
> > +        /*
> > +         * Make the page containing the next instruction non-
> > writable in order
> > +         * to get a consistent translation if another thread is
> > modifying the
> > +         * code while translate_insn() fetches the instruction
> > bytes piecemeal.
> > +         * Writer threads will wait for mmap_lock() in
> > page_unprotect().
> > +         */
> > +        if ((db->pc_next & TARGET_PAGE_MASK) != page_addr) {
> > +            page_addr = db->pc_next & TARGET_PAGE_MASK;
> > +            page_protect(page_addr);
> > +        }
> > +#endif
> > +        if (stop) {
> > +            break;
> > +        }

Thanks, moving protection to just before instruction byte loads makes
perfect sense. I have just one question.

> So... I think this isn't quite right.
> 
> (1) If we stop exactly at the page break, this protects the *next*
> page unnecessarily.
> 
> (2) This only protects the page of the start of the insn.  If the
> instruction crosses the 
> page boundary, then the latter part of the insn is still victim to
> the race we're trying 
> to fix.
> 
> I think a protect needs to happen in translator_ld*_swap, before
> reading the data.
> 
> I think that the translator_ld*_swap functions should be moved out of
> include/exec/translator.h into accel/tcg/translator.c.

Do we really need this? In the end, the added code is not that large.

> I think that the translator_ld* functions should add a
> DisasContextBase argument, which 
> should then contain the cache for the protection.  This will be a
> moderately large change, 
> but it should be mostly mechanical.
> 
> I think that we should initialize the protection cache before
> translating the first insn, 
> outside of that loop.  This will mean that you need not check for two
> pages 
> simultaneously, when a single read crosses the page boundary.  You'll
> know that (at 
> minimum) the first byte of the first read is already covered, and
> only need to check the 
> last byte of each subsequent read.  I think the value you use for
> your cache should be the 
> end of the page for which protection is known to apply, so that the
> check reduces to
> 
>    end = pc + len - 1;
>    if (end > dcbase->page_protect_end) {
>      dcbase->page_protect_end = end | ~TARGET_PAGE_MASK;
>      page_protect(end);
>    }
> 
> 
> r~
Richard Henderson Aug. 5, 2021, 4:59 p.m. UTC | #3
On 8/5/21 12:56 AM, Ilya Leoshkevich wrote:
> On Wed, 2021-08-04 at 14:30 -1000, Richard Henderson wrote:
>> I think that the translator_ld*_swap functions should be moved out of
>> include/exec/translator.h into accel/tcg/translator.c.
> 
> Do we really need this? In the end, the added code is not that large.

I suppose it's not required, but they're already larger than they need to be.

In my opinion, if you're going to swap one out-of-line function call for two out-of-line 
function calls (cpu_ld*_code + plugin_insn_append), you've probably made a bad size trade-off.

With your added code, it'll be 3 out-of-line calls.


r~
Ilya Leoshkevich Aug. 5, 2021, 8:36 p.m. UTC | #4
On Thu, 2021-08-05 at 06:59 -1000, Richard Henderson wrote:
> On 8/5/21 12:56 AM, Ilya Leoshkevich wrote:
> > On Wed, 2021-08-04 at 14:30 -1000, Richard Henderson wrote:
> > > I think that the translator_ld*_swap functions should be moved
> > > out of
> > > include/exec/translator.h into accel/tcg/translator.c.
> > 
> > Do we really need this? In the end, the added code is not that
> > large.
> 
> I suppose it's not required, but they're already larger than they
> need to be.
> 
> In my opinion, if you're going to swap one out-of-line function call
> for two out-of-line 
> function calls (cpu_ld*_code + plugin_insn_append), you've probably
> made a bad size trade-off.
> 
> With your added code, it'll be 3 out-of-line calls.
> 
> 
> r~

Fair enough. Let me send a v3 then.

Best regards,
Ilya
diff mbox series

Patch

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index bbfcfb698c..fb9ebfad9e 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1297,31 +1297,8 @@  static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
     invalidate_page_bitmap(p);
 
 #if defined(CONFIG_USER_ONLY)
-    if (p->flags & PAGE_WRITE) {
-        target_ulong addr;
-        PageDesc *p2;
-        int prot;
-
-        /* force the host page as non writable (writes will have a
-           page fault + mprotect overhead) */
-        page_addr &= qemu_host_page_mask;
-        prot = 0;
-        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
-            addr += TARGET_PAGE_SIZE) {
-
-            p2 = page_find(addr >> TARGET_PAGE_BITS);
-            if (!p2) {
-                continue;
-            }
-            prot |= p2->flags;
-            p2->flags &= ~PAGE_WRITE;
-          }
-        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
-                 (prot & PAGE_BITS) & ~PAGE_WRITE);
-        if (DEBUG_TB_INVALIDATE_GATE) {
-            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
-        }
-    }
+    /* translator_loop() must have made all TB pages non-writable */
+    assert(!(p->flags & PAGE_WRITE));
 #else
     /* if some code is already present, then the pages are already
        protected. So we handle the case where only the first TB is
@@ -2394,6 +2371,38 @@  int page_check_range(target_ulong start, target_ulong len, int flags)
     return 0;
 }
 
+void page_protect(tb_page_addr_t page_addr)
+{
+    target_ulong addr;
+    PageDesc *p;
+    int prot;
+
+    p = page_find(page_addr >> TARGET_PAGE_BITS);
+    if (p && (p->flags & PAGE_WRITE)) {
+        /*
+         * Force the host page as non writable (writes will have a page fault +
+         * mprotect overhead).
+         */
+        page_addr &= qemu_host_page_mask;
+        prot = 0;
+        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
+             addr += TARGET_PAGE_SIZE) {
+
+            p = page_find(addr >> TARGET_PAGE_BITS);
+            if (!p) {
+                continue;
+            }
+            prot |= p->flags;
+            p->flags &= ~PAGE_WRITE;
+        }
+        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
+                 (prot & PAGE_BITS) & ~PAGE_WRITE);
+        if (DEBUG_TB_INVALIDATE_GATE) {
+            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
+        }
+    }
+}
+
 /* called from signal handler: invalidate the code and unprotect the
  * page. Return 0 if the fault was not handled, 1 if it was handled,
  * and 2 if it was handled but the caller must cause the TB to be
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index c53a7f8e44..bfbe7d7ccf 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -14,6 +14,7 @@ 
 #include "exec/exec-all.h"
 #include "exec/gen-icount.h"
 #include "exec/log.h"
+#include "exec/translate-all.h"
 #include "exec/translator.h"
 #include "exec/plugin-gen.h"
 #include "sysemu/replay.h"
@@ -47,6 +48,10 @@  void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
 {
     uint32_t cflags = tb_cflags(tb);
     bool plugin_enabled;
+    bool stop = false;
+#ifdef CONFIG_USER_ONLY
+    target_ulong page_addr = -1;
+#endif
 
     /* Initialize DisasContext */
     db->tb = tb;
@@ -71,6 +76,21 @@  void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
     plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags & CF_MEMI_ONLY);
 
     while (true) {
+#ifdef CONFIG_USER_ONLY
+        /*
+         * Make the page containing the next instruction non-writable in order
+         * to get a consistent translation if another thread is modifying the
+         * code while translate_insn() fetches the instruction bytes piecemeal.
+         * Writer threads will wait for mmap_lock() in page_unprotect().
+         */
+        if ((db->pc_next & TARGET_PAGE_MASK) != page_addr) {
+            page_addr = db->pc_next & TARGET_PAGE_MASK;
+            page_protect(page_addr);
+        }
+#endif
+        if (stop) {
+            break;
+        }
         db->num_insns++;
         ops->insn_start(db, cpu);
         tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
@@ -95,7 +115,8 @@  void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
 
         /* Stop translation if translate_insn so indicated.  */
         if (db->is_jmp != DISAS_NEXT) {
-            break;
+            stop = true;
+            continue;
         }
 
         /*
@@ -110,7 +131,8 @@  void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
            or we have executed all of the allowed instructions.  */
         if (tcg_op_buf_full() || db->num_insns >= db->max_insns) {
             db->is_jmp = DISAS_TOO_MANY;
-            break;
+            stop = true;
+            continue;
         }
     }
 
diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h
index a557b4e2bb..9f646389af 100644
--- a/include/exec/translate-all.h
+++ b/include/exec/translate-all.h
@@ -33,6 +33,7 @@  void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
 void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
 
 #ifdef CONFIG_USER_ONLY
+void page_protect(tb_page_addr_t page_addr);
 int page_unprotect(target_ulong address, uintptr_t pc);
 #endif