diff mbox

[PULL,5/8] target-sparc: Use global registers for the register window

Message ID 1456252389-4416-6-git-send-email-rth@twiddle.net (mailing list archive)
State New, archived
Headers show

Commit Message

Richard Henderson Feb. 23, 2016, 6:33 p.m. UTC
Via indirection off cpu_regwptr.

Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-sparc/translate.c | 57 ++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

Comments

Mark Cave-Ayland June 14, 2016, 9:52 p.m. UTC | #1
On 23/02/16 18:33, Richard Henderson wrote:

> Via indirection off cpu_regwptr.
> 
> Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-sparc/translate.c | 57 ++++++++++++++++++++++++++++--------------------
>  1 file changed, 33 insertions(+), 24 deletions(-)
> 
> diff --git a/target-sparc/translate.c b/target-sparc/translate.c
> index 4be56dd..00d61ee 100644
> --- a/target-sparc/translate.c
> +++ b/target-sparc/translate.c
> @@ -43,7 +43,8 @@ static TCGv_ptr cpu_env, cpu_regwptr;
>  static TCGv cpu_cc_src, cpu_cc_src2, cpu_cc_dst;
>  static TCGv_i32 cpu_cc_op;
>  static TCGv_i32 cpu_psr;
> -static TCGv cpu_fsr, cpu_pc, cpu_npc, cpu_gregs[8];
> +static TCGv cpu_fsr, cpu_pc, cpu_npc;
> +static TCGv cpu_regs[32];
>  static TCGv cpu_y;
>  #ifndef CONFIG_USER_ONLY
>  static TCGv cpu_tbr;
> @@ -273,36 +274,31 @@ static inline void gen_address_mask(DisasContext *dc, TCGv addr)
>  
>  static inline TCGv gen_load_gpr(DisasContext *dc, int reg)
>  {
> -    if (reg == 0 || reg >= 8) {
> +    if (reg > 0) {
> +        assert(reg < 32);
> +        return cpu_regs[reg];
> +    } else {
>          TCGv t = get_temp_tl(dc);
> -        if (reg == 0) {
> -            tcg_gen_movi_tl(t, 0);
> -        } else {
> -            tcg_gen_ld_tl(t, cpu_regwptr, (reg - 8) * sizeof(target_ulong));
> -        }
> +        tcg_gen_movi_tl(t, 0);
>          return t;
> -    } else {
> -        return cpu_gregs[reg];
>      }
>  }
>  
>  static inline void gen_store_gpr(DisasContext *dc, int reg, TCGv v)
>  {
>      if (reg > 0) {
> -        if (reg < 8) {
> -            tcg_gen_mov_tl(cpu_gregs[reg], v);
> -        } else {
> -            tcg_gen_st_tl(v, cpu_regwptr, (reg - 8) * sizeof(target_ulong));
> -        }
> +        assert(reg < 32);
> +        tcg_gen_mov_tl(cpu_regs[reg], v);
>      }
>  }
>  
>  static inline TCGv gen_dest_gpr(DisasContext *dc, int reg)
>  {
> -    if (reg == 0 || reg >= 8) {
> -        return get_temp_tl(dc);
> +    if (reg > 0) {
> +        assert(reg < 32);
> +        return cpu_regs[reg];
>      } else {
> -        return cpu_gregs[reg];
> +        return get_temp_tl(dc);
>      }
>  }
>  
> @@ -2158,9 +2154,13 @@ static inline void gen_ldda_asi(DisasContext *dc, TCGv hi, TCGv addr,
>      tcg_temp_free_i32(r_size);
>      tcg_temp_free_i32(r_asi);
>  
> -    t = gen_dest_gpr(dc, rd + 1);
> +    /* ??? Work around an apparent bug in Ubuntu gcc 4.8.2-10ubuntu2+12,
> +       whereby "rd + 1" elicits "error: array subscript is above array".
> +       Since we have already asserted that rd is even, the semantics
> +       are unchanged.  */
> +    t = gen_dest_gpr(dc, rd | 1);
>      tcg_gen_trunc_i64_tl(t, t64);
> -    gen_store_gpr(dc, rd + 1, t);
> +    gen_store_gpr(dc, rd | 1, t);
>  
>      tcg_gen_shri_i64(t64, t64, 32);
>      tcg_gen_trunc_i64_tl(hi, t64);
> @@ -5330,8 +5330,11 @@ void gen_intermediate_code(CPUSPARCState * env, TranslationBlock * tb)
>  void gen_intermediate_code_init(CPUSPARCState *env)
>  {
>      static int inited;
> -    static const char gregnames[8][4] = {
> +    static const char gregnames[32][4] = {
>          "g0", "g1", "g2", "g3", "g4", "g5", "g6", "g7",
> +        "o0", "o1", "o2", "o3", "o4", "o5", "o6", "o7",
> +        "l0", "l1", "l2", "l3", "l4", "l5", "l6", "l7",
> +        "i0", "i1", "i2", "i3", "i4", "i5", "i6", "i7",
>      };
>      static const char fregnames[32][4] = {
>          "f0", "f2", "f4", "f6", "f8", "f10", "f12", "f14",
> @@ -5401,11 +5404,17 @@ void gen_intermediate_code_init(CPUSPARCState *env)
>          *rtl[i].ptr = tcg_global_mem_new(cpu_env, rtl[i].off, rtl[i].name);
>      }
>  
> -    TCGV_UNUSED(cpu_gregs[0]);
> +    TCGV_UNUSED(cpu_regs[0]);
>      for (i = 1; i < 8; ++i) {
> -        cpu_gregs[i] = tcg_global_mem_new(cpu_env,
> -                                          offsetof(CPUSPARCState, gregs[i]),
> -                                          gregnames[i]);
> +        cpu_regs[i] = tcg_global_mem_new(cpu_env,
> +                                         offsetof(CPUSPARCState, gregs[i]),
> +                                         gregnames[i]);
> +    }
> +
> +    for (i = 8; i < 32; ++i) {
> +        cpu_regs[i] = tcg_global_mem_new(cpu_regwptr,
> +                                         (i - 8) * sizeof(target_ulong),
> +                                         gregnames[i]);
>      }
>  
>      for (i = 0; i < TARGET_DPREGS; i++) {
> 

Hi Richard,

Following up the bug report at
https://bugs.launchpad.net/qemu/+bug/1588328, I bisected the regression
down to this particular commit. I can't see anything obvious here, so
perhaps this is exposing another bug somewhere else?


ATB,

Mark.
Richard Henderson June 16, 2016, 8:26 p.m. UTC | #2
On 06/14/2016 02:52 PM, Mark Cave-Ayland wrote:
> Following up the bug report at
> https://bugs.launchpad.net/qemu/+bug/1588328, I bisected the regression
> down to this particular commit. I can't see anything obvious here, so
> perhaps this is exposing another bug somewhere else?
> 

Probably.  I'm downloading the solaris image now.


r~
Mark Cave-Ayland June 16, 2016, 9:53 p.m. UTC | #3
On 16/06/16 21:26, Richard Henderson wrote:

> On 06/14/2016 02:52 PM, Mark Cave-Ayland wrote:
>> Following up the bug report at
>> https://bugs.launchpad.net/qemu/+bug/1588328, I bisected the regression
>> down to this particular commit. I can't see anything obvious here, so
>> perhaps this is exposing another bug somewhere else?
>>
> 
> Probably.  I'm downloading the solaris image now.
> 
> 
> r~

Thanks for taking a look - otherwise I won't be able to get to this
until next week. My thinking was that since the code makes access to
regwptr direct instead of copied into a temporary, something is
accidentally clobbering a destination register...


ATB,

Mark.
Richard Henderson June 24, 2016, 3:57 a.m. UTC | #4
On 06/16/2016 02:53 PM, Mark Cave-Ayland wrote:
> On 16/06/16 21:26, Richard Henderson wrote:
>
>> On 06/14/2016 02:52 PM, Mark Cave-Ayland wrote:
>>> Following up the bug report at
>>> https://bugs.launchpad.net/qemu/+bug/1588328, I bisected the regression
>>> down to this particular commit. I can't see anything obvious here, so
>>> perhaps this is exposing another bug somewhere else?
>>>
>>
>> Probably.  I'm downloading the solaris image now.
>>
>>
>> r~
>
> Thanks for taking a look - otherwise I won't be able to get to this
> until next week. My thinking was that since the code makes access to
> regwptr direct instead of copied into a temporary, something is
> accidentally clobbering a destination register...

I've been unable to find this.

Whatever happens, it happens after 10GB of logs, which is simply too much to 
sift through.  I've tried to narrow it down, but the lack of a hardware tlb 
refill means that we get hundreds of thousands of Data Access Faults that are 
simply TLB misses and not the actual Segmentation Fault in question.

It doesn't seem to affect other OSes, so I can't imagine what quirk is being 
exercised in this case.

As loath as I am to suggest it, we may have to revert the sparc indirect 
register patch for the release.

I do now ping the rest of my sparc improvements patchset.  It's completely 
independent of the use of indirect registers.


r~
Paolo Bonzini June 24, 2016, 6:36 a.m. UTC | #5
On 24/06/2016 05:57, Richard Henderson wrote:
> 
> Whatever happens, it happens after 10GB of logs, which is simply too
> much to sift through.  I've tried to narrow it down, but the lack of a
> hardware tlb refill means that we get hundreds of thousands of Data
> Access Faults that are simply TLB misses and not the actual Segmentation
> Fault in question.
> 
> It doesn't seem to affect other OSes, so I can't imagine what quirk is
> being exercised in this case.
> 
> As loath as I am to suggest it, we may have to revert the sparc indirect
> register patch for the release.

We have more than a month.  If it's reproducible, it can be fixed. :)

> I do now ping the rest of my sparc improvements patchset.  It's
> completely independent of the use of indirect registers.

Mark, perhaps you can try to use migration to reduce the amount of
logging?  (Start QEMU with -snapshot, try to stop the vm before it
fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
"commit"; if you fail, try again).

It would be nice to have a mechanism to stop the VM after executing N
basic blocks.  Binary search on this value then can help with coming up
with a more easily debuggable snapshot, possibly to a point where the
difference between pre-patch and post-patch becomes deterministic.

Paolo
Peter Maydell June 24, 2016, 8:12 a.m. UTC | #6
On 24 June 2016 at 07:36, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Mark, perhaps you can try to use migration to reduce the amount of
> logging?  (Start QEMU with -snapshot, try to stop the vm before it
> fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
> "commit"; if you fail, try again).

Why drag migration into it? I usually use 'savevm' and then
the -loadvm command line argument for this. (You need a
qcow2 disk image.)

> It would be nice to have a mechanism to stop the VM after executing N
> basic blocks.  Binary search on this value then can help with coming up
> with a more easily debuggable snapshot, possibly to a point where the
> difference between pre-patch and post-patch becomes deterministic.

You can use the monitor and an expect script to say "take a
snapshot 0.7 seconds into boot", which I've found to be
a good enough approximation:

https://translatedcode.wordpress.com/2015/07/06/tricks-for-debugging-qemu-savevm-snapshots/

thanks
-- PMM
Paolo Bonzini June 24, 2016, 8:16 a.m. UTC | #7
On 24/06/2016 10:12, Peter Maydell wrote:
> On 24 June 2016 at 07:36, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Mark, perhaps you can try to use migration to reduce the amount of
>> logging?  (Start QEMU with -snapshot, try to stop the vm before it
>> fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
>> "commit"; if you fail, try again).
> 
> Why drag migration into it? I usually use 'savevm' and then
> the -loadvm command line argument for this. (You need a
> qcow2 disk image.)

Well, migration and savevm are the same. :)  In this case IIUC the
failure happens from a CDROM so migration lets you avoid the qcow2
image.  In general I find it easier to manage migration files on disk
than saved snapshots.

Paolo

>> It would be nice to have a mechanism to stop the VM after executing N
>> basic blocks.  Binary search on this value then can help with coming up
>> with a more easily debuggable snapshot, possibly to a point where the
>> difference between pre-patch and post-patch becomes deterministic.
> 
> You can use the monitor and an expect script to say "take a
> snapshot 0.7 seconds into boot", which I've found to be
> a good enough approximation:
> 
> https://translatedcode.wordpress.com/2015/07/06/tricks-for-debugging-qemu-savevm-snapshots/
> 
> thanks
> -- PMM
>
Mark Cave-Ayland June 24, 2016, 10:42 a.m. UTC | #8
On 24/06/16 07:36, Paolo Bonzini wrote:

> On 24/06/2016 05:57, Richard Henderson wrote:
>>
>> Whatever happens, it happens after 10GB of logs, which is simply too
>> much to sift through.  I've tried to narrow it down, but the lack of a
>> hardware tlb refill means that we get hundreds of thousands of Data
>> Access Faults that are simply TLB misses and not the actual Segmentation
>> Fault in question.
>>
>> It doesn't seem to affect other OSes, so I can't imagine what quirk is
>> being exercised in this case.
>>
>> As loath as I am to suggest it, we may have to revert the sparc indirect
>> register patch for the release.
>
> We have more than a month.  If it's reproducible, it can be fixed. :)
>
>> I do now ping the rest of my sparc improvements patchset.  It's
>> completely independent of the use of indirect registers.
>
> Mark, perhaps you can try to use migration to reduce the amount of
> logging?  (Start QEMU with -snapshot, try to stop the vm before it
> fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
> "commit"; if you fail, try again).

Yeah, given the improvements that Richard has made, I'd prefer not to 
revert if at all possible. Finally I have some spare time today so I'll 
try and get this down to an easily-testable qcow2 image that can 
reproduce the issue.


ATB,

Mark.
Paolo Bonzini June 24, 2016, 12:03 p.m. UTC | #9
On 24/06/2016 12:42, Mark Cave-Ayland wrote:
> On 24/06/16 07:36, Paolo Bonzini wrote:
> 
>> On 24/06/2016 05:57, Richard Henderson wrote:
>>>
>>> Whatever happens, it happens after 10GB of logs, which is simply too
>>> much to sift through.  I've tried to narrow it down, but the lack of a
>>> hardware tlb refill means that we get hundreds of thousands of Data
>>> Access Faults that are simply TLB misses and not the actual Segmentation
>>> Fault in question.
>>>
>>> It doesn't seem to affect other OSes, so I can't imagine what quirk is
>>> being exercised in this case.
>>>
>>> As loath as I am to suggest it, we may have to revert the sparc indirect
>>> register patch for the release.
>>
>> We have more than a month.  If it's reproducible, it can be fixed. :)
>>
>>> I do now ping the rest of my sparc improvements patchset.  It's
>>> completely independent of the use of indirect registers.
>>
>> Mark, perhaps you can try to use migration to reduce the amount of
>> logging?  (Start QEMU with -snapshot, try to stop the vm before it
>> fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
>> "commit"; if you fail, try again).
> 
> Yeah, given the improvements that Richard has made, I'd prefer not to
> revert if at all possible. Finally I have some spare time today so I'll
> try and get this down to an easily-testable qcow2 image that can
> reproduce the issue.

I've gotten an image that reaches the segmentation fault in about 1
second but I cannot upload it anywhere in the next few hours.  The good
news is that it fails even without a hard disk (so it's a stateless vm)
and with -d nochain -singlestep.  The bad news is that the dump is not
very deterministic and that I failed to create images closer to the failure.

Paolo
Artyom Tarasenko June 24, 2016, 12:35 p.m. UTC | #10
On Fri, Jun 24, 2016 at 2:03 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 24/06/2016 12:42, Mark Cave-Ayland wrote:
>> On 24/06/16 07:36, Paolo Bonzini wrote:
>>
>>> On 24/06/2016 05:57, Richard Henderson wrote:
>>>>
>>>> Whatever happens, it happens after 10GB of logs, which is simply too
>>>> much to sift through.  I've tried to narrow it down, but the lack of a
>>>> hardware tlb refill means that we get hundreds of thousands of Data
>>>> Access Faults that are simply TLB misses and not the actual Segmentation
>>>> Fault in question.
>>>>
>>>> It doesn't seem to affect other OSes, so I can't imagine what quirk is
>>>> being exercised in this case.
>>>>
>>>> As loath as I am to suggest it, we may have to revert the sparc indirect
>>>> register patch for the release.
>>>
>>> We have more than a month.  If it's reproducible, it can be fixed. :)
>>>
>>>> I do now ping the rest of my sparc improvements patchset.  It's
>>>> completely independent of the use of indirect registers.
>>>
>>> Mark, perhaps you can try to use migration to reduce the amount of
>>> logging?  (Start QEMU with -snapshot, try to stop the vm before it
>>> fails.  If you succeed, do a "migrate exec:cat>foo.sav" followed by
>>> "commit"; if you fail, try again).
>>
>> Yeah, given the improvements that Richard has made, I'd prefer not to
>> revert if at all possible. Finally I have some spare time today so I'll
>> try and get this down to an easily-testable qcow2 image that can
>> reproduce the issue.
>
> I've gotten an image that reaches the segmentation fault in about 1
> second but I cannot upload it anywhere in the next few hours.  The good
> news is that it fails even without a hard disk (so it's a stateless vm)
> and with -d nochain -singlestep.  The bad news is that the dump is not
> very deterministic and that I failed to create images closer to the failure.

I have a fix for the bug, will post it shortly. This patch does reveal a bug in
the ldstub implementation, but I'm really surprised we haven't hit it before.

 I observe the following sequence under Solaris:

1. a memory page is mapped without the write permission.
2. the ldstub instruction is executed on this page.
3. ldstub raises an access exception
4b. Because of the bug in the ldstub, a register gets corrupted.
5b. Solaris kernel re-maps the page to have the write access
6b. ldstub is executed again with the corrupted register content
7b. Segmentation fault

With the ldstub fix it goes:
4a. Solaris kernel re-maps the page to have the write access
5a. ldstub is executed again, this time all is good.

There must be another bug in memory access handling.
Maybe cached TBs can be executed with the wrong mem idx?

Artyom
diff mbox

Patch

diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index 4be56dd..00d61ee 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -43,7 +43,8 @@  static TCGv_ptr cpu_env, cpu_regwptr;
 static TCGv cpu_cc_src, cpu_cc_src2, cpu_cc_dst;
 static TCGv_i32 cpu_cc_op;
 static TCGv_i32 cpu_psr;
-static TCGv cpu_fsr, cpu_pc, cpu_npc, cpu_gregs[8];
+static TCGv cpu_fsr, cpu_pc, cpu_npc;
+static TCGv cpu_regs[32];
 static TCGv cpu_y;
 #ifndef CONFIG_USER_ONLY
 static TCGv cpu_tbr;
@@ -273,36 +274,31 @@  static inline void gen_address_mask(DisasContext *dc, TCGv addr)
 
 static inline TCGv gen_load_gpr(DisasContext *dc, int reg)
 {
-    if (reg == 0 || reg >= 8) {
+    if (reg > 0) {
+        assert(reg < 32);
+        return cpu_regs[reg];
+    } else {
         TCGv t = get_temp_tl(dc);
-        if (reg == 0) {
-            tcg_gen_movi_tl(t, 0);
-        } else {
-            tcg_gen_ld_tl(t, cpu_regwptr, (reg - 8) * sizeof(target_ulong));
-        }
+        tcg_gen_movi_tl(t, 0);
         return t;
-    } else {
-        return cpu_gregs[reg];
     }
 }
 
 static inline void gen_store_gpr(DisasContext *dc, int reg, TCGv v)
 {
     if (reg > 0) {
-        if (reg < 8) {
-            tcg_gen_mov_tl(cpu_gregs[reg], v);
-        } else {
-            tcg_gen_st_tl(v, cpu_regwptr, (reg - 8) * sizeof(target_ulong));
-        }
+        assert(reg < 32);
+        tcg_gen_mov_tl(cpu_regs[reg], v);
     }
 }
 
 static inline TCGv gen_dest_gpr(DisasContext *dc, int reg)
 {
-    if (reg == 0 || reg >= 8) {
-        return get_temp_tl(dc);
+    if (reg > 0) {
+        assert(reg < 32);
+        return cpu_regs[reg];
     } else {
-        return cpu_gregs[reg];
+        return get_temp_tl(dc);
     }
 }
 
@@ -2158,9 +2154,13 @@  static inline void gen_ldda_asi(DisasContext *dc, TCGv hi, TCGv addr,
     tcg_temp_free_i32(r_size);
     tcg_temp_free_i32(r_asi);
 
-    t = gen_dest_gpr(dc, rd + 1);
+    /* ??? Work around an apparent bug in Ubuntu gcc 4.8.2-10ubuntu2+12,
+       whereby "rd + 1" elicits "error: array subscript is above array".
+       Since we have already asserted that rd is even, the semantics
+       are unchanged.  */
+    t = gen_dest_gpr(dc, rd | 1);
     tcg_gen_trunc_i64_tl(t, t64);
-    gen_store_gpr(dc, rd + 1, t);
+    gen_store_gpr(dc, rd | 1, t);
 
     tcg_gen_shri_i64(t64, t64, 32);
     tcg_gen_trunc_i64_tl(hi, t64);
@@ -5330,8 +5330,11 @@  void gen_intermediate_code(CPUSPARCState * env, TranslationBlock * tb)
 void gen_intermediate_code_init(CPUSPARCState *env)
 {
     static int inited;
-    static const char gregnames[8][4] = {
+    static const char gregnames[32][4] = {
         "g0", "g1", "g2", "g3", "g4", "g5", "g6", "g7",
+        "o0", "o1", "o2", "o3", "o4", "o5", "o6", "o7",
+        "l0", "l1", "l2", "l3", "l4", "l5", "l6", "l7",
+        "i0", "i1", "i2", "i3", "i4", "i5", "i6", "i7",
     };
     static const char fregnames[32][4] = {
         "f0", "f2", "f4", "f6", "f8", "f10", "f12", "f14",
@@ -5401,11 +5404,17 @@  void gen_intermediate_code_init(CPUSPARCState *env)
         *rtl[i].ptr = tcg_global_mem_new(cpu_env, rtl[i].off, rtl[i].name);
     }
 
-    TCGV_UNUSED(cpu_gregs[0]);
+    TCGV_UNUSED(cpu_regs[0]);
     for (i = 1; i < 8; ++i) {
-        cpu_gregs[i] = tcg_global_mem_new(cpu_env,
-                                          offsetof(CPUSPARCState, gregs[i]),
-                                          gregnames[i]);
+        cpu_regs[i] = tcg_global_mem_new(cpu_env,
+                                         offsetof(CPUSPARCState, gregs[i]),
+                                         gregnames[i]);
+    }
+
+    for (i = 8; i < 32; ++i) {
+        cpu_regs[i] = tcg_global_mem_new(cpu_regwptr,
+                                         (i - 8) * sizeof(target_ulong),
+                                         gregnames[i]);
     }
 
     for (i = 0; i < TARGET_DPREGS; i++) {