diff mbox series

[10/25] target/i386: finish converting 0F AE to the new decoder

Message ID 20240608084113.2770363-11-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series target/i386: more progress towards new decoder | expand

Commit Message

Paolo Bonzini June 8, 2024, 8:40 a.m. UTC
This is already partly implemented due to VLDMXCSR and VSTMXCSR; finish
the job.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.h     |   7 ++
 target/i386/tcg/translate.c      | 188 -------------------------------
 target/i386/tcg/decode-new.c.inc |  48 +++++++-
 target/i386/tcg/emit.c.inc       |  80 +++++++++++++
 4 files changed, 129 insertions(+), 194 deletions(-)

Comments

Richard Henderson June 8, 2024, 6:42 p.m. UTC | #1
On 6/8/24 01:40, Paolo Bonzini wrote:
> +static void gen_FXRSTOR(DisasContext *s, X86DecodedInsn *decode)
> +{
> +    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
> +        gen_NM_exception(s);
> +    }
> +    gen_helper_fxrstor(tcg_env, s->A0);
> +}
> +
> +static void gen_FXSAVE(DisasContext *s, X86DecodedInsn *decode)
> +{
> +    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
> +        gen_NM_exception(s);
> +    }
> +    gen_helper_fxsave(tcg_env, s->A0);
> +}

You want else's here, to not emit fxsave/restore after raising NM.

Otherwise,
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
Guenter Roeck Oct. 21, 2024, 1:49 a.m. UTC | #2
Hi,

On Sat, Jun 08, 2024 at 10:40:58AM +0200, Paolo Bonzini wrote:
> This is already partly implemented due to VLDMXCSR and VSTMXCSR; finish
> the job.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

While testing qemu v9.1, I noticed the following crash when testing qemu-system-i386
with pentium3 CPU.

Oops: invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc4 #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
EIP: raid6_sse12_gen_syndrome+0xfa/0x10c
Code: 83 e8 01 73 bf 8b 4d ec 0f e7 53 f8 0f e7 1b 0f e7 61 f8 0f e7 31 8b 45 e8 83 c6 10 83 c3 10 83 c1 10 39 c6 0f 82 6a ff ff ff <0f> 9
EAX: 00001000 EBX: c1367008 ECX: c1368008 EDX: c119deb0
ESI: 00001000 EDI: 00000ff8 EBP: c119de84 ESP: c119de68
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010246
CR0: 80050033 CR2: ffd39000 CR3: 06144000 CR4: 000006d0
Call Trace:
 ? show_regs+0x4d/0x54
 ? die+0x2f/0x88
 ? do_trap+0xc6/0xc8
 ? do_error_trap+0x6c/0x100
 ? raid6_sse12_gen_syndrome+0xfa/0x10c
 ? exc_overflow+0x50/0x50
 ? exc_invalid_op+0x5b/0x70
 ? raid6_sse12_gen_syndrome+0xfa/0x10c
 ? handle_exception+0x14b/0x14b
 ? exc_overflow+0x50/0x50
 ? raid6_sse12_gen_syndrome+0xfa/0x10c
 ? exc_overflow+0x50/0x50
 ? raid6_sse12_gen_syndrome+0xfa/0x10c
 ? raid6_sse11_gen_syndrome+0xfc/0xfc
 raid6_select_algo+0x144/0x420
 ? libcrc32c_mod_init+0x24/0x24
 do_one_initcall+0x63/0x284
 ? rdinit_setup+0x40/0x40
 ? parse_args+0x14b/0x3f4
 kernel_init_freeable+0x238/0x44c
 ? rdinit_setup+0x40/0x40
 ? rest_init+0x164/0x164
 kernel_init+0x15/0x1dc
 ? schedule_tail+0x50/0x64
 ret_from_fork+0x38/0x44
 ? rest_init+0x164/0x164
 ret_from_fork_asm+0x12/0x18
 entry_INT80_32+0x108/0x108
Modules linked in:
---[ end trace 0000000000000000 ]---

Bisect points to this patch. Bisect log as well as the decoded stacktrace
is attached below.

The problem is still seen in qemu mainline (v9.1.0-997-g72b0b80714).

Reverting the patch is not straightforward and results in a number of conflicts,
so I was not able to test qemu with the patch reverted.

Guenter

---
Bisect log:

# bad: [fd1952d814da738ed107e05583b3e02ac11e88ff] Update version for v9.1.0 release
# good: [c25df57ae8f9fe1c72eee2dab37d76d904ac382e] Update version for 9.0.0 release
git bisect start 'v9.1.0' 'v9.0.0'
# bad: [2529ea2d561ea9fe359fb19ebdcfeb8b6cddd219] hw/acpi/ich9: Remove dead code related to 'acpi_memory_hotplug'
git bisect bad 2529ea2d561ea9fe359fb19ebdcfeb8b6cddd219
# good: [544595e73007c824b7435b52519cc578586783a6] tests/plugin/inline: add test for conditional callback
git bisect good 544595e73007c824b7435b52519cc578586783a6
# good: [039003995047b2f7911142c7c5cfb845fda044fd] hw/riscv/boot.c: Support 64-bit address for initrd
git bisect good 039003995047b2f7911142c7c5cfb845fda044fd
# good: [a73f7a00eea15c75fe9cfbeeaff5228f5ee24b61] tests/qtest: Add numa test for loongarch system
git bisect good a73f7a00eea15c75fe9cfbeeaff5228f5ee24b61
# bad: [11ffaf8c73aae1a70f4640ada14a437a78d06efb] target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder
git bisect bad 11ffaf8c73aae1a70f4640ada14a437a78d06efb
# good: [fc00123f3abeb027cd51eb58ea8845377794b3bc] python: mkvenv: remove ensure command
git bisect good fc00123f3abeb027cd51eb58ea8845377794b3bc
# good: [593aab332f048347bd19893071caf44e1fb742ff] Merge tag 'pull-hex-20240608' of https://github.com/quic/qemu into staging
git bisect good 593aab332f048347bd19893071caf44e1fb742ff
# good: [c2b6b6a65a227d2bb45e1b2694cf064b881543e4] target/i386: change X86_ENTRYr to use T0
git bisect good c2b6b6a65a227d2bb45e1b2694cf064b881543e4
# good: [10340080cd501b1aba23c3e502e2e0aa7c825fbf] target/i386: fix bad sorting of entries in the 0F table
git bisect good 10340080cd501b1aba23c3e502e2e0aa7c825fbf
# bad: [ae541c0eb47f2fbcfd975c8e2fcb0e3a2613dc1c] target/i386: convert non-grouped, helper-based 2-byte opcodes
git bisect bad ae541c0eb47f2fbcfd975c8e2fcb0e3a2613dc1c
# bad: [556c4c5cc44c3454f78d796b6050c6d574a35dd2] target/i386: split X86_CHECK_prot into PE and VM86 checks
git bisect bad 556c4c5cc44c3454f78d796b6050c6d574a35dd2
# bad: [ea89aa895e98fd8a1b9ebf7e3dc8bfcd863b9466] target/i386: finish converting 0F AE to the new decoder
git bisect bad ea89aa895e98fd8a1b9ebf7e3dc8bfcd863b9466
# first bad commit: [ea89aa895e98fd8a1b9ebf7e3dc8bfcd863b9466] target/i386: finish converting 0F AE to the new decoder

---
Decoded stacktrace:

CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc4 #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
EIP: raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
Code: 83 e8 01 73 bf 8b 4d ec 0f e7 53 f8 0f e7 1b 0f e7 61 f8 0f e7 31 8b 45 e8 83 c6 10 83 c3 10 83 c1 10 39 c6 0f 82 6a ff ff ff <0f> 9
All code
========
   0:	83 e8 01             	sub    $0x1,%eax
   3:	73 bf                	jae    0xffffffffffffffc4
   5:	8b 4d ec             	mov    -0x14(%rbp),%ecx
   8:	0f e7 53 f8          	movntq %mm2,-0x8(%rbx)
   c:	0f e7 1b             	movntq %mm3,(%rbx)
   f:	0f e7 61 f8          	movntq %mm4,-0x8(%rcx)
  13:	0f e7 31             	movntq %mm6,(%rcx)
  16:	8b 45 e8             	mov    -0x18(%rbp),%eax
  19:	83 c6 10             	add    $0x10,%esi
  1c:	83 c3 10             	add    $0x10,%ebx
  1f:	83 c1 10             	add    $0x10,%ecx
  22:	39 c6                	cmp    %eax,%esi
  24:	0f 82 6a ff ff ff    	jb     0xffffffffffffff94
  2a:*	0f 09                	wbinvd 		<-- trapping instruction

Code starting with the faulting instruction
===========================================
   0:	0f 09                	wbinvd
EAX: 00001000 EBX: c1367008 ECX: c1368008 EDX: c119deb0
ESI: 00001000 EDI: 00000ff8 EBP: c119de84 ESP: c119de68
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010246
CR0: 80050033 CR2: ffd39000 CR3: 06144000 CR4: 000006d0
Call Trace:
? show_regs (arch/x86/kernel/dumpstack.c:479)
? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447)
? do_trap (arch/x86/kernel/traps.c:156 arch/x86/kernel/traps.c:197)
? do_error_trap (arch/x86/include/asm/traps.h:58 arch/x86/kernel/traps.c:218)
? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
? exc_overflow (arch/x86/kernel/traps.c:301)
? exc_invalid_op (arch/x86/kernel/traps.c:316)
? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
? handle_exception (arch/x86/entry/entry_32.S:1055)
? exc_overflow (arch/x86/kernel/traps.c:301)
? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
? exc_overflow (arch/x86/kernel/traps.c:301)
? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
? raid6_sse11_gen_syndrome (lib/raid6/sse1.c:100)
raid6_select_algo (lib/raid6/algos.c:179 (discriminator 2) lib/raid6/algos.c:273 (discriminator 2))
? libcrc32c_mod_init (lib/raid6/algos.c:243)
do_one_initcall (init/main.c:1269)
? rdinit_setup (init/main.c:1317)
? parse_args (kernel/params.c:153 kernel/params.c:186)
kernel_init_freeable (init/main.c:1330 (discriminator 1) init/main.c:1347 (discriminator 1) init/main.c:1366 (discriminator 1) init/main.c:1580 (discrim
? rdinit_setup (init/main.c:1317)
? rest_init (init/main.c:1461)
kernel_init (init/main.c:1471)
? schedule_tail (kernel/sched/core.c:5266)
ret_from_fork (arch/x86/kernel/process.c:153)
? rest_init (init/main.c:1461)
ret_from_fork_asm (arch/x86/entry/entry_32.S:737)
entry_INT80_32 (arch/x86/entry/entry_32.S:945)
Paolo Bonzini Oct. 21, 2024, 6:57 a.m. UTC | #3
On 10/21/24 03:49, Guenter Roeck wrote:
> Hi,
> 
> On Sat, Jun 08, 2024 at 10:40:58AM +0200, Paolo Bonzini wrote:
>> This is already partly implemented due to VLDMXCSR and VSTMXCSR; finish
>> the job.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> 
> While testing qemu v9.1, I noticed the following crash when testing qemu-system-i386
> with pentium3 CPU.

Is this enough to fix it?

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index ee2a508ae9a..cda32ee6784 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -345,9 +345,9 @@ static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
          [1] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
          [2] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
          [3] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
-        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE2) p_00),
+        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE) p_00),
          [6] = X86_OP_ENTRY0(MFENCE,          cpuid(SSE2) p_00),
-        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE2) p_00),
+        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE) p_00),
      };
  
      static const X86OpEntry group15_mem[8] = {

>    22:	39 c6                	cmp    %eax,%esi
>    24:	0f 82 6a ff ff ff    	jb     0xffffffffffffff94
>    2a:*	0f 09                	wbinvd 		<-- trapping instruction

This is a bit weird, as wbinvd is not affected by this patch.  However,
a checkout of Linux has

         asm volatile("sfence" : :: "memory");
         kernel_fpu_end();
}

at the end of lib/raid6/sse1.c and it would indeed be affected by this
patch.  SSE2 was not present in Pentium III, but SSE was.

Paolo

> Code starting with the faulting instruction
> ===========================================
>     0:	0f 09                	wbinvd
> EAX: 00001000 EBX: c1367008 ECX: c1368008 EDX: c119deb0
> ESI: 00001000 EDI: 00000ff8 EBP: c119de84 ESP: c119de68
> DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010246
> CR0: 80050033 CR2: ffd39000 CR3: 06144000 CR4: 000006d0
> Call Trace:
> ? show_regs (arch/x86/kernel/dumpstack.c:479)
> ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447)
> ? do_trap (arch/x86/kernel/traps.c:156 arch/x86/kernel/traps.c:197)
> ? do_error_trap (arch/x86/include/asm/traps.h:58 arch/x86/kernel/traps.c:218)
> ? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
> ? exc_overflow (arch/x86/kernel/traps.c:301)
> ? exc_invalid_op (arch/x86/kernel/traps.c:316)
> ? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
> ? handle_exception (arch/x86/entry/entry_32.S:1055)
> ? exc_overflow (arch/x86/kernel/traps.c:301)
> ? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
> ? exc_overflow (arch/x86/kernel/traps.c:301)
> ? raid6_sse12_gen_syndrome (lib/raid6/sse1.c:147)
> ? raid6_sse11_gen_syndrome (lib/raid6/sse1.c:100)
> raid6_select_algo (lib/raid6/algos.c:179 (discriminator 2) lib/raid6/algos.c:273 (discriminator 2))
> ? libcrc32c_mod_init (lib/raid6/algos.c:243)
> do_one_initcall (init/main.c:1269)
> ? rdinit_setup (init/main.c:1317)
> ? parse_args (kernel/params.c:153 kernel/params.c:186)
> kernel_init_freeable (init/main.c:1330 (discriminator 1) init/main.c:1347 (discriminator 1) init/main.c:1366 (discriminator 1) init/main.c:1580 (discrim
> ? rdinit_setup (init/main.c:1317)
> ? rest_init (init/main.c:1461)
> kernel_init (init/main.c:1471)
> ? schedule_tail (kernel/sched/core.c:5266)
> ret_from_fork (arch/x86/kernel/process.c:153)
> ? rest_init (init/main.c:1461)
> ret_from_fork_asm (arch/x86/entry/entry_32.S:737)
> entry_INT80_32 (arch/x86/entry/entry_32.S:945)
> 
>
Guenter Roeck Oct. 21, 2024, 1:54 p.m. UTC | #4
On 10/20/24 23:57, Paolo Bonzini wrote:
> On 10/21/24 03:49, Guenter Roeck wrote:
>> Hi,
>>
>> On Sat, Jun 08, 2024 at 10:40:58AM +0200, Paolo Bonzini wrote:
>>> This is already partly implemented due to VLDMXCSR and VSTMXCSR; finish
>>> the job.
>>>
>>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>>> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
>>
>> While testing qemu v9.1, I noticed the following crash when testing qemu-system-i386
>> with pentium3 CPU.
> 
> Is this enough to fix it?
> 
Yes.

> diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
> index ee2a508ae9a..cda32ee6784 100644
> --- a/target/i386/tcg/decode-new.c.inc
> +++ b/target/i386/tcg/decode-new.c.inc
> @@ -345,9 +345,9 @@ static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
>           [1] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
>           [2] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
>           [3] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
> -        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE2) p_00),
> +        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE) p_00),
>           [6] = X86_OP_ENTRY0(MFENCE,          cpuid(SSE2) p_00),
> -        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE2) p_00),
> +        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE) p_00),
>       };
> 
>       static const X86OpEntry group15_mem[8] = {
> 
>>    22:    39 c6                    cmp    %eax,%esi
>>    24:    0f 82 6a ff ff ff        jb     0xffffffffffffff94
>>    2a:*    0f 09                    wbinvd         <-- trapping instruction
> 
> This is a bit weird, as wbinvd is not affected by this patch.  However,
> a checkout of Linux has
> 
>          asm volatile("sfence" : :: "memory");
>          kernel_fpu_end();
> }
> 
> at the end of lib/raid6/sse1.c and it would indeed be affected by this
> patch.  SSE2 was not present in Pentium III, but SSE was.
> 

No idea how the 0x0f 0x09 ends up in the log. I wondered about that as well.

Thanks,
Guenter
diff mbox series

Patch

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 8465717ea21..5577f7509aa 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -108,10 +108,15 @@  typedef enum X86CPUIDFeature {
     X86_FEAT_AVX2,
     X86_FEAT_BMI1,
     X86_FEAT_BMI2,
+    X86_FEAT_CLFLUSH,
+    X86_FEAT_CLFLUSHOPT,
+    X86_FEAT_CLWB,
     X86_FEAT_CMOV,
     X86_FEAT_CMPCCXADD,
     X86_FEAT_F16C,
     X86_FEAT_FMA,
+    X86_FEAT_FSGSBASE,
+    X86_FEAT_FXSR,
     X86_FEAT_MOVBE,
     X86_FEAT_PCLMULQDQ,
     X86_FEAT_SHA_NI,
@@ -122,6 +127,8 @@  typedef enum X86CPUIDFeature {
     X86_FEAT_SSE41,
     X86_FEAT_SSE42,
     X86_FEAT_SSE4A,
+    X86_FEAT_XSAVE,
+    X86_FEAT_XSAVEOPT,
 } X86CPUIDFeature;
 
 /* Execution flags */
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 4958f4c45d5..ebae745ecba 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4197,194 +4197,6 @@  static void disas_insn_old(DisasContext *s, CPUState *cpu, int b)
             s->base.is_jmp = DISAS_EOB_NEXT;
         }
         break;
-    /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */
-    case 0x1ae:
-        modrm = x86_ldub_code(env, s);
-        switch (modrm) {
-        CASE_MODRM_MEM_OP(0): /* fxsave */
-            if (!(s->cpuid_features & CPUID_FXSR)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            gen_helper_fxsave(tcg_env, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(1): /* fxrstor */
-            if (!(s->cpuid_features & CPUID_FXSR)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            gen_helper_fxrstor(tcg_env, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(2): /* ldmxcsr */
-            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) {
-                goto illegal_op;
-            }
-            if (s->flags & HF_TS_MASK) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
-            gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
-            break;
-
-        CASE_MODRM_MEM_OP(3): /* stmxcsr */
-            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) {
-                goto illegal_op;
-            }
-            if (s->flags & HF_TS_MASK) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_helper_update_mxcsr(tcg_env);
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
-            gen_op_st_v(s, MO_32, s->T0, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(4): /* xsave */
-            if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                || (prefixes & (PREFIX_LOCK | PREFIX_DATA
-                                | PREFIX_REPZ | PREFIX_REPNZ))) {
-                goto illegal_op;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                  cpu_regs[R_EDX]);
-            gen_helper_xsave(tcg_env, s->A0, s->tmp1_i64);
-            break;
-
-        CASE_MODRM_MEM_OP(5): /* xrstor */
-            if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                || (prefixes & (PREFIX_LOCK | PREFIX_DATA
-                                | PREFIX_REPZ | PREFIX_REPNZ))) {
-                goto illegal_op;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                  cpu_regs[R_EDX]);
-            gen_helper_xrstor(tcg_env, s->A0, s->tmp1_i64);
-            /* XRSTOR is how MPX is enabled, which changes how
-               we translate.  Thus we need to end the TB.  */
-            s->base.is_jmp = DISAS_EOB_NEXT;
-            break;
-
-        CASE_MODRM_MEM_OP(6): /* xsaveopt / clwb */
-            if (prefixes & PREFIX_LOCK) {
-                goto illegal_op;
-            }
-            if (prefixes & PREFIX_DATA) {
-                /* clwb */
-                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB)) {
-                    goto illegal_op;
-                }
-                gen_nop_modrm(env, s, modrm);
-            } else {
-                /* xsaveopt */
-                if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                    || (s->cpuid_xsave_features & CPUID_XSAVE_XSAVEOPT) == 0
-                    || (prefixes & (PREFIX_REPZ | PREFIX_REPNZ))) {
-                    goto illegal_op;
-                }
-                gen_lea_modrm(env, s, modrm);
-                tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                      cpu_regs[R_EDX]);
-                gen_helper_xsaveopt(tcg_env, s->A0, s->tmp1_i64);
-            }
-            break;
-
-        CASE_MODRM_MEM_OP(7): /* clflush / clflushopt */
-            if (prefixes & PREFIX_LOCK) {
-                goto illegal_op;
-            }
-            if (prefixes & PREFIX_DATA) {
-                /* clflushopt */
-                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLFLUSHOPT)) {
-                    goto illegal_op;
-                }
-            } else {
-                /* clflush */
-                if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))
-                    || !(s->cpuid_features & CPUID_CLFLUSH)) {
-                    goto illegal_op;
-                }
-            }
-            gen_nop_modrm(env, s, modrm);
-            break;
-
-        case 0xc0 ... 0xc7: /* rdfsbase (f3 0f ae /0) */
-        case 0xc8 ... 0xcf: /* rdgsbase (f3 0f ae /1) */
-        case 0xd0 ... 0xd7: /* wrfsbase (f3 0f ae /2) */
-        case 0xd8 ... 0xdf: /* wrgsbase (f3 0f ae /3) */
-            if (CODE64(s)
-                && (prefixes & PREFIX_REPZ)
-                && !(prefixes & PREFIX_LOCK)
-                && (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_FSGSBASE)) {
-                TCGv base, treg, src, dst;
-
-                /* Preserve hflags bits by testing CR4 at runtime.  */
-                tcg_gen_movi_i32(s->tmp2_i32, CR4_FSGSBASE_MASK);
-                gen_helper_cr4_testbit(tcg_env, s->tmp2_i32);
-
-                base = cpu_seg_base[modrm & 8 ? R_GS : R_FS];
-                treg = cpu_regs[(modrm & 7) | REX_B(s)];
-
-                if (modrm & 0x10) {
-                    /* wr*base */
-                    dst = base, src = treg;
-                } else {
-                    /* rd*base */
-                    dst = treg, src = base;
-                }
-
-                if (s->dflag == MO_32) {
-                    tcg_gen_ext32u_tl(dst, src);
-                } else {
-                    tcg_gen_mov_tl(dst, src);
-                }
-                break;
-            }
-            goto unknown_op;
-
-        case 0xf8 ... 0xff: /* sfence */
-            if (!(s->cpuid_features & CPUID_SSE)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
-            break;
-        case 0xe8 ... 0xef: /* lfence */
-            if (!(s->cpuid_features & CPUID_SSE)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
-            break;
-        case 0xf0 ... 0xf7: /* mfence */
-            if (!(s->cpuid_features & CPUID_SSE2)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-            break;
-
-        default:
-            goto unknown_op;
-        }
-        break;
-
     case 0x1aa: /* rsm */
         gen_svm_check_intercept(s, SVM_EXIT_RSM);
         if (!(s->flags & HF_SMM_MASK))
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 4e745f10dd8..1c6fa39c3eb 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -269,20 +269,41 @@  static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEnt
 
 static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
-    /* only includes ldmxcsr and stmxcsr, because they have AVX variants.  */
     static const X86OpEntry group15_reg[8] = {
+        [0] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
+        [1] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
+        [2] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
+        [3] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
+        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE2) p_00),
+        [6] = X86_OP_ENTRY0(MFENCE,          cpuid(SSE2) p_00),
+        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE2) p_00),
     };
 
     static const X86OpEntry group15_mem[8] = {
-        [2] = X86_OP_ENTRYr(LDMXCSR,    E,d, vex5 chk(VEX128)),
-        [3] = X86_OP_ENTRYw(STMXCSR,    E,d, vex5 chk(VEX128)),
+        [0] = X86_OP_ENTRYw(FXSAVE,     M,y, cpuid(FXSR) p_00),
+        [1] = X86_OP_ENTRYr(FXRSTOR,    M,y, cpuid(FXSR) p_00),
+        [2] = X86_OP_ENTRYr(LDMXCSR,    E,d, vex5 chk(VEX128) p_00),
+        [3] = X86_OP_ENTRYw(STMXCSR,    E,d, vex5 chk(VEX128) p_00),
+        [4] = X86_OP_ENTRYw(XSAVE,      M,y, cpuid(XSAVE) p_00),
+        [5] = X86_OP_ENTRYr(XRSTOR,     M,y, cpuid(XSAVE) p_00),
+        [6] = X86_OP_ENTRYw(XSAVEOPT,   M,b, cpuid(XSAVEOPT) p_00),
+        [7] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLFLUSH) p_00),
+    };
+
+    static const X86OpEntry group15_mem_66[8] = {
+        [6] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLWB)),
+        [7] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLFLUSHOPT)),
     };
 
     uint8_t modrm = get_modrm(s, env);
+    int op = (modrm >> 3) & 7;
+
     if ((modrm >> 6) == 3) {
-        *entry = group15_reg[(modrm >> 3) & 7];
+        *entry = group15_reg[op];
+    } else if (s->prefix & PREFIX_DATA) {
+        *entry = group15_mem_66[op];
     } else {
-        *entry = group15_mem[(modrm >> 3) & 7];
+        *entry = group15_mem[op];
     }
 }
 
@@ -2094,6 +2115,10 @@  static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return true;
     case X86_FEAT_CMOV:
         return (s->cpuid_features & CPUID_CMOV);
+    case X86_FEAT_CLFLUSH:
+        return (s->cpuid_features & CPUID_CLFLUSH);
+    case X86_FEAT_FXSR:
+        return (s->cpuid_features & CPUID_FXSR);
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
     case X86_FEAT_FMA:
@@ -2127,6 +2152,8 @@  static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
 
     case X86_FEAT_AVX:
         return (s->cpuid_ext_features & CPUID_EXT_AVX);
+    case X86_FEAT_XSAVE:
+        return (s->cpuid_ext_features & CPUID_EXT_XSAVE);
 
     case X86_FEAT_3DNOW:
         return (s->cpuid_ext2_features & CPUID_EXT2_3DNOW);
@@ -2141,11 +2168,20 @@  static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2);
     case X86_FEAT_AVX2:
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2);
+    case X86_FEAT_CLFLUSHOPT:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLFLUSHOPT);
+    case X86_FEAT_CLWB:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB);
+    case X86_FEAT_FSGSBASE:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_FSGSBASE);
     case X86_FEAT_SHA_NI:
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SHA_NI);
 
     case X86_FEAT_CMPCCXADD:
         return (s->cpuid_7_1_eax_features & CPUID_7_1_EAX_CMPCCXADD);
+
+    case X86_FEAT_XSAVEOPT:
+        return (s->cpuid_xsave_features & CPUID_XSAVE_XSAVEOPT);
     }
     g_assert_not_reached();
 }
@@ -2480,7 +2516,7 @@  static void disas_insn(DisasContext *s, CPUState *cpu)
             case 0x1a ... 0x1b: /* MPX */
             case 0x30 ... 0x35: /* more privileged instructions */
             case 0xa2 ... 0xa5: /* CPUID, BT, SHLD */
-            case 0xaa ... 0xae: /* RSM, SHRD, grp15 */
+            case 0xaa ... 0xad: /* RSM, SHRD */
             case 0xb0 ... 0xb1: /* cmpxchg */
             case 0xb3:          /* btr */
             case 0xb8:          /* integer ops */
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index bcb6bccbd75..5ca3764e006 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1653,6 +1653,22 @@  static void gen_EXTRQ_r(DisasContext *s, X86DecodedInsn *decode)
     gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
 }
 
+static void gen_FXRSTOR(DisasContext *s, X86DecodedInsn *decode)
+{
+    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+        gen_NM_exception(s);
+    }
+    gen_helper_fxrstor(tcg_env, s->A0);
+}
+
+static void gen_FXSAVE(DisasContext *s, X86DecodedInsn *decode)
+{
+    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+        gen_NM_exception(s);
+    }
+    gen_helper_fxsave(tcg_env, s->A0);
+}
+
 static void gen_HLT(DisasContext *s, X86DecodedInsn *decode)
 {
 #ifdef CONFIG_SYSTEM_ONLY
@@ -2000,6 +2016,11 @@  static void gen_LES(DisasContext *s, X86DecodedInsn *decode)
     gen_lxx_seg(s, decode, R_ES);
 }
 
+static void gen_LFENCE(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
+}
+
 static void gen_LFS(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_lxx_seg(s, decode, R_FS);
@@ -2059,6 +2080,11 @@  static void gen_LSS(DisasContext *s, X86DecodedInsn *decode)
     gen_lxx_seg(s, decode, R_SS);
 }
 
+static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+}
+
 static void gen_MOV(DisasContext *s, X86DecodedInsn *decode)
 {
     /* nothing to do! */
@@ -3092,6 +3118,15 @@  static void gen_RCR(DisasContext *s, X86DecodedInsn *decode)
     }
 }
 
+static void gen_RDxxBASE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
+
+    /* Preserve hflags bits by testing CR4 at runtime.  */
+    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
+    tcg_gen_mov_tl(s->T0, base);
+}
+
 static void gen_RET(DisasContext *s, X86DecodedInsn *decode)
 {
     int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
@@ -3372,6 +3407,11 @@  static void gen_SETcc(DisasContext *s, X86DecodedInsn *decode)
     gen_setcc1(s, decode->b & 0xf, s->T0);
 }
 
+static void gen_SFENCE(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
+}
+
 static void gen_SHA1NEXTE(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
@@ -4042,6 +4082,15 @@  static void gen_WAIT(DisasContext *s, X86DecodedInsn *decode)
     }
 }
 
+static void gen_WRxxBASE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
+
+    /* Preserve hflags bits by testing CR4 at runtime.  */
+    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
+    tcg_gen_mov_tl(base, s->T0);
+}
+
 static void gen_XCHG(DisasContext *s, X86DecodedInsn *decode)
 {
     if (s->prefix & PREFIX_LOCK) {
@@ -4084,3 +4133,34 @@  static void gen_XOR(DisasContext *s, X86DecodedInsn *decode)
         prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
     }
 }
+
+static void gen_XRSTOR(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xrstor(tcg_env, s->A0, features);
+    if (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_MPX) {
+        /*
+         * XRSTOR is how MPX is enabled, which changes how
+         * we translate.  Thus we need to end the TB.
+         */
+        s->base.is_jmp = DISAS_EOB_NEXT;
+    }
+}
+
+static void gen_XSAVE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xsave(tcg_env, s->A0, features);
+}
+
+static void gen_XSAVEOPT(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xsave(tcg_env, s->A0, features);
+}