diff mbox series

[for-9.1,09/19] target/i386: move 60-BF opcodes to new decoder

Message ID 20240409164323.776660-10-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series target/i386: convert 1-byte opcodes to new decoder | expand

Commit Message

Paolo Bonzini April 9, 2024, 4:43 p.m. UTC
Compared to the old decoder, the main differences in translation
are for the little-used ARPL instruction.  IMUL is adjusted a bit
to share more code to produce flags, but is otherwise very similar.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.h     |   2 +
 target/i386/tcg/translate.c      |   9 +-
 target/i386/tcg/decode-new.c.inc | 171 +++++++++++++++++
 target/i386/tcg/emit.c.inc       | 317 +++++++++++++++++++++++++++++++
 4 files changed, 497 insertions(+), 2 deletions(-)

Comments

Richard Henderson April 11, 2024, 4:12 a.m. UTC | #1
On 4/9/24 06:43, Paolo Bonzini wrote:
> +static void gen_ARPL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    TCGLabel *label1 = gen_new_label();
> +    TCGv rpl_adj = tcg_temp_new();
> +    TCGv flags = tcg_temp_new();
> +
> +    gen_mov_eflags(s, flags);
> +    tcg_gen_andi_tl(flags, flags, ~CC_Z);
> +
> +    /* Compute dest[rpl] - src[rpl], adjust if result <0.  */
> +    tcg_gen_andi_tl(rpl_adj, s->T0, 3);
> +    tcg_gen_andi_tl(s->T1, s->T1, 3);
> +    tcg_gen_sub_tl(rpl_adj, rpl_adj, s->T1);
> +
> +    tcg_gen_brcondi_tl(TCG_COND_LT, rpl_adj, 0, label1);

Comment is right, but branch condition is wrong.

I think this might be better as:

     /* SRC = DST with SRC[RPL] */
     tcg_gen_deposit_tl(s->T1, s->T0, s->T1, 0, 2);
     /* Z flag set if DST < SRC */
     tcg_gen_setcond_tl(TCG_COND_LTU, tmp, s->T0, s->T1);
     /* Install Z */
     tcg_gen_deposit_tl(flags, flags, tmp, ctz(CC_Z), 1);
     /* DST with maximum RPL */
     tcg_gen_umax_tl(s->T0, s->T0, s->T1);


> +    case MO_32:
> +#ifdef TARGET_X86_64
> +        /*
> +         * This could also use the same algorithm as MO_16.  It produces fewer
> +         * TCG ops and better code if flags are needed, but it requires a 64-bit
> +         * multiply even if they are not (and thus the high part of the multiply
> +         * is dead).
> +         */

Is 64-bit multiply ever slower these days?
My intuition says "slow" multiply is at least a decade out of date.

> +        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
> +        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);

Avoid s->tmp*, especially in new code.

> +        tcg_gen_muls2_i32(s->tmp2_i32, s->tmp3_i32,
> +                          s->tmp2_i32, s->tmp3_i32);
> +        tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
> +
> +        cc_src_rhs = tcg_temp_new();
> +        tcg_gen_extu_i32_tl(cc_src_rhs, s->tmp3_i32);
> +        /* Compare the high part to the sign bit of the truncated result */
> +        tcg_gen_negsetcondi_i32(TCG_COND_LT, s->tmp2_i32, s->tmp2_i32, 0);

This seems like something the optimizer should handle, but doesn't.
I'd write this as

     tcg_gen_sari_i32(tmp, tmp, 31);
or
     tcg_gen_sextract_i32(tmp, tmp, 31, 1);

which I know will expand to the same thing.

> +    case MO_64:
> +#endif
> +        cc_src_rhs = tcg_temp_new();
> +        tcg_gen_muls2_tl(s->T0, cc_src_rhs, s->T0, s->T1);
> +        /* Compare the high part to the sign bit of the truncated result */
> +        tcg_gen_negsetcondi_tl(TCG_COND_LT, s->T1, s->T0, 0);

Similarly.


r~
Paolo Bonzini April 11, 2024, 11:18 a.m. UTC | #2
On Thu, Apr 11, 2024 at 9:47 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
> > +    case MO_32:
> > +#ifdef TARGET_X86_64
> > +        /*
> > +         * This could also use the same algorithm as MO_16.  It produces fewer
> > +         * TCG ops and better code if flags are needed, but it requires a 64-bit
> > +         * multiply even if they are not (and thus the high part of the multiply
> > +         * is dead).
> > +         */
>
> Is 64-bit multiply ever slower these days?
> My intuition says "slow" multiply is at least a decade out of date.

I was thinking more about TCG_TARGET_REG_BITS == 32.

> > +        tcg_gen_negsetcondi_i32(TCG_COND_LT, s->tmp2_i32, s->tmp2_i32, 0);
>
> This seems like something the optimizer should handle, but doesn't.

I wanted to avoid using TARGET_LONG_BITS - 1, but it's not a problem
to use extract. I've changed it.

At least the ppc and x86 backends do support it and convert it to SAR,
so I didn't notice in my test that it was the backend doing it and not
the optimizer!

Paolo
Zhao Liu April 11, 2024, 2:31 p.m. UTC | #3
Hi Paolo,

I just did some tests,

> +    [0x98] = X86_OP_ENTRY1(CBW,    0,v), /* rAX */
> +    [0x99] = X86_OP_ENTRY3(CWD,    2,v, 0,v, None, None), /* rDX, rAX */
> +    [0x9A] = X86_OP_ENTRYrr(CALLF, I_unsigned,p, I_unsigned,w, chk(i64)),

X86_TYPE_I_unsigned is defined in patch 11, so the related changes
should be move into this patch to avoid compiling failures:

--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -1642,6 +1642,11 @@ static bool decode_op(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
         decode->immediate = op->imm = insn_get_signed(env, s, op->ot);
         break;

+    case X86_TYPE_I_unsigned:  /* Immediate */
+        op->unit = X86_OP_IMM;
+        decode->immediate = op->imm = insn_get(env, s, op->ot);
+        break;
+
     case X86_TYPE_L:  /* The upper 4 bits of the immediate select a 128-bit register */
         op->n = insn_get(env, s, op->ot) >> 4;
         break;
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index ca99a620ce94..790ad5e1d006 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -48,6 +48,7 @@ typedef enum X86OpType {

     /* Custom */
     X86_TYPE_WM, /* modrm byte selects an XMM/YMM memory operand */
+    X86_TYPE_I_unsigned, /* Immediate, zero-extended */
     X86_TYPE_2op, /* 2-operand RMW instruction */
     X86_TYPE_LoBits, /* encoded in bits 0-2 of the operand + REX.B */
     X86_TYPE_0, /* Hard-coded GPRs (RAX..RDI) */

-Zhao
Zhao Liu April 11, 2024, 3:19 p.m. UTC | #4
On Tue, Apr 09, 2024 at 06:43:13PM +0200, Paolo Bonzini wrote:
> Date: Tue,  9 Apr 2024 18:43:13 +0200
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH for-9.1 09/19] target/i386: move 60-BF opcodes to new
>  decoder
> X-Mailer: git-send-email 2.44.0
> 
> Compared to the old decoder, the main differences in translation
> are for the little-used ARPL instruction.  IMUL is adjusted a bit
> to share more code to produce flags, but is otherwise very similar.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  target/i386/tcg/decode-new.h     |   2 +
>  target/i386/tcg/translate.c      |   9 +-
>  target/i386/tcg/decode-new.c.inc | 171 +++++++++++++++++
>  target/i386/tcg/emit.c.inc       | 317 +++++++++++++++++++++++++++++++
>  4 files changed, 497 insertions(+), 2 deletions(-)

HMM, I met Guest boot failure on this patch because of ata unrecognized.
I haven't located the exact error yet, so let me post my log first.
If there are other means I can use to dig further, I'd be happy to try
that too.

# Command (boot a ubuntu Guest via TCG)

./qemu/build/qemu-system-x86_64 \
-smp 1 \
-name ubuntu -m 4G \
-cpu max -accel tcg \
-hda ../img_qemu/test.qcow2 -nographic \
-kernel ../img_qemu/kernel/vmlinuz-6.4.0-rc6+ \
-initrd ../img_qemu/kernel/initrd.img-6.4.0-rc6+ \
-append "root=/dev/sda ro console=ttyS0" \
-qmp unix:/tmp/qmp-sock,server=on,wait=off

# Guest log

...
[    1.567002] scsi host0: ata_piix
[    1.570945] scsi host1: ata_piix
[    1.571446] ata1: PATA max MWDMA2 cmd 0x1f0 ctl 0x3f6 bmdma 0xc040 irq 14
[    1.571592] ata2: PATA max MWDMA2 cmd 0x170 ctl 0x376 bmdma 0xc048 irq 15
[    1.577016] tun: Universal TUN/TAP device driver, 1.6
[    1.579437] PPP generic driver version 2.4.2
[    1.583207] VFIO - User Level meta-driver version: 0.3
[    1.585787] i8042: PNP: PS/2 Controller [PNP0303:KBD,PNP0f13:MOU] at 0x60,0x64 irq 1,12
[    1.593520] serio: i8042 KBD port at 0x60,0x64 irq 1
[    1.593847] serio: i8042 AUX port at 0x60,0x64 irq 12
[    1.599632] mousedev: PS/2 mouse device common for all mice
[    1.602405] input: AT Translated Set 2 keyboard as /devices/platform/i8042/serio0/input/input1
[    1.604135] rtc_cmos 00:05: RTC can wake from S4
[    1.611641] rtc_cmos 00:05: registered as rtc0
[    1.612271] rtc_cmos 00:05: setting system clock to 2024-04-11T14:59:56 UTC (1712847596)
[    1.613380] rtc_cmos 00:05: alarms up to one day, y3k, 242 bytes nvram, hpet irqs
[    1.613669] i2c_dev: i2c /dev entries driver
[    1.614302] device-mapper: core: CONFIG_IMA_DISABLE_HTABLE is disabled. Duplicate IMA measurements will not be recorded in the IMA log.
[    1.614687] device-mapper: uevent: version 1.0.3
[    1.619437] device-mapper: ioctl: 4.48.0-ioctl (2023-03-01) initialised: dm-devel@redhat.com
[    1.620134] platform eisa.0: Probing EISA bus 0
[    1.620388] platform eisa.0: EISA: Cannot allocate resource for mainboard
[    1.620608] platform eisa.0: Cannot allocate resource for EISA slot 1
[    1.620749] platform eisa.0: Cannot allocate resource for EISA slot 2
[    1.621045] platform eisa.0: Cannot allocate resource for EISA slot 3
[    1.621178] platform eisa.0: Cannot allocate resource for EISA slot 4
[    1.621291] platform eisa.0: Cannot allocate resource for EISA slot 5
[    1.621400] platform eisa.0: Cannot allocate resource for EISA slot 6
[    1.621510] platform eisa.0: Cannot allocate resource for EISA slot 7
[    1.621621] platform eisa.0: Cannot allocate resource for EISA slot 8
[    1.621745] platform eisa.0: EISA: Detected 0 cards
[    1.621873] amd_pstate: driver load is disabled, boot with specific mode to enable this
[    1.622309] ledtrig-cpu: registered to indicate activity on CPUs
[    1.628983] drop_monitor: Initializing network drop monitor service
[    1.735074] ata1: found unknown device (class 0)
[    1.748551] ata2: found unknown device (class 0)
[    1.763380] NET: Registered PF_INET6 protocol family
[    1.771260] ata1.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[    2.330777] tsc: Refined TSC clocksource calibration: 2903.979 MHz
[    2.331349] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x29dbf1ca37e, max_idle_ns: 440795233019 ns
[    2.332227] clocksource: Switched to clocksource tsc
[    5.375200] Freeing initrd memory: 72840K
[    5.526057] Segment Routing with IPv6
[    5.526392] In-situ OAM (IOAM) with IPv6
[    5.527164] NET: Registered PF_PACKET protocol family
[    5.527709] Key type dns_resolver registered
[    5.530256] IPI shorthand broadcast: enabled
[    5.545746] sched_clock: Marking stable (5512061965, 30528485)->(5548999115, -6408665)
[    5.548071] registered taskstats version 1
[    5.550142] Loading compiled-in X.509 certificates
[    5.563141] Key type .fscrypt registered
[    5.563275] Key type fscrypt-provisioning registered
[    5.594426] Key type encrypted registered
[    5.594760] AppArmor: AppArmor sha1 policy hashing enabled
[    5.595336] ima: No TPM chip found, activating TPM-bypass!
[    5.595621] Loading compiled-in module X.509 certificates
[    5.595865] ima: Allocated hash algorithm: sha1
[    5.600207] ima: No architecture policies found
[    5.601022] evm: Initialising EVM extended attributes:
[    5.601215] evm: security.selinux
[    5.601329] evm: security.SMACK64
[    5.601408] evm: security.SMACK64EXEC
[    5.601496] evm: security.SMACK64TRANSMUTE
[    5.601608] evm: security.SMACK64MMAP
[    5.601737] evm: security.apparmor
[    5.601854] evm: security.ima
[    5.601927] evm: security.capability
[    5.602031] evm: HMAC attrs: 0x1
[    5.604746] PM:   Magic number: 4:740:990
[    5.605067] memory memory33: hash matches
[    5.606782] RAS: Correctable Errors collector initialized.
[    5.608451] clk: Disabling unused clocks
[    6.885887] ata2.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[    7.035230] ata1: found unknown device (class 0)
[    7.045928] ata1.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[Thread 0x7ffff4c40640 (LWP 695720) exited]
[   12.251691] ata2: found unknown device (class 0)
[   12.262338] ata2.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[   17.371176] ata1: found unknown device (class 0)
[   17.381770] ata1.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[   22.491447] ata2: found unknown device (class 0)
[   22.502088] ata2.00: failed to IDENTIFY (I/O error, err_mask=0x2)
[   27.611394] ata1: found unknown device (class 0)
[   27.771117] ata2: found unknown device (class 0)
[   27.794539] Freeing unused decrypted memory: 2036K
[   27.855183] Freeing unused kernel image (initmem) memory: 4632K
[   27.855513] Write protecting the kernel read-only data: 34816k
[   27.857829] Freeing unused kernel image (rodata/data gap) memory: 1960K
[   27.985357] x86/mm: Checked W+X mappings: passed, no W+X pages found.
[   27.985661] Run /init as init process
Loading, please wait...
Starting version 249.11-0ubuntu3.7
[   29.631358] Floppy drive(s): fd0 is 2.88M AMI BIOS
[   29.652292] FDC 0 is a S82078B
[   29.662353] piix4_smbus 0000:00:01.3: SMBus Host Controller at 0x700, revision 0
[   29.778444] e1000: Intel(R) PRO/1000 Network Driver
[   29.778562] e1000: Copyright (c) 1999-2006 Intel Corporation.
[   30.673420] ACPI: \_SB_.LNKC: Enabled at IRQ 11
[   30.872005] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input3
[   31.020279] e1000 0000:00:03.0 eth0: (PCI:33MHz:32-bit) 52:54:00:12:34:56
[   31.020856] e1000 0000:00:03.0 eth0: Intel(R) PRO/1000 Network Connection
[   31.052677] e1000 0000:00:03.0 ens3: renamed from eth0
Begin: Loading essential drivers ... done.
Begin: Running /scripts/init-premount ... done.
Begin: Mounting root file system ... Begin: Running /scripts/local-top ... done.
Begin: Running /scripts/local-premount ... done.
Begin: Waiting for root file system ... Begin: Running /scripts/local-block ... done.
done.
Gave up waiting for root file system device.  Common problems:
 - Boot args (cat /proc/cmdline)
   - Check rootdelay= (did the system wait long enough?)
 - Missing modules (cat /proc/modules; ls /dev)
ALERT!  /dev/sda does not exist.  Dropping to a shell!

Regards,
Zhao
Paolo Bonzini April 11, 2024, 4:43 p.m. UTC | #5
On Thu, Apr 11, 2024 at 5:05 PM Zhao Liu <zhao1.liu@intel.com> wrote:
>
> On Tue, Apr 09, 2024 at 06:43:13PM +0200, Paolo Bonzini wrote:
> > Date: Tue,  9 Apr 2024 18:43:13 +0200
> > From: Paolo Bonzini <pbonzini@redhat.com>
> > Subject: [PATCH for-9.1 09/19] target/i386: move 60-BF opcodes to new
> >  decoder
> > X-Mailer: git-send-email 2.44.0
> >
> > Compared to the old decoder, the main differences in translation
> > are for the little-used ARPL instruction.  IMUL is adjusted a bit
> > to share more code to produce flags, but is otherwise very similar.
> >
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >  target/i386/tcg/decode-new.h     |   2 +
> >  target/i386/tcg/translate.c      |   9 +-
> >  target/i386/tcg/decode-new.c.inc | 171 +++++++++++++++++
> >  target/i386/tcg/emit.c.inc       | 317 +++++++++++++++++++++++++++++++
> >  4 files changed, 497 insertions(+), 2 deletions(-)
>
> HMM, I met Guest boot failure on this patch because of ata unrecognized.
> I haven't located the exact error yet, so let me post my log first.
> If there are other means I can use to dig further, I'd be happy to try
> that too.
>
> # Command (boot a ubuntu Guest via TCG)
>
> ./qemu/build/qemu-system-x86_64 \
> -smp 1 \
> -name ubuntu -m 4G \
> -cpu max -accel tcg \
> -hda ../img_qemu/test.qcow2 -nographic \
> -kernel ../img_qemu/kernel/vmlinuz-6.4.0-rc6+ \
> -initrd ../img_qemu/kernel/initrd.img-6.4.0-rc6+ \
> -append "root=/dev/sda ro console=ttyS0" \
> -qmp unix:/tmp/qmp-sock,server=on,wait=off

I did run a bunch of boot tests but I'll check this one too.

Thanks!

Paolo
Paolo Bonzini April 24, 2024, 11:13 a.m. UTC | #6
On Thu, Apr 11, 2024 at 5:05 PM Zhao Liu <zhao1.liu@intel.com> wrote:
> HMM, I met Guest boot failure on this patch because of ata unrecognized.
> I haven't located the exact error yet, so let me post my log first.
> If there are other means I can use to dig further, I'd be happy to try
> that too.
>
> # Command (boot a ubuntu Guest via TCG)
>
> ./qemu/build/qemu-system-x86_64 \
> -smp 1 \
> -name ubuntu -m 4G \
> -cpu max -accel tcg \
> -hda ../img_qemu/test.qcow2 -nographic \
> -kernel ../img_qemu/kernel/vmlinuz-6.4.0-rc6+ \
> -initrd ../img_qemu/kernel/initrd.img-6.4.0-rc6+ \
> -append "root=/dev/sda ro console=ttyS0" \
> -qmp unix:/tmp/qmp-sock,server=on,wait=off

The issue is that INS and OUTS are using the incorrect operand size.

While at it I also made OUTS a bit more similar to OUT:

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 0951b042dfa..46682cfe070 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -1544,8 +1544,8 @@ static const X86OpEntry opcodes_root[256] = {
     [0x6B] = X86_OP_ENTRY3(IMUL3, G,v, E,v, I,b, sextT0),
     [0x6C] = X86_OP_ENTRYrr(INS, Y,b, 2,w), /* DX */
     [0x6D] = X86_OP_ENTRYrr(INS, Y,z, 2,w), /* DX */
-    [0x6E] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
-    [0x6F] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
+    [0x6E] = X86_OP_ENTRYrr(OUTS, X,b, 2,w), /* DX */
+    [0x6F] = X86_OP_ENTRYrr(OUTS, X,z, 2,w), /* DX */

     [0x78] = X86_OP_ENTRYr(Jcc, J,b),
     [0x79] = X86_OP_ENTRYr(Jcc, J,b),
@@ -1592,7 +1592,7 @@ static void gen_INC(

 static void gen_INS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[0].ot;
+    MemOp ot = decode->op[1].ot;
     TCGv_i32 port = tcg_temp_new_i32();

     tcg_gen_trunc_tl_i32(port, s->T1);
@@ -2310,10 +2310,10 @@ static void gen_OUT(

 static void gen_OUTS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    MemOp ot = decode->op[1].ot;
     TCGv_i32 port = tcg_temp_new_i32();

-    tcg_gen_trunc_tl_i32(port, s->T0);
+    tcg_gen_trunc_tl_i32(port, s->T1);
     tcg_gen_ext16u_i32(port, port);
     if (!gen_check_io(s, ot, port, SVM_IOIO_STR_MASK)) {
         return;


(sorry about any word breaking)
Zhao Liu April 25, 2024, 3:29 p.m. UTC | #7
On Wed, Apr 24, 2024 at 01:13:01PM +0200, Paolo Bonzini wrote:
> Date: Wed, 24 Apr 2024 13:13:01 +0200
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: Re: [PATCH for-9.1 09/19] target/i386: move 60-BF opcodes to new
>  decoder
> 
> On Thu, Apr 11, 2024 at 5:05 PM Zhao Liu <zhao1.liu@intel.com> wrote:
> > HMM, I met Guest boot failure on this patch because of ata unrecognized.
> > I haven't located the exact error yet, so let me post my log first.
> > If there are other means I can use to dig further, I'd be happy to try
> > that too.
> >
> > # Command (boot a ubuntu Guest via TCG)
> >
> > ./qemu/build/qemu-system-x86_64 \
> > -smp 1 \
> > -name ubuntu -m 4G \
> > -cpu max -accel tcg \
> > -hda ../img_qemu/test.qcow2 -nographic \
> > -kernel ../img_qemu/kernel/vmlinuz-6.4.0-rc6+ \
> > -initrd ../img_qemu/kernel/initrd.img-6.4.0-rc6+ \
> > -append "root=/dev/sda ro console=ttyS0" \
> > -qmp unix:/tmp/qmp-sock,server=on,wait=off
> 
> The issue is that INS and OUTS are using the incorrect operand size.
> 
> While at it I also made OUTS a bit more similar to OUT:
> 
> diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
> index 0951b042dfa..46682cfe070 100644
> --- a/target/i386/tcg/decode-new.c.inc
> +++ b/target/i386/tcg/decode-new.c.inc
> @@ -1544,8 +1544,8 @@ static const X86OpEntry opcodes_root[256] = {
>      [0x6B] = X86_OP_ENTRY3(IMUL3, G,v, E,v, I,b, sextT0),
>      [0x6C] = X86_OP_ENTRYrr(INS, Y,b, 2,w), /* DX */
>      [0x6D] = X86_OP_ENTRYrr(INS, Y,z, 2,w), /* DX */
> -    [0x6E] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
> -    [0x6F] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
> +    [0x6E] = X86_OP_ENTRYrr(OUTS, X,b, 2,w), /* DX */
> +    [0x6F] = X86_OP_ENTRYrr(OUTS, X,z, 2,w), /* DX */
> 
>      [0x78] = X86_OP_ENTRYr(Jcc, J,b),
>      [0x79] = X86_OP_ENTRYr(Jcc, J,b),
> @@ -1592,7 +1592,7 @@ static void gen_INC(
> 
>  static void gen_INS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
>  {
> -    MemOp ot = decode->op[0].ot;
> +    MemOp ot = decode->op[1].ot;
>      TCGv_i32 port = tcg_temp_new_i32();
> 
>      tcg_gen_trunc_tl_i32(port, s->T1);
> @@ -2310,10 +2310,10 @@ static void gen_OUT(
> 
>  static void gen_OUTS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
>  {
> -    MemOp ot = decode->op[2].ot;
> +    MemOp ot = decode->op[1].ot;
>      TCGv_i32 port = tcg_temp_new_i32();
> 
> -    tcg_gen_trunc_tl_i32(port, s->T0);
> +    tcg_gen_trunc_tl_i32(port, s->T1);
>      tcg_gen_ext16u_i32(port, port);
>      if (!gen_check_io(s, ot, port, SVM_IOIO_STR_MASK)) {
>          return;
> 
> 
> (sorry about any word breaking)

Thanks! With the above fix, now I re-test on each patch, and all patches
could boot Guest properly! If there is a v2, I can continue to test it.

Regards,
Zhao
diff mbox series

Patch

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 8ffde8d1cd6..ca99a620ce9 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -165,6 +165,8 @@  typedef enum X86InsnSpecial {
     /* Always locked if it has a memory operand (XCHG) */
     X86_SPECIAL_Locked,
 
+    /* Do not apply segment base to effective address */
+    X86_SPECIAL_NoSeg,
     /*
      * Rd/Mb or Rd/Mw in the manual: register operand 0 is treated as 32 bits
      * (and writeback zero-extends it to 64 bits if applicable).  PREFIX_DATA
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index c251fa21e6d..de1ccb6ea7f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1296,7 +1296,11 @@  static void gen_cmps(DisasContext *s, MemOp ot)
     gen_string_movl_A0_EDI(s);
     gen_op_ld_v(s, ot, s->T1, s->A0);
     gen_string_movl_A0_ESI(s);
-    gen_op(s, OP_CMPL, ot, OR_TMP0);
+    gen_op_ld_v(s, ot, s->T0, s->A0);
+    tcg_gen_mov_tl(cpu_cc_src, s->T1);
+    tcg_gen_mov_tl(s->cc_srcT, s->T0);
+    tcg_gen_sub_tl(cpu_cc_dst, s->T0, s->T1);
+    set_cc_op(s, CC_OP_SUBB + ot);
 
     dshift = gen_compute_Dshift(s, ot);
     gen_op_add_reg(s, s->aflag, R_ESI, dshift);
@@ -3124,6 +3128,7 @@  static bool disas_insn(DisasContext *s, CPUState *cpu)
 
     s->pc = s->base.pc_next;
     s->override = -1;
+    s->popl_esp_hack = 0;
 #ifdef TARGET_X86_64
     s->rex_r = 0;
     s->rex_x = 0;
@@ -3181,7 +3186,7 @@  static bool disas_insn(DisasContext *s, CPUState *cpu)
 #ifndef CONFIG_USER_ONLY
         use_new &= b <= limit;
 #endif
-        if (use_new && b <= 0x5f) {
+        if (use_new && b <= 0xbf) {
             disas_insn_new(s, cpu, b);
             return true;
         }
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index c6fd7a053bd..f6d6873dd83 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -33,6 +33,13 @@ 
  * ("cannot encode 16-bit or 32-bit size in 64-bit mode") as modifiers of the
  * "v" or "z" sizes.  The decoder simply makes them separate operand sizes.
  *
+ * The manual lists immediate far destinations as Ap (technically an implicit
+ * argument).  The decoder splits them into two immediates, using "Ip" for
+ * the offset part (that comes first in the instruction stream) and "Iw" for
+ * the segment/selector part.  The size of the offset is given by s->dflag
+ * and the instructions are illegal in 64-bit mode, so the choice of "Ip"
+ * is somewhat arbitrary; "Iv" or "Iz" would work just as well.
+ *
  * Vector operands
  * ---------------
  *
@@ -151,6 +158,8 @@ 
  */
 #define X86_OP_ENTRYrr(op, op0, s0, op1, s1, ...)                 \
     X86_OP_ENTRY3(op, None, None, op0, s0, op1, s1, ## __VA_ARGS__)
+#define X86_OP_ENTRYwr(op, op0, s0, op1, s1, ...)                 \
+    X86_OP_ENTRY3(op, op0, s0, None, None, op1, s1, ## __VA_ARGS__)
 #define X86_OP_ENTRY2(op, op0, s0, op1, s1, ...)                  \
     X86_OP_ENTRY3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
 #define X86_OP_ENTRYw(op, op0, s0, ...)                           \
@@ -163,6 +172,7 @@ 
     X86_OP_ENTRY3(op, None, None, None, None, None, None, ## __VA_ARGS__)
 
 #define cpuid(feat) .cpuid = X86_FEAT_##feat,
+#define noseg .special = X86_SPECIAL_NoSeg,
 #define xchg .special = X86_SPECIAL_Locked,
 #define lock .special = X86_SPECIAL_HasLock,
 #define mmx .special = X86_SPECIAL_MMX,
@@ -209,6 +219,8 @@ 
 #define p_66_f3_f2    .valid_prefix = P_66 | P_F3 | P_F2,
 #define p_00_66_f3_f2 .valid_prefix = P_00 | P_66 | P_F3 | P_F2,
 
+#define UNKNOWN_OPCODE ((X86OpEntry) {})
+
 static uint8_t get_modrm(DisasContext *s, CPUX86State *env)
 {
     if (!s->has_modrm) {
@@ -1108,6 +1120,51 @@  static void decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint
     do_decode_0F(s, env, entry, b);
 }
 
+static void decode_63(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry arpl = X86_OP_ENTRY2(ARPL, E,w, G,w, chk(prot));
+    static const X86OpEntry mov = X86_OP_ENTRY3(MOV, G,v, E,v, None, None);
+    static const X86OpEntry movsxd = X86_OP_ENTRY3(MOV, G,v, E,d, None, None, sextT0);
+    if (!CODE64(s)) {
+        *entry = arpl;
+    } else if (REX_W(s)) {
+        *entry = movsxd;
+    } else {
+        *entry = mov;
+    }
+}
+
+static void decode_group1(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86GenFunc group1_gen[8] = {
+        gen_ADD, gen_OR, gen_ADC, gen_SBB, gen_AND, gen_SUB, gen_XOR, gen_SUB,
+    };
+    int op = (get_modrm(s, env) >> 3) & 7;
+    entry->gen = group1_gen[op];
+
+    if (op == 7) {
+        /* prevent writeback for CMP */
+        entry->op1 = entry->op0;
+        entry->op0 = X86_TYPE_None;
+        entry->s0 = X86_SIZE_None;
+    } else {
+        entry->special = X86_SPECIAL_HasLock;
+    }
+}
+
+static void decode_group1A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    int op = (get_modrm(s, env) >> 3) & 7;
+    if (op != 0) {
+        /* could be XOP prefix too */
+        *entry = UNKNOWN_OPCODE;
+    } else {
+        entry->gen = gen_POP;
+        /* The address must use the value of ESP after the pop.  */
+        s->popl_esp_hack = 1 << mo_pushpop(s, s->dflag);
+    }
+}
+
 static const X86OpEntry opcodes_root[256] = {
     [0x00] = X86_OP_ENTRY2(ADD, E,b, G,b, lock),
     [0x01] = X86_OP_ENTRY2(ADD, E,v, G,v, lock),
@@ -1163,6 +1220,60 @@  static const X86OpEntry opcodes_root[256] = {
     [0x56] = X86_OP_ENTRYr(PUSH, LoBits,d64),
     [0x57] = X86_OP_ENTRYr(PUSH, LoBits,d64),
 
+    [0x60] = X86_OP_ENTRY0(PUSHA, chk(i64)),
+    [0x61] = X86_OP_ENTRY0(POPA, chk(i64)),
+    [0x62] = X86_OP_ENTRYrr(BOUND, G,v, M,a, chk(i64)),
+    [0x63] = X86_OP_GROUP0(63),
+    [0x64] = {},
+    [0x65] = {},
+    [0x66] = {},
+    [0x67] = {},
+
+    [0x70] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x71] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x72] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x73] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x74] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x75] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x76] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x77] = X86_OP_ENTRYr(Jcc, J,b),
+
+    [0x80] = X86_OP_GROUP2(group1, E,b, I,b),
+    [0x81] = X86_OP_GROUP2(group1, E,v, I,z),
+    [0x82] = X86_OP_GROUP2(group1, E,b, I,b, chk(i64)),
+    [0x83] = X86_OP_GROUP2(group1, E,v, I,b),
+    [0x84] = X86_OP_ENTRYrr(AND, E,b, G,b),
+    [0x85] = X86_OP_ENTRYrr(AND, E,v, G,v),
+    [0x86] = X86_OP_ENTRY2(XCHG, E,b, G,b, xchg),
+    [0x87] = X86_OP_ENTRY2(XCHG, E,v, G,v, xchg),
+
+    [0x90] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x91] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x92] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x93] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x94] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x95] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x96] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+    [0x97] = X86_OP_ENTRY2(XCHG, 0,v, LoBits,v),
+
+    [0xA0] = X86_OP_ENTRY3(MOV, 0,b, O,b, None, None), /* AL, Ob */
+    [0xA1] = X86_OP_ENTRY3(MOV, 0,v, O,v, None, None), /* rAX, Ov */
+    [0xA2] = X86_OP_ENTRY3(MOV, O,b, 0,b, None, None), /* Ob, AL */
+    [0xA3] = X86_OP_ENTRY3(MOV, O,v, 0,v, None, None), /* Ov, rAX */
+    [0xA4] = X86_OP_ENTRYrr(MOVS, Y,b, X,b),
+    [0xA5] = X86_OP_ENTRYrr(MOVS, Y,v, X,v),
+    [0xA6] = X86_OP_ENTRYrr(CMPS, Y,b, X,b),
+    [0xA7] = X86_OP_ENTRYrr(CMPS, Y,v, X,v),
+
+    [0xB0] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB1] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB2] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB3] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB4] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB5] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB6] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+    [0xB7] = X86_OP_ENTRY3(MOV, LoBits,b, I,b, None, None),
+
 
     [0x08] = X86_OP_ENTRY2(OR, E,b, G,b, lock),
     [0x09] = X86_OP_ENTRY2(OR, E,v, G,v, lock),
@@ -1217,6 +1328,61 @@  static const X86OpEntry opcodes_root[256] = {
     [0x5D] = X86_OP_ENTRYw(POP, LoBits,d64),
     [0x5E] = X86_OP_ENTRYw(POP, LoBits,d64),
     [0x5F] = X86_OP_ENTRYw(POP, LoBits,d64),
+
+    [0x68] = X86_OP_ENTRYr(PUSH, I,z),
+    [0x69] = X86_OP_ENTRY3(IMUL3, G,v, E,v, I,z),
+    [0x6A] = X86_OP_ENTRYr(PUSH, I,b),
+    [0x6B] = X86_OP_ENTRY3(IMUL3, G,v, E,v, I,b),
+    [0x6C] = X86_OP_ENTRYrr(INS, Y,b, 2,w), /* DX */
+    [0x6D] = X86_OP_ENTRYrr(INS, Y,z, 2,w), /* DX */
+    [0x6E] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
+    [0x6F] = X86_OP_ENTRYrr(OUTS, 2,w, X,b), /* DX */
+
+    [0x78] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x79] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7A] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7B] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7C] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7D] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7E] = X86_OP_ENTRYr(Jcc, J,b),
+    [0x7F] = X86_OP_ENTRYr(Jcc, J,b),
+
+    [0x88] = X86_OP_ENTRY3(MOV, E,b, G,b, None, None),
+    [0x89] = X86_OP_ENTRY3(MOV, E,v, G,v, None, None),
+    [0x8A] = X86_OP_ENTRY3(MOV, G,b, E,b, None, None),
+    [0x8B] = X86_OP_ENTRY3(MOV, G,v, E,v, None, None),
+    [0x8C] = X86_OP_ENTRY3(MOV, E,v, S,w, None, None),
+    [0x8D] = X86_OP_ENTRY3(LEA, G,v, M,v, None, None, noseg),
+    [0x8E] = X86_OP_ENTRY3(MOV, S,w, E,v, None, None),
+    [0x8F] = X86_OP_GROUPw(group1A, E,v),
+
+    [0x98] = X86_OP_ENTRY1(CBW,    0,v), /* rAX */
+    [0x99] = X86_OP_ENTRY3(CWD,    2,v, 0,v, None, None), /* rDX, rAX */
+    [0x9A] = X86_OP_ENTRYrr(CALLF, I_unsigned,p, I_unsigned,w, chk(i64)),
+    [0x9B] = X86_OP_ENTRY0(WAIT),
+    [0x9C] = X86_OP_ENTRY0(PUSHF,  chk(vm86_iopl) svm(PUSHF)),
+    [0x9D] = X86_OP_ENTRY0(POPF,   chk(vm86_iopl) svm(POPF)),
+    [0x9E] = X86_OP_ENTRY0(SAHF),
+    [0x9F] = X86_OP_ENTRY0(LAHF),
+
+    [0xA8] = X86_OP_ENTRYrr(AND, 0,b, I,b),   /* AL, Ib */
+    [0xA9] = X86_OP_ENTRYrr(AND, 0,v, I,z),   /* rAX, Iz */
+    [0xAA] = X86_OP_ENTRY3(STOS, Y,b, 0,b, None, None),
+    [0xAB] = X86_OP_ENTRY3(STOS, Y,v, 0,v, None, None),
+    /* Manual writeback because REP LODS (!) has to write EAX/RAX after every LODS.  */
+    [0xAC] = X86_OP_ENTRYr(LODS, X,b),
+    [0xAD] = X86_OP_ENTRYr(LODS, X,v),
+    [0xAE] = X86_OP_ENTRYrr(SCAS, 0,b, Y,b),
+    [0xAF] = X86_OP_ENTRYrr(SCAS, 0,v, Y,v),
+
+    [0xB8] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xB9] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBA] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBB] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBC] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBD] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBE] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xBF] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
 };
 
 #undef mmx
@@ -2037,6 +2203,11 @@  static void disas_insn_new(DisasContext *s, CPUState *cpu, int b)
         assert(decode.op[1].unit == X86_OP_INT);
         break;
 
+    case X86_SPECIAL_NoSeg:
+        decode.mem.def_seg = -1;
+        s->override = -1;
+        break;
+
     default:
         break;
     }
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index a27d3040e03..cba7b61f757 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1179,6 +1179,31 @@  static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
+static void gen_ARPL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGLabel *label1 = gen_new_label();
+    TCGv rpl_adj = tcg_temp_new();
+    TCGv flags = tcg_temp_new();
+
+    gen_mov_eflags(s, flags);
+    tcg_gen_andi_tl(flags, flags, ~CC_Z);
+
+    /* Compute dest[rpl] - src[rpl], adjust if result <0.  */
+    tcg_gen_andi_tl(rpl_adj, s->T0, 3);
+    tcg_gen_andi_tl(s->T1, s->T1, 3);
+    tcg_gen_sub_tl(rpl_adj, rpl_adj, s->T1);
+
+    tcg_gen_brcondi_tl(TCG_COND_LT, rpl_adj, 0, label1);
+
+    /* Subtract dest[rpl] - src[rpl] to set dest[rpl] = src[rpl].  */
+    tcg_gen_sub_tl(s->T0, s->T0, rpl_adj);
+    tcg_gen_ori_tl(flags, flags, CC_Z);
+    gen_set_label(label1);
+
+    decode->cc_src = flags;
+    decode->cc_op = CC_OP_EFLAGS;
+}
+
 static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -1243,6 +1268,17 @@  static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     set_cc_op(s, CC_OP_BMILGB + ot);
 }
 
+static void gen_BOUND(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGv_i32 op = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(op, s->T0);
+    if (decode->op[1].ot == MO_16) {
+        gen_helper_boundw(tcg_env, s->A0, op);
+    } else {
+        gen_helper_boundl(tcg_env, s->A0, op);
+    }
+}
+
 static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -1263,6 +1299,18 @@  static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
+static void gen_CALLF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_far_call(s);
+}
+
+static void gen_CBW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp src_ot = decode->op[0].ot - 1;
+
+    tcg_gen_ext_tl(s->T0, s->T0, src_ot | MO_SIGN);
+}
+
 static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     TCGLabel *label_top = gen_new_label();
@@ -1366,6 +1414,18 @@  static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *dec
     decode->cc_op = CC_OP_SUBB + ot;
 }
 
+static void gen_CMPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    if (s->prefix & PREFIX_REPNZ) {
+        gen_repz_cmps(s, ot, 1);
+    } else if (s->prefix & PREFIX_REPZ) {
+        gen_repz_cmps(s, ot, 0);
+    } else {
+        gen_cmps(s, ot);
+    }
+}
+
 static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
@@ -1404,6 +1464,13 @@  static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *dec
     }
 }
 
+static void gen_CWD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int shift = 8 << decode->op[0].ot;
+
+    tcg_gen_sextract_tl(s->T0, s->T0, shift - 1, 1);
+}
+
 static void gen_DAA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
@@ -1450,6 +1517,59 @@  static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
     gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
 }
 
+static void gen_IMUL3(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+    TCGv cc_src_rhs;
+
+    switch (ot) {
+    case MO_16:
+        tcg_gen_ext16s_tl(s->T0, s->T0);
+        tcg_gen_ext16s_tl(s->T1, s->T1);
+        /* XXX: use 32 bit mul which could be faster */
+        tcg_gen_mul_tl(s->T0, s->T0, s->T1);
+        /* Compare the full result to the extension of the truncated result.  */
+        tcg_gen_ext16s_tl(s->T1, s->T0);
+        cc_src_rhs = s->T0;
+        break;
+
+    case MO_32:
+#ifdef TARGET_X86_64
+        /*
+         * This could also use the same algorithm as MO_16.  It produces fewer
+         * TCG ops and better code if flags are needed, but it requires a 64-bit
+         * multiply even if they are not (and thus the high part of the multiply
+         * is dead).
+         */
+        tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
+        tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
+        tcg_gen_muls2_i32(s->tmp2_i32, s->tmp3_i32,
+                          s->tmp2_i32, s->tmp3_i32);
+        tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
+
+        cc_src_rhs = tcg_temp_new();
+        tcg_gen_extu_i32_tl(cc_src_rhs, s->tmp3_i32);
+        /* Compare the high part to the sign bit of the truncated result */
+        tcg_gen_negsetcondi_i32(TCG_COND_LT, s->tmp2_i32, s->tmp2_i32, 0);
+        tcg_gen_extu_i32_tl(s->T1, s->tmp2_i32);
+        break;
+
+    case MO_64:
+#endif
+        cc_src_rhs = tcg_temp_new();
+        tcg_gen_muls2_tl(s->T0, cc_src_rhs, s->T0, s->T1);
+        /* Compare the high part to the sign bit of the truncated result */
+        tcg_gen_negsetcondi_tl(TCG_COND_LT, s->T1, s->T0, 0);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_gen_sub_tl(s->T1, s->T1, cc_src_rhs);
+    prepare_update2_cc(decode, s, CC_OP_MULB + ot);
+}
+
 static void gen_INC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
@@ -1464,6 +1584,26 @@  static void gen_INC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update_cc_incdec(decode, s, CC_OP_INCB + ot);
 }
 
+static void gen_INS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+    TCGv_i32 port = tcg_temp_new_i32();
+
+    tcg_gen_trunc_tl_i32(port, s->T1);
+    tcg_gen_ext16u_i32(port, port);
+    if (!gen_check_io(s, ot, port,
+                      SVM_IOIO_TYPE_MASK | SVM_IOIO_STR_MASK)) {
+        return;
+    }
+
+    translator_io_start(&s->base);
+    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+        gen_repz_ins(s, ot);
+    } else {
+        gen_ins(s, ot);
+    }
+}
+
 static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
@@ -1477,12 +1617,50 @@  static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *dec
     gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
 }
 
+static void gen_Jcc(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_bnd_jmp(s);
+    gen_jcc(s, decode->b & 0xf, decode->immediate);
+}
+
+static void gen_LAHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
+        return gen_illegal_opcode(s);
+    }
+    gen_compute_eflags(s);
+    /* Note: gen_compute_eflags() only gives the condition codes */
+    tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02);
+    tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
+}
+
 static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1);
     gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
 }
 
+static void gen_LEA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    tcg_gen_mov_tl(s->T0, s->A0);
+}
+
+static void gen_LODS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+        gen_repz_lods(s, ot);
+    } else {
+        gen_lods(s, ot);
+    }
+}
+
+static void gen_MOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    /* nothing to do! */
+}
+#define gen_NOP gen_MOV
+
 static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     gen_lea_v_seg(s, s->aflag, cpu_regs[R_EDI], R_DS, s->override);
@@ -1590,6 +1768,16 @@  static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
     return gen_MOVQ(s, env, decode);
 }
 
+static void gen_MOVS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+        gen_repz_movs(s, ot);
+    } else {
+        gen_movs(s, ot);
+    }
+}
+
 static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -1629,6 +1817,25 @@  static void gen_OR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
+static void gen_OUTS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[1].ot;
+    TCGv_i32 port = tcg_temp_new_i32();
+
+    tcg_gen_trunc_tl_i32(port, s->T0);
+    tcg_gen_ext16u_i32(port, port);
+    if (!gen_check_io(s, ot, port, SVM_IOIO_STR_MASK)) {
+        return;
+    }
+
+    translator_io_start(&s->base);
+    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+        gen_repz_outs(s, ot);
+    } else {
+        gen_outs(s, ot);
+    }
+}
+
 static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -1884,6 +2091,33 @@  static void gen_POP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     gen_pop_update(s, ot);
 }
 
+static void gen_POPA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+   gen_popa(s);
+}
+
+static void gen_POPF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot;
+    int mask = TF_MASK | AC_MASK | ID_MASK | NT_MASK;
+
+    if (CPL(s) == 0) {
+        mask |= IF_MASK | IOPL_MASK;
+    } else if (CPL(s) <= IOPL(s)) {
+        mask |= IF_MASK;
+    }
+    if (s->dflag == MO_16) {
+        mask &= 0xffff;
+    }
+
+    ot = gen_pop_T0(s);
+    gen_helper_write_eflags(tcg_env, s->T0, tcg_constant_i32(mask));
+    gen_pop_update(s, ot);
+    set_cc_op(s, CC_OP_EFLAGS);
+    /* abort translation because TF/AC flag may change */
+    s->base.is_jmp = DISAS_EOB_NEXT;
+}
+
 static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -2035,6 +2269,18 @@  static void gen_PUSH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     gen_push_v(s, s->T1);
 }
 
+static void gen_PUSHA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+   gen_pusha(s);
+}
+
+static void gen_PUSHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_helper_read_eflags(s->T0, tcg_env);
+    gen_push_v(s, s->T0);
+}
+
 static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -2059,6 +2305,18 @@  static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     }
 }
 
+static void gen_SAHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
+        return gen_illegal_opcode(s);
+    }
+    tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8);
+    gen_compute_eflags(s);
+    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
+    tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C);
+    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, s->T0);
+}
+
 static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -2091,6 +2349,18 @@  static void gen_SBB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     prepare_update3_cc(decode, s, CC_OP_SBBB + ot, c_in);
 }
 
+static void gen_SCAS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    if (s->prefix & PREFIX_REPNZ) {
+        gen_repz_scas(s, ot, 1);
+    } else if (s->prefix & PREFIX_REPZ) {
+        gen_repz_scas(s, ot, 0);
+    } else {
+        gen_scas(s, ot);
+    }
+}
+
 static void gen_SHA1NEXTE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
@@ -2178,6 +2448,16 @@  static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
     tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
 }
 
+static void gen_STOS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[1].ot;
+    if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
+        gen_repz_stos(s, ot);
+    } else {
+        gen_stos(s, ot);
+    }
+}
+
 static void gen_SUB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
@@ -2674,6 +2954,43 @@  static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *de
     }
 }
 
+static void gen_WAIT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) == (HF_MP_MASK | HF_TS_MASK)) {
+        gen_NM_exception(s);
+    } else {
+        /* needs to be treated as I/O because of ferr_irq */
+        translator_io_start(&s->base);
+        gen_helper_fwait(tcg_env);
+    }
+}
+
+static void gen_XCHG(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if (decode->b == 0x90 && !REX_B(s)) {
+        if (s->prefix & PREFIX_REPZ) {
+            gen_update_cc_op(s);
+            gen_update_eip_cur(s);
+            gen_helper_pause(tcg_env, cur_insn_len_i32(s));
+            s->base.is_jmp = DISAS_NORETURN;
+        }
+        /* No writeback.  */
+        decode->op[0].unit = X86_OP_SKIP;
+        return;
+    }
+
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_xchg_tl(s->T0, s->A0, s->T1,
+                               s->mem_index, decode->op[0].ot | MO_LE);
+        /* now store old value into register operand */
+        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
+    } else {
+        /* move destination value into source operand, source preserved in T1 */
+        gen_op_mov_reg_v(s, decode->op[2].ot, decode->op[2].n, s->T0);
+        tcg_gen_mov_tl(s->T0, s->T1);
+    }
+}
+
 static void gen_XOR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     /* special case XOR reg, reg */