Message ID | 20130411001815.GA17544@pd.tnic (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Apr 11, 2013 at 02:18:15AM +0200, Borislav Petkov wrote: > On Wed, Apr 10, 2013 at 03:16:39PM +0300, Gleb Natapov wrote: > > Right, the question is how kernel can tell QEMU that the cpuid bit is > > supported but should not be set unless explicitly asked by an user. > > Actually, this seems to work with the patch below based on whether you > have "+movbe" in the -cpu option or not. > The problem is that -cpu host will have it unconditionally and this is definitely not what we want. > Anyway, here's the second version with hopefully all comments and > suggestions addressed. > Thanks, will review it later. > Thanks. > > -- > >From 612fc75a732ad16332f270b7c52a68c89e3565ca Mon Sep 17 00:00:00 2001 > From: Borislav Petkov <bp@suse.de> > Date: Thu, 11 Apr 2013 02:06:30 +0200 > Subject: [PATCH] kvm: Emulate MOVBE > > This basically came from the need to be able to boot 32-bit Atom SMP > guests on an AMD host, i.e. host which doesn't support MOVBE. As a > matter of fact, qemu has since recently received MOVBE support but we > cannot share that with kvm emulation and thus we have to do this in the > host. > > We piggyback on the #UD path and emulate the MOVBE functionality. With > it, an 8-core SMP guest boots in under 6 seconds. > > Also, requesting MOVBE emulation needs to happen explicitly to work, > i.e. qemu -cpu n270,+movbe... > > Signed-off-by: Andre Przywara <andre@andrep.de> > Signed-off-by: Borislav Petkov <bp@suse.de> > --- > arch/x86/kvm/cpuid.c | 2 +- > arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++-- > 2 files changed, 38 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index a20ecb5b6cbf..2d44fc4fd855 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -273,7 +273,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > cpuid_mask(&entry->ecx, 4); > /* we support x2apic emulation even if host does not support > * it since we emulate x2apic in software */ > - entry->ecx |= F(X2APIC); > + entry->ecx |= F(X2APIC) | F(MOVBE); > break; > /* function 2 entries are STATEFUL. That is, repeated cpuid commands > * may return different values. This forces us to get_cpu() before > diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c > index a335cc6cde72..9011c7a656ad 100644 > --- a/arch/x86/kvm/emulate.c > +++ b/arch/x86/kvm/emulate.c > @@ -152,6 +152,7 @@ > #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ > #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ > #define NoWrite ((u64)1 << 45) /* No writeback */ > +#define EmulateOnUD ((u64)1 << 46) /* emulate if unsupported by the host */ > > #define X2(x...) x, x > #define X3(x...) X2(x), x > @@ -3107,6 +3108,30 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) > return X86EMUL_CONTINUE; > } > > +static int em_movbe(struct x86_emulate_ctxt *ctxt) > +{ > + switch (ctxt->op_bytes) { > + case 2: > + *(u16 *)ctxt->dst.valptr = swab16(*(u16 *)ctxt->src.valptr); > + break; > + case 4: > + *(u32 *)ctxt->dst.valptr = swab32(*(u32 *)ctxt->src.valptr); > + > + /* > + * clear upper dword for 32-bit operand size in 64-bit mode. > + */ > + if (ctxt->mode == X86EMUL_MODE_PROT64) > + *((u32 *)ctxt->dst.valptr + 1) = 0x0; > + break; > + case 8: > + *(u64 *)ctxt->dst.valptr = swab64(*(u64 *)ctxt->src.valptr); > + break; > + default: > + return X86EMUL_PROPAGATE_FAULT; > + } > + return X86EMUL_CONTINUE; > +} > + > static int em_cr_write(struct x86_emulate_ctxt *ctxt) > { > if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) > @@ -4033,6 +4058,11 @@ static const struct opcode twobyte_table[256] = { > N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N > }; > > +static const struct opcode threebyte_table[] = { > + [0xf0] = I(DstReg | SrcMem | ModRM | Mov | EmulateOnUD, em_movbe), > + [0xf1] = I(DstMem | SrcReg | ModRM | Mov | EmulateOnUD, em_movbe), > +}; > + > #undef D > #undef N > #undef G > @@ -4320,6 +4350,9 @@ done_prefixes: > ctxt->twobyte = 1; > ctxt->b = insn_fetch(u8, ctxt); > opcode = twobyte_table[ctxt->b]; > + > + if (ctxt->b == 0x38) > + opcode = threebyte_table[insn_fetch(u8, ctxt)]; > } > ctxt->d = opcode.flags; > > @@ -4376,8 +4409,10 @@ done_prefixes: > if (ctxt->d == 0 || (ctxt->d & Undefined)) > return EMULATION_FAILED; > > - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) > - return EMULATION_FAILED; > + if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) { > + if (!(ctxt->d & EmulateOnUD)) > + return EMULATION_FAILED; > + } > > if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) > ctxt->op_bytes = 8; > -- > 1.8.2.135.g7b592fa > > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 11, 2013 at 05:28:18PM +0300, Gleb Natapov wrote: > On Thu, Apr 11, 2013 at 02:18:15AM +0200, Borislav Petkov wrote: > > On Wed, Apr 10, 2013 at 03:16:39PM +0300, Gleb Natapov wrote: > > > Right, the question is how kernel can tell QEMU that the cpuid bit is > > > supported but should not be set unless explicitly asked by an user. > > > > Actually, this seems to work with the patch below based on whether you > > have "+movbe" in the -cpu option or not. > > > The problem is that -cpu host will have it unconditionally and this is > definitely not what we want. Hmm, ok, I see what you mean. -cpu host boots the atom kernel just fine. Well, with my limited qemu exposure, I'd guess cpu_x86_parse_featurestr() would need to somehow say to x86_cpu_realizefn that it has parsed a "+movbe" on the command line and the latter has to keep that bit enabled when doing the checks against kvm in kvm_check_features_against_host and filter_features_for_kvm(). Unless you have a better idea, that is. Thanks.
On Thu, Apr 11, 2013 at 05:37:33PM +0200, Borislav Petkov wrote: > On Thu, Apr 11, 2013 at 05:28:18PM +0300, Gleb Natapov wrote: > > On Thu, Apr 11, 2013 at 02:18:15AM +0200, Borislav Petkov wrote: > > > On Wed, Apr 10, 2013 at 03:16:39PM +0300, Gleb Natapov wrote: > > > > Right, the question is how kernel can tell QEMU that the cpuid bit is > > > > supported but should not be set unless explicitly asked by an user. > > > > > > Actually, this seems to work with the patch below based on whether you > > > have "+movbe" in the -cpu option or not. > > > > > The problem is that -cpu host will have it unconditionally and this is > > definitely not what we want. > > Hmm, ok, I see what you mean. -cpu host boots the atom kernel just fine. > > Well, with my limited qemu exposure, I'd guess > cpu_x86_parse_featurestr() would need to somehow say to > x86_cpu_realizefn that it has parsed a "+movbe" on the command line and > the latter has to keep that bit enabled when doing the checks against > kvm in kvm_check_features_against_host and filter_features_for_kvm(). > Then you will filter out movbe even if host cpu supports it directly which is also not what we want. Currently userspace assumes that that cpuid configuration returned by KVM_GET_SUPPORTED_CPUID is the optimal one. What we want here is a way for KVM to tell userspace that it can emulate movbe though it is not optimal. Userspace alone cannot figure it out. It can check host's cpuid directly and assume that if cpuid bit is not present on the host cpu, but reported as supported by KVM then it is emulated by KVM and this is not optimal, but sometimes emulation is actually desirable (x2apic), so such assumption is not always correct. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Apr 11, 2013 at 02:18:15AM +0200, Borislav Petkov wrote: > On Wed, Apr 10, 2013 at 03:16:39PM +0300, Gleb Natapov wrote: > > Right, the question is how kernel can tell QEMU that the cpuid bit is > > supported but should not be set unless explicitly asked by an user. > > Actually, this seems to work with the patch below based on whether you > have "+movbe" in the -cpu option or not. > > Anyway, here's the second version with hopefully all comments and > suggestions addressed. > > Thanks. > > -- > >From 612fc75a732ad16332f270b7c52a68c89e3565ca Mon Sep 17 00:00:00 2001 > From: Borislav Petkov <bp@suse.de> > Date: Thu, 11 Apr 2013 02:06:30 +0200 > Subject: [PATCH] kvm: Emulate MOVBE > > This basically came from the need to be able to boot 32-bit Atom SMP > guests on an AMD host, i.e. host which doesn't support MOVBE. As a > matter of fact, qemu has since recently received MOVBE support but we > cannot share that with kvm emulation and thus we have to do this in the > host. > > We piggyback on the #UD path and emulate the MOVBE functionality. With > it, an 8-core SMP guest boots in under 6 seconds. > > Also, requesting MOVBE emulation needs to happen explicitly to work, > i.e. qemu -cpu n270,+movbe... > > Signed-off-by: Andre Przywara <andre@andrep.de> > Signed-off-by: Borislav Petkov <bp@suse.de> > --- > arch/x86/kvm/cpuid.c | 2 +- > arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++-- > 2 files changed, 38 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index a20ecb5b6cbf..2d44fc4fd855 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -273,7 +273,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > cpuid_mask(&entry->ecx, 4); > /* we support x2apic emulation even if host does not support > * it since we emulate x2apic in software */ > - entry->ecx |= F(X2APIC); > + entry->ecx |= F(X2APIC) | F(MOVBE); > break; > /* function 2 entries are STATEFUL. That is, repeated cpuid commands > * may return different values. This forces us to get_cpu() before > diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c > index a335cc6cde72..9011c7a656ad 100644 > --- a/arch/x86/kvm/emulate.c > +++ b/arch/x86/kvm/emulate.c > @@ -152,6 +152,7 @@ > #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ > #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ > #define NoWrite ((u64)1 << 45) /* No writeback */ > +#define EmulateOnUD ((u64)1 << 46) /* emulate if unsupported by the host */ > Just rename VendorSpecific to EmulateOnUD. The meaning is the same. > #define X2(x...) x, x > #define X3(x...) X2(x), x > @@ -3107,6 +3108,30 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) > return X86EMUL_CONTINUE; > } > > +static int em_movbe(struct x86_emulate_ctxt *ctxt) > +{ > + switch (ctxt->op_bytes) { > + case 2: > + *(u16 *)ctxt->dst.valptr = swab16(*(u16 *)ctxt->src.valptr); What's wrong with: ctxt->dst.val = swab16((u16)ctxt->src.val)? > + break; > + case 4: > + *(u32 *)ctxt->dst.valptr = swab32(*(u32 *)ctxt->src.valptr); > + > + /* > + * clear upper dword for 32-bit operand size in 64-bit mode. > + */ > + if (ctxt->mode == X86EMUL_MODE_PROT64) > + *((u32 *)ctxt->dst.valptr + 1) = 0x0; It should be clean without this this. > + break; > + case 8: > + *(u64 *)ctxt->dst.valptr = swab64(*(u64 *)ctxt->src.valptr); > + break; > + default: > + return X86EMUL_PROPAGATE_FAULT; > + } > + return X86EMUL_CONTINUE; > +} > + > static int em_cr_write(struct x86_emulate_ctxt *ctxt) > { > if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) > @@ -4033,6 +4058,11 @@ static const struct opcode twobyte_table[256] = { > N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N > }; > > +static const struct opcode threebyte_table[] = { > + [0xf0] = I(DstReg | SrcMem | ModRM | Mov | EmulateOnUD, em_movbe), > + [0xf1] = I(DstMem | SrcReg | ModRM | Mov | EmulateOnUD, em_movbe), > +}; > + Populate the whole table explicitly please. You can use X16(N) to make it short. The encoding is also wrong since 0F 38 F0/F1 instruction decoding depend on a prefix. Same opcode is decoded as CRC32 with f2 prefix. We have Prefix mechanism for such instructions, but it currently assumes that 66 and f2/f3 prefixes are mutually exclusive, but in this case they are not, ugh. > #undef D > #undef N > #undef G > @@ -4320,6 +4350,9 @@ done_prefixes: > ctxt->twobyte = 1; > ctxt->b = insn_fetch(u8, ctxt); > opcode = twobyte_table[ctxt->b]; > + > + if (ctxt->b == 0x38) > + opcode = threebyte_table[insn_fetch(u8, ctxt)]; ctxt->b = insn_fetch(u8, ctxt); opcode = threebyte_table[ctxt->b]; Otherwise OpImplicit type of decoding will not work for three byte instructions. Also should rename twobyte into opbytes and set it to 1, 2 or 3. I prefer that three byte instruction decoding will be sent as a separate patch. > } > ctxt->d = opcode.flags; > > @@ -4376,8 +4409,10 @@ done_prefixes: > if (ctxt->d == 0 || (ctxt->d & Undefined)) > return EMULATION_FAILED; > > - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) > - return EMULATION_FAILED; > + if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) { > + if (!(ctxt->d & EmulateOnUD)) > + return EMULATION_FAILED; > + } > > if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) > ctxt->op_bytes = 8; > -- > 1.8.2.135.g7b592fa > > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Apr 14, 2013 at 10:41:07AM +0300, Gleb Natapov wrote: > Currently userspace assumes that that cpuid configuration returned by > KVM_GET_SUPPORTED_CPUID is the optimal one. What we want here is a way > for KVM to tell userspace that it can emulate movbe though it is not > optimal. Ok, I don't understand. You want to tell userspace: "yes, we do support a hw feature but we emulate it."? > Userspace alone cannot figure it out. It can check host's cpuid > directly and assume that if cpuid bit is not present on the host cpu, > but reported as supported by KVM then it is emulated by KVM and this is > not optimal, but sometimes emulation is actually desirable (x2apic), so > such assumption is not always correct. Right, and this is what we have, AFAICT. And if userspace does that what you exemplify above, you get exactly that - a feature bit not set in CPUID but KVM reporting it set means, it is emulated. There's no room for other interpretations here. Which probably means also not optimal because it is not done in hw. Or, do you want to have a way to say with KVM_GET_SUPPORTED_CPUID that "the features I'm reporting to you are those - a subset of them are not optimally supported because I'm emulating them." Am I close? Thanks.
Just to interject here... all this is about host kernel vs userspace, correct? I.e. nothing that affects the guest. I assume that the guest sees CPUID and that is what is supported. Borislav Petkov <bp@alien8.de> wrote: >On Sun, Apr 14, 2013 at 10:41:07AM +0300, Gleb Natapov wrote: >> Currently userspace assumes that that cpuid configuration returned by >> KVM_GET_SUPPORTED_CPUID is the optimal one. What we want here is a >way >> for KVM to tell userspace that it can emulate movbe though it is not >> optimal. > >Ok, I don't understand. > >You want to tell userspace: "yes, we do support a hw feature but we >emulate it."? > >> Userspace alone cannot figure it out. It can check host's cpuid >> directly and assume that if cpuid bit is not present on the host cpu, >> but reported as supported by KVM then it is emulated by KVM and this >is >> not optimal, but sometimes emulation is actually desirable (x2apic), >so >> such assumption is not always correct. > >Right, and this is what we have, AFAICT. And if userspace does that >what >you exemplify above, you get exactly that - a feature bit not set in >CPUID but KVM reporting it set means, it is emulated. There's no room >for other interpretations here. Which probably means also not optimal >because it is not done in hw. > >Or, do you want to have a way to say with KVM_GET_SUPPORTED_CPUID that >"the features I'm reporting to you are those - a subset of them are not >optimally supported because I'm emulating them." > >Am I close? > >Thanks.
On Sun, Apr 14, 2013 at 11:36:29AM -0700, H. Peter Anvin wrote: > Just to interject here... all this is about host kernel vs userspace, > correct? I.e. nothing that affects the guest. I assume that the guest > sees CPUID and that is what is supported. Actually, currently the guest sees MOVBE in CPUID set: max base function: 0x00000005 CPUID.EAX=0x00000000 EAX=0x00000005, EBX=0x68747541, ECX=0x444d4163, EDX=0x69746e65 CPUID.EAX=0x00000001 EAX=0x000106c2, EBX=0x01000800, ECX=0x80400201, EDX=0x0789fbff ^ |- bit 22 even though host (AMD) doesn't support it. And I'd say we want it that way because the guest should show it in CPUID so that not only the guest kernel but also guest userspace binaries built with MOVBE can be run.
That is what I mean yes... Borislav Petkov <bp@alien8.de> wrote: >On Sun, Apr 14, 2013 at 11:36:29AM -0700, H. Peter Anvin wrote: >> Just to interject here... all this is about host kernel vs userspace, >> correct? I.e. nothing that affects the guest. I assume that the guest >> sees CPUID and that is what is supported. > >Actually, currently the guest sees MOVBE in CPUID set: > >max base function: 0x00000005 >CPUID.EAX=0x00000000 >EAX=0x00000005, EBX=0x68747541, ECX=0x444d4163, EDX=0x69746e65 >CPUID.EAX=0x00000001 >EAX=0x000106c2, EBX=0x01000800, ECX=0x80400201, EDX=0x0789fbff > ^ > |- bit 22 > >even though host (AMD) doesn't support it. > >And I'd say we want it that way because the guest should show it in >CPUID so that not only the guest kernel but also guest userspace >binaries built with MOVBE can be run.
On Sun, Apr 14, 2013 at 11:43:03AM +0300, Gleb Natapov wrote: > > +#define EmulateOnUD ((u64)1 << 46) /* emulate if unsupported by the host */ > > > Just rename VendorSpecific to EmulateOnUD. The meaning is the same. done. > > +static int em_movbe(struct x86_emulate_ctxt *ctxt) > > +{ > > + switch (ctxt->op_bytes) { > > + case 2: > > + *(u16 *)ctxt->dst.valptr = swab16(*(u16 *)ctxt->src.valptr); > What's wrong with: ctxt->dst.val = swab16((u16)ctxt->src.val)? This doesn't work for the 16-bit swab because of the following functionality of MOVBE: "When the operand size is 16 bits, the upper word of the destination register remains unchanged." Now here's what gcc produces here: movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp83 rolw $8, %ax #, tmp83 movzwl %ax, %eax # tmp83, tmp84 movq %rax, 240(%rdi) # tmp84, See the zero-extension happening twice (second one should not be needed, even) and it is actually writing the whole 64-bit value in %rax back which effectively destroys the upper word? However, this, a bit uglier version works: *(u16 *)&ctxt->dst.val = swab16((u16)ctxt->src.val); movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp82 rolw $8, %ax #, tmp82 movw %ax, 240(%rdi) # tmp82, MEM[(u16 *)ctxt_5(D) + 240B] Whichever version we take, I'll slap a comment explaining why we're doing it differently for 16-bit. > > + case 4: > > + *(u32 *)ctxt->dst.valptr = swab32(*(u32 *)ctxt->src.valptr); > > + > > + /* > > + * clear upper dword for 32-bit operand size in 64-bit mode. > > + */ > > + if (ctxt->mode == X86EMUL_MODE_PROT64) > > + *((u32 *)ctxt->dst.valptr + 1) = 0x0; > It should be clean without this this. Right, this should probably work because for 32-bit, we don't care about the zero-extended upper dword and in 64-bit mode we want to do it anyway. > > + break; > > + case 8: > > + *(u64 *)ctxt->dst.valptr = swab64(*(u64 *)ctxt->src.valptr); Ditto for this one. > > +static const struct opcode threebyte_table[] = { > > + [0xf0] = I(DstReg | SrcMem | ModRM | Mov | EmulateOnUD, em_movbe), > > + [0xf1] = I(DstMem | SrcReg | ModRM | Mov | EmulateOnUD, em_movbe), > > +}; > > + > Populate the whole table explicitly please. You can use X16(N) to make it > short. The encoding is also wrong since 0F 38 F0/F1 instruction decoding > depend on a prefix. ... on the absence of a prefix, actually. > Same opcode is decoded as CRC32 with f2 prefix. We have Prefix > mechanism for such instructions, but it currently assumes that 66 and > f2/f3 prefixes are mutually exclusive, but in this case they are not, > ugh. Yeah, I'm lazy and wanted to leave the honors to the poor soul who implements the whole 0F_38h opcode map. :-) Ok, I probably could do something similar to pfx_vmovntpx with the GP macro for the F0/F1 opcodes. I'll give it a try when I get a chance soonish. > > + > > + if (ctxt->b == 0x38) > > + opcode = threebyte_table[insn_fetch(u8, ctxt)]; > ctxt->b = insn_fetch(u8, ctxt); > opcode = threebyte_table[ctxt->b]; > > Otherwise OpImplicit type of decoding will not work for three byte > instructions. Also should rename twobyte into opbytes and set it to 1, > 2 or 3. I prefer that three byte instruction decoding will be sent as a > separate patch. Ok. Thanks.
Il 14/04/2013 23:02, Borislav Petkov ha scritto: > *(u16 *)&ctxt->dst.val = swab16((u16)ctxt->src.val); > > movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp82 > rolw $8, %ax #, tmp82 > movw %ax, 240(%rdi) # tmp82, MEM[(u16 *)ctxt_5(D) + 240B] I think this breaks the C aliasing rules. Using valptr is okay because it is a char. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Apr 14, 2013 at 07:32:16PM +0200, Borislav Petkov wrote: > On Sun, Apr 14, 2013 at 10:41:07AM +0300, Gleb Natapov wrote: > > Currently userspace assumes that that cpuid configuration returned by > > KVM_GET_SUPPORTED_CPUID is the optimal one. What we want here is a way > > for KVM to tell userspace that it can emulate movbe though it is not > > optimal. > > Ok, I don't understand. > > You want to tell userspace: "yes, we do support a hw feature but we > emulate it."? > I am contemplating this, yes. > > Userspace alone cannot figure it out. It can check host's cpuid > > directly and assume that if cpuid bit is not present on the host cpu, > > but reported as supported by KVM then it is emulated by KVM and this is > > not optimal, but sometimes emulation is actually desirable (x2apic), so > > such assumption is not always correct. > > Right, and this is what we have, AFAICT. And if userspace does that what > you exemplify above, you get exactly that - a feature bit not set in > CPUID but KVM reporting it set means, it is emulated. There's no room > for other interpretations here. Which probably means also not optimal > because it is not done in hw. > This is not true for all emulated CPUID bits. X2APIC is emulated and it is preferable for a guest to use it for example. > Or, do you want to have a way to say with KVM_GET_SUPPORTED_CPUID that > "the features I'm reporting to you are those - a subset of them are not > optimally supported because I'm emulating them." > > Am I close? > Yes, you are. I am considering such interface. Adding new specialized interfaces is last resort though. I am open to other ideas. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 16, 2013 at 08:42:36PM +0300, Gleb Natapov wrote: > > Right, and this is what we have, AFAICT. And if userspace does that what > > you exemplify above, you get exactly that - a feature bit not set in > > CPUID but KVM reporting it set means, it is emulated. There's no room > > for other interpretations here. Which probably means also not optimal > > because it is not done in hw. > > > This is not true for all emulated CPUID bits. X2APIC is emulated and it > is preferable for a guest to use it for example. Right, and on boxes without X2APIC support you will not find CPUID(1).ECX[21] set. kvm will tell you it is supported, though, which means, kvm emulates it. > > Or, do you want to have a way to say with KVM_GET_SUPPORTED_CPUID that > > "the features I'm reporting to you are those - a subset of them are not > > optimally supported because I'm emulating them." > > > > Am I close? > > > Yes, you are. I am considering such interface. Adding new specialized > interfaces is last resort though. I am open to other ideas. Hmm, ok, so basically we want qemu to query the host CPUID and see which features are *really* supported, then do KVM_GET_SUPPORTED_CPUID and see which are emulated. How's that? Too simple? I probably am missing something as a novice qemu user.
On Wed, Apr 17, 2013 at 01:04:34PM +0200, Borislav Petkov wrote: > On Tue, Apr 16, 2013 at 08:42:36PM +0300, Gleb Natapov wrote: > > > Right, and this is what we have, AFAICT. And if userspace does that what > > > you exemplify above, you get exactly that - a feature bit not set in > > > CPUID but KVM reporting it set means, it is emulated. There's no room > > > for other interpretations here. Which probably means also not optimal > > > because it is not done in hw. > > > > > This is not true for all emulated CPUID bits. X2APIC is emulated and it > > is preferable for a guest to use it for example. > > Right, and on boxes without X2APIC support you will not find > CPUID(1).ECX[21] set. kvm will tell you it is supported, though, which > means, kvm emulates it. > Yes, but it is "good" emulation. You want to have it for performance even if host does not have x2apic. x2apic is emulated no matter if host cpu has it or not. movbe is different, you want to emulate it only if your guest requires it. > > > Or, do you want to have a way to say with KVM_GET_SUPPORTED_CPUID that > > > "the features I'm reporting to you are those - a subset of them are not > > > optimally supported because I'm emulating them." > > > > > > Am I close? > > > > > Yes, you are. I am considering such interface. Adding new specialized > > interfaces is last resort though. I am open to other ideas. > > Hmm, ok, so basically we want qemu to query the host CPUID and see which > features are *really* supported, then do KVM_GET_SUPPORTED_CPUID and see > which are emulated. > > How's that? Too simple? As I said this will filter x2apic on machines which host cpu does not support it. This is not what we want. > > I probably am missing something as a novice qemu user. > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Apr 17, 2013 at 04:38:29PM +0300, Gleb Natapov wrote: > Yes, but it is "good" emulation. You want to have it for performance > even if host does not have x2apic. x2apic is emulated no matter if > host cpu has it or not. movbe is different, you want to emulate it > only if your guest requires it. It sounds to me you want to have feature bits which are *explicitly* enabled on the command line nandled differently in qemu. Let me elaborate: 1. x2apic is always enabled in qemu and will be reported through KVM_GET_SUPPORTED_CPUID, no matter whether emulated or not. 2. movbe, which got enabled on the command line through "-cpu n270,+movbe" will be enabled for this specific guest only. Anything else *without* "+movbe" on the command line, will not get the emulation. Those 2 categories or am I missing a third one? > > How's that? Too simple? > As I said this will filter x2apic on machines which host cpu does not > support it. This is not what we want. Right, so basically we want to handle features which were explicitly enabled only for this guest as private, only relevant to this particular guest run.
On Wed, Apr 17, 2013 at 04:02:00PM +0200, Borislav Petkov wrote: > Right, so basically we want to handle features which were explicitly > enabled only for this guest as private, only relevant to this > particular guest run. Ok, here are two more ideas Joerg and I had today during lunch: * reuse KVM_GET_SUPPORTED_CPUID we hand-in a struct kvm_cpuid_entry2 with ->function and respective bits in e[abcd]x set for each CPUID leaf we want to query kvm. Once in the kernel, we do the following: if ->function is not 0xffffffff, it means userspace wants us to look at the all set bits in the respective e[abcd]x members. For each set bit, we check whether we emulate the respective feature and if so, we leave it untouched before returning it to userspace. Otherwise, we clear it before OR-ing in the host bits and the good-emulated bits like x2apic. Yeah, semantics need to be handled carefully, but it has this knock-on-door aspect where kvm says that it actually emulates a feature only if asked, i.e. with the -cpu ...,+<feature> syntax. * new ioctl KVM_GET_EMULATED_CPUID Might be overkill and might be used only in a limited fashion since we don't want to emulate *every* feature in kvm. Hmmm. I kinda like the first one more while the second one is cleaner. Opinions?
On Fri, Apr 19, 2013 at 12:48:48AM +0200, Borislav Petkov wrote: > On Wed, Apr 17, 2013 at 04:02:00PM +0200, Borislav Petkov wrote: > > Right, so basically we want to handle features which were explicitly > > enabled only for this guest as private, only relevant to this > > particular guest run. > > Ok, > > here are two more ideas Joerg and I had today during lunch: > > * reuse KVM_GET_SUPPORTED_CPUID > > we hand-in a struct kvm_cpuid_entry2 with ->function and respective bits > in e[abcd]x set for each CPUID leaf we want to query kvm. > > Once in the kernel, we do the following: > > if ->function is not 0xffffffff, it means userspace wants us to look at > the all set bits in the respective e[abcd]x members. There may be userspaces that set ->function to 0xffffffff (just because they do not init the buffer before calling into the kernel) and this will break them. > > For each set bit, we check whether we emulate the respective feature > and if so, we leave it untouched before returning it to userspace. > Otherwise, we clear it before OR-ing in the host bits and the > good-emulated bits like x2apic. > > Yeah, semantics need to be handled carefully, but it has this > knock-on-door aspect where kvm says that it actually emulates a feature > only if asked, i.e. with the -cpu ...,+<feature> syntax. > > * new ioctl KVM_GET_EMULATED_CPUID > > Might be overkill and might be used only in a limited fashion since we > don't want to emulate *every* feature in kvm. > > Hmmm. I kinda like the first one more while the second one is cleaner. > The first one needs explicit userspace support to work correctly. This should be other way around: old userspace should do the right thing, but may not support new features, new userspace should be able to support new feature. We may extend KVM_GET_SUPPORTED_CPUID instead of adding new one. There is a padding field in kvm_cpuid2 that we could have reused as flags, but unfortunately current implementation does not error if the field is not zero, so if there is a userspace that does not zero the padding it may break. Another options is to reuse high bits of nent as flags (not very nice, but will work). -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Apr 21, 2013 at 12:46:51PM +0300, Gleb Natapov wrote: > There may be userspaces that set ->function to 0xffffffff (just > because they do not init the buffer before calling into the kernel) > and this will break them. You don't mean 0xffffffff here but rather something random which is not properly initialized and by chance is a valid CPUID leaf. Then, if some of the bits in the register variables e[abcd]x are also set, we return with emulated bits set. > > For each set bit, we check whether we emulate the respective feature > > and if so, we leave it untouched before returning it to userspace. > > Otherwise, we clear it before OR-ing in the host bits and the > > good-emulated bits like x2apic. > > > > Yeah, semantics need to be handled carefully, but it has this > > knock-on-door aspect where kvm says that it actually emulates a feature > > only if asked, i.e. with the -cpu ...,+<feature> syntax. > > > > * new ioctl KVM_GET_EMULATED_CPUID > > > > Might be overkill and might be used only in a limited fashion since we > > don't want to emulate *every* feature in kvm. > > > > Hmmm. I kinda like the first one more while the second one is cleaner. > > > The first one needs explicit userspace support to work correctly. This > should be other way around: old userspace should do the right thing, but may > not support new features, new userspace should be able to support new > feature. Crap, not even qemu is handing in cleared buffers with that g_malloc0() thing, AFAICT. > We may extend KVM_GET_SUPPORTED_CPUID instead of adding new one. There > is a padding field in kvm_cpuid2 that we could have reused as flags, That would've been lovely. > but unfortunately current implementation does not error if the field is > not zero, so if there is a userspace that does not zero the padding it may > break. Another options is to reuse high bits of nent as flags (not very > nice, but will work). Not nice?! It is a very nasty hack - that's what it is. :-) Frankly speaking, I'd rather prefer adding a new ioctl. Since old userspace won't support the new features, then we just as well can simply add the new ioctl. In all the cases we need to touch userspace - be it to OR in the flags into ->nent or to implement the new ioctl. So why not do it in a clean manner from the get-go? Hmmm.
On Tue, Apr 16, 2013 at 01:36:00PM +0200, Paolo Bonzini wrote: > Il 14/04/2013 23:02, Borislav Petkov ha scritto: > > *(u16 *)&ctxt->dst.val = swab16((u16)ctxt->src.val); > > > > movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp82 > > rolw $8, %ax #, tmp82 > > movw %ax, 240(%rdi) # tmp82, MEM[(u16 *)ctxt_5(D) + 240B] > > I think this breaks the C aliasing rules. > > Using valptr is okay because it is a char. Yep, good catch. We normally build with -fno-strict-aliasing but when I change that, gcc catches it: arch/x86/kvm/emulate.c: In function ‘em_movbe’: arch/x86/kvm/emulate.c:3121:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] However, it screams even louder about the valptr variant *(u16 *)ctxt->dst.valptr = swab16(*(u16 *)ctxt->src.valptr); too: arch/x86/kvm/emulate.c: In function ‘em_movbe’: arch/x86/kvm/emulate.c:3117:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] arch/x86/kvm/emulate.c:3117:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] arch/x86/kvm/emulate.c:3117:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] arch/x86/kvm/emulate.c:3117:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] arch/x86/kvm/emulate.c:3117:3: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] We probably need something with copying values to a temp variable or so. Thanks.
On Sun, Apr 21, 2013 at 01:46:50PM +0200, Borislav Petkov wrote:
> We probably need something with copying values to a temp variable or so.
Basically something like that:
case 2:
/*
* From MOVBE definition: "...When the operand size is 16 bits,
* the upper word of the destination register remains unchanged
* ..."
*
* Both casting ->valptr and ->val to u16 breaks strict aliasing
* rules so we have to do the operation almost per hand.
*/
tmp = (u16)ctxt->src.val;
ctxt->dst.val &= ~0xffffUL;
ctxt->dst.val |= (unsigned long)swab16(tmp);
break;
This passes all gcc checks, even the stricter ones when building with W=3.
On Sun, Apr 21, 2013 at 01:30:51PM +0200, Borislav Petkov wrote: > On Sun, Apr 21, 2013 at 12:46:51PM +0300, Gleb Natapov wrote: > > There may be userspaces that set ->function to 0xffffffff (just > > because they do not init the buffer before calling into the kernel) > > and this will break them. > > You don't mean 0xffffffff here but rather something random which is not > properly initialized and by chance is a valid CPUID leaf. Then, if some > of the bits in the register variables e[abcd]x are also set, we return > with emulated bits set. > Yes. > > > For each set bit, we check whether we emulate the respective feature > > > and if so, we leave it untouched before returning it to userspace. > > > Otherwise, we clear it before OR-ing in the host bits and the > > > good-emulated bits like x2apic. > > > > > > Yeah, semantics need to be handled carefully, but it has this > > > knock-on-door aspect where kvm says that it actually emulates a feature > > > only if asked, i.e. with the -cpu ...,+<feature> syntax. > > > > > > * new ioctl KVM_GET_EMULATED_CPUID > > > > > > Might be overkill and might be used only in a limited fashion since we > > > don't want to emulate *every* feature in kvm. > > > > > > Hmmm. I kinda like the first one more while the second one is cleaner. > > > > > The first one needs explicit userspace support to work correctly. This > > should be other way around: old userspace should do the right thing, but may > > not support new features, new userspace should be able to support new > > feature. > > Crap, not even qemu is handing in cleared buffers with that g_malloc0() > thing, AFAICT. > > > We may extend KVM_GET_SUPPORTED_CPUID instead of adding new one. There > > is a padding field in kvm_cpuid2 that we could have reused as flags, > > That would've been lovely. > > > but unfortunately current implementation does not error if the field is > > not zero, so if there is a userspace that does not zero the padding it may > > break. Another options is to reuse high bits of nent as flags (not very > > nice, but will work). > > Not nice?! It is a very nasty hack - that's what it is. :-) > Agree, but not nastier than expecting ->function to have special value :) > Frankly speaking, I'd rather prefer adding a new ioctl. Since old > userspace won't support the new features, then we just as well can > simply add the new ioctl. > > In all the cases we need to touch userspace - be it to OR in the flags > into ->nent or to implement the new ioctl. So why not do it in a clean > manner from the get-go? > Agree here too. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 21/04/2013 14:23, Borislav Petkov ha scritto: > On Sun, Apr 21, 2013 at 01:46:50PM +0200, Borislav Petkov wrote: >> We probably need something with copying values to a temp variable or so. > > Basically something like that: > > case 2: > /* > * From MOVBE definition: "...When the operand size is 16 bits, > * the upper word of the destination register remains unchanged > * ..." > * > * Both casting ->valptr and ->val to u16 breaks strict aliasing > * rules so we have to do the operation almost per hand. > */ > tmp = (u16)ctxt->src.val; > ctxt->dst.val &= ~0xffffUL; > ctxt->dst.val |= (unsigned long)swab16(tmp); > break; > > This passes all gcc checks, even the stricter ones when building with W=3. I thought the valptr one was ok. I find this one more readable, too. How does the generated code look like? Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 22, 2013 at 10:53:42AM +0200, Paolo Bonzini wrote: > Il 21/04/2013 14:23, Borislav Petkov ha scritto: > > On Sun, Apr 21, 2013 at 01:46:50PM +0200, Borislav Petkov wrote: > >> We probably need something with copying values to a temp variable or so. > > > > Basically something like that: > > > > case 2: > > /* > > * From MOVBE definition: "...When the operand size is 16 bits, > > * the upper word of the destination register remains unchanged > > * ..." > > * > > * Both casting ->valptr and ->val to u16 breaks strict aliasing > > * rules so we have to do the operation almost per hand. > > */ > > tmp = (u16)ctxt->src.val; > > ctxt->dst.val &= ~0xffffUL; > > ctxt->dst.val |= (unsigned long)swab16(tmp); > > break; > > > > This passes all gcc checks, even the stricter ones when building with W=3. > > I thought the valptr one was ok. Yep, it looked like that too. And, it could actually really be ok and the gcc's warning here is bogus. I'll try to talk to gcc people about it. > I find this one more readable, too. How does the generated code look > like? Well, so so: movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp87 movq 240(%rdi), %rdx # ctxt_5(D)->dst.D.27823.val, tmp89 xorw %dx, %dx # tmp89 rolw $8, %ax #, tmp87 movzwl %ax, %eax # tmp87, tmp91 I have hard time understanding why it is adding this insn here - it can simply drop it and continue with the 64-bit OR. It's not like it changes anything... orq %rdx, %rax # tmp89, tmp91 movq %rax, 240(%rdi) # tmp91, ctxt_5(D)->dst.D.27823.val Btw, I wanted to ask: when kvm commits the results, does it look at ctxt->op_bytes to know exactly how many bytes to write to the guest? Because if it does, we can save ourselves the trouble here. Or does it simply write both the full sizeof(unsigned long) bytes of ->src.val and ->dst.val to the guest? Thanks.
On Mon, Apr 22, 2013 at 11:38:10AM +0200, Borislav Petkov wrote: > On Mon, Apr 22, 2013 at 10:53:42AM +0200, Paolo Bonzini wrote: > > Il 21/04/2013 14:23, Borislav Petkov ha scritto: > > > On Sun, Apr 21, 2013 at 01:46:50PM +0200, Borislav Petkov wrote: > > >> We probably need something with copying values to a temp variable or so. > > > > > > Basically something like that: > > > > > > case 2: > > > /* > > > * From MOVBE definition: "...When the operand size is 16 bits, > > > * the upper word of the destination register remains unchanged > > > * ..." > > > * > > > * Both casting ->valptr and ->val to u16 breaks strict aliasing > > > * rules so we have to do the operation almost per hand. > > > */ > > > tmp = (u16)ctxt->src.val; > > > ctxt->dst.val &= ~0xffffUL; > > > ctxt->dst.val |= (unsigned long)swab16(tmp); > > > break; > > > > > > This passes all gcc checks, even the stricter ones when building with W=3. > > > > I thought the valptr one was ok. > > Yep, it looked like that too. And, it could actually really be ok and > the gcc's warning here is bogus. I'll try to talk to gcc people about > it. > > > I find this one more readable, too. How does the generated code look > > like? > > Well, so so: > > movzwl 112(%rdi), %eax # ctxt_5(D)->src.D.27823.val, tmp87 > movq 240(%rdi), %rdx # ctxt_5(D)->dst.D.27823.val, tmp89 > xorw %dx, %dx # tmp89 > rolw $8, %ax #, tmp87 > movzwl %ax, %eax # tmp87, tmp91 > > I have hard time understanding why it is adding this insn here - it can > simply drop it and continue with the 64-bit OR. It's not like it changes > anything... > > orq %rdx, %rax # tmp89, tmp91 > movq %rax, 240(%rdi) # tmp91, ctxt_5(D)->dst.D.27823.val > > Btw, I wanted to ask: when kvm commits the results, does it look at > ctxt->op_bytes to know exactly how many bytes to write to the guest? > Because if it does, we can save ourselves the trouble here. > > Or does it simply write both the full sizeof(unsigned long) bytes of > ->src.val and ->dst.val to the guest? > No, it does this in case of register operand: static void write_register_operand(struct operand *op) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (op->bytes) { case 1: *(u8 *)op->addr.reg = (u8)op->val; break; case 2: *(u16 *)op->addr.reg = (u16)op->val; break; case 4: *op->addr.reg = (u32)op->val; break; /* 64b: zero-extend */ case 8: *op->addr.reg = op->val; break; } } -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 22, 2013 at 12:42:46PM +0300, Gleb Natapov wrote: > > Btw, I wanted to ask: when kvm commits the results, does it look at > > ctxt->op_bytes to know exactly how many bytes to write to the guest? > > Because if it does, we can save ourselves the trouble here. > > > > Or does it simply write both the full sizeof(unsigned long) bytes of > > ->src.val and ->dst.val to the guest? > > > No, it does this in case of register operand: > > static void write_register_operand(struct operand *op) > { > /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ > switch (op->bytes) { > case 1: > *(u8 *)op->addr.reg = (u8)op->val; > break; > case 2: > *(u16 *)op->addr.reg = (u16)op->val; > break; > case 4: > *op->addr.reg = (u32)op->val; > break; /* 64b: zero-extend */ > case 8: > *op->addr.reg = op->val; > break; > } > } Ok, and for OP_MEM it does look at ctxt->dst.bytes in writeback(), AFAICT. And I see other emulated instructions like POPF, for example, do this: ctxt->dst.bytes = ctxt->op_bytes; Which means, we can drop all the bullshit in em_movbe and even destroy some of the bytes in dst.val but only write out the correct ones. Which means, a simpler code and a lot less jumping through hoops. Would that be the more accepted practice? Thanks.
On Mon, Apr 22, 2013 at 11:52:03AM +0200, Borislav Petkov wrote: > On Mon, Apr 22, 2013 at 12:42:46PM +0300, Gleb Natapov wrote: > > > Btw, I wanted to ask: when kvm commits the results, does it look at > > > ctxt->op_bytes to know exactly how many bytes to write to the guest? > > > Because if it does, we can save ourselves the trouble here. > > > > > > Or does it simply write both the full sizeof(unsigned long) bytes of > > > ->src.val and ->dst.val to the guest? > > > > > No, it does this in case of register operand: > > > > static void write_register_operand(struct operand *op) > > { > > /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ > > switch (op->bytes) { > > case 1: > > *(u8 *)op->addr.reg = (u8)op->val; > > break; > > case 2: > > *(u16 *)op->addr.reg = (u16)op->val; > > break; > > case 4: > > *op->addr.reg = (u32)op->val; > > break; /* 64b: zero-extend */ > > case 8: > > *op->addr.reg = op->val; > > break; > > } > > } > > Ok, and for OP_MEM it does look at ctxt->dst.bytes in writeback(), > AFAICT. And I see other emulated instructions like POPF, for example, do > this: > > ctxt->dst.bytes = ctxt->op_bytes; > > Which means, we can drop all the bullshit in em_movbe and even destroy > some of the bytes in dst.val but only write out the correct ones. Which > means, a simpler code and a lot less jumping through hoops. > > Would that be the more accepted practice? > For most instructions the decoder already sets op->bytes to correct value, given that all flags a correctly specified in opcode table. Explicit op->bytes setting should be done only if it cannot be expressed by opcode flags. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 22, 2013 at 12:58:12PM +0300, Gleb Natapov wrote: > For most instructions the decoder already sets op->bytes to correct > value, given that all flags a correctly specified in opcode table. > Explicit op->bytes setting should be done only if it cannot be > expressed by opcode flags. MOVBE encodes operands in ModRM and operand size is determined by the effective operand size. By looking at that switch(mode) thing near the beginning of x86_decode_insn, we make sure ctxt->op_bytes is set accordingly. Then, we have the following definitions for MOVBE: + [0xf0] = I(DstReg | SrcMem | ModRM | Mov | ThreeByte | EmulateOnUD, em_movbe), + [0xf1] = I(DstMem | SrcReg | ModRM | Mov | ThreeByte | EmulateOnUD, em_movbe), and from looking at decode_operand(), it makes sure that op->bytes gets the correct value since we have the proper {Src,Dst}{Reg,Mem} flags in the insn definition. So everything is fine, I'll make sure it works that way too, though, when testing. Thanks.
On Sun, Apr 21, 2013 at 03:51:16PM +0300, Gleb Natapov wrote: > > Not nice?! It is a very nasty hack - that's what it is. :-) > > > Agree, but not nastier than expecting ->function to have special value :) Probably of the same nastiness level :-) > > Frankly speaking, I'd rather prefer adding a new ioctl. Since old > > userspace won't support the new features, then we just as well can > > simply add the new ioctl. > > > > In all the cases we need to touch userspace - be it to OR in the flags > > into ->nent or to implement the new ioctl. So why not do it in a clean > > manner from the get-go? > > > Agree here too. Btw, in thinking about this more, I'm kinda sceptical we want to use the CPUID layout for this new KVM_GET_EMULATED_* ioctl. And the reason why I'm sceptical is that not every instruction is behind a CPUID capability bit and if we want to tell userspace that we do emulate any insn, even one for which there's no CPUID bit, we're going to have to either simulate a kvm-specific CPUID leaf or, maybe even better, come up with our own format for emulated capabilities. Maybe a bit vector with set bits for the respective capability, or something more nifty. In any case, it doesn't really need to be CPUID-like, IMHO. Hmmm...
On 04/23/2013 04:41 PM, Borislav Petkov wrote: > > Btw, in thinking about this more, I'm kinda sceptical we want to use the > CPUID layout for this new KVM_GET_EMULATED_* ioctl. And the reason why > I'm sceptical is that not every instruction is behind a CPUID capability > bit and if we want to tell userspace that we do emulate any insn, even > one for which there's no CPUID bit, we're going to have to either > simulate a kvm-specific CPUID leaf or, maybe even better, come up with > our own format for emulated capabilities. Maybe a bit vector with set > bits for the respective capability, or something more nifty. > > In any case, it doesn't really need to be CPUID-like, IMHO. > Using CPUID has the major advantage that it is well-defined. -hpa -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 23, 2013 at 04:50:43PM -0700, H. Peter Anvin wrote: > On 04/23/2013 04:41 PM, Borislav Petkov wrote: > > > > Btw, in thinking about this more, I'm kinda sceptical we want to use the > > CPUID layout for this new KVM_GET_EMULATED_* ioctl. And the reason why > > I'm sceptical is that not every instruction is behind a CPUID capability > > bit and if we want to tell userspace that we do emulate any insn, even > > one for which there's no CPUID bit, we're going to have to either > > simulate a kvm-specific CPUID leaf or, maybe even better, come up with > > our own format for emulated capabilities. Maybe a bit vector with set > > bits for the respective capability, or something more nifty. > > > > In any case, it doesn't really need to be CPUID-like, IMHO. > > > > Using CPUID has the major advantage that it is well-defined. > This. And I really hope vendors will not add instructions without corespondent CPUID bits nowadays. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 23, 2013 at 04:50:43PM -0700, H. Peter Anvin wrote: > On 04/23/2013 04:41 PM, Borislav Petkov wrote: > > > > Btw, in thinking about this more, I'm kinda sceptical we want to use the > > CPUID layout for this new KVM_GET_EMULATED_* ioctl. And the reason why > > I'm sceptical is that not every instruction is behind a CPUID capability > > bit and if we want to tell userspace that we do emulate any insn, even > > one for which there's no CPUID bit, we're going to have to either > > simulate a kvm-specific CPUID leaf or, maybe even better, come up with > > our own format for emulated capabilities. Maybe a bit vector with set > > bits for the respective capability, or something more nifty. > > > > In any case, it doesn't really need to be CPUID-like, IMHO. > > > Using CPUID has the major advantage that it is well-defined. Right, and it is actually a tree of bitvectors of height of 1. :-) However, in the remote case where we want to state an emulated capability for which there is no CPUID bit, we probably need something. From looking at do_cpuid_ent(), kvm uses the 0x40000000 so we probably could define such a flag in there. If needed, of course. So KVM_GET_EMULATED_CPUID it is. :-)
On Mon, Apr 22, 2013 at 11:38:10AM +0200, Borislav Petkov wrote: > On Mon, Apr 22, 2013 at 10:53:42AM +0200, Paolo Bonzini wrote: > > Il 21/04/2013 14:23, Borislav Petkov ha scritto: > > > On Sun, Apr 21, 2013 at 01:46:50PM +0200, Borislav Petkov wrote: > > >> We probably need something with copying values to a temp variable or so. > > > > > > Basically something like that: > > > > > > case 2: > > > /* > > > * From MOVBE definition: "...When the operand size is 16 bits, > > > * the upper word of the destination register remains unchanged > > > * ..." > > > * > > > * Both casting ->valptr and ->val to u16 breaks strict aliasing > > > * rules so we have to do the operation almost per hand. > > > */ > > > tmp = (u16)ctxt->src.val; > > > ctxt->dst.val &= ~0xffffUL; > > > ctxt->dst.val |= (unsigned long)swab16(tmp); > > > break; > > > > > > This passes all gcc checks, even the stricter ones when building with W=3. > > > > I thought the valptr one was ok. > > Yep, it looked like that too. And, it could actually really be ok and > the gcc's warning here is bogus. I'll try to talk to gcc people about > it. Ok, I did and here's the explanation, as far as I understood it. Micha, please correct me if I'm talking bullsh*t. So basically, gcc screams because there's a type incompatibility according to the ICO C standard. IOW, valptr is declared as char[] and we are casting it to "unsigned short *" and then dereffing it, and both types are not compatible. So, I'm looking at http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf, section 6.2.7 says "Two types have compatible type if their types are the same." Then, section "6.7.2 Type specifiers" talks about the different types and on the next page, in sentence 5 it says: "Each of the comma-separated multisets designates the same type,... " And, no wonder there, char and unsigned short are in different multisets so... So, what gcc actually warns about is, something which has been declared as char[] should not be subsequently accessed through "unsigned short *" because the two types are incompatible. No wonder we're building the kernel with -fno-strict-aliasing :).
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cbf..2d44fc4fd855 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -273,7 +273,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, cpuid_mask(&entry->ecx, 4); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ - entry->ecx |= F(X2APIC); + entry->ecx |= F(X2APIC) | F(MOVBE); break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a335cc6cde72..9011c7a656ad 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -152,6 +152,7 @@ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ +#define EmulateOnUD ((u64)1 << 46) /* emulate if unsupported by the host */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -3107,6 +3108,30 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movbe(struct x86_emulate_ctxt *ctxt) +{ + switch (ctxt->op_bytes) { + case 2: + *(u16 *)ctxt->dst.valptr = swab16(*(u16 *)ctxt->src.valptr); + break; + case 4: + *(u32 *)ctxt->dst.valptr = swab32(*(u32 *)ctxt->src.valptr); + + /* + * clear upper dword for 32-bit operand size in 64-bit mode. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + *((u32 *)ctxt->dst.valptr + 1) = 0x0; + break; + case 8: + *(u64 *)ctxt->dst.valptr = swab64(*(u64 *)ctxt->src.valptr); + break; + default: + return X86EMUL_PROPAGATE_FAULT; + } + return X86EMUL_CONTINUE; +} + static int em_cr_write(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) @@ -4033,6 +4058,11 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct opcode threebyte_table[] = { + [0xf0] = I(DstReg | SrcMem | ModRM | Mov | EmulateOnUD, em_movbe), + [0xf1] = I(DstMem | SrcReg | ModRM | Mov | EmulateOnUD, em_movbe), +}; + #undef D #undef N #undef G @@ -4320,6 +4350,9 @@ done_prefixes: ctxt->twobyte = 1; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; + + if (ctxt->b == 0x38) + opcode = threebyte_table[insn_fetch(u8, ctxt)]; } ctxt->d = opcode.flags; @@ -4376,8 +4409,10 @@ done_prefixes: if (ctxt->d == 0 || (ctxt->d & Undefined)) return EMULATION_FAILED; - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) - return EMULATION_FAILED; + if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) { + if (!(ctxt->d & EmulateOnUD)) + return EMULATION_FAILED; + } if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8;