Message ID | 57D1806F020000780010D18F@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 08/09/16 14:14, Jan Beulich wrote: > @@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le > return vmcb->nextrip - vmcb->rip; > } > > -/* First byte: Length. Following bytes: Opcode bytes. */ > -#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ } > -MAKE_INSTR(INVD, 2, 0x0f, 0x08); > -MAKE_INSTR(WBINVD, 2, 0x0f, 0x09); > -MAKE_INSTR(CPUID, 2, 0x0f, 0xa2); > -MAKE_INSTR(RDMSR, 2, 0x0f, 0x32); > -MAKE_INSTR(WRMSR, 2, 0x0f, 0x30); > -MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9); > -MAKE_INSTR(HLT, 1, 0xf4); > -MAKE_INSTR(INT3, 1, 0xcc); > -MAKE_INSTR(RDTSC, 2, 0x0f, 0x31); > -MAKE_INSTR(PAUSE, 1, 0x90); > -MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1); > -MAKE_INSTR(VMRUN, 3, 0x0f, 0x01, 0xd8); > -MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda); > -MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb); > -MAKE_INSTR(STGI, 3, 0x0f, 0x01, 0xdc); > -MAKE_INSTR(CLGI, 3, 0x0f, 0x01, 0xdd); > -MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf); > - > -static const u8 *const opc_bytes[INSTR_MAX_COUNT] = > -{ > - [INSTR_INVD] = OPCODE_INVD, > - [INSTR_WBINVD] = OPCODE_WBINVD, > - [INSTR_CPUID] = OPCODE_CPUID, > - [INSTR_RDMSR] = OPCODE_RDMSR, > - [INSTR_WRMSR] = OPCODE_WRMSR, > - [INSTR_VMCALL] = OPCODE_VMCALL, > - [INSTR_HLT] = OPCODE_HLT, > - [INSTR_INT3] = OPCODE_INT3, > - [INSTR_RDTSC] = OPCODE_RDTSC, > - [INSTR_PAUSE] = OPCODE_PAUSE, > - [INSTR_XSETBV] = OPCODE_XSETBV, > - [INSTR_VMRUN] = OPCODE_VMRUN, > - [INSTR_VMLOAD] = OPCODE_VMLOAD, > - [INSTR_VMSAVE] = OPCODE_VMSAVE, > - [INSTR_STGI] = OPCODE_STGI, > - [INSTR_CLGI] = OPCODE_CLGI, > - [INSTR_INVLPGA] = OPCODE_INVLPGA, > +static const struct { > + unsigned int opcode; > + struct { > + unsigned int rm:3; > + unsigned int reg:3; > + unsigned int mod:2; > +#define MODRM(mod, reg, rm) { rm, reg, mod } > + } modrm; > +} const opc_tab[INSTR_MAX_COUNT] = { > + [INSTR_PAUSE] = { X86EMUL_OPC_F3(0, 0x90) }, > + [INSTR_INT3] = { X86EMUL_OPC( 0, 0xcc) }, > + [INSTR_HLT] = { X86EMUL_OPC( 0, 0xf4) }, > + [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) }, > + [INSTR_VMRUN] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) }, > + [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) }, > + [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) }, > + [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) }, > + [INSTR_STGI] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) }, > + [INSTR_CLGI] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) }, > + [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) }, > + [INSTR_INVD] = { X86EMUL_OPC(0x0f, 0x08) }, > + [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) }, > + [INSTR_WRMSR] = { X86EMUL_OPC(0x0f, 0x30) }, > + [INSTR_RDTSC] = { X86EMUL_OPC(0x0f, 0x31) }, > + [INSTR_RDMSR] = { X86EMUL_OPC(0x0f, 0x32) }, > + [INSTR_CPUID] = { X86EMUL_OPC(0x0f, 0xa2) }, > }; > > -static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf, > - unsigned long addr, unsigned int len) > -{ > - uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0; > - > - switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) ) > - { > - case HVMCOPY_okay: > - break; > - case HVMCOPY_bad_gva_to_gfn: > - /* OK just to give up; we'll have injected #PF already */ > - return 0; > - default: > - /* Not OK: fetches from non-RAM pages are not supportable. */ > - gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n", > - vmcb->rip, addr); > - hvm_inject_hw_exception(TRAP_gp_fault, 0); > - return 0; > - } > - return 1; > -} > - > int __get_instruction_length_from_list(struct vcpu *v, > const enum instruction_index *list, unsigned int list_count) > { > struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; > - unsigned int i, j, inst_len = 0; > - enum instruction_index instr = 0; > - u8 buf[MAX_INST_LEN]; > - const u8 *opcode = NULL; > - unsigned long fetch_addr, fetch_limit; > - unsigned int fetch_len, max_len; > + struct hvm_emulate_ctxt ctxt; > + struct x86_emulate_state *state; > + unsigned int inst_len, j, modrm_rm, modrm_reg; > + int modrm_mod; > > +#ifdef NDEBUG Presumably this is just for your testing? > if ( (inst_len = svm_nextrip_insn_length(v)) != 0 ) > return inst_len; > > if ( vmcb->exitcode == VMEXIT_IOIO ) > return vmcb->exitinfo2 - vmcb->rip; > +#endif > > - /* Fetch up to the next page break; we'll fetch from the next page > - * later if we have to. */ > - fetch_addr = svm_rip2pointer(v, &fetch_limit); > - if ( vmcb->rip > fetch_limit ) > - return 0; > - max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL); > - fetch_len = min_t(unsigned int, max_len, > - PAGE_SIZE - (fetch_addr & ~PAGE_MASK)); > - if ( !fetch(vmcb, buf, fetch_addr, fetch_len) ) > + ASSERT(v == current); > + hvm_emulate_prepare(&ctxt, guest_cpu_user_regs()); > + hvm_emulate_init(&ctxt, NULL, 0); > + state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch); > + if ( IS_ERR_OR_NULL(state) ) > return 0; > > - while ( (inst_len < max_len) && is_prefix(buf[inst_len]) ) > - { > - inst_len++; > - if ( inst_len >= fetch_len ) > - { > - if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len, > - max_len - fetch_len) ) > - return 0; > - fetch_len = max_len; > - } > + inst_len = x86_insn_length(state, &ctxt.ctxt); > + modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg); > + x86_emulate_free_state(state); From an API point of view, it is weird to have x86_emulate_free_state() without a matching allocation function. Perhaps that is just me. However, the x86_insn_modrm() API is definitely more weird. Wouldn't it be more natural to take optional pointers for the mod, rm and reg parts individually? > --- a/xen/arch/x86/x86_emulate/x86_emulate.c > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c > @@ -382,7 +382,7 @@ struct operand { > } mem; > }; > #ifdef __x86_64__ > -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */ > +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */ > #else > #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */ > #endif Given that these are now used for general pointer poisoning, they should be renamed. There are only 3 instances. > @@ -1631,6 +1631,10 @@ struct x86_emulate_state { > > unsigned long eip; > struct cpu_user_regs *regs; > + > +#ifndef NDEBUG > + void *caller; > +#endif Perhaps worth a comment here? Its purpose is rather opaque. > }; > > /* Helper definitions. */ > @@ -1658,6 +1662,11 @@ x86_decode_base( > > switch ( ctxt->opcode ) > { > + case 0x90: /* nop / pause */ > + if ( repe_prefix() ) > + ctxt->opcode |= X86EMUL_OPC_F3(0, 0); > + break; Why is it necessary to special case the rep prefix handling in this case? > +int > +x86_insn_modrm(const struct x86_emulate_state *state, > + unsigned int *rm, unsigned int *reg) > +{ > + check_state(state); > + > + if ( !(state->desc & ModRM) ) > + return -EINVAL; > + > + if ( rm ) > + *rm = state->modrm_rm; > + if ( reg ) > + *reg = state->modrm_reg; > + > + return state->modrm_mod; > +} > + > +unsigned int > +x86_insn_length(const struct x86_emulate_state *state, > + const struct x86_emulate_ctxt *ctxt) > +{ > + check_state(state); > + > + return state->eip - ctxt->regs->eip; Is it worth stashing a starting eip? This calculation will go wrong after the emulated state has been committed. ~Andrew
>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote: > On 08/09/16 14:14, Jan Beulich wrote: >> int __get_instruction_length_from_list(struct vcpu *v, >> const enum instruction_index *list, unsigned int list_count) >> { >> struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; >> - unsigned int i, j, inst_len = 0; >> - enum instruction_index instr = 0; >> - u8 buf[MAX_INST_LEN]; >> - const u8 *opcode = NULL; >> - unsigned long fetch_addr, fetch_limit; >> - unsigned int fetch_len, max_len; >> + struct hvm_emulate_ctxt ctxt; >> + struct x86_emulate_state *state; >> + unsigned int inst_len, j, modrm_rm, modrm_reg; >> + int modrm_mod; >> >> +#ifdef NDEBUG > > Presumably this is just for your testing? No, I actually meant it to stay that way. Along the lines of the extra debugging code we have in map_domain_page(). >> if ( (inst_len = svm_nextrip_insn_length(v)) != 0 ) >> return inst_len; >> >> if ( vmcb->exitcode == VMEXIT_IOIO ) >> return vmcb->exitinfo2 - vmcb->rip; >> +#endif >> >> - /* Fetch up to the next page break; we'll fetch from the next page >> - * later if we have to. */ >> - fetch_addr = svm_rip2pointer(v, &fetch_limit); >> - if ( vmcb->rip > fetch_limit ) >> - return 0; >> - max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL); >> - fetch_len = min_t(unsigned int, max_len, >> - PAGE_SIZE - (fetch_addr & ~PAGE_MASK)); >> - if ( !fetch(vmcb, buf, fetch_addr, fetch_len) ) >> + ASSERT(v == current); >> + hvm_emulate_prepare(&ctxt, guest_cpu_user_regs()); >> + hvm_emulate_init(&ctxt, NULL, 0); >> + state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch); >> + if ( IS_ERR_OR_NULL(state) ) >> return 0; >> >> - while ( (inst_len < max_len) && is_prefix(buf[inst_len]) ) >> - { >> - inst_len++; >> - if ( inst_len >= fetch_len ) >> - { >> - if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len, >> - max_len - fetch_len) ) >> - return 0; >> - fetch_len = max_len; >> - } >> + inst_len = x86_insn_length(state, &ctxt.ctxt); >> + modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg); >> + x86_emulate_free_state(state); > > From an API point of view, it is weird to have x86_emulate_free_state() > without a matching allocation function. Perhaps that is just me. With x86_decode_insn() returning the state, that to me _is_ the allocation function. > However, the x86_insn_modrm() API is definitely more weird. Wouldn't it > be more natural to take optional pointers for the mod, rm and reg parts > individually? I could change it to that, but I did it this way because without mod at least rm is meaningless. Or said differently, I can't really see there being a caller not caring about mod. >> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >> @@ -382,7 +382,7 @@ struct operand { >> } mem; >> }; >> #ifdef __x86_64__ >> -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */ >> +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */ >> #else >> #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */ >> #endif > > Given that these are now used for general pointer poisoning, they should > be renamed. There are only 3 instances. Okay. I'll make the PTR_POISON then. >> @@ -1658,6 +1662,11 @@ x86_decode_base( >> >> switch ( ctxt->opcode ) >> { >> + case 0x90: /* nop / pause */ >> + if ( repe_prefix() ) >> + ctxt->opcode |= X86EMUL_OPC_F3(0, 0); >> + break; > > Why is it necessary to special case the rep prefix handling in this case? Because SVM's pause intercept should not mistakenly also accept a plain NOP. >> +unsigned int >> +x86_insn_length(const struct x86_emulate_state *state, >> + const struct x86_emulate_ctxt *ctxt) >> +{ >> + check_state(state); >> + >> + return state->eip - ctxt->regs->eip; > > Is it worth stashing a starting eip? This calculation will go wrong > after the emulated state has been committed. This function (taking a state parameter) can't be called by users of x86_emulate(), and I don't think we need to cater for callers committing state themselves - they should clearly use the result of this function for what to commit in the first place. Jan
On 15/09/16 07:55, Jan Beulich wrote: >>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote: >> On 08/09/16 14:14, Jan Beulich wrote: >>> int __get_instruction_length_from_list(struct vcpu *v, >>> const enum instruction_index *list, unsigned int list_count) >>> { >>> struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; >>> - unsigned int i, j, inst_len = 0; >>> - enum instruction_index instr = 0; >>> - u8 buf[MAX_INST_LEN]; >>> - const u8 *opcode = NULL; >>> - unsigned long fetch_addr, fetch_limit; >>> - unsigned int fetch_len, max_len; >>> + struct hvm_emulate_ctxt ctxt; >>> + struct x86_emulate_state *state; >>> + unsigned int inst_len, j, modrm_rm, modrm_reg; >>> + int modrm_mod; >>> >>> +#ifdef NDEBUG >> Presumably this is just for your testing? > No, I actually meant it to stay that way. Along the lines of the extra > debugging code we have in map_domain_page(). I was never very happy with the older version of this debugging. Surely in a case like this, we should use the intercept information when available, and check it against the emulator in a debug build. That way, we don't entirely change the underlying logic in this function between a debug and non debug build. >>> @@ -1658,6 +1662,11 @@ x86_decode_base( >>> >>> switch ( ctxt->opcode ) >>> { >>> + case 0x90: /* nop / pause */ >>> + if ( repe_prefix() ) >>> + ctxt->opcode |= X86EMUL_OPC_F3(0, 0); >>> + break; >> Why is it necessary to special case the rep prefix handling in this case? > Because SVM's pause intercept should not mistakenly also accept a > plain NOP. How about a comment to this effect: /* Note: Prefixes such as rep/repne are only encoded when semantically meaningful to the instruction, to reduce the complexity of interpreting this opcode representation. */ ~Andrew
>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote: > On 15/09/16 07:55, Jan Beulich wrote: >>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote: >>> On 08/09/16 14:14, Jan Beulich wrote: >>>> int __get_instruction_length_from_list(struct vcpu *v, >>>> const enum instruction_index *list, unsigned int list_count) >>>> { >>>> struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; >>>> - unsigned int i, j, inst_len = 0; >>>> - enum instruction_index instr = 0; >>>> - u8 buf[MAX_INST_LEN]; >>>> - const u8 *opcode = NULL; >>>> - unsigned long fetch_addr, fetch_limit; >>>> - unsigned int fetch_len, max_len; >>>> + struct hvm_emulate_ctxt ctxt; >>>> + struct x86_emulate_state *state; >>>> + unsigned int inst_len, j, modrm_rm, modrm_reg; >>>> + int modrm_mod; >>>> >>>> +#ifdef NDEBUG >>> Presumably this is just for your testing? >> No, I actually meant it to stay that way. Along the lines of the extra >> debugging code we have in map_domain_page(). > > I was never very happy with the older version of this debugging. Surely > in a case like this, we should use the intercept information when > available, and check it against the emulator in a debug build. > > That way, we don't entirely change the underlying logic in this function > between a debug and non debug build. But that is exactly what the code is doing: #ifndef NDEBUG if ( vmcb->exitcode == VMEXIT_IOIO ) j = vmcb->exitinfo2 - vmcb->rip; else j = svm_nextrip_insn_length(v); if ( j && j != inst_len ) { gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n", ctxt.ctxt.opcode, inst_len, j); return j; } #endif I.e. in case of a mismatch we use the data from hardware, plus a message gets logged. In case of a match we further exercise the opcode lookup logic, which non-debug builds would never hit on capable hardware. >>>> @@ -1658,6 +1662,11 @@ x86_decode_base( >>>> >>>> switch ( ctxt->opcode ) >>>> { >>>> + case 0x90: /* nop / pause */ >>>> + if ( repe_prefix() ) >>>> + ctxt->opcode |= X86EMUL_OPC_F3(0, 0); >>>> + break; >>> Why is it necessary to special case the rep prefix handling in this case? >> Because SVM's pause intercept should not mistakenly also accept a >> plain NOP. > > How about a comment to this effect: > > /* Note: Prefixes such as rep/repne are only encoded when semantically > meaningful to the instruction, to reduce the complexity of interpreting > this opcode representation. */ Yes, I've added a slight variation of this to the definition of X86EMUL_OPC_PFX_MASK. Jan
On 27/09/16 14:56, Jan Beulich wrote: >>>> On 27.09.16 at 15:42, <andrew.cooper3@citrix.com> wrote: >> On 15/09/16 07:55, Jan Beulich wrote: >>>>>> On 14.09.16 at 19:56, <andrew.cooper3@citrix.com> wrote: >>>> On 08/09/16 14:14, Jan Beulich wrote: >>>>> int __get_instruction_length_from_list(struct vcpu *v, >>>>> const enum instruction_index *list, unsigned int list_count) >>>>> { >>>>> struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; >>>>> - unsigned int i, j, inst_len = 0; >>>>> - enum instruction_index instr = 0; >>>>> - u8 buf[MAX_INST_LEN]; >>>>> - const u8 *opcode = NULL; >>>>> - unsigned long fetch_addr, fetch_limit; >>>>> - unsigned int fetch_len, max_len; >>>>> + struct hvm_emulate_ctxt ctxt; >>>>> + struct x86_emulate_state *state; >>>>> + unsigned int inst_len, j, modrm_rm, modrm_reg; >>>>> + int modrm_mod; >>>>> >>>>> +#ifdef NDEBUG >>>> Presumably this is just for your testing? >>> No, I actually meant it to stay that way. Along the lines of the extra >>> debugging code we have in map_domain_page(). >> I was never very happy with the older version of this debugging. Surely >> in a case like this, we should use the intercept information when >> available, and check it against the emulator in a debug build. >> >> That way, we don't entirely change the underlying logic in this function >> between a debug and non debug build. > But that is exactly what the code is doing: > > #ifndef NDEBUG > if ( vmcb->exitcode == VMEXIT_IOIO ) > j = vmcb->exitinfo2 - vmcb->rip; > else > j = svm_nextrip_insn_length(v); > if ( j && j != inst_len ) > { > gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n", > ctxt.ctxt.opcode, inst_len, j); > return j; > } > #endif > > I.e. in case of a mismatch we use the data from hardware, plus a > message gets logged. In case of a match we further exercise the > opcode lookup logic, which non-debug builds would never hit on > capable hardware. Ah yes - I see now. The split between #ifdef NDEBUG and #ifndef NDEBUG is the confusing factor. ~Andrew
--- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -835,7 +835,7 @@ static int hvmemul_read( container_of(ctxt, struct hvm_emulate_ctxt, ctxt)); } -static int hvmemul_insn_fetch( +int hvmemul_insn_fetch( enum x86_segment seg, unsigned long offset, void *p_data, @@ -1765,15 +1765,14 @@ static const struct x86_emulate_ops hvm_ .vmfunc = hvmemul_vmfunc, }; -static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, - const struct x86_emulate_ops *ops) +void hvm_emulate_init( + struct hvm_emulate_ctxt *hvmemul_ctxt, + const unsigned char *insn_buf, + unsigned int insn_bytes) { - struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; struct vcpu *curr = current; - uint32_t new_intr_shadow, pfec = PFEC_page_present; - struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + unsigned int pfec = PFEC_page_present; unsigned long addr; - int rc; if ( hvm_long_mode_enabled(curr) && hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l ) @@ -1791,14 +1790,14 @@ static int _hvm_emulate_one(struct hvm_e if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) pfec |= PFEC_user_mode; - hvmemul_ctxt->insn_buf_eip = regs->eip; - if ( !vio->mmio_insn_bytes ) + hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->eip; + if ( !insn_bytes ) { hvmemul_ctxt->insn_buf_bytes = hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?: (hvm_virtual_to_linear_addr(x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs], - regs->eip, + hvmemul_ctxt->insn_buf_eip, sizeof(hvmemul_ctxt->insn_buf), hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, @@ -1810,11 +1809,24 @@ static int _hvm_emulate_one(struct hvm_e } else { - hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes; - memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes); + hvmemul_ctxt->insn_buf_bytes = insn_bytes; + memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes); } hvmemul_ctxt->exn_pending = 0; +} + +static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, + const struct x86_emulate_ops *ops) +{ + const struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; + struct vcpu *curr = current; + uint32_t new_intr_shadow; + struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + int rc; + + hvm_emulate_init(hvmemul_ctxt, vio->mmio_insn, vio->mmio_insn_bytes); + vio->mmio_retry = 0; if ( cpu_has_vmx ) --- a/xen/arch/x86/hvm/svm/emulate.c +++ b/xen/arch/x86/hvm/svm/emulate.c @@ -15,7 +15,7 @@ * this program; If not, see <http://www.gnu.org/licenses/>. */ -#include <xen/config.h> +#include <xen/err.h> #include <xen/init.h> #include <xen/lib.h> #include <xen/trace.h> @@ -26,41 +26,6 @@ #include <asm/hvm/svm/vmcb.h> #include <asm/hvm/svm/emulate.h> -static unsigned int is_prefix(u8 opc) -{ - switch ( opc ) - { - case 0x66: - case 0x67: - case 0x2E: - case 0x3E: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xF0: - case 0xF3: - case 0xF2: - case 0x40 ... 0x4f: - return 1; - } - return 0; -} - -static unsigned long svm_rip2pointer(struct vcpu *v, unsigned long *limit) -{ - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned long p = vmcb->cs.base + vmcb->rip; - - if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) ) - { - *limit = vmcb->cs.limit; - return (u32)p; /* mask to 32 bits */ - } - *limit = ~0UL; - return p; -} - static unsigned long svm_nextrip_insn_length(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -89,141 +54,96 @@ static unsigned long svm_nextrip_insn_le return vmcb->nextrip - vmcb->rip; } -/* First byte: Length. Following bytes: Opcode bytes. */ -#define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ } -MAKE_INSTR(INVD, 2, 0x0f, 0x08); -MAKE_INSTR(WBINVD, 2, 0x0f, 0x09); -MAKE_INSTR(CPUID, 2, 0x0f, 0xa2); -MAKE_INSTR(RDMSR, 2, 0x0f, 0x32); -MAKE_INSTR(WRMSR, 2, 0x0f, 0x30); -MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9); -MAKE_INSTR(HLT, 1, 0xf4); -MAKE_INSTR(INT3, 1, 0xcc); -MAKE_INSTR(RDTSC, 2, 0x0f, 0x31); -MAKE_INSTR(PAUSE, 1, 0x90); -MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1); -MAKE_INSTR(VMRUN, 3, 0x0f, 0x01, 0xd8); -MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda); -MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb); -MAKE_INSTR(STGI, 3, 0x0f, 0x01, 0xdc); -MAKE_INSTR(CLGI, 3, 0x0f, 0x01, 0xdd); -MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf); - -static const u8 *const opc_bytes[INSTR_MAX_COUNT] = -{ - [INSTR_INVD] = OPCODE_INVD, - [INSTR_WBINVD] = OPCODE_WBINVD, - [INSTR_CPUID] = OPCODE_CPUID, - [INSTR_RDMSR] = OPCODE_RDMSR, - [INSTR_WRMSR] = OPCODE_WRMSR, - [INSTR_VMCALL] = OPCODE_VMCALL, - [INSTR_HLT] = OPCODE_HLT, - [INSTR_INT3] = OPCODE_INT3, - [INSTR_RDTSC] = OPCODE_RDTSC, - [INSTR_PAUSE] = OPCODE_PAUSE, - [INSTR_XSETBV] = OPCODE_XSETBV, - [INSTR_VMRUN] = OPCODE_VMRUN, - [INSTR_VMLOAD] = OPCODE_VMLOAD, - [INSTR_VMSAVE] = OPCODE_VMSAVE, - [INSTR_STGI] = OPCODE_STGI, - [INSTR_CLGI] = OPCODE_CLGI, - [INSTR_INVLPGA] = OPCODE_INVLPGA, +static const struct { + unsigned int opcode; + struct { + unsigned int rm:3; + unsigned int reg:3; + unsigned int mod:2; +#define MODRM(mod, reg, rm) { rm, reg, mod } + } modrm; +} const opc_tab[INSTR_MAX_COUNT] = { + [INSTR_PAUSE] = { X86EMUL_OPC_F3(0, 0x90) }, + [INSTR_INT3] = { X86EMUL_OPC( 0, 0xcc) }, + [INSTR_HLT] = { X86EMUL_OPC( 0, 0xf4) }, + [INSTR_XSETBV] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 2, 1) }, + [INSTR_VMRUN] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 0) }, + [INSTR_VMCALL] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 1) }, + [INSTR_VMLOAD] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 2) }, + [INSTR_VMSAVE] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 3) }, + [INSTR_STGI] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 4) }, + [INSTR_CLGI] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 5) }, + [INSTR_INVLPGA] = { X86EMUL_OPC(0x0f, 0x01), MODRM(3, 3, 7) }, + [INSTR_INVD] = { X86EMUL_OPC(0x0f, 0x08) }, + [INSTR_WBINVD] = { X86EMUL_OPC(0x0f, 0x09) }, + [INSTR_WRMSR] = { X86EMUL_OPC(0x0f, 0x30) }, + [INSTR_RDTSC] = { X86EMUL_OPC(0x0f, 0x31) }, + [INSTR_RDMSR] = { X86EMUL_OPC(0x0f, 0x32) }, + [INSTR_CPUID] = { X86EMUL_OPC(0x0f, 0xa2) }, }; -static bool_t fetch(const struct vmcb_struct *vmcb, u8 *buf, - unsigned long addr, unsigned int len) -{ - uint32_t pfec = (vmcb_get_cpl(vmcb) == 3) ? PFEC_user_mode : 0; - - switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) ) - { - case HVMCOPY_okay: - break; - case HVMCOPY_bad_gva_to_gfn: - /* OK just to give up; we'll have injected #PF already */ - return 0; - default: - /* Not OK: fetches from non-RAM pages are not supportable. */ - gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n", - vmcb->rip, addr); - hvm_inject_hw_exception(TRAP_gp_fault, 0); - return 0; - } - return 1; -} - int __get_instruction_length_from_list(struct vcpu *v, const enum instruction_index *list, unsigned int list_count) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned int i, j, inst_len = 0; - enum instruction_index instr = 0; - u8 buf[MAX_INST_LEN]; - const u8 *opcode = NULL; - unsigned long fetch_addr, fetch_limit; - unsigned int fetch_len, max_len; + struct hvm_emulate_ctxt ctxt; + struct x86_emulate_state *state; + unsigned int inst_len, j, modrm_rm, modrm_reg; + int modrm_mod; +#ifdef NDEBUG if ( (inst_len = svm_nextrip_insn_length(v)) != 0 ) return inst_len; if ( vmcb->exitcode == VMEXIT_IOIO ) return vmcb->exitinfo2 - vmcb->rip; +#endif - /* Fetch up to the next page break; we'll fetch from the next page - * later if we have to. */ - fetch_addr = svm_rip2pointer(v, &fetch_limit); - if ( vmcb->rip > fetch_limit ) - return 0; - max_len = min(fetch_limit - vmcb->rip + 1, MAX_INST_LEN + 0UL); - fetch_len = min_t(unsigned int, max_len, - PAGE_SIZE - (fetch_addr & ~PAGE_MASK)); - if ( !fetch(vmcb, buf, fetch_addr, fetch_len) ) + ASSERT(v == current); + hvm_emulate_prepare(&ctxt, guest_cpu_user_regs()); + hvm_emulate_init(&ctxt, NULL, 0); + state = x86_decode_insn(&ctxt.ctxt, hvmemul_insn_fetch); + if ( IS_ERR_OR_NULL(state) ) return 0; - while ( (inst_len < max_len) && is_prefix(buf[inst_len]) ) - { - inst_len++; - if ( inst_len >= fetch_len ) - { - if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len, - max_len - fetch_len) ) - return 0; - fetch_len = max_len; - } + inst_len = x86_insn_length(state, &ctxt.ctxt); + modrm_mod = x86_insn_modrm(state, &modrm_rm, &modrm_reg); + x86_emulate_free_state(state); +#ifndef NDEBUG + if ( vmcb->exitcode == VMEXIT_IOIO ) + j = vmcb->exitinfo2 - vmcb->rip; + else + j = svm_nextrip_insn_length(v); + if ( j && j != inst_len ) + { + gprintk(XENLOG_WARNING, "insn-len[%02x]=%u (exp %u)\n", + ctxt.ctxt.opcode, inst_len, j); + return j; } +#endif for ( j = 0; j < list_count; j++ ) { - instr = list[j]; - opcode = opc_bytes[instr]; + enum instruction_index instr = list[j]; - for ( i = 0; (i < opcode[0]) && ((inst_len + i) < max_len); i++ ) + ASSERT(instr >= 0 && instr < ARRAY_SIZE(opc_tab)); + if ( opc_tab[instr].opcode == ctxt.ctxt.opcode ) { - if ( (inst_len + i) >= fetch_len ) - { - if ( !fetch(vmcb, buf + fetch_len, fetch_addr + fetch_len, - max_len - fetch_len) ) - return 0; - fetch_len = max_len; - } + if ( !opc_tab[instr].modrm.mod ) + return inst_len; - if ( buf[inst_len+i] != opcode[i+1] ) - goto mismatch; + if ( modrm_mod == opc_tab[instr].modrm.mod && + (modrm_rm & 7) == opc_tab[instr].modrm.rm && + (modrm_reg & 7) == opc_tab[instr].modrm.reg ) + return inst_len; } - goto done; - mismatch: ; } gdprintk(XENLOG_WARNING, - "%s: Mismatch between expected and actual instruction bytes: " + "%s: Mismatch between expected and actual instruction: " "eip = %lx\n", __func__, (unsigned long)vmcb->rip); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 0; - - done: - inst_len += opcode[0]; - ASSERT(inst_len <= max_len); - return inst_len; } /* --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -382,7 +382,7 @@ struct operand { } mem; }; #ifdef __x86_64__ -#define REG_POISON ((unsigned long *) 0x8086000000008086UL) /* non-canonical */ +#define REG_POISON ((void *)0x8086000000008086UL) /* non-canonical */ #else #define REG_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */ #endif @@ -1631,6 +1631,10 @@ struct x86_emulate_state { unsigned long eip; struct cpu_user_regs *regs; + +#ifndef NDEBUG + void *caller; +#endif }; /* Helper definitions. */ @@ -1658,6 +1662,11 @@ x86_decode_base( switch ( ctxt->opcode ) { + case 0x90: /* nop / pause */ + if ( repe_prefix() ) + ctxt->opcode |= X86EMUL_OPC_F3(0, 0); + break; + case 0x9a: /* call (far, absolute) */ case 0xea: /* jmp (far, absolute) */ generate_exception_if(mode_64bit(), EXC_UD, -1); @@ -2852,8 +2861,9 @@ x86_emulate( break; case 0x90: /* nop / xchg %%r8,%%rax */ + case X86EMUL_OPC_F3(0, 0x90): /* pause / xchg %%r8,%%rax */ if ( !(rex_prefix & 1) ) - break; /* nop */ + break; /* nop / pause */ /* fall through */ case 0x91 ... 0x97: /* xchg reg,%%rax */ @@ -5200,3 +5210,89 @@ x86_emulate( #undef vex #undef override_seg #undef ea + +#ifdef __XEN__ + +#include <xen/err.h> + +struct x86_emulate_state * +x86_decode_insn( + struct x86_emulate_ctxt *ctxt, + int (*insn_fetch)( + enum x86_segment seg, unsigned long offset, + void *p_data, unsigned int bytes, + struct x86_emulate_ctxt *ctxt)) +{ + static DEFINE_PER_CPU(struct x86_emulate_state, state); + struct x86_emulate_state *state = &this_cpu(state); + const struct x86_emulate_ops ops = { + .insn_fetch = insn_fetch, + .read = x86emul_unhandleable_rw, + .write = REG_POISON, + .cmpxchg = REG_POISON, + }; + int rc = x86_decode(state, ctxt, &ops); + + if ( unlikely(rc != X86EMUL_OKAY) ) + return ERR_PTR(-rc); + +#ifndef NDEBUG + /* + * While we avoid memory allocation (by use of per-CPU data) above, + * nevertheless make sure callers properly release the state structure + * for forward compatibility. + */ + if ( state->caller ) + { + printk(XENLOG_ERR "Unreleased emulation state acquired by %ps\n", + state->caller); + dump_execution_state(); + } + state->caller = __builtin_return_address(0); +#endif + + return state; +} + +static inline void check_state(const struct x86_emulate_state *state) +{ +#ifndef NDEBUG + ASSERT(state->caller); +#endif +} + +#ifndef NDEBUG +void x86_emulate_free_state(struct x86_emulate_state *state) +{ + check_state(state); + state->caller = NULL; +} +#endif + +int +x86_insn_modrm(const struct x86_emulate_state *state, + unsigned int *rm, unsigned int *reg) +{ + check_state(state); + + if ( !(state->desc & ModRM) ) + return -EINVAL; + + if ( rm ) + *rm = state->modrm_rm; + if ( reg ) + *reg = state->modrm_reg; + + return state->modrm_mod; +} + +unsigned int +x86_insn_length(const struct x86_emulate_state *state, + const struct x86_emulate_ctxt *ctxt) +{ + check_state(state); + + return state->eip - ctxt->regs->eip; +} + +#endif --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -523,4 +523,29 @@ x86emul_unhandleable_rw( unsigned int bytes, struct x86_emulate_ctxt *ctxt); +#ifdef __XEN__ + +struct x86_emulate_state * +x86_decode_insn( + struct x86_emulate_ctxt *ctxt, + int (*insn_fetch)( + enum x86_segment seg, unsigned long offset, + void *p_data, unsigned int bytes, + struct x86_emulate_ctxt *ctxt)); + +int +x86_insn_modrm(const struct x86_emulate_state *state, + unsigned int *rm, unsigned int *reg); +unsigned int +x86_insn_length(const struct x86_emulate_state *state, + const struct x86_emulate_ctxt *ctxt); + +#ifdef NDEBUG +static inline void x86_emulate_free_state(struct x86_emulate_state *state) {} +#else +void x86_emulate_free_state(struct x86_emulate_state *state); +#endif + +#endif + #endif /* __X86_EMULATE_H__ */ --- a/xen/include/asm-x86/hvm/emulate.h +++ b/xen/include/asm-x86/hvm/emulate.h @@ -53,6 +53,10 @@ void hvm_mem_access_emulate_one(enum emu void hvm_emulate_prepare( struct hvm_emulate_ctxt *hvmemul_ctxt, struct cpu_user_regs *regs); +void hvm_emulate_init( + struct hvm_emulate_ctxt *hvmemul_ctxt, + const unsigned char *insn_buf, + unsigned int insn_bytes); void hvm_emulate_writeback( struct hvm_emulate_ctxt *hvmemul_ctxt); struct segment_register *hvmemul_get_seg_reg( @@ -60,6 +64,11 @@ struct segment_register *hvmemul_get_seg struct hvm_emulate_ctxt *hvmemul_ctxt); int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla); +int hvmemul_insn_fetch(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); int hvmemul_do_pio_buffer(uint16_t port, unsigned int size, uint8_t dir,