diff mbox

[14/15] KVM: ARM: Handle I/O aborts

Message ID 20120915153559.21241.29970.stgit@ubuntu (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall Sept. 15, 2012, 3:35 p.m. UTC
When the guest accesses I/O memory this will create data abort
exceptions and they are handled by decoding the HSR information
(physical address, read/write, length, register) and forwarding reads
and writes to QEMU which performs the device emulation.

Certain classes of load/store operations do not support the syndrome
information provided in the HSR and we therefore must be able to fetch
the offending instruction from guest memory and decode it manually.

We only support instruction decoding for valid reasonable MMIO operations
where trapping them do not provide sufficient information in the HSR (no
16-bit Thumb instructions provide register writeback that we care about).

The following instruciton types are NOT supported for MMIO operations
despite the HSR not containing decode info:
 - any Load/Store multiple
 - any load/store exclusive
 - any load/store dual
 - anything with the PC as the dest register

This requires changing the general flow somewhat since new calls to run
the VCPU must check if there's a pending MMIO load and perform the write
after userspace has made the data available.

Rusty Russell fixed a horrible race pointed out by Ben Herrenschmidt:
(1) Guest complicated mmio instruction traps.
(2) The hardware doesn't tell us enough, so we need to read the actual
    instruction which was being exectuted.
(3) KVM maps the instruction virtual address to a physical address.
(4) The guest (SMP) swaps out that page, and fills it with something else.
(5) We read the physical address, but now that's the wrong thing.

Signed-off-by: Rusty Russell <rusty.russell@linaro.org>
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
 arch/arm/include/asm/kvm_arm.h     |    3 
 arch/arm/include/asm/kvm_asm.h     |    2 
 arch/arm/include/asm/kvm_emulate.h |   23 ++
 arch/arm/include/asm/kvm_host.h    |    3 
 arch/arm/include/asm/kvm_mmu.h     |    1 
 arch/arm/kvm/arm.c                 |   14 +
 arch/arm/kvm/emulate.c             |  448 ++++++++++++++++++++++++++++++++++++
 arch/arm/kvm/exports.c             |    1 
 arch/arm/kvm/interrupts.S          |   40 +++
 arch/arm/kvm/mmu.c                 |  266 +++++++++++++++++++++
 arch/arm/kvm/trace.h               |   21 ++
 11 files changed, 819 insertions(+), 3 deletions(-)

Comments

Will Deacon Sept. 27, 2012, 3:11 p.m. UTC | #1
On Sat, Sep 15, 2012 at 04:35:59PM +0100, Christoffer Dall wrote:
> When the guest accesses I/O memory this will create data abort
> exceptions and they are handled by decoding the HSR information
> (physical address, read/write, length, register) and forwarding reads
> and writes to QEMU which performs the device emulation.
> 
> Certain classes of load/store operations do not support the syndrome
> information provided in the HSR and we therefore must be able to fetch
> the offending instruction from guest memory and decode it manually.
> 
> We only support instruction decoding for valid reasonable MMIO operations
> where trapping them do not provide sufficient information in the HSR (no
> 16-bit Thumb instructions provide register writeback that we care about).
> 
> The following instruciton types are NOT supported for MMIO operations
> despite the HSR not containing decode info:
>  - any Load/Store multiple
>  - any load/store exclusive
>  - any load/store dual
>  - anything with the PC as the dest register

[...]

> +
> +/******************************************************************************
> + * Load-Store instruction emulation
> + *****************************************************************************/
> +
> +/*
> + * Must be ordered with LOADS first and WRITES afterwards
> + * for easy distinction when doing MMIO.
> + */
> +#define NUM_LD_INSTR  9
> +enum INSTR_LS_INDEXES {
> +       INSTR_LS_LDRBT, INSTR_LS_LDRT, INSTR_LS_LDR, INSTR_LS_LDRB,
> +       INSTR_LS_LDRD, INSTR_LS_LDREX, INSTR_LS_LDRH, INSTR_LS_LDRSB,
> +       INSTR_LS_LDRSH,
> +       INSTR_LS_STRBT, INSTR_LS_STRT, INSTR_LS_STR, INSTR_LS_STRB,
> +       INSTR_LS_STRD, INSTR_LS_STREX, INSTR_LS_STRH,
> +       NUM_LS_INSTR
> +};
> +
> +static u32 ls_instr[NUM_LS_INSTR][2] = {
> +       {0x04700000, 0x0d700000}, /* LDRBT */
> +       {0x04300000, 0x0d700000}, /* LDRT  */
> +       {0x04100000, 0x0c500000}, /* LDR   */
> +       {0x04500000, 0x0c500000}, /* LDRB  */
> +       {0x000000d0, 0x0e1000f0}, /* LDRD  */
> +       {0x01900090, 0x0ff000f0}, /* LDREX */
> +       {0x001000b0, 0x0e1000f0}, /* LDRH  */
> +       {0x001000d0, 0x0e1000f0}, /* LDRSB */
> +       {0x001000f0, 0x0e1000f0}, /* LDRSH */
> +       {0x04600000, 0x0d700000}, /* STRBT */
> +       {0x04200000, 0x0d700000}, /* STRT  */
> +       {0x04000000, 0x0c500000}, /* STR   */
> +       {0x04400000, 0x0c500000}, /* STRB  */
> +       {0x000000f0, 0x0e1000f0}, /* STRD  */
> +       {0x01800090, 0x0ff000f0}, /* STREX */
> +       {0x000000b0, 0x0e1000f0}  /* STRH  */
> +};
> +
> +static inline int get_arm_ls_instr_index(u32 instr)
> +{
> +       return kvm_instr_index(instr, ls_instr, NUM_LS_INSTR);
> +}
> +
> +/*
> + * Load-Store instruction decoding
> + */
> +#define INSTR_LS_TYPE_BIT              26
> +#define INSTR_LS_RD_MASK               0x0000f000
> +#define INSTR_LS_RD_SHIFT              12
> +#define INSTR_LS_RN_MASK               0x000f0000
> +#define INSTR_LS_RN_SHIFT              16
> +#define INSTR_LS_RM_MASK               0x0000000f
> +#define INSTR_LS_OFFSET12_MASK         0x00000fff

I'm afraid you're not going to thank me much for this, but it's high time we
unified the various instruction decoding functions we have under arch/arm/
and this seems like a good opportunity for that. For example, look at the
following snippets (there is much more in the files I list) in addition to
what you have:


asm/ptrace.h
-------------
#define PSR_T_BIT	0x00000020
#define PSR_F_BIT	0x00000040
#define PSR_I_BIT	0x00000080
#define PSR_A_BIT	0x00000100
#define PSR_E_BIT	0x00000200
#define PSR_J_BIT	0x01000000
#define PSR_Q_BIT	0x08000000
#define PSR_V_BIT	0x10000000
#define PSR_C_BIT	0x20000000
#define PSR_Z_BIT	0x40000000
#define PSR_N_BIT	0x80000000

mm/alignment.c
--------------
#define LDST_I_BIT(i)	(i & (1 << 26))		/* Immediate constant	*/
#define LDST_P_BIT(i)	(i & (1 << 24))		/* Preindex		*/
#define LDST_U_BIT(i)	(i & (1 << 23))		/* Add offset		*/
#define LDST_W_BIT(i)	(i & (1 << 21))		/* Writeback		*/
#define LDST_L_BIT(i)	(i & (1 << 20))		/* Load			*/

kernel/kprobes*.c
-----------------
static void __kprobes
emulate_ldr(struct kprobe *p, struct pt_regs *regs)
{
	kprobe_opcode_t insn = p->opcode;
	unsigned long pc = (unsigned long)p->addr + 8;
	int rt = (insn >> 12) & 0xf;
	int rn = (insn >> 16) & 0xf;
	int rm = insn & 0xf;

kernel/opcodes.c
----------------
static const unsigned short cc_map[16] = {
	0xF0F0,			/* EQ == Z set            */
	0x0F0F,			/* NE                     */
	0xCCCC,			/* CS == C set            */
	0x3333,			/* CC                     */
	0xFF00,			/* MI == N set            */
	0x00FF,			/* PL                     */
	0xAAAA,			/* VS == V set            */
	0x5555,			/* VC                     */
	0x0C0C,			/* HI == C set && Z clear */
	0xF3F3,			/* LS == C clear || Z set */
	0xAA55,			/* GE == (N==V)           */
	0x55AA,			/* LT == (N!=V)           */
	0x0A05,			/* GT == (!Z && (N==V))   */
	0xF5FA,			/* LE == (Z || (N!=V))    */
	0xFFFF,			/* AL always              */
	0			/* NV                     */
};

kernel/swp_emulate.c
--------------------
#define EXTRACT_REG_NUM(instruction, offset) \
	(((instruction) & (0xf << (offset))) >> (offset))
#define RN_OFFSET  16
#define RT_OFFSET  12
#define RT2_OFFSET  0


There are also bits and pieces with the patching frameworks and module
relocations that could benefit from some code sharing. Now, I think Dave had
some ideas about moving a load of this code into a common disassembler under
arch/arm/ so it would be great to tie that in here and implement that for
load/store instructions. Then other users can augment the number of
supported instruction classes as and when it is required.

Will
Christoffer Dall Sept. 30, 2012, 9:49 p.m. UTC | #2
On Thu, Sep 27, 2012 at 11:11 AM, Will Deacon <will.deacon@arm.com> wrote:
> On Sat, Sep 15, 2012 at 04:35:59PM +0100, Christoffer Dall wrote:
>> When the guest accesses I/O memory this will create data abort
>> exceptions and they are handled by decoding the HSR information
>> (physical address, read/write, length, register) and forwarding reads
>> and writes to QEMU which performs the device emulation.
>>
>> Certain classes of load/store operations do not support the syndrome
>> information provided in the HSR and we therefore must be able to fetch
>> the offending instruction from guest memory and decode it manually.
>>
>> We only support instruction decoding for valid reasonable MMIO operations
>> where trapping them do not provide sufficient information in the HSR (no
>> 16-bit Thumb instructions provide register writeback that we care about).
>>
>> The following instruciton types are NOT supported for MMIO operations
>> despite the HSR not containing decode info:
>>  - any Load/Store multiple
>>  - any load/store exclusive
>>  - any load/store dual
>>  - anything with the PC as the dest register
>
> [...]
>
>> +
>> +/******************************************************************************
>> + * Load-Store instruction emulation
>> + *****************************************************************************/
>> +
>> +/*
>> + * Must be ordered with LOADS first and WRITES afterwards
>> + * for easy distinction when doing MMIO.
>> + */
>> +#define NUM_LD_INSTR  9
>> +enum INSTR_LS_INDEXES {
>> +       INSTR_LS_LDRBT, INSTR_LS_LDRT, INSTR_LS_LDR, INSTR_LS_LDRB,
>> +       INSTR_LS_LDRD, INSTR_LS_LDREX, INSTR_LS_LDRH, INSTR_LS_LDRSB,
>> +       INSTR_LS_LDRSH,
>> +       INSTR_LS_STRBT, INSTR_LS_STRT, INSTR_LS_STR, INSTR_LS_STRB,
>> +       INSTR_LS_STRD, INSTR_LS_STREX, INSTR_LS_STRH,
>> +       NUM_LS_INSTR
>> +};
>> +
>> +static u32 ls_instr[NUM_LS_INSTR][2] = {
>> +       {0x04700000, 0x0d700000}, /* LDRBT */
>> +       {0x04300000, 0x0d700000}, /* LDRT  */
>> +       {0x04100000, 0x0c500000}, /* LDR   */
>> +       {0x04500000, 0x0c500000}, /* LDRB  */
>> +       {0x000000d0, 0x0e1000f0}, /* LDRD  */
>> +       {0x01900090, 0x0ff000f0}, /* LDREX */
>> +       {0x001000b0, 0x0e1000f0}, /* LDRH  */
>> +       {0x001000d0, 0x0e1000f0}, /* LDRSB */
>> +       {0x001000f0, 0x0e1000f0}, /* LDRSH */
>> +       {0x04600000, 0x0d700000}, /* STRBT */
>> +       {0x04200000, 0x0d700000}, /* STRT  */
>> +       {0x04000000, 0x0c500000}, /* STR   */
>> +       {0x04400000, 0x0c500000}, /* STRB  */
>> +       {0x000000f0, 0x0e1000f0}, /* STRD  */
>> +       {0x01800090, 0x0ff000f0}, /* STREX */
>> +       {0x000000b0, 0x0e1000f0}  /* STRH  */
>> +};
>> +
>> +static inline int get_arm_ls_instr_index(u32 instr)
>> +{
>> +       return kvm_instr_index(instr, ls_instr, NUM_LS_INSTR);
>> +}
>> +
>> +/*
>> + * Load-Store instruction decoding
>> + */
>> +#define INSTR_LS_TYPE_BIT              26
>> +#define INSTR_LS_RD_MASK               0x0000f000
>> +#define INSTR_LS_RD_SHIFT              12
>> +#define INSTR_LS_RN_MASK               0x000f0000
>> +#define INSTR_LS_RN_SHIFT              16
>> +#define INSTR_LS_RM_MASK               0x0000000f
>> +#define INSTR_LS_OFFSET12_MASK         0x00000fff
>
> I'm afraid you're not going to thank me much for this, but it's high time we
> unified the various instruction decoding functions we have under arch/arm/
> and this seems like a good opportunity for that. For example, look at the
> following snippets (there is much more in the files I list) in addition to
> what you have:
>

I think it would be great if we had a set of unified decoding functions!

However, I think it's a shame if we can't merge KVM because we want to
clean up this code. There would be nothing in the way of breaking API
or anything like that preventing us from cleaning this up after the
code has been merged.

Please consider reviewing the code for correctness and consider if the
code could be merged as is.

On the other hand, if you or Dave or anyone else wants to create a set
of patches that cleans this up in a timely manner, I will be happy to
merge them for my code as well ;)

>
> asm/ptrace.h
> -------------
> #define PSR_T_BIT       0x00000020
> #define PSR_F_BIT       0x00000040
> #define PSR_I_BIT       0x00000080
> #define PSR_A_BIT       0x00000100
> #define PSR_E_BIT       0x00000200
> #define PSR_J_BIT       0x01000000
> #define PSR_Q_BIT       0x08000000
> #define PSR_V_BIT       0x10000000
> #define PSR_C_BIT       0x20000000
> #define PSR_Z_BIT       0x40000000
> #define PSR_N_BIT       0x80000000
>
> mm/alignment.c
> --------------
> #define LDST_I_BIT(i)   (i & (1 << 26))         /* Immediate constant   */
> #define LDST_P_BIT(i)   (i & (1 << 24))         /* Preindex             */
> #define LDST_U_BIT(i)   (i & (1 << 23))         /* Add offset           */
> #define LDST_W_BIT(i)   (i & (1 << 21))         /* Writeback            */
> #define LDST_L_BIT(i)   (i & (1 << 20))         /* Load                 */
>
> kernel/kprobes*.c
> -----------------
> static void __kprobes
> emulate_ldr(struct kprobe *p, struct pt_regs *regs)
> {
>         kprobe_opcode_t insn = p->opcode;
>         unsigned long pc = (unsigned long)p->addr + 8;
>         int rt = (insn >> 12) & 0xf;
>         int rn = (insn >> 16) & 0xf;
>         int rm = insn & 0xf;
>
> kernel/opcodes.c
> ----------------
> static const unsigned short cc_map[16] = {
>         0xF0F0,                 /* EQ == Z set            */
>         0x0F0F,                 /* NE                     */
>         0xCCCC,                 /* CS == C set            */
>         0x3333,                 /* CC                     */
>         0xFF00,                 /* MI == N set            */
>         0x00FF,                 /* PL                     */
>         0xAAAA,                 /* VS == V set            */
>         0x5555,                 /* VC                     */
>         0x0C0C,                 /* HI == C set && Z clear */
>         0xF3F3,                 /* LS == C clear || Z set */
>         0xAA55,                 /* GE == (N==V)           */
>         0x55AA,                 /* LT == (N!=V)           */
>         0x0A05,                 /* GT == (!Z && (N==V))   */
>         0xF5FA,                 /* LE == (Z || (N!=V))    */
>         0xFFFF,                 /* AL always              */
>         0                       /* NV                     */
> };
>
> kernel/swp_emulate.c
> --------------------
> #define EXTRACT_REG_NUM(instruction, offset) \
>         (((instruction) & (0xf << (offset))) >> (offset))
> #define RN_OFFSET  16
> #define RT_OFFSET  12
> #define RT2_OFFSET  0
>
>
> There are also bits and pieces with the patching frameworks and module
> relocations that could benefit from some code sharing. Now, I think Dave had
> some ideas about moving a load of this code into a common disassembler under
> arch/arm/ so it would be great to tie that in here and implement that for
> load/store instructions. Then other users can augment the number of
> supported instruction classes as and when it is required.
>
tip-bot for Dave Martin Oct. 1, 2012, 12:53 p.m. UTC | #3
On Sun, Sep 30, 2012 at 05:49:21PM -0400, Christoffer Dall wrote:
> On Thu, Sep 27, 2012 at 11:11 AM, Will Deacon <will.deacon@arm.com> wrote:
> > On Sat, Sep 15, 2012 at 04:35:59PM +0100, Christoffer Dall wrote:
> >> When the guest accesses I/O memory this will create data abort
> >> exceptions and they are handled by decoding the HSR information
> >> (physical address, read/write, length, register) and forwarding reads
> >> and writes to QEMU which performs the device emulation.
> >>
> >> Certain classes of load/store operations do not support the syndrome
> >> information provided in the HSR and we therefore must be able to fetch
> >> the offending instruction from guest memory and decode it manually.
> >>
> >> We only support instruction decoding for valid reasonable MMIO operations
> >> where trapping them do not provide sufficient information in the HSR (no
> >> 16-bit Thumb instructions provide register writeback that we care about).
> >>
> >> The following instruciton types are NOT supported for MMIO operations
> >> despite the HSR not containing decode info:
> >>  - any Load/Store multiple
> >>  - any load/store exclusive
> >>  - any load/store dual
> >>  - anything with the PC as the dest register
> >
> > [...]
> >
> >> +
> >> +/******************************************************************************
> >> + * Load-Store instruction emulation
> >> + *****************************************************************************/
> >> +
> >> +/*
> >> + * Must be ordered with LOADS first and WRITES afterwards
> >> + * for easy distinction when doing MMIO.
> >> + */
> >> +#define NUM_LD_INSTR  9
> >> +enum INSTR_LS_INDEXES {
> >> +       INSTR_LS_LDRBT, INSTR_LS_LDRT, INSTR_LS_LDR, INSTR_LS_LDRB,
> >> +       INSTR_LS_LDRD, INSTR_LS_LDREX, INSTR_LS_LDRH, INSTR_LS_LDRSB,
> >> +       INSTR_LS_LDRSH,
> >> +       INSTR_LS_STRBT, INSTR_LS_STRT, INSTR_LS_STR, INSTR_LS_STRB,
> >> +       INSTR_LS_STRD, INSTR_LS_STREX, INSTR_LS_STRH,
> >> +       NUM_LS_INSTR
> >> +};
> >> +
> >> +static u32 ls_instr[NUM_LS_INSTR][2] = {
> >> +       {0x04700000, 0x0d700000}, /* LDRBT */
> >> +       {0x04300000, 0x0d700000}, /* LDRT  */
> >> +       {0x04100000, 0x0c500000}, /* LDR   */
> >> +       {0x04500000, 0x0c500000}, /* LDRB  */
> >> +       {0x000000d0, 0x0e1000f0}, /* LDRD  */
> >> +       {0x01900090, 0x0ff000f0}, /* LDREX */
> >> +       {0x001000b0, 0x0e1000f0}, /* LDRH  */
> >> +       {0x001000d0, 0x0e1000f0}, /* LDRSB */
> >> +       {0x001000f0, 0x0e1000f0}, /* LDRSH */
> >> +       {0x04600000, 0x0d700000}, /* STRBT */
> >> +       {0x04200000, 0x0d700000}, /* STRT  */
> >> +       {0x04000000, 0x0c500000}, /* STR   */
> >> +       {0x04400000, 0x0c500000}, /* STRB  */
> >> +       {0x000000f0, 0x0e1000f0}, /* STRD  */
> >> +       {0x01800090, 0x0ff000f0}, /* STREX */
> >> +       {0x000000b0, 0x0e1000f0}  /* STRH  */
> >> +};
> >> +
> >> +static inline int get_arm_ls_instr_index(u32 instr)
> >> +{
> >> +       return kvm_instr_index(instr, ls_instr, NUM_LS_INSTR);
> >> +}
> >> +
> >> +/*
> >> + * Load-Store instruction decoding
> >> + */
> >> +#define INSTR_LS_TYPE_BIT              26
> >> +#define INSTR_LS_RD_MASK               0x0000f000
> >> +#define INSTR_LS_RD_SHIFT              12
> >> +#define INSTR_LS_RN_MASK               0x000f0000
> >> +#define INSTR_LS_RN_SHIFT              16
> >> +#define INSTR_LS_RM_MASK               0x0000000f
> >> +#define INSTR_LS_OFFSET12_MASK         0x00000fff
> >
> > I'm afraid you're not going to thank me much for this, but it's high time we
> > unified the various instruction decoding functions we have under arch/arm/
> > and this seems like a good opportunity for that. For example, look at the
> > following snippets (there is much more in the files I list) in addition to
> > what you have:
> >
> 
> I think it would be great if we had a set of unified decoding functions!
> 
> However, I think it's a shame if we can't merge KVM because we want to
> clean up this code. There would be nothing in the way of breaking API
> or anything like that preventing us from cleaning this up after the
> code has been merged.
> 
> Please consider reviewing the code for correctness and consider if the
> code could be merged as is.
> 
> On the other hand, if you or Dave or anyone else wants to create a set
> of patches that cleans this up in a timely manner, I will be happy to
> merge them for my code as well ;)

The time I would have available to put into this is rather limited, but
I have some initial ideas, as outlined below.

Tixy (who did the kprobes implementation, which is probably the most
sophisticated opcode handling we have in the kernel right now) may have
ideas too.  I would hope that any common framework could reuse a fair
chunk of his implementation and test coverage.



I think that a common framework would have to start out a lot more
generic that the special-purpose code in current subsystems, at least
initially.  We should try to move all the knowledge about how
instructions are encoded out of subsystems and into the common
framework, so far as possible.


We might end up with an interface like this:

Instruction data in memory <-> __arm_mem_to_opcode*() and friends <-> canonical form

canonical form -> __arm_opcode_decode() -> decoded form

decoded form -> __arm_opcode_encode() -> canonical form


The decoded form might look something like this:

struct arm_insn {
	u32			opcode;
	u32			flags;		/* common flags */
	enum arm_insn_type	type;
	u32			insn_flags;	/* insn specific flags */
	enum arm_reg		Rd;
	enum arm_reg		Rn;
	enum arm_reg		Rm;
	enum arm_reg		Rs;
	enum arm_reg		Rt;
	enum arm_reg		Ra;

	/* ... */
};

... and so on.  This just a sketch, and deliberately very incomplete.

The basic principle here is that the client gets the architectural,
encoding-independent view of the instruction: as far as possible, client
code should never need to know anything about how the encoding works.
The client code should not even need to know for most purposes whether
the instruction is ARM or Thumb, once decoded.

Identifying instruction classes (i.e., is this a VFP/NEON instruction,
does this access memory, does this change the PC) etc. should all be
abstracted as helper functions / macros.

All cleverness involving tables and hashes to accelerate decode and
identify instruction classes should move into the framework, but
we can start out with something stupid and simple: if we only need
to distinguish a few instruction types to begin with, a simple switch
statement for decode may be enough to get started.  However, as
things mature, a more sophisticated table-driven approach could be


A good starting point would be load/store emulation as this seems to be a
common theme, and we would need a credible deployment for any new
framework so that we know it's fit for purpose.

So just refactoring the code that appears in the KVM patches could be a
good way of getting such a framework off the ground.

Subsystems could be migrated to the new framework incrementally after
its initial creation, while extending the framework as needed.  This
means that the initial that the initial framework could be pretty simple
-- we don't need all the functionality all at once.


[...]

Cheers
---Dave
Jon Medhurst (Tixy) Oct. 1, 2012, 3:12 p.m. UTC | #4
On Mon, 2012-10-01 at 13:53 +0100, Dave Martin wrote:
> On Sun, Sep 30, 2012 at 05:49:21PM -0400, Christoffer Dall wrote:
> > On Thu, Sep 27, 2012 at 11:11 AM, Will Deacon <will.deacon@arm.com> wrote:
> > > On Sat, Sep 15, 2012 at 04:35:59PM +0100, Christoffer Dall wrote:
> > > I'm afraid you're not going to thank me much for this, but it's high time we
> > > unified the various instruction decoding functions we have under arch/arm/
> > > and this seems like a good opportunity for that. For example, look at the
> > > following snippets (there is much more in the files I list) in addition to
> > > what you have:
> > >
> > 
> > I think it would be great if we had a set of unified decoding functions!
> > 
> > However, I think it's a shame if we can't merge KVM because we want to
> > clean up this code. There would be nothing in the way of breaking API
> > or anything like that preventing us from cleaning this up after the
> > code has been merged.
> > 
> > Please consider reviewing the code for correctness and consider if the
> > code could be merged as is.
> > 
> > On the other hand, if you or Dave or anyone else wants to create a set
> > of patches that cleans this up in a timely manner, I will be happy to
> > merge them for my code as well ;)
> 
> The time I would have available to put into this is rather limited, but
> I have some initial ideas, as outlined below.
> 
> Tixy (who did the kprobes implementation, which is probably the most
> sophisticated opcode handling we have in the kernel right now) may have
> ideas too.  I would hope that any common framework could reuse a fair
> chunk of his implementation and test coverage.

To my thinking, the kprobes code is very tailored to the job it needs to
do and that turning it into something generic is just going to make
everything bigger and more complex - because a generic framework would
be bigger (as it's trying to be generic) and then things like kprobes
will probably end up having an additional framework layered over the top
to bend it to it's purposes. Perhaps I'm being too pessimistic.

It would also requiring an inordinate amount of time to thrash out
requirements, design, prototype, and to implement. (I don't think I'm
being overly pessimistic about that ;-)

So, unless some-one has serious quantities of spare time lying around...
tip-bot for Dave Martin Oct. 1, 2012, 4:07 p.m. UTC | #5
On Mon, Oct 01, 2012 at 04:12:09PM +0100, Jon Medhurst (Tixy) wrote:
> On Mon, 2012-10-01 at 13:53 +0100, Dave Martin wrote:
> > On Sun, Sep 30, 2012 at 05:49:21PM -0400, Christoffer Dall wrote:
> > > On Thu, Sep 27, 2012 at 11:11 AM, Will Deacon <will.deacon@arm.com> wrote:
> > > > On Sat, Sep 15, 2012 at 04:35:59PM +0100, Christoffer Dall wrote:
> > > > I'm afraid you're not going to thank me much for this, but it's high time we
> > > > unified the various instruction decoding functions we have under arch/arm/
> > > > and this seems like a good opportunity for that. For example, look at the
> > > > following snippets (there is much more in the files I list) in addition to
> > > > what you have:
> > > >
> > > 
> > > I think it would be great if we had a set of unified decoding functions!
> > > 
> > > However, I think it's a shame if we can't merge KVM because we want to
> > > clean up this code. There would be nothing in the way of breaking API
> > > or anything like that preventing us from cleaning this up after the
> > > code has been merged.
> > > 
> > > Please consider reviewing the code for correctness and consider if the
> > > code could be merged as is.
> > > 
> > > On the other hand, if you or Dave or anyone else wants to create a set
> > > of patches that cleans this up in a timely manner, I will be happy to
> > > merge them for my code as well ;)
> > 
> > The time I would have available to put into this is rather limited, but
> > I have some initial ideas, as outlined below.
> > 
> > Tixy (who did the kprobes implementation, which is probably the most
> > sophisticated opcode handling we have in the kernel right now) may have
> > ideas too.  I would hope that any common framework could reuse a fair
> > chunk of his implementation and test coverage.
> 
> To my thinking, the kprobes code is very tailored to the job it needs to
> do and that turning it into something generic is just going to make
> everything bigger and more complex - because a generic framework would
> be bigger (as it's trying to be generic) and then things like kprobes
> will probably end up having an additional framework layered over the top
> to bend it to it's purposes. Perhaps I'm being too pessimistic.

Perhaps kprobes is a bit of a double-edged example.  It's an example of
an implementation with some good features, but because it is larger
the amount of adaptation required to convert to a common framework
would necessarily be larger also.

Yet, kprobes isn't trying to solve radically different problems from
other subsystems in the kernel.  It doesn't just want to descode and
manipulate the properties of instructions, it is actually interested in
many of the same properties (for example, whether an instruction is
a load or store, whether it modifies the PC etc.) as some other
subsystems.

I worry that by default every implementation of this ends up rather
deeply tailored to its correcponding subsystem -- so we gradually
accumulate more incompatible partially-overlapping duplicates of this
functionality over time.  This doesn't feel like a good thing.

> It would also requiring an inordinate amount of time to thrash out
> requirements, design, prototype, and to implement. (I don't think I'm
> being overly pessimistic about that ;-)
> 
> So, unless some-one has serious quantities of spare time lying around...

Well, I don't suggest that we should expect to get there in one go:
such an effort won't ever the off the ground for sure.

If we can consolidate a few simpler subsystems' opcode handling then
that would still be a step in the right direction, even if integrating
kprobes could not happen until much later.

If we do nothing, the situation will just gradually get worse.

Cheers
---Dave
Russell King - ARM Linux Oct. 5, 2012, 9 a.m. UTC | #6
On Mon, Oct 01, 2012 at 01:53:26PM +0100, Dave Martin wrote:
> A good starting point would be load/store emulation as this seems to be a
> common theme, and we would need a credible deployment for any new
> framework so that we know it's fit for purpose.

Probably not actually, that code is written to be fast, because things
like IP stack throughput depend on it - particularly when your network
card can only DMA packets to 32-bit aligned addresses (resulting in
virtually all network data being misaligned.)
tip-bot for Dave Martin Oct. 8, 2012, 10:04 a.m. UTC | #7
On Fri, Oct 05, 2012 at 10:00:25AM +0100, Russell King - ARM Linux wrote:
> On Mon, Oct 01, 2012 at 01:53:26PM +0100, Dave Martin wrote:
> > A good starting point would be load/store emulation as this seems to be a
> > common theme, and we would need a credible deployment for any new
> > framework so that we know it's fit for purpose.
> 
> Probably not actually, that code is written to be fast, because things
> like IP stack throughput depend on it - particularly when your network
> card can only DMA packets to 32-bit aligned addresses (resulting in
> virtually all network data being misaligned.)

A fair point, but surely it would still be worth a try?
 
We might decide that a few particular cases of instruction decode
should not use the generic framework for performance reaons, but in
most cases being critically dependent on fault-driven software
emulation for performance would be a serious mistake in the first place
(discussions about the network code notwithstanding).

This is not an argument for being slower just for the sake of it, but
it can make sense to factor code on paths where performance is not an
issue.

Cheers
---Dave
Christoffer Dall Oct. 8, 2012, 9:52 p.m. UTC | #8
On Mon, Oct 8, 2012 at 6:04 AM, Dave Martin <dave.martin@linaro.org> wrote:
> On Fri, Oct 05, 2012 at 10:00:25AM +0100, Russell King - ARM Linux wrote:
>> On Mon, Oct 01, 2012 at 01:53:26PM +0100, Dave Martin wrote:
>> > A good starting point would be load/store emulation as this seems to be a
>> > common theme, and we would need a credible deployment for any new
>> > framework so that we know it's fit for purpose.
>>
>> Probably not actually, that code is written to be fast, because things
>> like IP stack throughput depend on it - particularly when your network
>> card can only DMA packets to 32-bit aligned addresses (resulting in
>> virtually all network data being misaligned.)
>
> A fair point, but surely it would still be worth a try?
>
> We might decide that a few particular cases of instruction decode
> should not use the generic framework for performance reaons, but in
> most cases being critically dependent on fault-driven software
> emulation for performance would be a serious mistake in the first place
> (discussions about the network code notwithstanding).
>
> This is not an argument for being slower just for the sake of it, but
> it can make sense to factor code on paths where performance is not an
> issue.
>

I'm all for unifying this stuff, but I still think it doesn't qualify
for holding back on merging KVM patches. The ARM mode instruction
decoding can definitely be cleaned up though to look more like the
Thumb2 mode decoding which will be a good step before refactoring to
use a more common framework. Currently we decode too many types of
instructions (not just the ones with cleared HSR.IV) in ARM mode, so
the whole complexity of that code can be reduced.

I'll give that a go before re-sending the KVM patch series.

-Christoffer
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 4cff3b7..21cb240 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -158,8 +158,11 @@ 
 #define HSR_ISS		(HSR_IL - 1)
 #define HSR_ISV_SHIFT	(24)
 #define HSR_ISV		(1U << HSR_ISV_SHIFT)
+#define HSR_SRT_SHIFT	(16)
+#define HSR_SRT_MASK	(0xf << HSR_SRT_SHIFT)
 #define HSR_FSC		(0x3f)
 #define HSR_FSC_TYPE	(0x3c)
+#define HSR_SSE		(1 << 21)
 #define HSR_WNR		(1 << 6)
 #define HSR_CV_SHIFT	(24)
 #define HSR_CV		(1U << HSR_CV_SHIFT)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 40ee099..5315c69 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -49,6 +49,8 @@  extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
+extern u64 __kvm_va_to_pa(struct kvm_vcpu *vcpu, u32 va, bool priv);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 6ddfae2..68489f2 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -22,6 +22,27 @@ 
 #include <linux/kvm_host.h>
 #include <asm/kvm_asm.h>
 
+/*
+ * The in-kernel MMIO emulation code wants to use a copy of run->mmio,
+ * which is an anonymous type. Use our own type instead.
+ */
+struct kvm_exit_mmio {
+	phys_addr_t	phys_addr;
+	u8		data[8];
+	u32		len;
+	bool		is_write;
+};
+
+static inline void kvm_prepare_mmio(struct kvm_run *run,
+				    struct kvm_exit_mmio *mmio)
+{
+	run->mmio.phys_addr	= mmio->phys_addr;
+	run->mmio.len		= mmio->len;
+	run->mmio.is_write	= mmio->is_write;
+	memcpy(run->mmio.data, mmio->data, mmio->len);
+	run->exit_reason	= KVM_EXIT_MMIO;
+}
+
 u32 *vcpu_reg_mode(struct kvm_vcpu *vcpu, u8 reg_num, enum vcpu_mode mode);
 
 static inline u8 __vcpu_mode(u32 cpsr)
@@ -52,6 +73,8 @@  static inline enum vcpu_mode vcpu_mode(struct kvm_vcpu *vcpu)
 }
 
 int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_emulate_mmio_ls(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			unsigned long instr, struct kvm_exit_mmio *mmio);
 void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr);
 void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 97608d7..3fec9ad 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -152,6 +152,9 @@  struct kvm_vcpu_arch {
 	int last_pcpu;
 	cpumask_t require_dcache_flush;
 
+	/* Don't run the guest: see copy_current_insn() */
+	bool pause;
+
 	/* IO related fields */
 	struct {
 		bool sign_extend;	/* for byte/halfword loads */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 11f4c3a..c3f90b0 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -38,6 +38,7 @@  void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size);
 
+int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 8dc5887..06a3368 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -570,6 +570,12 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (unlikely(!vcpu->arch.target))
 		return -ENOEXEC;
 
+	if (run->exit_reason == KVM_EXIT_MMIO) {
+		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+		if (ret)
+			return ret;
+	}
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -607,7 +613,13 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_guest_enter();
 		vcpu->mode = IN_GUEST_MODE;
 
-		ret = __kvm_vcpu_run(vcpu);
+		smp_mb(); /* set mode before reading vcpu->arch.pause */
+		if (unlikely(vcpu->arch.pause)) {
+			/* This means ignore, try again. */
+			ret = ARM_EXCEPTION_IRQ;
+		} else {
+			ret = __kvm_vcpu_run(vcpu);
+		}
 
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		vcpu->arch.last_pcpu = smp_processor_id();
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index 9236d10..2670679 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -132,11 +132,459 @@  u32 *vcpu_reg_mode(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode)
 	return reg_array + vcpu_reg_offsets[mode][reg_num];
 }
 
+/******************************************************************************
+ * Utility functions common for all emulation code
+ *****************************************************************************/
+
+/*
+ * This one accepts a matrix where the first element is the
+ * bits as they must be, and the second element is the bitmask.
+ */
+#define INSTR_NONE	-1
+static int kvm_instr_index(u32 instr, u32 table[][2], int table_entries)
+{
+	int i;
+	u32 mask;
+
+	for (i = 0; i < table_entries; i++) {
+		mask = table[i][1];
+		if ((table[i][0] & mask) == (instr & mask))
+			return i;
+	}
+	return INSTR_NONE;
+}
+
 int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	return 0;
 }
 
+
+/******************************************************************************
+ * Load-Store instruction emulation
+ *****************************************************************************/
+
+/*
+ * Must be ordered with LOADS first and WRITES afterwards
+ * for easy distinction when doing MMIO.
+ */
+#define NUM_LD_INSTR  9
+enum INSTR_LS_INDEXES {
+	INSTR_LS_LDRBT, INSTR_LS_LDRT, INSTR_LS_LDR, INSTR_LS_LDRB,
+	INSTR_LS_LDRD, INSTR_LS_LDREX, INSTR_LS_LDRH, INSTR_LS_LDRSB,
+	INSTR_LS_LDRSH,
+	INSTR_LS_STRBT, INSTR_LS_STRT, INSTR_LS_STR, INSTR_LS_STRB,
+	INSTR_LS_STRD, INSTR_LS_STREX, INSTR_LS_STRH,
+	NUM_LS_INSTR
+};
+
+static u32 ls_instr[NUM_LS_INSTR][2] = {
+	{0x04700000, 0x0d700000}, /* LDRBT */
+	{0x04300000, 0x0d700000}, /* LDRT  */
+	{0x04100000, 0x0c500000}, /* LDR   */
+	{0x04500000, 0x0c500000}, /* LDRB  */
+	{0x000000d0, 0x0e1000f0}, /* LDRD  */
+	{0x01900090, 0x0ff000f0}, /* LDREX */
+	{0x001000b0, 0x0e1000f0}, /* LDRH  */
+	{0x001000d0, 0x0e1000f0}, /* LDRSB */
+	{0x001000f0, 0x0e1000f0}, /* LDRSH */
+	{0x04600000, 0x0d700000}, /* STRBT */
+	{0x04200000, 0x0d700000}, /* STRT  */
+	{0x04000000, 0x0c500000}, /* STR   */
+	{0x04400000, 0x0c500000}, /* STRB  */
+	{0x000000f0, 0x0e1000f0}, /* STRD  */
+	{0x01800090, 0x0ff000f0}, /* STREX */
+	{0x000000b0, 0x0e1000f0}  /* STRH  */
+};
+
+static inline int get_arm_ls_instr_index(u32 instr)
+{
+	return kvm_instr_index(instr, ls_instr, NUM_LS_INSTR);
+}
+
+/*
+ * Load-Store instruction decoding
+ */
+#define INSTR_LS_TYPE_BIT		26
+#define INSTR_LS_RD_MASK		0x0000f000
+#define INSTR_LS_RD_SHIFT		12
+#define INSTR_LS_RN_MASK		0x000f0000
+#define INSTR_LS_RN_SHIFT		16
+#define INSTR_LS_RM_MASK		0x0000000f
+#define INSTR_LS_OFFSET12_MASK		0x00000fff
+
+#define INSTR_LS_BIT_P			24
+#define INSTR_LS_BIT_U			23
+#define INSTR_LS_BIT_B			22
+#define INSTR_LS_BIT_W			21
+#define INSTR_LS_BIT_L			20
+#define INSTR_LS_BIT_S			 6
+#define INSTR_LS_BIT_H			 5
+
+/*
+ * ARM addressing mode defines
+ */
+#define OFFSET_IMM_MASK			0x0e000000
+#define OFFSET_IMM_VALUE		0x04000000
+#define OFFSET_REG_MASK			0x0e000ff0
+#define OFFSET_REG_VALUE		0x06000000
+#define OFFSET_SCALE_MASK		0x0e000010
+#define OFFSET_SCALE_VALUE		0x06000000
+
+#define SCALE_SHIFT_MASK		0x000000a0
+#define SCALE_SHIFT_SHIFT		5
+#define SCALE_SHIFT_LSL			0x0
+#define SCALE_SHIFT_LSR			0x1
+#define SCALE_SHIFT_ASR			0x2
+#define SCALE_SHIFT_ROR_RRX		0x3
+#define SCALE_SHIFT_IMM_MASK		0x00000f80
+#define SCALE_SHIFT_IMM_SHIFT		6
+
+#define PSR_BIT_C			29
+
+static unsigned long ls_word_calc_offset(struct kvm_vcpu *vcpu,
+					 unsigned long instr)
+{
+	int offset = 0;
+
+	if ((instr & OFFSET_IMM_MASK) == OFFSET_IMM_VALUE) {
+		/* Immediate offset/index */
+		offset = instr & INSTR_LS_OFFSET12_MASK;
+
+		if (!(instr & (1U << INSTR_LS_BIT_U)))
+			offset = -offset;
+	}
+
+	if ((instr & OFFSET_REG_MASK) == OFFSET_REG_VALUE) {
+		/* Register offset/index */
+		u8 rm = instr & INSTR_LS_RM_MASK;
+		offset = *vcpu_reg(vcpu, rm);
+
+		if (!(instr & (1U << INSTR_LS_BIT_P)))
+			offset = 0;
+	}
+
+	if ((instr & OFFSET_SCALE_MASK) == OFFSET_SCALE_VALUE) {
+		/* Scaled register offset */
+		u8 rm = instr & INSTR_LS_RM_MASK;
+		u8 shift = (instr & SCALE_SHIFT_MASK) >> SCALE_SHIFT_SHIFT;
+		u32 shift_imm = (instr & SCALE_SHIFT_IMM_MASK)
+				>> SCALE_SHIFT_IMM_SHIFT;
+		offset = *vcpu_reg(vcpu, rm);
+
+		switch (shift) {
+		case SCALE_SHIFT_LSL:
+			offset = offset << shift_imm;
+			break;
+		case SCALE_SHIFT_LSR:
+			if (shift_imm == 0)
+				offset = 0;
+			else
+				offset = ((u32)offset) >> shift_imm;
+			break;
+		case SCALE_SHIFT_ASR:
+			if (shift_imm == 0) {
+				if (offset & (1U << 31))
+					offset = 0xffffffff;
+				else
+					offset = 0;
+			} else {
+				/* Ensure arithmetic shift */
+				asm("mov %[r], %[op], ASR %[s]" :
+				    [r] "=r" (offset) :
+				    [op] "r" (offset), [s] "r" (shift_imm));
+			}
+			break;
+		case SCALE_SHIFT_ROR_RRX:
+			if (shift_imm == 0) {
+				u32 C = (vcpu->arch.regs.cpsr &
+						(1U << PSR_BIT_C));
+				offset = (C << 31) | offset >> 1;
+			} else {
+				/* Ensure arithmetic shift */
+				asm("mov %[r], %[op], ASR %[s]" :
+				    [r] "=r" (offset) :
+				    [op] "r" (offset), [s] "r" (shift_imm));
+			}
+			break;
+		}
+
+		if (instr & (1U << INSTR_LS_BIT_U))
+			return offset;
+		else
+			return -offset;
+	}
+
+	if (instr & (1U << INSTR_LS_BIT_U))
+		return offset;
+	else
+		return -offset;
+
+	BUG();
+}
+
+static int kvm_ls_length(struct kvm_vcpu *vcpu, u32 instr)
+{
+	int index;
+
+	index = get_arm_ls_instr_index(instr);
+
+	if (instr & (1U << INSTR_LS_TYPE_BIT)) {
+		/* LS word or unsigned byte */
+		if (instr & (1U << INSTR_LS_BIT_B))
+			return sizeof(unsigned char);
+		else
+			return sizeof(u32);
+	} else {
+		/* LS halfword, doubleword or signed byte */
+		u32 H = (instr & (1U << INSTR_LS_BIT_H));
+		u32 S = (instr & (1U << INSTR_LS_BIT_S));
+		u32 L = (instr & (1U << INSTR_LS_BIT_L));
+
+		if (!L && S) {
+			kvm_err("WARNING: d-word for MMIO\n");
+			return 2 * sizeof(u32);
+		} else if (L && S && !H)
+			return sizeof(char);
+		else
+			return sizeof(u16);
+	}
+
+	BUG();
+}
+
+static bool kvm_decode_arm_ls(struct kvm_vcpu *vcpu, unsigned long instr,
+			      struct kvm_exit_mmio *mmio)
+{
+	int index;
+	bool is_write;
+	unsigned long rd, rn, offset, len;
+
+	index = get_arm_ls_instr_index(instr);
+	if (index == INSTR_NONE)
+		return false;
+
+	is_write = (index < NUM_LD_INSTR) ? false : true;
+	rd = (instr & INSTR_LS_RD_MASK) >> INSTR_LS_RD_SHIFT;
+	len = kvm_ls_length(vcpu, instr);
+
+	mmio->is_write = is_write;
+	mmio->len = len;
+
+	vcpu->arch.mmio.sign_extend = false;
+	vcpu->arch.mmio.rd = rd;
+
+	/* Handle base register writeback */
+	if (!(instr & (1U << INSTR_LS_BIT_P)) ||
+	     (instr & (1U << INSTR_LS_BIT_W))) {
+		rn = (instr & INSTR_LS_RN_MASK) >> INSTR_LS_RN_SHIFT;
+		offset = ls_word_calc_offset(vcpu, instr);
+		*vcpu_reg(vcpu, rn) += offset;
+	}
+
+	return true;
+}
+
+struct thumb_instr {
+	bool is32;
+
+	union {
+		struct {
+			u8 opcode;
+			u8 mask;
+		} t16;
+
+		struct {
+			u8 op1;
+			u8 op2;
+			u8 op2_mask;
+		} t32;
+	};
+
+	bool (*decode)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+		       unsigned long instr, const struct thumb_instr *ti);
+};
+
+static bool decode_thumb_wb(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+			    unsigned long instr)
+{
+	bool P = (instr >> 10) & 1;
+	bool U = (instr >> 9) & 1;
+	u8 imm8 = instr & 0xff;
+	u32 offset_addr = vcpu->arch.hdfar;
+	u8 Rn = (instr >> 16) & 0xf;
+
+	vcpu->arch.mmio.rd = (instr >> 12) & 0xf;
+
+	if (Rn == 15)
+		return false;
+
+	/* Handle Writeback */
+	if (!P && U)
+		*vcpu_reg(vcpu, Rn) = offset_addr + imm8;
+	else if (!P && !U)
+		*vcpu_reg(vcpu, Rn) = offset_addr - imm8;
+	return true;
+}
+
+static bool decode_thumb_str(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+			     unsigned long instr, const struct thumb_instr *ti)
+{
+	u8 op1 = (instr >> (16 + 5)) & 0x7;
+	u8 op2 = (instr >> 6) & 0x3f;
+
+	mmio->is_write = true;
+	vcpu->arch.mmio.sign_extend = false;
+
+	switch (op1) {
+	case 0x0: mmio->len = 1; break;
+	case 0x1: mmio->len = 2; break;
+	case 0x2: mmio->len = 4; break;
+	default:
+		  return false; /* Only register write-back versions! */
+	}
+
+	if ((op2 & 0x24) == 0x24) {
+		/* STRB (immediate, thumb, W=1) */
+		return decode_thumb_wb(vcpu, mmio, instr);
+	}
+
+	return false;
+}
+
+static bool decode_thumb_ldr(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
+			     unsigned long instr, const struct thumb_instr *ti)
+{
+	u8 op1 = (instr >> (16 + 7)) & 0x3;
+	u8 op2 = (instr >> 6) & 0x3f;
+
+	mmio->is_write = false;
+
+	switch (ti->t32.op2 & 0x7) {
+	case 0x1: mmio->len = 1; break;
+	case 0x3: mmio->len = 2; break;
+	case 0x5: mmio->len = 4; break;
+	}
+
+	if (op1 == 0x0)
+		vcpu->arch.mmio.sign_extend = false;
+	else if (op1 == 0x2 && (ti->t32.op2 & 0x7) != 0x5)
+		vcpu->arch.mmio.sign_extend = true;
+	else
+		return false; /* Only register write-back versions! */
+
+	if ((op2 & 0x24) == 0x24) {
+		/* LDR{S}X (immediate, thumb, W=1) */
+		return decode_thumb_wb(vcpu, mmio, instr);
+	}
+
+	return false;
+}
+
+/*
+ * We only support instruction decoding for valid reasonable MMIO operations
+ * where trapping them do not provide sufficient information in the HSR (no
+ * 16-bit Thumb instructions provide register writeback that we care about).
+ *
+ * The following instruciton types are NOT supported for MMIO operations
+ * despite the HSR not containing decode info:
+ *  - any Load/Store multiple
+ *  - any load/store exclusive
+ *  - any load/store dual
+ *  - anything with the PC as the dest register
+ */
+static const struct thumb_instr thumb_instr[] = {
+	/**************** 32-bit Thumb instructions **********************/
+	/* Store single data item:	Op1 == 11, Op2 == 000xxx0 */
+	{ .is32 = true,  .t32 = { 3, 0x00, 0x71}, decode_thumb_str	},
+	/* Load byte:			Op1 == 11, Op2 == 00xx001 */
+	{ .is32 = true,  .t32 = { 3, 0x01, 0x67}, decode_thumb_ldr	},
+	/* Load halfword:		Op1 == 11, Op2 == 00xx011 */
+	{ .is32 = true,  .t32 = { 3, 0x03, 0x67}, decode_thumb_ldr	},
+	/* Load word:			Op1 == 11, Op2 == 00xx101 */
+	{ .is32 = true,  .t32 = { 3, 0x05, 0x67}, decode_thumb_ldr	},
+};
+
+
+
+static bool kvm_decode_thumb_ls(struct kvm_vcpu *vcpu, unsigned long instr,
+				struct kvm_exit_mmio *mmio)
+{
+	bool is32 = is_wide_instruction(instr);
+	bool is16 = !is32;
+	struct thumb_instr tinstr; /* re-use to pass on already decoded info */
+	int i;
+
+	if (is16) {
+		tinstr.t16.opcode = (instr >> 10) & 0x3f;
+	} else {
+		tinstr.t32.op1 = (instr >> (16 + 11)) & 0x3;
+		tinstr.t32.op2 = (instr >> (16 + 4)) & 0x7f;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(thumb_instr); i++) {
+		const struct thumb_instr *ti = &thumb_instr[i];
+		if (ti->is32 != is32)
+			continue;
+
+		if (is16) {
+			if ((tinstr.t16.opcode & ti->t16.mask) != ti->t16.opcode)
+				continue;
+		} else {
+			if (ti->t32.op1 != tinstr.t32.op1)
+				continue;
+			if ((ti->t32.op2_mask & tinstr.t32.op2) != ti->t32.op2)
+				continue;
+		}
+
+		return ti->decode(vcpu, mmio, instr, &tinstr);
+	}
+
+	return false;
+}
+
+/**
+ * kvm_emulate_mmio_ls - emulates load/store instructions made to I/O memory
+ * @vcpu:	The vcpu pointer
+ * @fault_ipa:	The IPA that caused the 2nd stage fault
+ * @instr:	The instruction that caused the fault
+ *
+ * Handles emulation of load/store instructions which cannot be emulated through
+ * information found in the HSR on faults. It is necessary in this case to
+ * simply decode the offending instruction in software and determine the
+ * required operands.
+ */
+int kvm_emulate_mmio_ls(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			unsigned long instr, struct kvm_exit_mmio *mmio)
+{
+	bool is_thumb;
+
+	trace_kvm_mmio_emulate(vcpu->arch.regs.pc, instr, vcpu->arch.regs.cpsr);
+
+	mmio->phys_addr = fault_ipa;
+	is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT);
+	if (!is_thumb && !kvm_decode_arm_ls(vcpu, instr, mmio)) {
+		kvm_debug("Unable to decode inst: %#08lx (cpsr: %#08x (T=0)"
+			  "pc: %#08x)\n",
+			  instr, *vcpu_cpsr(vcpu), *vcpu_pc(vcpu));
+		kvm_inject_dabt(vcpu, vcpu->arch.hdfar);
+		return 1;
+	} else if (is_thumb && !kvm_decode_thumb_ls(vcpu, instr, mmio)) {
+		kvm_debug("Unable to decode inst: %#08lx (cpsr: %#08x (T=1)"
+			  "pc: %#08x)\n",
+			  instr, *vcpu_cpsr(vcpu), *vcpu_pc(vcpu));
+		kvm_inject_dabt(vcpu, vcpu->arch.hdfar);
+		return 1;
+	}
+
+	/*
+	 * The MMIO instruction is emulated and should not be re-executed
+	 * in the guest.
+	 */
+	kvm_skip_instr(vcpu, is_wide_instruction(instr));
+	return 0;
+}
+
 /**
  * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
  * @vcpu:	The VCPU pointer
diff --git a/arch/arm/kvm/exports.c b/arch/arm/kvm/exports.c
index f39f823..843c3bc 100644
--- a/arch/arm/kvm/exports.c
+++ b/arch/arm/kvm/exports.c
@@ -34,5 +34,6 @@  EXPORT_SYMBOL_GPL(__kvm_vcpu_run);
 
 EXPORT_SYMBOL_GPL(__kvm_flush_vm_context);
 EXPORT_SYMBOL_GPL(__kvm_tlb_flush_vmid);
+EXPORT_SYMBOL_GPL(__kvm_va_to_pa);
 
 EXPORT_SYMBOL_GPL(smp_send_reschedule);
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index cc9448b..ab78477 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -128,6 +128,7 @@  ENDPROC(__kvm_flush_vm_context)
 	VFPFMXR FPEXC, r2	@ FPEXC	(last, in case !EN)
 .endm
 
+
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 @  Hypervisor world-switch code
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@ -520,6 +521,45 @@  after_vfp_restore:
 	bx	lr			@ return to IOCTL
 
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@  Translate VA to PA
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ Arguments:
+@  r0: pointer to vcpu struct
+@  r1: virtual address to map (rounded to page)
+@  r2: 1 = P1 (read) mapping, 0 = P0 (read) mapping.
+@ Returns 64 bit PAR value.
+ENTRY(__kvm_va_to_pa)
+	hvc	#0			@ switch to hyp-mode
+
+	push	{r4-r12}
+
+	@ Fold flag into r1, easier than using stack.
+	cmp	r2, #0
+	movne	r2, #1
+	orr	r1, r1, r2
+
+	@ This swaps too many registers, but we're in the slow path anyway.
+	read_cp15_state
+	write_cp15_state 1, r0
+
+	ands	r2, r1, #1
+	bic	r1, r1, r2
+	mcrne	p15, 0, r1, c7, c8, 0	@ VA to PA, ATS1CPR
+	mcreq	p15, 0, r1, c7, c8, 2	@ VA to PA, ATS1CUR
+	isb
+
+	@ Restore host state.
+	read_cp15_state 1, r0
+	write_cp15_state
+
+	mrrc	p15, 0, r0, r1, c7	@ PAR
+	pop	{r4-r12}
+	hvc	#0			@ Back to SVC
+	bx	lr
+
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 @  Hypervisor exception vector and handlers
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 52cc280..dc760bb 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@ 
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
+#include <trace/events/kvm.h>
 #include <asm/idmap.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
@@ -589,6 +590,266 @@  out_put_existing:
 }
 
 /**
+ * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ * @vcpu: The VCPU pointer
+ * @run:  The VCPU run struct containing the mmio data
+ *
+ * This should only be called after returning from userspace for MMIO load
+ * emulation.
+ */
+int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	int *dest;
+	unsigned int len;
+	int mask;
+
+	if (!run->mmio.is_write) {
+		dest = vcpu_reg(vcpu, vcpu->arch.mmio.rd);
+		memset(dest, 0, sizeof(int));
+
+		len = run->mmio.len;
+		if (len > 4)
+			return -EINVAL;
+
+		memcpy(dest, run->mmio.data, len);
+
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
+				*((u64 *)run->mmio.data));
+
+		if (vcpu->arch.mmio.sign_extend && len < 4) {
+			mask = 1U << ((len * 8) - 1);
+			*dest = (*dest ^ mask) - mask;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * copy_from_guest_va - copy memory from guest (very slow!)
+ * @vcpu:	vcpu pointer
+ * @dest:	memory to copy into
+ * @gva:	virtual address in guest to copy from
+ * @len:	length to copy
+ * @priv:	use guest PL1 (ie. kernel) mappings
+ *              otherwise use guest PL0 mappings.
+ *
+ * Returns true on success, false on failure (unlikely, but retry).
+ */
+static bool copy_from_guest_va(struct kvm_vcpu *vcpu,
+			       void *dest, unsigned long gva, size_t len,
+			       bool priv)
+{
+	u64 par;
+	phys_addr_t pc_ipa;
+	int err;
+
+	BUG_ON((gva & PAGE_MASK) != ((gva + len) & PAGE_MASK));
+	par = __kvm_va_to_pa(vcpu, gva & PAGE_MASK, priv);
+	if (par & 1) {
+		kvm_err("IO abort from invalid instruction address"
+			" %#lx!\n", gva);
+		return false;
+	}
+
+	BUG_ON(!(par & (1U << 11)));
+	pc_ipa = par & PAGE_MASK & ((1ULL << 32) - 1);
+	pc_ipa += gva & ~PAGE_MASK;
+
+
+	err = kvm_read_guest(vcpu->kvm, pc_ipa, dest, len);
+	if (unlikely(err))
+		return false;
+
+	return true;
+}
+
+/* Just ensure we're not running the guest. */
+static void do_nothing(void *info)
+{
+}
+
+/*
+ * We have to be very careful copying memory from a running (ie. SMP) guest.
+ * Another CPU may remap the page (eg. swap out a userspace text page) as we
+ * read the instruction.  Unlike normal hardware operation, to emulate an
+ * instruction we map the virtual to physical address then read that memory
+ * as separate steps, thus not atomic.
+ *
+ * Fortunately this is so rare (we don't usually need the instruction), we
+ * can go very slowly and noone will mind.
+ */
+static bool copy_current_insn(struct kvm_vcpu *vcpu, unsigned long *instr)
+{
+	int i;
+	bool ret;
+	struct kvm_vcpu *v;
+	bool is_thumb;
+	size_t instr_len;
+
+	/* Don't cross with IPIs in kvm_main.c */
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	/* Tell them all to pause, so no more will enter guest. */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		v->arch.pause = true;
+
+	/* Set ->pause before we read ->mode */
+	smp_mb();
+
+	/* Kick out any which are still running. */
+	kvm_for_each_vcpu(i, v, vcpu->kvm) {
+		/* Guest could exit now, making cpu wrong. That's OK. */
+		if (kvm_vcpu_exiting_guest_mode(v) == IN_GUEST_MODE)
+			smp_call_function_single(v->cpu, do_nothing, NULL, 1);
+	}
+
+
+	is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT);
+	instr_len = (is_thumb) ? 2 : 4;
+
+	BUG_ON(!is_thumb && vcpu->arch.regs.pc & 0x3);
+
+	/* Now guest isn't running, we can va->pa map and copy atomically. */
+	ret = copy_from_guest_va(vcpu, instr, vcpu->arch.regs.pc, instr_len,
+				 vcpu_mode_priv(vcpu));
+	if (!ret)
+		goto out;
+
+	/* A 32-bit thumb2 instruction can actually go over a page boundary! */
+	if (is_thumb && is_wide_instruction(*instr)) {
+		*instr = *instr << 16;
+		ret = copy_from_guest_va(vcpu, instr, vcpu->arch.regs.pc + 2, 2,
+					 vcpu_mode_priv(vcpu));
+	}
+
+out:
+	/* Release them all. */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		v->arch.pause = false;
+
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	return ret;
+}
+
+/**
+ * invalid_io_mem_abort -- Handle I/O aborts ISV bit is clear
+ *
+ * @vcpu:      The vcpu pointer
+ * @fault_ipa: The IPA that caused the 2nd stage fault
+ * @mmio:      Pointer to struct to hold decode information
+ *
+ * Some load/store instructions cannot be emulated using the information
+ * presented in the HSR, for instance, register write-back instructions are not
+ * supported. We therefore need to fetch the instruction, decode it, and then
+ * emulate its behavior.
+ */
+static int invalid_io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+				struct kvm_exit_mmio *mmio)
+{
+	unsigned long instr = 0;
+
+	/* If it fails (SMP race?), we reenter guest for it to retry. */
+	if (!copy_current_insn(vcpu, &instr))
+		return 1;
+
+	return kvm_emulate_mmio_ls(vcpu, fault_ipa, instr, mmio);
+}
+
+static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+		      struct kvm_exit_mmio *mmio)
+{
+	unsigned long rd, len;
+	bool is_write, sign_extend;
+
+	if ((vcpu->arch.hsr >> 8) & 1) {
+		/* cache operation on I/O addr, tell guest unsupported */
+		kvm_inject_dabt(vcpu, vcpu->arch.hdfar);
+		return 1;
+	}
+
+	if ((vcpu->arch.hsr >> 7) & 1) {
+		/* page table accesses IO mem: tell guest to fix its TTBR */
+		kvm_inject_dabt(vcpu, vcpu->arch.hdfar);
+		return 1;
+	}
+
+	switch ((vcpu->arch.hsr >> 22) & 0x3) {
+	case 0:
+		len = 1;
+		break;
+	case 1:
+		len = 2;
+		break;
+	case 2:
+		len = 4;
+		break;
+	default:
+		kvm_err("Hardware is weird: SAS 0b11 is reserved\n");
+		return -EFAULT;
+	}
+
+	is_write = vcpu->arch.hsr & HSR_WNR;
+	sign_extend = vcpu->arch.hsr & HSR_SSE;
+	rd = (vcpu->arch.hsr & HSR_SRT_MASK) >> HSR_SRT_SHIFT;
+
+	if (rd == 15) {
+		/* IO memory trying to read/write pc */
+		kvm_inject_pabt(vcpu, vcpu->arch.hdfar);
+		return 1;
+	}
+
+	mmio->is_write = is_write;
+	mmio->phys_addr = fault_ipa;
+	mmio->len = len;
+	vcpu->arch.mmio.sign_extend = sign_extend;
+	vcpu->arch.mmio.rd = rd;
+
+	/*
+	 * The MMIO instruction is emulated and should not be re-executed
+	 * in the guest.
+	 */
+	kvm_skip_instr(vcpu, (vcpu->arch.hsr >> 25) & 1);
+	return 0;
+}
+
+static int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
+			phys_addr_t fault_ipa, struct kvm_memory_slot *memslot)
+{
+	struct kvm_exit_mmio mmio;
+	unsigned long rd;
+	int ret;
+
+	/*
+	 * Prepare MMIO operation. First stash it in a private
+	 * structure that we can use for in-kernel emulation. If the
+	 * kernel can't handle it, copy it into run->mmio and let user
+	 * space do its magic.
+	 */
+
+	if (vcpu->arch.hsr & HSR_ISV)
+		ret = decode_hsr(vcpu, fault_ipa, &mmio);
+	else
+		ret = invalid_io_mem_abort(vcpu, fault_ipa, &mmio);
+
+	if (ret != 0)
+		return ret;
+
+	rd = vcpu->arch.mmio.rd;
+	trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE :
+					 KVM_TRACE_MMIO_READ_UNSATISFIED,
+			mmio.len, fault_ipa,
+			(mmio.is_write) ? *vcpu_reg(vcpu, rd) : 0);
+
+	if (mmio.is_write)
+		memcpy(mmio.data, vcpu_reg(vcpu, rd), mmio.len);
+
+	kvm_prepare_mmio(run, &mmio);
+	return 0;
+}
+
+/**
  * kvm_handle_guest_abort - handles all 2nd stage aborts
  * @vcpu:	the VCPU pointer
  * @run:	the kvm_run structure
@@ -633,8 +894,9 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			return 1;
 		}
 
-		kvm_pr_unimpl("I/O address abort...");
-		return 0;
+		/* Adjust page offset */
+		fault_ipa |= vcpu->arch.hdfar & ~PAGE_MASK;
+		return io_mem_abort(vcpu, run, fault_ipa, memslot);
 	}
 
 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index 40606c9..7199b58 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -92,6 +92,27 @@  TRACE_EVENT(kvm_irq_line,
 		  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
 );
 
+TRACE_EVENT(kvm_mmio_emulate,
+	TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
+		 unsigned long cpsr),
+	TP_ARGS(vcpu_pc, instr, cpsr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_pc		)
+		__field(	unsigned long,	instr		)
+		__field(	unsigned long,	cpsr		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc		= vcpu_pc;
+		__entry->instr			= instr;
+		__entry->cpsr			= cpsr;
+	),
+
+	TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+		  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
+);
+
 /* Architecturally implementation defined CP15 register access */
 TRACE_EVENT(kvm_emulate_cp15_imp,
 	TP_PROTO(unsigned long Op1, unsigned long Rt1, unsigned long CRn,