diff mbox series

[v5,10/20] RISC-V: KVM: Handle MMIO exits for VCPU

Message ID 20190822084131.114764-11-anup.patel@wdc.com (mailing list archive)
State New, archived
Headers show
Series KVM RISC-V Support | expand

Commit Message

Anup Patel Aug. 22, 2019, 8:44 a.m. UTC
We will get stage2 page faults whenever Guest/VM access SW emulated
MMIO device or unmapped Guest RAM.

This patch implements MMIO read/write emulation by extracting MMIO
details from the trapped load/store instruction and forwarding the
MMIO read/write to user-space. The actual MMIO emulation will happen
in user-space and KVM kernel module will only take care of register
updates before resuming the trapped VCPU.

The handling for stage2 page faults for unmapped Guest RAM will be
implemeted by a separate patch later.

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/riscv/include/asm/kvm_host.h |  11 +
 arch/riscv/kvm/mmu.c              |   7 +
 arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
 3 files changed, 451 insertions(+), 3 deletions(-)

Comments

Alexander Graf Aug. 22, 2019, 12:10 p.m. UTC | #1
On 22.08.19 10:44, Anup Patel wrote:
> We will get stage2 page faults whenever Guest/VM access SW emulated
> MMIO device or unmapped Guest RAM.
> 
> This patch implements MMIO read/write emulation by extracting MMIO
> details from the trapped load/store instruction and forwarding the
> MMIO read/write to user-space. The actual MMIO emulation will happen
> in user-space and KVM kernel module will only take care of register
> updates before resuming the trapped VCPU.
> 
> The handling for stage2 page faults for unmapped Guest RAM will be
> implemeted by a separate patch later.
> 
> Signed-off-by: Anup Patel <anup.patel@wdc.com>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/riscv/include/asm/kvm_host.h |  11 +
>   arch/riscv/kvm/mmu.c              |   7 +
>   arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
>   3 files changed, 451 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 18f1097f1d8d..4388bace6d70 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -53,6 +53,12 @@ struct kvm_arch {
>   	phys_addr_t pgd_phys;
>   };
>   
> +struct kvm_mmio_decode {
> +	unsigned long insn;
> +	int len;
> +	int shift;
> +};
> +
>   struct kvm_cpu_context {
>   	unsigned long zero;
>   	unsigned long ra;
> @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
>   	unsigned long irqs_pending;
>   	unsigned long irqs_pending_mask;
>   
> +	/* MMIO instruction details */
> +	struct kvm_mmio_decode mmio_decode;
> +
>   	/* VCPU power-off state */
>   	bool power_off;
>   
> @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>   int kvm_riscv_setup_vsip(void);
>   void kvm_riscv_cleanup_vsip(void);
>   
> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> +			 bool is_write);
>   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
>   int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
>   void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 04dd089b86ff..2b965f9aac07 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>   	return 0;
>   }
>   
> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> +			 bool is_write)
> +{
> +	/* TODO: */
> +	return 0;
> +}
> +
>   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
>   {
>   	/* TODO: */
> diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> index e4d7c8f0807a..efc06198c259 100644
> --- a/arch/riscv/kvm/vcpu_exit.c
> +++ b/arch/riscv/kvm/vcpu_exit.c
> @@ -6,9 +6,371 @@
>    *     Anup Patel <anup.patel@wdc.com>
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/errno.h>
>   #include <linux/err.h>
>   #include <linux/kvm_host.h>
> +#include <asm/csr.h>
> +
> +#define INSN_MATCH_LB		0x3
> +#define INSN_MASK_LB		0x707f
> +#define INSN_MATCH_LH		0x1003
> +#define INSN_MASK_LH		0x707f
> +#define INSN_MATCH_LW		0x2003
> +#define INSN_MASK_LW		0x707f
> +#define INSN_MATCH_LD		0x3003
> +#define INSN_MASK_LD		0x707f
> +#define INSN_MATCH_LBU		0x4003
> +#define INSN_MASK_LBU		0x707f
> +#define INSN_MATCH_LHU		0x5003
> +#define INSN_MASK_LHU		0x707f
> +#define INSN_MATCH_LWU		0x6003
> +#define INSN_MASK_LWU		0x707f
> +#define INSN_MATCH_SB		0x23
> +#define INSN_MASK_SB		0x707f
> +#define INSN_MATCH_SH		0x1023
> +#define INSN_MASK_SH		0x707f
> +#define INSN_MATCH_SW		0x2023
> +#define INSN_MASK_SW		0x707f
> +#define INSN_MATCH_SD		0x3023
> +#define INSN_MASK_SD		0x707f
> +
> +#define INSN_MATCH_C_LD		0x6000
> +#define INSN_MASK_C_LD		0xe003
> +#define INSN_MATCH_C_SD		0xe000
> +#define INSN_MASK_C_SD		0xe003
> +#define INSN_MATCH_C_LW		0x4000
> +#define INSN_MASK_C_LW		0xe003
> +#define INSN_MATCH_C_SW		0xc000
> +#define INSN_MASK_C_SW		0xe003
> +#define INSN_MATCH_C_LDSP	0x6002
> +#define INSN_MASK_C_LDSP	0xe003
> +#define INSN_MATCH_C_SDSP	0xe002
> +#define INSN_MASK_C_SDSP	0xe003
> +#define INSN_MATCH_C_LWSP	0x4002
> +#define INSN_MASK_C_LWSP	0xe003
> +#define INSN_MATCH_C_SWSP	0xc002
> +#define INSN_MASK_C_SWSP	0xe003
> +
> +#define INSN_LEN(insn)		((((insn) & 0x3) < 0x3) ? 2 : 4)
> +
> +#ifdef CONFIG_64BIT
> +#define LOG_REGBYTES		3
> +#else
> +#define LOG_REGBYTES		2
> +#endif
> +#define REGBYTES		(1 << LOG_REGBYTES)
> +
> +#define SH_RD			7
> +#define SH_RS1			15
> +#define SH_RS2			20
> +#define SH_RS2C			2
> +
> +#define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
> +#define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
> +				 (RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 5, 1) << 6))
> +#define RVC_LD_IMM(x)		((RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 5, 2) << 6))
> +#define RVC_LWSP_IMM(x)		((RV_X(x, 4, 3) << 2) | \
> +				 (RV_X(x, 12, 1) << 5) | \
> +				 (RV_X(x, 2, 2) << 6))
> +#define RVC_LDSP_IMM(x)		((RV_X(x, 5, 2) << 3) | \
> +				 (RV_X(x, 12, 1) << 5) | \
> +				 (RV_X(x, 2, 3) << 6))
> +#define RVC_SWSP_IMM(x)		((RV_X(x, 9, 4) << 2) | \
> +				 (RV_X(x, 7, 2) << 6))
> +#define RVC_SDSP_IMM(x)		((RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 7, 3) << 6))
> +#define RVC_RS1S(insn)		(8 + RV_X(insn, SH_RD, 3))
> +#define RVC_RS2S(insn)		(8 + RV_X(insn, SH_RS2C, 3))
> +#define RVC_RS2(insn)		RV_X(insn, SH_RS2C, 5)
> +
> +#define SHIFT_RIGHT(x, y)		\
> +	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
> +
> +#define REG_MASK			\
> +	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
> +
> +#define REG_OFFSET(insn, pos)		\
> +	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
> +
> +#define REG_PTR(insn, pos, regs)	\
> +	(ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
> +
> +#define GET_RM(insn)		(((insn) >> 12) & 7)
> +
> +#define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
> +#define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
> +#define GET_RS1S(insn, regs)	(*REG_PTR(RVC_RS1S(insn), 0, regs))
> +#define GET_RS2S(insn, regs)	(*REG_PTR(RVC_RS2S(insn), 0, regs))
> +#define GET_RS2C(insn, regs)	(*REG_PTR(insn, SH_RS2C, regs))
> +#define GET_SP(regs)		(*REG_PTR(2, 0, regs))
> +#define SET_RD(insn, regs, val)	(*REG_PTR(insn, SH_RD, regs) = (val))
> +#define IMM_I(insn)		((s32)(insn) >> 20)
> +#define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
> +				 (s32)(((insn) >> 7) & 0x1f))
> +#define MASK_FUNCT3		0x7000
> +
> +#define STR(x)			XSTR(x)
> +#define XSTR(x)			#x
> +
> +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
> +static ulong get_insn(struct kvm_vcpu *vcpu)
> +{
> +	ulong __sepc = vcpu->arch.guest_context.sepc;
> +	ulong __hstatus, __sstatus, __vsstatus;
> +#ifdef CONFIG_RISCV_ISA_C
> +	ulong rvc_mask = 3, tmp;
> +#endif
> +	ulong flags, val;
> +
> +	local_irq_save(flags);
> +
> +	__vsstatus = csr_read(CSR_VSSTATUS);
> +	__sstatus = csr_read(CSR_SSTATUS);
> +	__hstatus = csr_read(CSR_HSTATUS);
> +
> +	csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
> +	csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
> +	csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
> +
> +#ifndef CONFIG_RISCV_ISA_C
> +	asm ("\n"
> +#ifdef CONFIG_64BIT
> +		STR(LWU) " %[insn], (%[addr])\n"
> +#else
> +		STR(LW) " %[insn], (%[addr])\n"
> +#endif
> +		: [insn] "=&r" (val) : [addr] "r" (__sepc));
> +#else
> +	asm ("and %[tmp], %[addr], 2\n"
> +		"bnez %[tmp], 1f\n"
> +#ifdef CONFIG_64BIT
> +		STR(LWU) " %[insn], (%[addr])\n"
> +#else
> +		STR(LW) " %[insn], (%[addr])\n"
> +#endif
> +		"and %[tmp], %[insn], %[rvc_mask]\n"
> +		"beq %[tmp], %[rvc_mask], 2f\n"
> +		"sll %[insn], %[insn], %[xlen_minus_16]\n"
> +		"srl %[insn], %[insn], %[xlen_minus_16]\n"
> +		"j 2f\n"
> +		"1:\n"
> +		"lhu %[insn], (%[addr])\n"
> +		"and %[tmp], %[insn], %[rvc_mask]\n"
> +		"bne %[tmp], %[rvc_mask], 2f\n"
> +		"lhu %[tmp], 2(%[addr])\n"
> +		"sll %[tmp], %[tmp], 16\n"
> +		"add %[insn], %[insn], %[tmp]\n"
> +		"2:"
> +	: [vsstatus] "+&r" (__vsstatus), [insn] "=&r" (val),
> +	  [tmp] "=&r" (tmp)
> +	: [addr] "r" (__sepc), [rvc_mask] "r" (rvc_mask),
> +	  [xlen_minus_16] "i" (__riscv_xlen - 16));
> +#endif
> +
> +	csr_write(CSR_HSTATUS, __hstatus);
> +	csr_write(CSR_SSTATUS, __sstatus);
> +	csr_write(CSR_VSSTATUS, __vsstatus);
> +
> +	local_irq_restore(flags);
> +
> +	return val;
> +}
> +
> +static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +			unsigned long fault_addr)
> +{
> +	int shift = 0, len = 0;
> +	ulong insn = get_insn(vcpu);
> +
> +	/* Decode length of MMIO and shift */
> +	if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> +		len = 4;
> +		shift = 8 * (sizeof(ulong) - len);
> +	} else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) {
> +		len = 1;
> +		shift = 8 * (sizeof(ulong) - len);
> +	} else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) {
> +		len = 1;
> +		shift = 8 * (sizeof(ulong) - len);
> +#ifdef CONFIG_64BIT
> +	} else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
> +		len = 8;
> +		shift = 8 * (sizeof(ulong) - len);
> +	} else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
> +		len = 4;
> +#endif
> +	} else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
> +		len = 2;
> +		shift = 8 * (sizeof(ulong) - len);
> +	} else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
> +		len = 2;
> +#ifdef CONFIG_RISCV_ISA_C
> +#ifdef CONFIG_64BIT
> +	} else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
> +		len = 8;
> +		shift = 8 * (sizeof(ulong) - len);
> +		insn = RVC_RS2S(insn) << SH_RD;
> +	} else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
> +		   ((insn >> SH_RD) & 0x1f)) {
> +		len = 8;
> +		shift = 8 * (sizeof(ulong) - len);
> +#endif
> +	} else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
> +		len = 4;
> +		shift = 8 * (sizeof(ulong) - len);
> +		insn = RVC_RS2S(insn) << SH_RD;
> +	} else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
> +		   ((insn >> SH_RD) & 0x1f)) {
> +		len = 4;
> +		shift = 8 * (sizeof(ulong) - len);
> +#endif
> +	} else {
> +		return -ENOTSUPP;
> +	}
> +
> +	/* Fault address should be aligned to length of MMIO */
> +	if (fault_addr & (len - 1))
> +		return -EIO;
> +
> +	/* Save instruction decode info */
> +	vcpu->arch.mmio_decode.insn = insn;
> +	vcpu->arch.mmio_decode.shift = shift;
> +	vcpu->arch.mmio_decode.len = len;
> +
> +	/* Exit to userspace for MMIO emulation */
> +	vcpu->stat.mmio_exit_user++;
> +	run->exit_reason = KVM_EXIT_MMIO;
> +	run->mmio.is_write = false;
> +	run->mmio.phys_addr = fault_addr;
> +	run->mmio.len = len;
> +
> +	/* Move to next instruction */
> +	vcpu->arch.guest_context.sepc += INSN_LEN(insn);

Doesn't that make more sense on the reentry path? What if you want to 
inject an MCE on access to unmapped addresses from user space?

> +
> +	return 0;
> +}
> +
> +static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +			 unsigned long fault_addr)
> +{
> +	u8 data8;
> +	u16 data16;
> +	u32 data32;
> +	u64 data64;
> +	ulong data;
> +	int len = 0;
> +	ulong insn = get_insn(vcpu);
> +
> +	data = GET_RS2(insn, &vcpu->arch.guest_context);
> +	data8 = data16 = data32 = data64 = data;
> +
> +	if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
> +		len = 4;
> +	} else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) {
> +		len = 1;
> +#ifdef CONFIG_64BIT
> +	} else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
> +		len = 8;
> +#endif
> +	} else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
> +		len = 2;
> +#ifdef CONFIG_RISCV_ISA_C
> +#ifdef CONFIG_64BIT
> +	} else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
> +		len = 8;
> +		data64 = GET_RS2S(insn, &vcpu->arch.guest_context);
> +	} else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
> +		   ((insn >> SH_RD) & 0x1f)) {
> +		len = 8;
> +		data64 = GET_RS2C(insn, &vcpu->arch.guest_context);
> +#endif
> +	} else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
> +		len = 4;
> +		data32 = GET_RS2S(insn, &vcpu->arch.guest_context);
> +	} else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
> +		   ((insn >> SH_RD) & 0x1f)) {
> +		len = 4;
> +		data32 = GET_RS2C(insn, &vcpu->arch.guest_context);
> +#endif
> +	} else {
> +		return -ENOTSUPP;
> +	}
> +
> +	/* Fault address should be aligned to length of MMIO */
> +	if (fault_addr & (len - 1))
> +		return -EIO;
> +
> +	/* Clear instruction decode info */
> +	vcpu->arch.mmio_decode.insn = 0;
> +	vcpu->arch.mmio_decode.shift = 0;
> +	vcpu->arch.mmio_decode.len = 0;
> +
> +	/* Copy data to kvm_run instance */
> +	switch (len) {
> +	case 1:
> +		*((u8 *)run->mmio.data) = data8;
> +		break;
> +	case 2:
> +		*((u16 *)run->mmio.data) = data16;
> +		break;
> +	case 4:
> +		*((u32 *)run->mmio.data) = data32;
> +		break;
> +	case 8:
> +		*((u64 *)run->mmio.data) = data64;
> +		break;
> +	default:
> +		return -ENOTSUPP;
> +	};
> +
> +	/* Exit to userspace for MMIO emulation */
> +	vcpu->stat.mmio_exit_user++;
> +	run->exit_reason = KVM_EXIT_MMIO;
> +	run->mmio.is_write = true;
> +	run->mmio.phys_addr = fault_addr;
> +	run->mmio.len = len;
> +
> +	/* Move to next instruction */
> +	vcpu->arch.guest_context.sepc += INSN_LEN(insn);

Same comment here.


Alex
Alexander Graf Aug. 22, 2019, 12:14 p.m. UTC | #2
On 22.08.19 10:44, Anup Patel wrote:
> We will get stage2 page faults whenever Guest/VM access SW emulated
> MMIO device or unmapped Guest RAM.
> 
> This patch implements MMIO read/write emulation by extracting MMIO
> details from the trapped load/store instruction and forwarding the
> MMIO read/write to user-space. The actual MMIO emulation will happen
> in user-space and KVM kernel module will only take care of register
> updates before resuming the trapped VCPU.
> 
> The handling for stage2 page faults for unmapped Guest RAM will be
> implemeted by a separate patch later.
> 
> Signed-off-by: Anup Patel <anup.patel@wdc.com>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/riscv/include/asm/kvm_host.h |  11 +
>   arch/riscv/kvm/mmu.c              |   7 +
>   arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
>   3 files changed, 451 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 18f1097f1d8d..4388bace6d70 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -53,6 +53,12 @@ struct kvm_arch {
>   	phys_addr_t pgd_phys;
>   };
>   
> +struct kvm_mmio_decode {
> +	unsigned long insn;
> +	int len;
> +	int shift;
> +};
> +
>   struct kvm_cpu_context {
>   	unsigned long zero;
>   	unsigned long ra;
> @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
>   	unsigned long irqs_pending;
>   	unsigned long irqs_pending_mask;
>   
> +	/* MMIO instruction details */
> +	struct kvm_mmio_decode mmio_decode;
> +
>   	/* VCPU power-off state */
>   	bool power_off;
>   
> @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>   int kvm_riscv_setup_vsip(void);
>   void kvm_riscv_cleanup_vsip(void);
>   
> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> +			 bool is_write);
>   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
>   int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
>   void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 04dd089b86ff..2b965f9aac07 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>   	return 0;
>   }
>   
> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> +			 bool is_write)
> +{
> +	/* TODO: */
> +	return 0;
> +}
> +
>   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
>   {
>   	/* TODO: */
> diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> index e4d7c8f0807a..efc06198c259 100644
> --- a/arch/riscv/kvm/vcpu_exit.c
> +++ b/arch/riscv/kvm/vcpu_exit.c
> @@ -6,9 +6,371 @@
>    *     Anup Patel <anup.patel@wdc.com>
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/errno.h>
>   #include <linux/err.h>
>   #include <linux/kvm_host.h>
> +#include <asm/csr.h>
> +
> +#define INSN_MATCH_LB		0x3
> +#define INSN_MASK_LB		0x707f
> +#define INSN_MATCH_LH		0x1003
> +#define INSN_MASK_LH		0x707f
> +#define INSN_MATCH_LW		0x2003
> +#define INSN_MASK_LW		0x707f
> +#define INSN_MATCH_LD		0x3003
> +#define INSN_MASK_LD		0x707f
> +#define INSN_MATCH_LBU		0x4003
> +#define INSN_MASK_LBU		0x707f
> +#define INSN_MATCH_LHU		0x5003
> +#define INSN_MASK_LHU		0x707f
> +#define INSN_MATCH_LWU		0x6003
> +#define INSN_MASK_LWU		0x707f
> +#define INSN_MATCH_SB		0x23
> +#define INSN_MASK_SB		0x707f
> +#define INSN_MATCH_SH		0x1023
> +#define INSN_MASK_SH		0x707f
> +#define INSN_MATCH_SW		0x2023
> +#define INSN_MASK_SW		0x707f
> +#define INSN_MATCH_SD		0x3023
> +#define INSN_MASK_SD		0x707f
> +
> +#define INSN_MATCH_C_LD		0x6000
> +#define INSN_MASK_C_LD		0xe003
> +#define INSN_MATCH_C_SD		0xe000
> +#define INSN_MASK_C_SD		0xe003
> +#define INSN_MATCH_C_LW		0x4000
> +#define INSN_MASK_C_LW		0xe003
> +#define INSN_MATCH_C_SW		0xc000
> +#define INSN_MASK_C_SW		0xe003
> +#define INSN_MATCH_C_LDSP	0x6002
> +#define INSN_MASK_C_LDSP	0xe003
> +#define INSN_MATCH_C_SDSP	0xe002
> +#define INSN_MASK_C_SDSP	0xe003
> +#define INSN_MATCH_C_LWSP	0x4002
> +#define INSN_MASK_C_LWSP	0xe003
> +#define INSN_MATCH_C_SWSP	0xc002
> +#define INSN_MASK_C_SWSP	0xe003
> +
> +#define INSN_LEN(insn)		((((insn) & 0x3) < 0x3) ? 2 : 4)
> +
> +#ifdef CONFIG_64BIT
> +#define LOG_REGBYTES		3
> +#else
> +#define LOG_REGBYTES		2
> +#endif
> +#define REGBYTES		(1 << LOG_REGBYTES)
> +
> +#define SH_RD			7
> +#define SH_RS1			15
> +#define SH_RS2			20
> +#define SH_RS2C			2
> +
> +#define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
> +#define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
> +				 (RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 5, 1) << 6))
> +#define RVC_LD_IMM(x)		((RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 5, 2) << 6))
> +#define RVC_LWSP_IMM(x)		((RV_X(x, 4, 3) << 2) | \
> +				 (RV_X(x, 12, 1) << 5) | \
> +				 (RV_X(x, 2, 2) << 6))
> +#define RVC_LDSP_IMM(x)		((RV_X(x, 5, 2) << 3) | \
> +				 (RV_X(x, 12, 1) << 5) | \
> +				 (RV_X(x, 2, 3) << 6))
> +#define RVC_SWSP_IMM(x)		((RV_X(x, 9, 4) << 2) | \
> +				 (RV_X(x, 7, 2) << 6))
> +#define RVC_SDSP_IMM(x)		((RV_X(x, 10, 3) << 3) | \
> +				 (RV_X(x, 7, 3) << 6))
> +#define RVC_RS1S(insn)		(8 + RV_X(insn, SH_RD, 3))
> +#define RVC_RS2S(insn)		(8 + RV_X(insn, SH_RS2C, 3))
> +#define RVC_RS2(insn)		RV_X(insn, SH_RS2C, 5)
> +
> +#define SHIFT_RIGHT(x, y)		\
> +	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
> +
> +#define REG_MASK			\
> +	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
> +
> +#define REG_OFFSET(insn, pos)		\
> +	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
> +
> +#define REG_PTR(insn, pos, regs)	\
> +	(ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
> +
> +#define GET_RM(insn)		(((insn) >> 12) & 7)
> +
> +#define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
> +#define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
> +#define GET_RS1S(insn, regs)	(*REG_PTR(RVC_RS1S(insn), 0, regs))
> +#define GET_RS2S(insn, regs)	(*REG_PTR(RVC_RS2S(insn), 0, regs))
> +#define GET_RS2C(insn, regs)	(*REG_PTR(insn, SH_RS2C, regs))
> +#define GET_SP(regs)		(*REG_PTR(2, 0, regs))
> +#define SET_RD(insn, regs, val)	(*REG_PTR(insn, SH_RD, regs) = (val))
> +#define IMM_I(insn)		((s32)(insn) >> 20)
> +#define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
> +				 (s32)(((insn) >> 7) & 0x1f))
> +#define MASK_FUNCT3		0x7000
> +
> +#define STR(x)			XSTR(x)
> +#define XSTR(x)			#x
> +
> +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
> +static ulong get_insn(struct kvm_vcpu *vcpu)
> +{
> +	ulong __sepc = vcpu->arch.guest_context.sepc;
> +	ulong __hstatus, __sstatus, __vsstatus;
> +#ifdef CONFIG_RISCV_ISA_C
> +	ulong rvc_mask = 3, tmp;
> +#endif
> +	ulong flags, val;
> +
> +	local_irq_save(flags);
> +
> +	__vsstatus = csr_read(CSR_VSSTATUS);
> +	__sstatus = csr_read(CSR_SSTATUS);
> +	__hstatus = csr_read(CSR_HSTATUS);
> +
> +	csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
> +	csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
> +	csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);

What happens when the insn load triggers a page fault, maybe because the 
guest was malicious and did

   1) Run on page 0x1000
   2) Remove map for 0x1000, do *not* flush TLB
   3) Trigger MMIO

That would DOS the host here, as the host kernel would continue running 
in guest address space, right?


Alex

> +
> +#ifndef CONFIG_RISCV_ISA_C
> +	asm ("\n"
> +#ifdef CONFIG_64BIT
> +		STR(LWU) " %[insn], (%[addr])\n"
> +#else
> +		STR(LW) " %[insn], (%[addr])\n"
> +#endif
> +		: [insn] "=&r" (val) : [addr] "r" (__sepc));
> +#else
> +	asm ("and %[tmp], %[addr], 2\n"
> +		"bnez %[tmp], 1f\n"
> +#ifdef CONFIG_64BIT
> +		STR(LWU) " %[insn], (%[addr])\n"
> +#else
> +		STR(LW) " %[insn], (%[addr])\n"
> +#endif
> +		"and %[tmp], %[insn], %[rvc_mask]\n"
> +		"beq %[tmp], %[rvc_mask], 2f\n"
> +		"sll %[insn], %[insn], %[xlen_minus_16]\n"
> +		"srl %[insn], %[insn], %[xlen_minus_16]\n"
> +		"j 2f\n"
> +		"1:\n"
> +		"lhu %[insn], (%[addr])\n"
> +		"and %[tmp], %[insn], %[rvc_mask]\n"
> +		"bne %[tmp], %[rvc_mask], 2f\n"
> +		"lhu %[tmp], 2(%[addr])\n"
> +		"sll %[tmp], %[tmp], 16\n"
> +		"add %[insn], %[insn], %[tmp]\n"
> +		"2:"
> +	: [vsstatus] "+&r" (__vsstatus), [insn] "=&r" (val),
> +	  [tmp] "=&r" (tmp)
> +	: [addr] "r" (__sepc), [rvc_mask] "r" (rvc_mask),
> +	  [xlen_minus_16] "i" (__riscv_xlen - 16));
> +#endif
> +
> +	csr_write(CSR_HSTATUS, __hstatus);
> +	csr_write(CSR_SSTATUS, __sstatus);
> +	csr_write(CSR_VSSTATUS, __vsstatus);
> +
> +	local_irq_restore(flags);
> +
> +	return val;
> +}
Andrew Jones Aug. 22, 2019, 12:21 p.m. UTC | #3
On Thu, Aug 22, 2019 at 02:10:48PM +0200, Alexander Graf wrote:
> On 22.08.19 10:44, Anup Patel wrote:
...
> > +static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
> > +			unsigned long fault_addr)
...
> > +	/* Exit to userspace for MMIO emulation */
> > +	vcpu->stat.mmio_exit_user++;
> > +	run->exit_reason = KVM_EXIT_MMIO;
> > +	run->mmio.is_write = false;
> > +	run->mmio.phys_addr = fault_addr;
> > +	run->mmio.len = len;
> > +
> > +	/* Move to next instruction */
> > +	vcpu->arch.guest_context.sepc += INSN_LEN(insn);
> 
> Doesn't that make more sense on the reentry path? What if you want to inject
> an MCE on access to unmapped addresses from user space?
>

I agree. See commit 0d640732dbeb for arm's justification for moving
the instruction skip. But also see

https://patchwork.kernel.org/patch/11109063/

for a needed fix to avoid skipping the instructions multiple times.
It looks like riscv's KVM_RUN ioctl would be vulnerable to that as
well.

Thanks,
drew
Anup Patel Aug. 22, 2019, 12:27 p.m. UTC | #4
On Thu, Aug 22, 2019 at 5:40 PM Alexander Graf <graf@amazon.com> wrote:
>
> On 22.08.19 10:44, Anup Patel wrote:
> > We will get stage2 page faults whenever Guest/VM access SW emulated
> > MMIO device or unmapped Guest RAM.
> >
> > This patch implements MMIO read/write emulation by extracting MMIO
> > details from the trapped load/store instruction and forwarding the
> > MMIO read/write to user-space. The actual MMIO emulation will happen
> > in user-space and KVM kernel module will only take care of register
> > updates before resuming the trapped VCPU.
> >
> > The handling for stage2 page faults for unmapped Guest RAM will be
> > implemeted by a separate patch later.
> >
> > Signed-off-by: Anup Patel <anup.patel@wdc.com>
> > Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/riscv/include/asm/kvm_host.h |  11 +
> >   arch/riscv/kvm/mmu.c              |   7 +
> >   arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
> >   3 files changed, 451 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> > index 18f1097f1d8d..4388bace6d70 100644
> > --- a/arch/riscv/include/asm/kvm_host.h
> > +++ b/arch/riscv/include/asm/kvm_host.h
> > @@ -53,6 +53,12 @@ struct kvm_arch {
> >       phys_addr_t pgd_phys;
> >   };
> >
> > +struct kvm_mmio_decode {
> > +     unsigned long insn;
> > +     int len;
> > +     int shift;
> > +};
> > +
> >   struct kvm_cpu_context {
> >       unsigned long zero;
> >       unsigned long ra;
> > @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
> >       unsigned long irqs_pending;
> >       unsigned long irqs_pending_mask;
> >
> > +     /* MMIO instruction details */
> > +     struct kvm_mmio_decode mmio_decode;
> > +
> >       /* VCPU power-off state */
> >       bool power_off;
> >
> > @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
> >   int kvm_riscv_setup_vsip(void);
> >   void kvm_riscv_cleanup_vsip(void);
> >
> > +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> > +                      bool is_write);
> >   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
> >   int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
> >   void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
> > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> > index 04dd089b86ff..2b965f9aac07 100644
> > --- a/arch/riscv/kvm/mmu.c
> > +++ b/arch/riscv/kvm/mmu.c
> > @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >       return 0;
> >   }
> >
> > +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> > +                      bool is_write)
> > +{
> > +     /* TODO: */
> > +     return 0;
> > +}
> > +
> >   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
> >   {
> >       /* TODO: */
> > diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> > index e4d7c8f0807a..efc06198c259 100644
> > --- a/arch/riscv/kvm/vcpu_exit.c
> > +++ b/arch/riscv/kvm/vcpu_exit.c
> > @@ -6,9 +6,371 @@
> >    *     Anup Patel <anup.patel@wdc.com>
> >    */
> >
> > +#include <linux/bitops.h>
> >   #include <linux/errno.h>
> >   #include <linux/err.h>
> >   #include <linux/kvm_host.h>
> > +#include <asm/csr.h>
> > +
> > +#define INSN_MATCH_LB                0x3
> > +#define INSN_MASK_LB         0x707f
> > +#define INSN_MATCH_LH                0x1003
> > +#define INSN_MASK_LH         0x707f
> > +#define INSN_MATCH_LW                0x2003
> > +#define INSN_MASK_LW         0x707f
> > +#define INSN_MATCH_LD                0x3003
> > +#define INSN_MASK_LD         0x707f
> > +#define INSN_MATCH_LBU               0x4003
> > +#define INSN_MASK_LBU                0x707f
> > +#define INSN_MATCH_LHU               0x5003
> > +#define INSN_MASK_LHU                0x707f
> > +#define INSN_MATCH_LWU               0x6003
> > +#define INSN_MASK_LWU                0x707f
> > +#define INSN_MATCH_SB                0x23
> > +#define INSN_MASK_SB         0x707f
> > +#define INSN_MATCH_SH                0x1023
> > +#define INSN_MASK_SH         0x707f
> > +#define INSN_MATCH_SW                0x2023
> > +#define INSN_MASK_SW         0x707f
> > +#define INSN_MATCH_SD                0x3023
> > +#define INSN_MASK_SD         0x707f
> > +
> > +#define INSN_MATCH_C_LD              0x6000
> > +#define INSN_MASK_C_LD               0xe003
> > +#define INSN_MATCH_C_SD              0xe000
> > +#define INSN_MASK_C_SD               0xe003
> > +#define INSN_MATCH_C_LW              0x4000
> > +#define INSN_MASK_C_LW               0xe003
> > +#define INSN_MATCH_C_SW              0xc000
> > +#define INSN_MASK_C_SW               0xe003
> > +#define INSN_MATCH_C_LDSP    0x6002
> > +#define INSN_MASK_C_LDSP     0xe003
> > +#define INSN_MATCH_C_SDSP    0xe002
> > +#define INSN_MASK_C_SDSP     0xe003
> > +#define INSN_MATCH_C_LWSP    0x4002
> > +#define INSN_MASK_C_LWSP     0xe003
> > +#define INSN_MATCH_C_SWSP    0xc002
> > +#define INSN_MASK_C_SWSP     0xe003
> > +
> > +#define INSN_LEN(insn)               ((((insn) & 0x3) < 0x3) ? 2 : 4)
> > +
> > +#ifdef CONFIG_64BIT
> > +#define LOG_REGBYTES         3
> > +#else
> > +#define LOG_REGBYTES         2
> > +#endif
> > +#define REGBYTES             (1 << LOG_REGBYTES)
> > +
> > +#define SH_RD                        7
> > +#define SH_RS1                       15
> > +#define SH_RS2                       20
> > +#define SH_RS2C                      2
> > +
> > +#define RV_X(x, s, n)                (((x) >> (s)) & ((1 << (n)) - 1))
> > +#define RVC_LW_IMM(x)                ((RV_X(x, 6, 1) << 2) | \
> > +                              (RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 5, 1) << 6))
> > +#define RVC_LD_IMM(x)                ((RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 5, 2) << 6))
> > +#define RVC_LWSP_IMM(x)              ((RV_X(x, 4, 3) << 2) | \
> > +                              (RV_X(x, 12, 1) << 5) | \
> > +                              (RV_X(x, 2, 2) << 6))
> > +#define RVC_LDSP_IMM(x)              ((RV_X(x, 5, 2) << 3) | \
> > +                              (RV_X(x, 12, 1) << 5) | \
> > +                              (RV_X(x, 2, 3) << 6))
> > +#define RVC_SWSP_IMM(x)              ((RV_X(x, 9, 4) << 2) | \
> > +                              (RV_X(x, 7, 2) << 6))
> > +#define RVC_SDSP_IMM(x)              ((RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 7, 3) << 6))
> > +#define RVC_RS1S(insn)               (8 + RV_X(insn, SH_RD, 3))
> > +#define RVC_RS2S(insn)               (8 + RV_X(insn, SH_RS2C, 3))
> > +#define RVC_RS2(insn)                RV_X(insn, SH_RS2C, 5)
> > +
> > +#define SHIFT_RIGHT(x, y)            \
> > +     ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
> > +
> > +#define REG_MASK                     \
> > +     ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
> > +
> > +#define REG_OFFSET(insn, pos)                \
> > +     (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
> > +
> > +#define REG_PTR(insn, pos, regs)     \
> > +     (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
> > +
> > +#define GET_RM(insn)         (((insn) >> 12) & 7)
> > +
> > +#define GET_RS1(insn, regs)  (*REG_PTR(insn, SH_RS1, regs))
> > +#define GET_RS2(insn, regs)  (*REG_PTR(insn, SH_RS2, regs))
> > +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
> > +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
> > +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
> > +#define GET_SP(regs)         (*REG_PTR(2, 0, regs))
> > +#define SET_RD(insn, regs, val)      (*REG_PTR(insn, SH_RD, regs) = (val))
> > +#define IMM_I(insn)          ((s32)(insn) >> 20)
> > +#define IMM_S(insn)          (((s32)(insn) >> 25 << 5) | \
> > +                              (s32)(((insn) >> 7) & 0x1f))
> > +#define MASK_FUNCT3          0x7000
> > +
> > +#define STR(x)                       XSTR(x)
> > +#define XSTR(x)                      #x
> > +
> > +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
> > +static ulong get_insn(struct kvm_vcpu *vcpu)
> > +{
> > +     ulong __sepc = vcpu->arch.guest_context.sepc;
> > +     ulong __hstatus, __sstatus, __vsstatus;
> > +#ifdef CONFIG_RISCV_ISA_C
> > +     ulong rvc_mask = 3, tmp;
> > +#endif
> > +     ulong flags, val;
> > +
> > +     local_irq_save(flags);
> > +
> > +     __vsstatus = csr_read(CSR_VSSTATUS);
> > +     __sstatus = csr_read(CSR_SSTATUS);
> > +     __hstatus = csr_read(CSR_HSTATUS);
> > +
> > +     csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
> > +     csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
> > +     csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
> > +
> > +#ifndef CONFIG_RISCV_ISA_C
> > +     asm ("\n"
> > +#ifdef CONFIG_64BIT
> > +             STR(LWU) " %[insn], (%[addr])\n"
> > +#else
> > +             STR(LW) " %[insn], (%[addr])\n"
> > +#endif
> > +             : [insn] "=&r" (val) : [addr] "r" (__sepc));
> > +#else
> > +     asm ("and %[tmp], %[addr], 2\n"
> > +             "bnez %[tmp], 1f\n"
> > +#ifdef CONFIG_64BIT
> > +             STR(LWU) " %[insn], (%[addr])\n"
> > +#else
> > +             STR(LW) " %[insn], (%[addr])\n"
> > +#endif
> > +             "and %[tmp], %[insn], %[rvc_mask]\n"
> > +             "beq %[tmp], %[rvc_mask], 2f\n"
> > +             "sll %[insn], %[insn], %[xlen_minus_16]\n"
> > +             "srl %[insn], %[insn], %[xlen_minus_16]\n"
> > +             "j 2f\n"
> > +             "1:\n"
> > +             "lhu %[insn], (%[addr])\n"
> > +             "and %[tmp], %[insn], %[rvc_mask]\n"
> > +             "bne %[tmp], %[rvc_mask], 2f\n"
> > +             "lhu %[tmp], 2(%[addr])\n"
> > +             "sll %[tmp], %[tmp], 16\n"
> > +             "add %[insn], %[insn], %[tmp]\n"
> > +             "2:"
> > +     : [vsstatus] "+&r" (__vsstatus), [insn] "=&r" (val),
> > +       [tmp] "=&r" (tmp)
> > +     : [addr] "r" (__sepc), [rvc_mask] "r" (rvc_mask),
> > +       [xlen_minus_16] "i" (__riscv_xlen - 16));
> > +#endif
> > +
> > +     csr_write(CSR_HSTATUS, __hstatus);
> > +     csr_write(CSR_SSTATUS, __sstatus);
> > +     csr_write(CSR_VSSTATUS, __vsstatus);
> > +
> > +     local_irq_restore(flags);
> > +
> > +     return val;
> > +}
> > +
> > +static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
> > +                     unsigned long fault_addr)
> > +{
> > +     int shift = 0, len = 0;
> > +     ulong insn = get_insn(vcpu);
> > +
> > +     /* Decode length of MMIO and shift */
> > +     if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> > +             len = 4;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +     } else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) {
> > +             len = 1;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +     } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) {
> > +             len = 1;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +#ifdef CONFIG_64BIT
> > +     } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
> > +             len = 8;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +     } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
> > +             len = 4;
> > +#endif
> > +     } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
> > +             len = 2;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +     } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
> > +             len = 2;
> > +#ifdef CONFIG_RISCV_ISA_C
> > +#ifdef CONFIG_64BIT
> > +     } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
> > +             len = 8;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +             insn = RVC_RS2S(insn) << SH_RD;
> > +     } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
> > +                ((insn >> SH_RD) & 0x1f)) {
> > +             len = 8;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +#endif
> > +     } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
> > +             len = 4;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +             insn = RVC_RS2S(insn) << SH_RD;
> > +     } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
> > +                ((insn >> SH_RD) & 0x1f)) {
> > +             len = 4;
> > +             shift = 8 * (sizeof(ulong) - len);
> > +#endif
> > +     } else {
> > +             return -ENOTSUPP;
> > +     }
> > +
> > +     /* Fault address should be aligned to length of MMIO */
> > +     if (fault_addr & (len - 1))
> > +             return -EIO;
> > +
> > +     /* Save instruction decode info */
> > +     vcpu->arch.mmio_decode.insn = insn;
> > +     vcpu->arch.mmio_decode.shift = shift;
> > +     vcpu->arch.mmio_decode.len = len;
> > +
> > +     /* Exit to userspace for MMIO emulation */
> > +     vcpu->stat.mmio_exit_user++;
> > +     run->exit_reason = KVM_EXIT_MMIO;
> > +     run->mmio.is_write = false;
> > +     run->mmio.phys_addr = fault_addr;
> > +     run->mmio.len = len;
> > +
> > +     /* Move to next instruction */
> > +     vcpu->arch.guest_context.sepc += INSN_LEN(insn);
>
> Doesn't that make more sense on the reentry path? What if you want to
> inject an MCE on access to unmapped addresses from user space?

This a good suggestion. I did not think about debugging aspect.

I will update this patch accordingly.

>
> > +
> > +     return 0;
> > +}
> > +
> > +static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
> > +                      unsigned long fault_addr)
> > +{
> > +     u8 data8;
> > +     u16 data16;
> > +     u32 data32;
> > +     u64 data64;
> > +     ulong data;
> > +     int len = 0;
> > +     ulong insn = get_insn(vcpu);
> > +
> > +     data = GET_RS2(insn, &vcpu->arch.guest_context);
> > +     data8 = data16 = data32 = data64 = data;
> > +
> > +     if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
> > +             len = 4;
> > +     } else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) {
> > +             len = 1;
> > +#ifdef CONFIG_64BIT
> > +     } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
> > +             len = 8;
> > +#endif
> > +     } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
> > +             len = 2;
> > +#ifdef CONFIG_RISCV_ISA_C
> > +#ifdef CONFIG_64BIT
> > +     } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
> > +             len = 8;
> > +             data64 = GET_RS2S(insn, &vcpu->arch.guest_context);
> > +     } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
> > +                ((insn >> SH_RD) & 0x1f)) {
> > +             len = 8;
> > +             data64 = GET_RS2C(insn, &vcpu->arch.guest_context);
> > +#endif
> > +     } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
> > +             len = 4;
> > +             data32 = GET_RS2S(insn, &vcpu->arch.guest_context);
> > +     } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
> > +                ((insn >> SH_RD) & 0x1f)) {
> > +             len = 4;
> > +             data32 = GET_RS2C(insn, &vcpu->arch.guest_context);
> > +#endif
> > +     } else {
> > +             return -ENOTSUPP;
> > +     }
> > +
> > +     /* Fault address should be aligned to length of MMIO */
> > +     if (fault_addr & (len - 1))
> > +             return -EIO;
> > +
> > +     /* Clear instruction decode info */
> > +     vcpu->arch.mmio_decode.insn = 0;
> > +     vcpu->arch.mmio_decode.shift = 0;
> > +     vcpu->arch.mmio_decode.len = 0;
> > +
> > +     /* Copy data to kvm_run instance */
> > +     switch (len) {
> > +     case 1:
> > +             *((u8 *)run->mmio.data) = data8;
> > +             break;
> > +     case 2:
> > +             *((u16 *)run->mmio.data) = data16;
> > +             break;
> > +     case 4:
> > +             *((u32 *)run->mmio.data) = data32;
> > +             break;
> > +     case 8:
> > +             *((u64 *)run->mmio.data) = data64;
> > +             break;
> > +     default:
> > +             return -ENOTSUPP;
> > +     };
> > +
> > +     /* Exit to userspace for MMIO emulation */
> > +     vcpu->stat.mmio_exit_user++;
> > +     run->exit_reason = KVM_EXIT_MMIO;
> > +     run->mmio.is_write = true;
> > +     run->mmio.phys_addr = fault_addr;
> > +     run->mmio.len = len;
> > +
> > +     /* Move to next instruction */
> > +     vcpu->arch.guest_context.sepc += INSN_LEN(insn);
>
> Same comment here.

Sure, I will update.

>
>
> Alex

Regards,
Anup
Anup Patel Aug. 22, 2019, 12:33 p.m. UTC | #5
On Thu, Aug 22, 2019 at 5:44 PM Alexander Graf <graf@amazon.com> wrote:
>
> On 22.08.19 10:44, Anup Patel wrote:
> > We will get stage2 page faults whenever Guest/VM access SW emulated
> > MMIO device or unmapped Guest RAM.
> >
> > This patch implements MMIO read/write emulation by extracting MMIO
> > details from the trapped load/store instruction and forwarding the
> > MMIO read/write to user-space. The actual MMIO emulation will happen
> > in user-space and KVM kernel module will only take care of register
> > updates before resuming the trapped VCPU.
> >
> > The handling for stage2 page faults for unmapped Guest RAM will be
> > implemeted by a separate patch later.
> >
> > Signed-off-by: Anup Patel <anup.patel@wdc.com>
> > Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/riscv/include/asm/kvm_host.h |  11 +
> >   arch/riscv/kvm/mmu.c              |   7 +
> >   arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
> >   3 files changed, 451 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> > index 18f1097f1d8d..4388bace6d70 100644
> > --- a/arch/riscv/include/asm/kvm_host.h
> > +++ b/arch/riscv/include/asm/kvm_host.h
> > @@ -53,6 +53,12 @@ struct kvm_arch {
> >       phys_addr_t pgd_phys;
> >   };
> >
> > +struct kvm_mmio_decode {
> > +     unsigned long insn;
> > +     int len;
> > +     int shift;
> > +};
> > +
> >   struct kvm_cpu_context {
> >       unsigned long zero;
> >       unsigned long ra;
> > @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
> >       unsigned long irqs_pending;
> >       unsigned long irqs_pending_mask;
> >
> > +     /* MMIO instruction details */
> > +     struct kvm_mmio_decode mmio_decode;
> > +
> >       /* VCPU power-off state */
> >       bool power_off;
> >
> > @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
> >   int kvm_riscv_setup_vsip(void);
> >   void kvm_riscv_cleanup_vsip(void);
> >
> > +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> > +                      bool is_write);
> >   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
> >   int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
> >   void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
> > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> > index 04dd089b86ff..2b965f9aac07 100644
> > --- a/arch/riscv/kvm/mmu.c
> > +++ b/arch/riscv/kvm/mmu.c
> > @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >       return 0;
> >   }
> >
> > +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> > +                      bool is_write)
> > +{
> > +     /* TODO: */
> > +     return 0;
> > +}
> > +
> >   void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
> >   {
> >       /* TODO: */
> > diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> > index e4d7c8f0807a..efc06198c259 100644
> > --- a/arch/riscv/kvm/vcpu_exit.c
> > +++ b/arch/riscv/kvm/vcpu_exit.c
> > @@ -6,9 +6,371 @@
> >    *     Anup Patel <anup.patel@wdc.com>
> >    */
> >
> > +#include <linux/bitops.h>
> >   #include <linux/errno.h>
> >   #include <linux/err.h>
> >   #include <linux/kvm_host.h>
> > +#include <asm/csr.h>
> > +
> > +#define INSN_MATCH_LB                0x3
> > +#define INSN_MASK_LB         0x707f
> > +#define INSN_MATCH_LH                0x1003
> > +#define INSN_MASK_LH         0x707f
> > +#define INSN_MATCH_LW                0x2003
> > +#define INSN_MASK_LW         0x707f
> > +#define INSN_MATCH_LD                0x3003
> > +#define INSN_MASK_LD         0x707f
> > +#define INSN_MATCH_LBU               0x4003
> > +#define INSN_MASK_LBU                0x707f
> > +#define INSN_MATCH_LHU               0x5003
> > +#define INSN_MASK_LHU                0x707f
> > +#define INSN_MATCH_LWU               0x6003
> > +#define INSN_MASK_LWU                0x707f
> > +#define INSN_MATCH_SB                0x23
> > +#define INSN_MASK_SB         0x707f
> > +#define INSN_MATCH_SH                0x1023
> > +#define INSN_MASK_SH         0x707f
> > +#define INSN_MATCH_SW                0x2023
> > +#define INSN_MASK_SW         0x707f
> > +#define INSN_MATCH_SD                0x3023
> > +#define INSN_MASK_SD         0x707f
> > +
> > +#define INSN_MATCH_C_LD              0x6000
> > +#define INSN_MASK_C_LD               0xe003
> > +#define INSN_MATCH_C_SD              0xe000
> > +#define INSN_MASK_C_SD               0xe003
> > +#define INSN_MATCH_C_LW              0x4000
> > +#define INSN_MASK_C_LW               0xe003
> > +#define INSN_MATCH_C_SW              0xc000
> > +#define INSN_MASK_C_SW               0xe003
> > +#define INSN_MATCH_C_LDSP    0x6002
> > +#define INSN_MASK_C_LDSP     0xe003
> > +#define INSN_MATCH_C_SDSP    0xe002
> > +#define INSN_MASK_C_SDSP     0xe003
> > +#define INSN_MATCH_C_LWSP    0x4002
> > +#define INSN_MASK_C_LWSP     0xe003
> > +#define INSN_MATCH_C_SWSP    0xc002
> > +#define INSN_MASK_C_SWSP     0xe003
> > +
> > +#define INSN_LEN(insn)               ((((insn) & 0x3) < 0x3) ? 2 : 4)
> > +
> > +#ifdef CONFIG_64BIT
> > +#define LOG_REGBYTES         3
> > +#else
> > +#define LOG_REGBYTES         2
> > +#endif
> > +#define REGBYTES             (1 << LOG_REGBYTES)
> > +
> > +#define SH_RD                        7
> > +#define SH_RS1                       15
> > +#define SH_RS2                       20
> > +#define SH_RS2C                      2
> > +
> > +#define RV_X(x, s, n)                (((x) >> (s)) & ((1 << (n)) - 1))
> > +#define RVC_LW_IMM(x)                ((RV_X(x, 6, 1) << 2) | \
> > +                              (RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 5, 1) << 6))
> > +#define RVC_LD_IMM(x)                ((RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 5, 2) << 6))
> > +#define RVC_LWSP_IMM(x)              ((RV_X(x, 4, 3) << 2) | \
> > +                              (RV_X(x, 12, 1) << 5) | \
> > +                              (RV_X(x, 2, 2) << 6))
> > +#define RVC_LDSP_IMM(x)              ((RV_X(x, 5, 2) << 3) | \
> > +                              (RV_X(x, 12, 1) << 5) | \
> > +                              (RV_X(x, 2, 3) << 6))
> > +#define RVC_SWSP_IMM(x)              ((RV_X(x, 9, 4) << 2) | \
> > +                              (RV_X(x, 7, 2) << 6))
> > +#define RVC_SDSP_IMM(x)              ((RV_X(x, 10, 3) << 3) | \
> > +                              (RV_X(x, 7, 3) << 6))
> > +#define RVC_RS1S(insn)               (8 + RV_X(insn, SH_RD, 3))
> > +#define RVC_RS2S(insn)               (8 + RV_X(insn, SH_RS2C, 3))
> > +#define RVC_RS2(insn)                RV_X(insn, SH_RS2C, 5)
> > +
> > +#define SHIFT_RIGHT(x, y)            \
> > +     ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
> > +
> > +#define REG_MASK                     \
> > +     ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
> > +
> > +#define REG_OFFSET(insn, pos)                \
> > +     (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
> > +
> > +#define REG_PTR(insn, pos, regs)     \
> > +     (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
> > +
> > +#define GET_RM(insn)         (((insn) >> 12) & 7)
> > +
> > +#define GET_RS1(insn, regs)  (*REG_PTR(insn, SH_RS1, regs))
> > +#define GET_RS2(insn, regs)  (*REG_PTR(insn, SH_RS2, regs))
> > +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
> > +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
> > +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
> > +#define GET_SP(regs)         (*REG_PTR(2, 0, regs))
> > +#define SET_RD(insn, regs, val)      (*REG_PTR(insn, SH_RD, regs) = (val))
> > +#define IMM_I(insn)          ((s32)(insn) >> 20)
> > +#define IMM_S(insn)          (((s32)(insn) >> 25 << 5) | \
> > +                              (s32)(((insn) >> 7) & 0x1f))
> > +#define MASK_FUNCT3          0x7000
> > +
> > +#define STR(x)                       XSTR(x)
> > +#define XSTR(x)                      #x
> > +
> > +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
> > +static ulong get_insn(struct kvm_vcpu *vcpu)
> > +{
> > +     ulong __sepc = vcpu->arch.guest_context.sepc;
> > +     ulong __hstatus, __sstatus, __vsstatus;
> > +#ifdef CONFIG_RISCV_ISA_C
> > +     ulong rvc_mask = 3, tmp;
> > +#endif
> > +     ulong flags, val;
> > +
> > +     local_irq_save(flags);
> > +
> > +     __vsstatus = csr_read(CSR_VSSTATUS);
> > +     __sstatus = csr_read(CSR_SSTATUS);
> > +     __hstatus = csr_read(CSR_HSTATUS);
> > +
> > +     csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
> > +     csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
> > +     csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
>
> What happens when the insn load triggers a page fault, maybe because the
> guest was malicious and did
>
>    1) Run on page 0x1000
>    2) Remove map for 0x1000, do *not* flush TLB
>    3) Trigger MMIO
>
> That would DOS the host here, as the host kernel would continue running
> in guest address space, right?

Yes, we can certainly fault while accessing Guest instruction. We will
be fixing this issue in a followup series. We have mentioned this in cover
letter as well.

BTW, RISC-V spec is going to further improve to provide easy
access of faulting instruction to Hypervisor.
(Refer, https://github.com/riscv/riscv-isa-manual/issues/431)

Regards,
Anup

>
>
> Alex
>
> > +
> > +#ifndef CONFIG_RISCV_ISA_C
> > +     asm ("\n"
> > +#ifdef CONFIG_64BIT
> > +             STR(LWU) " %[insn], (%[addr])\n"
> > +#else
> > +             STR(LW) " %[insn], (%[addr])\n"
> > +#endif
> > +             : [insn] "=&r" (val) : [addr] "r" (__sepc));
> > +#else
> > +     asm ("and %[tmp], %[addr], 2\n"
> > +             "bnez %[tmp], 1f\n"
> > +#ifdef CONFIG_64BIT
> > +             STR(LWU) " %[insn], (%[addr])\n"
> > +#else
> > +             STR(LW) " %[insn], (%[addr])\n"
> > +#endif
> > +             "and %[tmp], %[insn], %[rvc_mask]\n"
> > +             "beq %[tmp], %[rvc_mask], 2f\n"
> > +             "sll %[insn], %[insn], %[xlen_minus_16]\n"
> > +             "srl %[insn], %[insn], %[xlen_minus_16]\n"
> > +             "j 2f\n"
> > +             "1:\n"
> > +             "lhu %[insn], (%[addr])\n"
> > +             "and %[tmp], %[insn], %[rvc_mask]\n"
> > +             "bne %[tmp], %[rvc_mask], 2f\n"
> > +             "lhu %[tmp], 2(%[addr])\n"
> > +             "sll %[tmp], %[tmp], 16\n"
> > +             "add %[insn], %[insn], %[tmp]\n"
> > +             "2:"
> > +     : [vsstatus] "+&r" (__vsstatus), [insn] "=&r" (val),
> > +       [tmp] "=&r" (tmp)
> > +     : [addr] "r" (__sepc), [rvc_mask] "r" (rvc_mask),
> > +       [xlen_minus_16] "i" (__riscv_xlen - 16));
> > +#endif
> > +
> > +     csr_write(CSR_HSTATUS, __hstatus);
> > +     csr_write(CSR_SSTATUS, __sstatus);
> > +     csr_write(CSR_VSSTATUS, __vsstatus);
> > +
> > +     local_irq_restore(flags);
> > +
> > +     return val;
> > +}
>
Alexander Graf Aug. 22, 2019, 1:25 p.m. UTC | #6
On 22.08.19 14:33, Anup Patel wrote:
> On Thu, Aug 22, 2019 at 5:44 PM Alexander Graf <graf@amazon.com> wrote:
>>
>> On 22.08.19 10:44, Anup Patel wrote:
>>> We will get stage2 page faults whenever Guest/VM access SW emulated
>>> MMIO device or unmapped Guest RAM.
>>>
>>> This patch implements MMIO read/write emulation by extracting MMIO
>>> details from the trapped load/store instruction and forwarding the
>>> MMIO read/write to user-space. The actual MMIO emulation will happen
>>> in user-space and KVM kernel module will only take care of register
>>> updates before resuming the trapped VCPU.
>>>
>>> The handling for stage2 page faults for unmapped Guest RAM will be
>>> implemeted by a separate patch later.
>>>
>>> Signed-off-by: Anup Patel <anup.patel@wdc.com>
>>> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
>>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
>>> ---
>>>    arch/riscv/include/asm/kvm_host.h |  11 +
>>>    arch/riscv/kvm/mmu.c              |   7 +
>>>    arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
>>>    3 files changed, 451 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
>>> index 18f1097f1d8d..4388bace6d70 100644
>>> --- a/arch/riscv/include/asm/kvm_host.h
>>> +++ b/arch/riscv/include/asm/kvm_host.h
>>> @@ -53,6 +53,12 @@ struct kvm_arch {
>>>        phys_addr_t pgd_phys;
>>>    };
>>>
>>> +struct kvm_mmio_decode {
>>> +     unsigned long insn;
>>> +     int len;
>>> +     int shift;
>>> +};
>>> +
>>>    struct kvm_cpu_context {
>>>        unsigned long zero;
>>>        unsigned long ra;
>>> @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
>>>        unsigned long irqs_pending;
>>>        unsigned long irqs_pending_mask;
>>>
>>> +     /* MMIO instruction details */
>>> +     struct kvm_mmio_decode mmio_decode;
>>> +
>>>        /* VCPU power-off state */
>>>        bool power_off;
>>>
>>> @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>>>    int kvm_riscv_setup_vsip(void);
>>>    void kvm_riscv_cleanup_vsip(void);
>>>
>>> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
>>> +                      bool is_write);
>>>    void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
>>>    int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
>>>    void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
>>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
>>> index 04dd089b86ff..2b965f9aac07 100644
>>> --- a/arch/riscv/kvm/mmu.c
>>> +++ b/arch/riscv/kvm/mmu.c
>>> @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>        return 0;
>>>    }
>>>
>>> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
>>> +                      bool is_write)
>>> +{
>>> +     /* TODO: */
>>> +     return 0;
>>> +}
>>> +
>>>    void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
>>>    {
>>>        /* TODO: */
>>> diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
>>> index e4d7c8f0807a..efc06198c259 100644
>>> --- a/arch/riscv/kvm/vcpu_exit.c
>>> +++ b/arch/riscv/kvm/vcpu_exit.c
>>> @@ -6,9 +6,371 @@
>>>     *     Anup Patel <anup.patel@wdc.com>
>>>     */
>>>
>>> +#include <linux/bitops.h>
>>>    #include <linux/errno.h>
>>>    #include <linux/err.h>
>>>    #include <linux/kvm_host.h>
>>> +#include <asm/csr.h>
>>> +
>>> +#define INSN_MATCH_LB                0x3
>>> +#define INSN_MASK_LB         0x707f
>>> +#define INSN_MATCH_LH                0x1003
>>> +#define INSN_MASK_LH         0x707f
>>> +#define INSN_MATCH_LW                0x2003
>>> +#define INSN_MASK_LW         0x707f
>>> +#define INSN_MATCH_LD                0x3003
>>> +#define INSN_MASK_LD         0x707f
>>> +#define INSN_MATCH_LBU               0x4003
>>> +#define INSN_MASK_LBU                0x707f
>>> +#define INSN_MATCH_LHU               0x5003
>>> +#define INSN_MASK_LHU                0x707f
>>> +#define INSN_MATCH_LWU               0x6003
>>> +#define INSN_MASK_LWU                0x707f
>>> +#define INSN_MATCH_SB                0x23
>>> +#define INSN_MASK_SB         0x707f
>>> +#define INSN_MATCH_SH                0x1023
>>> +#define INSN_MASK_SH         0x707f
>>> +#define INSN_MATCH_SW                0x2023
>>> +#define INSN_MASK_SW         0x707f
>>> +#define INSN_MATCH_SD                0x3023
>>> +#define INSN_MASK_SD         0x707f
>>> +
>>> +#define INSN_MATCH_C_LD              0x6000
>>> +#define INSN_MASK_C_LD               0xe003
>>> +#define INSN_MATCH_C_SD              0xe000
>>> +#define INSN_MASK_C_SD               0xe003
>>> +#define INSN_MATCH_C_LW              0x4000
>>> +#define INSN_MASK_C_LW               0xe003
>>> +#define INSN_MATCH_C_SW              0xc000
>>> +#define INSN_MASK_C_SW               0xe003
>>> +#define INSN_MATCH_C_LDSP    0x6002
>>> +#define INSN_MASK_C_LDSP     0xe003
>>> +#define INSN_MATCH_C_SDSP    0xe002
>>> +#define INSN_MASK_C_SDSP     0xe003
>>> +#define INSN_MATCH_C_LWSP    0x4002
>>> +#define INSN_MASK_C_LWSP     0xe003
>>> +#define INSN_MATCH_C_SWSP    0xc002
>>> +#define INSN_MASK_C_SWSP     0xe003
>>> +
>>> +#define INSN_LEN(insn)               ((((insn) & 0x3) < 0x3) ? 2 : 4)
>>> +
>>> +#ifdef CONFIG_64BIT
>>> +#define LOG_REGBYTES         3
>>> +#else
>>> +#define LOG_REGBYTES         2
>>> +#endif
>>> +#define REGBYTES             (1 << LOG_REGBYTES)
>>> +
>>> +#define SH_RD                        7
>>> +#define SH_RS1                       15
>>> +#define SH_RS2                       20
>>> +#define SH_RS2C                      2
>>> +
>>> +#define RV_X(x, s, n)                (((x) >> (s)) & ((1 << (n)) - 1))
>>> +#define RVC_LW_IMM(x)                ((RV_X(x, 6, 1) << 2) | \
>>> +                              (RV_X(x, 10, 3) << 3) | \
>>> +                              (RV_X(x, 5, 1) << 6))
>>> +#define RVC_LD_IMM(x)                ((RV_X(x, 10, 3) << 3) | \
>>> +                              (RV_X(x, 5, 2) << 6))
>>> +#define RVC_LWSP_IMM(x)              ((RV_X(x, 4, 3) << 2) | \
>>> +                              (RV_X(x, 12, 1) << 5) | \
>>> +                              (RV_X(x, 2, 2) << 6))
>>> +#define RVC_LDSP_IMM(x)              ((RV_X(x, 5, 2) << 3) | \
>>> +                              (RV_X(x, 12, 1) << 5) | \
>>> +                              (RV_X(x, 2, 3) << 6))
>>> +#define RVC_SWSP_IMM(x)              ((RV_X(x, 9, 4) << 2) | \
>>> +                              (RV_X(x, 7, 2) << 6))
>>> +#define RVC_SDSP_IMM(x)              ((RV_X(x, 10, 3) << 3) | \
>>> +                              (RV_X(x, 7, 3) << 6))
>>> +#define RVC_RS1S(insn)               (8 + RV_X(insn, SH_RD, 3))
>>> +#define RVC_RS2S(insn)               (8 + RV_X(insn, SH_RS2C, 3))
>>> +#define RVC_RS2(insn)                RV_X(insn, SH_RS2C, 5)
>>> +
>>> +#define SHIFT_RIGHT(x, y)            \
>>> +     ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
>>> +
>>> +#define REG_MASK                     \
>>> +     ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
>>> +
>>> +#define REG_OFFSET(insn, pos)                \
>>> +     (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
>>> +
>>> +#define REG_PTR(insn, pos, regs)     \
>>> +     (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
>>> +
>>> +#define GET_RM(insn)         (((insn) >> 12) & 7)
>>> +
>>> +#define GET_RS1(insn, regs)  (*REG_PTR(insn, SH_RS1, regs))
>>> +#define GET_RS2(insn, regs)  (*REG_PTR(insn, SH_RS2, regs))
>>> +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
>>> +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
>>> +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
>>> +#define GET_SP(regs)         (*REG_PTR(2, 0, regs))
>>> +#define SET_RD(insn, regs, val)      (*REG_PTR(insn, SH_RD, regs) = (val))
>>> +#define IMM_I(insn)          ((s32)(insn) >> 20)
>>> +#define IMM_S(insn)          (((s32)(insn) >> 25 << 5) | \
>>> +                              (s32)(((insn) >> 7) & 0x1f))
>>> +#define MASK_FUNCT3          0x7000
>>> +
>>> +#define STR(x)                       XSTR(x)
>>> +#define XSTR(x)                      #x
>>> +
>>> +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
>>> +static ulong get_insn(struct kvm_vcpu *vcpu)
>>> +{
>>> +     ulong __sepc = vcpu->arch.guest_context.sepc;
>>> +     ulong __hstatus, __sstatus, __vsstatus;
>>> +#ifdef CONFIG_RISCV_ISA_C
>>> +     ulong rvc_mask = 3, tmp;
>>> +#endif
>>> +     ulong flags, val;
>>> +
>>> +     local_irq_save(flags);
>>> +
>>> +     __vsstatus = csr_read(CSR_VSSTATUS);
>>> +     __sstatus = csr_read(CSR_SSTATUS);
>>> +     __hstatus = csr_read(CSR_HSTATUS);
>>> +
>>> +     csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
>>> +     csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
>>> +     csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
>>
>> What happens when the insn load triggers a page fault, maybe because the
>> guest was malicious and did
>>
>>     1) Run on page 0x1000
>>     2) Remove map for 0x1000, do *not* flush TLB
>>     3) Trigger MMIO
>>
>> That would DOS the host here, as the host kernel would continue running
>> in guest address space, right?
> 
> Yes, we can certainly fault while accessing Guest instruction. We will
> be fixing this issue in a followup series. We have mentioned this in cover
> letter as well.

I don't think the cover letter is the right place for such a comment. 
Please definitely put it into the code as well, pointing out that this 
is a known bug. Or even better yet: Fix it up properly :).

In fact, with a bug that dramatic, I'm not even sure we can safely 
include the code. We're consciously allowing user space to DOS the kernel.

> 
> BTW, RISC-V spec is going to further improve to provide easy
> access of faulting instruction to Hypervisor.
> (Refer, https://github.com/riscv/riscv-isa-manual/issues/431)

Yes, we have similar extensions on other archs. Is this going to be an 
optional addition or a mandatory bit of the hypervisor spec? If it's not 
mandatory, we can not rely on it, so the current path has to be safe.


Alex
Anup Patel Aug. 22, 2019, 1:55 p.m. UTC | #7
On Thu, Aug 22, 2019 at 6:55 PM Alexander Graf <graf@amazon.com> wrote:
>
>
>
> On 22.08.19 14:33, Anup Patel wrote:
> > On Thu, Aug 22, 2019 at 5:44 PM Alexander Graf <graf@amazon.com> wrote:
> >>
> >> On 22.08.19 10:44, Anup Patel wrote:
> >>> We will get stage2 page faults whenever Guest/VM access SW emulated
> >>> MMIO device or unmapped Guest RAM.
> >>>
> >>> This patch implements MMIO read/write emulation by extracting MMIO
> >>> details from the trapped load/store instruction and forwarding the
> >>> MMIO read/write to user-space. The actual MMIO emulation will happen
> >>> in user-space and KVM kernel module will only take care of register
> >>> updates before resuming the trapped VCPU.
> >>>
> >>> The handling for stage2 page faults for unmapped Guest RAM will be
> >>> implemeted by a separate patch later.
> >>>
> >>> Signed-off-by: Anup Patel <anup.patel@wdc.com>
> >>> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> >>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> >>> ---
> >>>    arch/riscv/include/asm/kvm_host.h |  11 +
> >>>    arch/riscv/kvm/mmu.c              |   7 +
> >>>    arch/riscv/kvm/vcpu_exit.c        | 436 +++++++++++++++++++++++++++++-
> >>>    3 files changed, 451 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> >>> index 18f1097f1d8d..4388bace6d70 100644
> >>> --- a/arch/riscv/include/asm/kvm_host.h
> >>> +++ b/arch/riscv/include/asm/kvm_host.h
> >>> @@ -53,6 +53,12 @@ struct kvm_arch {
> >>>        phys_addr_t pgd_phys;
> >>>    };
> >>>
> >>> +struct kvm_mmio_decode {
> >>> +     unsigned long insn;
> >>> +     int len;
> >>> +     int shift;
> >>> +};
> >>> +
> >>>    struct kvm_cpu_context {
> >>>        unsigned long zero;
> >>>        unsigned long ra;
> >>> @@ -141,6 +147,9 @@ struct kvm_vcpu_arch {
> >>>        unsigned long irqs_pending;
> >>>        unsigned long irqs_pending_mask;
> >>>
> >>> +     /* MMIO instruction details */
> >>> +     struct kvm_mmio_decode mmio_decode;
> >>> +
> >>>        /* VCPU power-off state */
> >>>        bool power_off;
> >>>
> >>> @@ -160,6 +169,8 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
> >>>    int kvm_riscv_setup_vsip(void);
> >>>    void kvm_riscv_cleanup_vsip(void);
> >>>
> >>> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> >>> +                      bool is_write);
> >>>    void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
> >>>    int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
> >>>    void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
> >>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> >>> index 04dd089b86ff..2b965f9aac07 100644
> >>> --- a/arch/riscv/kvm/mmu.c
> >>> +++ b/arch/riscv/kvm/mmu.c
> >>> @@ -61,6 +61,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >>>        return 0;
> >>>    }
> >>>
> >>> +int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
> >>> +                      bool is_write)
> >>> +{
> >>> +     /* TODO: */
> >>> +     return 0;
> >>> +}
> >>> +
> >>>    void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
> >>>    {
> >>>        /* TODO: */
> >>> diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> >>> index e4d7c8f0807a..efc06198c259 100644
> >>> --- a/arch/riscv/kvm/vcpu_exit.c
> >>> +++ b/arch/riscv/kvm/vcpu_exit.c
> >>> @@ -6,9 +6,371 @@
> >>>     *     Anup Patel <anup.patel@wdc.com>
> >>>     */
> >>>
> >>> +#include <linux/bitops.h>
> >>>    #include <linux/errno.h>
> >>>    #include <linux/err.h>
> >>>    #include <linux/kvm_host.h>
> >>> +#include <asm/csr.h>
> >>> +
> >>> +#define INSN_MATCH_LB                0x3
> >>> +#define INSN_MASK_LB         0x707f
> >>> +#define INSN_MATCH_LH                0x1003
> >>> +#define INSN_MASK_LH         0x707f
> >>> +#define INSN_MATCH_LW                0x2003
> >>> +#define INSN_MASK_LW         0x707f
> >>> +#define INSN_MATCH_LD                0x3003
> >>> +#define INSN_MASK_LD         0x707f
> >>> +#define INSN_MATCH_LBU               0x4003
> >>> +#define INSN_MASK_LBU                0x707f
> >>> +#define INSN_MATCH_LHU               0x5003
> >>> +#define INSN_MASK_LHU                0x707f
> >>> +#define INSN_MATCH_LWU               0x6003
> >>> +#define INSN_MASK_LWU                0x707f
> >>> +#define INSN_MATCH_SB                0x23
> >>> +#define INSN_MASK_SB         0x707f
> >>> +#define INSN_MATCH_SH                0x1023
> >>> +#define INSN_MASK_SH         0x707f
> >>> +#define INSN_MATCH_SW                0x2023
> >>> +#define INSN_MASK_SW         0x707f
> >>> +#define INSN_MATCH_SD                0x3023
> >>> +#define INSN_MASK_SD         0x707f
> >>> +
> >>> +#define INSN_MATCH_C_LD              0x6000
> >>> +#define INSN_MASK_C_LD               0xe003
> >>> +#define INSN_MATCH_C_SD              0xe000
> >>> +#define INSN_MASK_C_SD               0xe003
> >>> +#define INSN_MATCH_C_LW              0x4000
> >>> +#define INSN_MASK_C_LW               0xe003
> >>> +#define INSN_MATCH_C_SW              0xc000
> >>> +#define INSN_MASK_C_SW               0xe003
> >>> +#define INSN_MATCH_C_LDSP    0x6002
> >>> +#define INSN_MASK_C_LDSP     0xe003
> >>> +#define INSN_MATCH_C_SDSP    0xe002
> >>> +#define INSN_MASK_C_SDSP     0xe003
> >>> +#define INSN_MATCH_C_LWSP    0x4002
> >>> +#define INSN_MASK_C_LWSP     0xe003
> >>> +#define INSN_MATCH_C_SWSP    0xc002
> >>> +#define INSN_MASK_C_SWSP     0xe003
> >>> +
> >>> +#define INSN_LEN(insn)               ((((insn) & 0x3) < 0x3) ? 2 : 4)
> >>> +
> >>> +#ifdef CONFIG_64BIT
> >>> +#define LOG_REGBYTES         3
> >>> +#else
> >>> +#define LOG_REGBYTES         2
> >>> +#endif
> >>> +#define REGBYTES             (1 << LOG_REGBYTES)
> >>> +
> >>> +#define SH_RD                        7
> >>> +#define SH_RS1                       15
> >>> +#define SH_RS2                       20
> >>> +#define SH_RS2C                      2
> >>> +
> >>> +#define RV_X(x, s, n)                (((x) >> (s)) & ((1 << (n)) - 1))
> >>> +#define RVC_LW_IMM(x)                ((RV_X(x, 6, 1) << 2) | \
> >>> +                              (RV_X(x, 10, 3) << 3) | \
> >>> +                              (RV_X(x, 5, 1) << 6))
> >>> +#define RVC_LD_IMM(x)                ((RV_X(x, 10, 3) << 3) | \
> >>> +                              (RV_X(x, 5, 2) << 6))
> >>> +#define RVC_LWSP_IMM(x)              ((RV_X(x, 4, 3) << 2) | \
> >>> +                              (RV_X(x, 12, 1) << 5) | \
> >>> +                              (RV_X(x, 2, 2) << 6))
> >>> +#define RVC_LDSP_IMM(x)              ((RV_X(x, 5, 2) << 3) | \
> >>> +                              (RV_X(x, 12, 1) << 5) | \
> >>> +                              (RV_X(x, 2, 3) << 6))
> >>> +#define RVC_SWSP_IMM(x)              ((RV_X(x, 9, 4) << 2) | \
> >>> +                              (RV_X(x, 7, 2) << 6))
> >>> +#define RVC_SDSP_IMM(x)              ((RV_X(x, 10, 3) << 3) | \
> >>> +                              (RV_X(x, 7, 3) << 6))
> >>> +#define RVC_RS1S(insn)               (8 + RV_X(insn, SH_RD, 3))
> >>> +#define RVC_RS2S(insn)               (8 + RV_X(insn, SH_RS2C, 3))
> >>> +#define RVC_RS2(insn)                RV_X(insn, SH_RS2C, 5)
> >>> +
> >>> +#define SHIFT_RIGHT(x, y)            \
> >>> +     ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
> >>> +
> >>> +#define REG_MASK                     \
> >>> +     ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
> >>> +
> >>> +#define REG_OFFSET(insn, pos)                \
> >>> +     (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
> >>> +
> >>> +#define REG_PTR(insn, pos, regs)     \
> >>> +     (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
> >>> +
> >>> +#define GET_RM(insn)         (((insn) >> 12) & 7)
> >>> +
> >>> +#define GET_RS1(insn, regs)  (*REG_PTR(insn, SH_RS1, regs))
> >>> +#define GET_RS2(insn, regs)  (*REG_PTR(insn, SH_RS2, regs))
> >>> +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
> >>> +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
> >>> +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
> >>> +#define GET_SP(regs)         (*REG_PTR(2, 0, regs))
> >>> +#define SET_RD(insn, regs, val)      (*REG_PTR(insn, SH_RD, regs) = (val))
> >>> +#define IMM_I(insn)          ((s32)(insn) >> 20)
> >>> +#define IMM_S(insn)          (((s32)(insn) >> 25 << 5) | \
> >>> +                              (s32)(((insn) >> 7) & 0x1f))
> >>> +#define MASK_FUNCT3          0x7000
> >>> +
> >>> +#define STR(x)                       XSTR(x)
> >>> +#define XSTR(x)                      #x
> >>> +
> >>> +/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
> >>> +static ulong get_insn(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +     ulong __sepc = vcpu->arch.guest_context.sepc;
> >>> +     ulong __hstatus, __sstatus, __vsstatus;
> >>> +#ifdef CONFIG_RISCV_ISA_C
> >>> +     ulong rvc_mask = 3, tmp;
> >>> +#endif
> >>> +     ulong flags, val;
> >>> +
> >>> +     local_irq_save(flags);
> >>> +
> >>> +     __vsstatus = csr_read(CSR_VSSTATUS);
> >>> +     __sstatus = csr_read(CSR_SSTATUS);
> >>> +     __hstatus = csr_read(CSR_HSTATUS);
> >>> +
> >>> +     csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
> >>> +     csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
> >>> +     csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
> >>
> >> What happens when the insn load triggers a page fault, maybe because the
> >> guest was malicious and did
> >>
> >>     1) Run on page 0x1000
> >>     2) Remove map for 0x1000, do *not* flush TLB
> >>     3) Trigger MMIO
> >>
> >> That would DOS the host here, as the host kernel would continue running
> >> in guest address space, right?
> >
> > Yes, we can certainly fault while accessing Guest instruction. We will
> > be fixing this issue in a followup series. We have mentioned this in cover
> > letter as well.
>
> I don't think the cover letter is the right place for such a comment.
> Please definitely put it into the code as well, pointing out that this
> is a known bug. Or even better yet: Fix it up properly :).
>
> In fact, with a bug that dramatic, I'm not even sure we can safely
> include the code. We're consciously allowing user space to DOS the kernel.

There is already a TODO comment above get_insn() function.

>
> >
> > BTW, RISC-V spec is going to further improve to provide easy
> > access of faulting instruction to Hypervisor.
> > (Refer, https://github.com/riscv/riscv-isa-manual/issues/431)
>
> Yes, we have similar extensions on other archs. Is this going to be an
> optional addition or a mandatory bit of the hypervisor spec? If it's not
> mandatory, we can not rely on it, so the current path has to be safe.

Yes, it's going to be optional so we are certainly going to fix this issue
here.

This issue discussed in previous patch reviews. We have already
agreed to fix this in next revision.

Regards,
Anup

>
>
> Alex
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 18f1097f1d8d..4388bace6d70 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -53,6 +53,12 @@  struct kvm_arch {
 	phys_addr_t pgd_phys;
 };
 
+struct kvm_mmio_decode {
+	unsigned long insn;
+	int len;
+	int shift;
+};
+
 struct kvm_cpu_context {
 	unsigned long zero;
 	unsigned long ra;
@@ -141,6 +147,9 @@  struct kvm_vcpu_arch {
 	unsigned long irqs_pending;
 	unsigned long irqs_pending_mask;
 
+	/* MMIO instruction details */
+	struct kvm_mmio_decode mmio_decode;
+
 	/* VCPU power-off state */
 	bool power_off;
 
@@ -160,6 +169,8 @@  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 int kvm_riscv_setup_vsip(void);
 void kvm_riscv_cleanup_vsip(void);
 
+int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
+			 bool is_write);
 void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu);
 int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm);
 void kvm_riscv_stage2_free_pgd(struct kvm *kvm);
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 04dd089b86ff..2b965f9aac07 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -61,6 +61,13 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	return 0;
 }
 
+int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
+			 bool is_write)
+{
+	/* TODO: */
+	return 0;
+}
+
 void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
 {
 	/* TODO: */
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index e4d7c8f0807a..efc06198c259 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -6,9 +6,371 @@ 
  *     Anup Patel <anup.patel@wdc.com>
  */
 
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <asm/csr.h>
+
+#define INSN_MATCH_LB		0x3
+#define INSN_MASK_LB		0x707f
+#define INSN_MATCH_LH		0x1003
+#define INSN_MASK_LH		0x707f
+#define INSN_MATCH_LW		0x2003
+#define INSN_MASK_LW		0x707f
+#define INSN_MATCH_LD		0x3003
+#define INSN_MASK_LD		0x707f
+#define INSN_MATCH_LBU		0x4003
+#define INSN_MASK_LBU		0x707f
+#define INSN_MATCH_LHU		0x5003
+#define INSN_MASK_LHU		0x707f
+#define INSN_MATCH_LWU		0x6003
+#define INSN_MASK_LWU		0x707f
+#define INSN_MATCH_SB		0x23
+#define INSN_MASK_SB		0x707f
+#define INSN_MATCH_SH		0x1023
+#define INSN_MASK_SH		0x707f
+#define INSN_MATCH_SW		0x2023
+#define INSN_MASK_SW		0x707f
+#define INSN_MATCH_SD		0x3023
+#define INSN_MASK_SD		0x707f
+
+#define INSN_MATCH_C_LD		0x6000
+#define INSN_MASK_C_LD		0xe003
+#define INSN_MATCH_C_SD		0xe000
+#define INSN_MASK_C_SD		0xe003
+#define INSN_MATCH_C_LW		0x4000
+#define INSN_MASK_C_LW		0xe003
+#define INSN_MATCH_C_SW		0xc000
+#define INSN_MASK_C_SW		0xe003
+#define INSN_MATCH_C_LDSP	0x6002
+#define INSN_MASK_C_LDSP	0xe003
+#define INSN_MATCH_C_SDSP	0xe002
+#define INSN_MASK_C_SDSP	0xe003
+#define INSN_MATCH_C_LWSP	0x4002
+#define INSN_MASK_C_LWSP	0xe003
+#define INSN_MATCH_C_SWSP	0xc002
+#define INSN_MASK_C_SWSP	0xe003
+
+#define INSN_LEN(insn)		((((insn) & 0x3) < 0x3) ? 2 : 4)
+
+#ifdef CONFIG_64BIT
+#define LOG_REGBYTES		3
+#else
+#define LOG_REGBYTES		2
+#endif
+#define REGBYTES		(1 << LOG_REGBYTES)
+
+#define SH_RD			7
+#define SH_RS1			15
+#define SH_RS2			20
+#define SH_RS2C			2
+
+#define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
+#define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
+				 (RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 5, 1) << 6))
+#define RVC_LD_IMM(x)		((RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 5, 2) << 6))
+#define RVC_LWSP_IMM(x)		((RV_X(x, 4, 3) << 2) | \
+				 (RV_X(x, 12, 1) << 5) | \
+				 (RV_X(x, 2, 2) << 6))
+#define RVC_LDSP_IMM(x)		((RV_X(x, 5, 2) << 3) | \
+				 (RV_X(x, 12, 1) << 5) | \
+				 (RV_X(x, 2, 3) << 6))
+#define RVC_SWSP_IMM(x)		((RV_X(x, 9, 4) << 2) | \
+				 (RV_X(x, 7, 2) << 6))
+#define RVC_SDSP_IMM(x)		((RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 7, 3) << 6))
+#define RVC_RS1S(insn)		(8 + RV_X(insn, SH_RD, 3))
+#define RVC_RS2S(insn)		(8 + RV_X(insn, SH_RS2C, 3))
+#define RVC_RS2(insn)		RV_X(insn, SH_RS2C, 5)
+
+#define SHIFT_RIGHT(x, y)		\
+	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
+
+#define REG_MASK			\
+	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
+
+#define REG_OFFSET(insn, pos)		\
+	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
+
+#define REG_PTR(insn, pos, regs)	\
+	(ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
+
+#define GET_RM(insn)		(((insn) >> 12) & 7)
+
+#define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
+#define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
+#define GET_RS1S(insn, regs)	(*REG_PTR(RVC_RS1S(insn), 0, regs))
+#define GET_RS2S(insn, regs)	(*REG_PTR(RVC_RS2S(insn), 0, regs))
+#define GET_RS2C(insn, regs)	(*REG_PTR(insn, SH_RS2C, regs))
+#define GET_SP(regs)		(*REG_PTR(2, 0, regs))
+#define SET_RD(insn, regs, val)	(*REG_PTR(insn, SH_RD, regs) = (val))
+#define IMM_I(insn)		((s32)(insn) >> 20)
+#define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
+				 (s32)(((insn) >> 7) & 0x1f))
+#define MASK_FUNCT3		0x7000
+
+#define STR(x)			XSTR(x)
+#define XSTR(x)			#x
+
+/* TODO: Handle traps due to unpriv load and redirect it back to VS-mode */
+static ulong get_insn(struct kvm_vcpu *vcpu)
+{
+	ulong __sepc = vcpu->arch.guest_context.sepc;
+	ulong __hstatus, __sstatus, __vsstatus;
+#ifdef CONFIG_RISCV_ISA_C
+	ulong rvc_mask = 3, tmp;
+#endif
+	ulong flags, val;
+
+	local_irq_save(flags);
+
+	__vsstatus = csr_read(CSR_VSSTATUS);
+	__sstatus = csr_read(CSR_SSTATUS);
+	__hstatus = csr_read(CSR_HSTATUS);
+
+	csr_write(CSR_VSSTATUS, __vsstatus | SR_MXR);
+	csr_write(CSR_SSTATUS, vcpu->arch.guest_context.sstatus | SR_MXR);
+	csr_write(CSR_HSTATUS, vcpu->arch.guest_context.hstatus | HSTATUS_SPRV);
+
+#ifndef CONFIG_RISCV_ISA_C
+	asm ("\n"
+#ifdef CONFIG_64BIT
+		STR(LWU) " %[insn], (%[addr])\n"
+#else
+		STR(LW) " %[insn], (%[addr])\n"
+#endif
+		: [insn] "=&r" (val) : [addr] "r" (__sepc));
+#else
+	asm ("and %[tmp], %[addr], 2\n"
+		"bnez %[tmp], 1f\n"
+#ifdef CONFIG_64BIT
+		STR(LWU) " %[insn], (%[addr])\n"
+#else
+		STR(LW) " %[insn], (%[addr])\n"
+#endif
+		"and %[tmp], %[insn], %[rvc_mask]\n"
+		"beq %[tmp], %[rvc_mask], 2f\n"
+		"sll %[insn], %[insn], %[xlen_minus_16]\n"
+		"srl %[insn], %[insn], %[xlen_minus_16]\n"
+		"j 2f\n"
+		"1:\n"
+		"lhu %[insn], (%[addr])\n"
+		"and %[tmp], %[insn], %[rvc_mask]\n"
+		"bne %[tmp], %[rvc_mask], 2f\n"
+		"lhu %[tmp], 2(%[addr])\n"
+		"sll %[tmp], %[tmp], 16\n"
+		"add %[insn], %[insn], %[tmp]\n"
+		"2:"
+	: [vsstatus] "+&r" (__vsstatus), [insn] "=&r" (val),
+	  [tmp] "=&r" (tmp)
+	: [addr] "r" (__sepc), [rvc_mask] "r" (rvc_mask),
+	  [xlen_minus_16] "i" (__riscv_xlen - 16));
+#endif
+
+	csr_write(CSR_HSTATUS, __hstatus);
+	csr_write(CSR_SSTATUS, __sstatus);
+	csr_write(CSR_VSSTATUS, __vsstatus);
+
+	local_irq_restore(flags);
+
+	return val;
+}
+
+static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
+			unsigned long fault_addr)
+{
+	int shift = 0, len = 0;
+	ulong insn = get_insn(vcpu);
+
+	/* Decode length of MMIO and shift */
+	if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
+		len = 4;
+		shift = 8 * (sizeof(ulong) - len);
+	} else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) {
+		len = 1;
+		shift = 8 * (sizeof(ulong) - len);
+	} else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) {
+		len = 1;
+		shift = 8 * (sizeof(ulong) - len);
+#ifdef CONFIG_64BIT
+	} else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
+		len = 8;
+		shift = 8 * (sizeof(ulong) - len);
+	} else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
+		len = 4;
+#endif
+	} else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
+		len = 2;
+		shift = 8 * (sizeof(ulong) - len);
+	} else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
+		len = 2;
+#ifdef CONFIG_RISCV_ISA_C
+#ifdef CONFIG_64BIT
+	} else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
+		len = 8;
+		shift = 8 * (sizeof(ulong) - len);
+		insn = RVC_RS2S(insn) << SH_RD;
+	} else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
+		   ((insn >> SH_RD) & 0x1f)) {
+		len = 8;
+		shift = 8 * (sizeof(ulong) - len);
+#endif
+	} else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
+		len = 4;
+		shift = 8 * (sizeof(ulong) - len);
+		insn = RVC_RS2S(insn) << SH_RD;
+	} else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
+		   ((insn >> SH_RD) & 0x1f)) {
+		len = 4;
+		shift = 8 * (sizeof(ulong) - len);
+#endif
+	} else {
+		return -ENOTSUPP;
+	}
+
+	/* Fault address should be aligned to length of MMIO */
+	if (fault_addr & (len - 1))
+		return -EIO;
+
+	/* Save instruction decode info */
+	vcpu->arch.mmio_decode.insn = insn;
+	vcpu->arch.mmio_decode.shift = shift;
+	vcpu->arch.mmio_decode.len = len;
+
+	/* Exit to userspace for MMIO emulation */
+	vcpu->stat.mmio_exit_user++;
+	run->exit_reason = KVM_EXIT_MMIO;
+	run->mmio.is_write = false;
+	run->mmio.phys_addr = fault_addr;
+	run->mmio.len = len;
+
+	/* Move to next instruction */
+	vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+
+	return 0;
+}
+
+static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
+			 unsigned long fault_addr)
+{
+	u8 data8;
+	u16 data16;
+	u32 data32;
+	u64 data64;
+	ulong data;
+	int len = 0;
+	ulong insn = get_insn(vcpu);
+
+	data = GET_RS2(insn, &vcpu->arch.guest_context);
+	data8 = data16 = data32 = data64 = data;
+
+	if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
+		len = 4;
+	} else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) {
+		len = 1;
+#ifdef CONFIG_64BIT
+	} else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
+		len = 8;
+#endif
+	} else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
+		len = 2;
+#ifdef CONFIG_RISCV_ISA_C
+#ifdef CONFIG_64BIT
+	} else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
+		len = 8;
+		data64 = GET_RS2S(insn, &vcpu->arch.guest_context);
+	} else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
+		   ((insn >> SH_RD) & 0x1f)) {
+		len = 8;
+		data64 = GET_RS2C(insn, &vcpu->arch.guest_context);
+#endif
+	} else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
+		len = 4;
+		data32 = GET_RS2S(insn, &vcpu->arch.guest_context);
+	} else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
+		   ((insn >> SH_RD) & 0x1f)) {
+		len = 4;
+		data32 = GET_RS2C(insn, &vcpu->arch.guest_context);
+#endif
+	} else {
+		return -ENOTSUPP;
+	}
+
+	/* Fault address should be aligned to length of MMIO */
+	if (fault_addr & (len - 1))
+		return -EIO;
+
+	/* Clear instruction decode info */
+	vcpu->arch.mmio_decode.insn = 0;
+	vcpu->arch.mmio_decode.shift = 0;
+	vcpu->arch.mmio_decode.len = 0;
+
+	/* Copy data to kvm_run instance */
+	switch (len) {
+	case 1:
+		*((u8 *)run->mmio.data) = data8;
+		break;
+	case 2:
+		*((u16 *)run->mmio.data) = data16;
+		break;
+	case 4:
+		*((u32 *)run->mmio.data) = data32;
+		break;
+	case 8:
+		*((u64 *)run->mmio.data) = data64;
+		break;
+	default:
+		return -ENOTSUPP;
+	};
+
+	/* Exit to userspace for MMIO emulation */
+	vcpu->stat.mmio_exit_user++;
+	run->exit_reason = KVM_EXIT_MMIO;
+	run->mmio.is_write = true;
+	run->mmio.phys_addr = fault_addr;
+	run->mmio.len = len;
+
+	/* Move to next instruction */
+	vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+
+	return 0;
+}
+
+static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
+			     unsigned long scause, unsigned long stval)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long hva;
+	bool writable;
+	gfn_t gfn;
+	int ret;
+
+	gfn = stval >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
+
+	if (kvm_is_error_hva(hva) ||
+	    (scause == EXC_STORE_PAGE_FAULT && !writable)) {
+		switch (scause) {
+		case EXC_LOAD_PAGE_FAULT:
+			return emulate_load(vcpu, run, stval);
+		case EXC_STORE_PAGE_FAULT:
+			return emulate_store(vcpu, run, stval);
+		default:
+			return -ENOTSUPP;
+		};
+	}
+
+	ret = kvm_riscv_stage2_map(vcpu, stval, hva,
+			(scause == EXC_STORE_PAGE_FAULT) ? true : false);
+	if (ret < 0)
+		return ret;
+
+	return 1;
+}
 
 /**
  * kvm_riscv_vcpu_mmio_return -- Handle MMIO loads after user space emulation
@@ -19,7 +381,44 @@ 
  */
 int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	/* TODO: */
+	u8 data8;
+	u16 data16;
+	u32 data32;
+	u64 data64;
+	ulong insn;
+	int len, shift;
+
+	if (run->mmio.is_write)
+		return 0;
+
+	insn = vcpu->arch.mmio_decode.insn;
+	len = vcpu->arch.mmio_decode.len;
+	shift = vcpu->arch.mmio_decode.shift;
+	switch (len) {
+	case 1:
+		data8 = *((u8 *)run->mmio.data);
+		SET_RD(insn, &vcpu->arch.guest_context,
+			(ulong)data8 << shift >> shift);
+		break;
+	case 2:
+		data16 = *((u16 *)run->mmio.data);
+		SET_RD(insn, &vcpu->arch.guest_context,
+			(ulong)data16 << shift >> shift);
+		break;
+	case 4:
+		data32 = *((u32 *)run->mmio.data);
+		SET_RD(insn, &vcpu->arch.guest_context,
+			(ulong)data32 << shift >> shift);
+		break;
+	case 8:
+		data64 = *((u64 *)run->mmio.data);
+		SET_RD(insn, &vcpu->arch.guest_context,
+			(ulong)data64 << shift >> shift);
+		break;
+	default:
+		return -ENOTSUPP;
+	};
+
 	return 0;
 }
 
@@ -30,6 +429,37 @@  int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			unsigned long scause, unsigned long stval)
 {
-	/* TODO: */
-	return 0;
+	int ret;
+
+	/* If we got host interrupt then do nothing */
+	if (scause & SCAUSE_IRQ_FLAG)
+		return 1;
+
+	/* Handle guest traps */
+	ret = -EFAULT;
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	switch (scause) {
+	case EXC_INST_PAGE_FAULT:
+	case EXC_LOAD_PAGE_FAULT:
+	case EXC_STORE_PAGE_FAULT:
+		if ((vcpu->arch.guest_context.hstatus & HSTATUS_SPV) &&
+		    (vcpu->arch.guest_context.hstatus & HSTATUS_STL))
+			ret = stage2_page_fault(vcpu, run, scause, stval);
+		break;
+	default:
+		break;
+	};
+
+	/* Print details in-case of error */
+	if (ret < 0) {
+		kvm_err("VCPU exit error %d\n", ret);
+		kvm_err("SEPC=0x%lx SSTATUS=0x%lx HSTATUS=0x%lx\n",
+			vcpu->arch.guest_context.sepc,
+			vcpu->arch.guest_context.sstatus,
+			vcpu->arch.guest_context.hstatus);
+		kvm_err("SCAUSE=0x%lx STVAL=0x%lx\n",
+			scause, stval);
+	}
+
+	return ret;
 }