diff mbox series

[v5,08/20] RISC-V: KVM: Implement KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls

Message ID 20190822084131.114764-9-anup.patel@wdc.com (mailing list archive)
State New, archived
Headers show
Series KVM RISC-V Support | expand

Commit Message

Anup Patel Aug. 22, 2019, 8:44 a.m. UTC
For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
VCPU config and registers from user-space.

We have three types of VCPU registers:
1. CONFIG - these are VCPU config and capabilities
2. CORE   - these are VCPU general purpose registers
3. CSR    - these are VCPU control and status registers

The CONFIG registers available to user-space are ISA and TIMEBASE. Out
of these, TIMEBASE is a read-only register which inform user-space about
VCPU timer base frequency. The ISA register is a read and write register
where user-space can only write the desired VCPU ISA capabilities before
running the VCPU.

The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
PC and MODE. The PC register represents program counter whereas the MODE
register represent VCPU privilege mode (i.e. S/U-mode).

The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.

In future, more VCPU register types will be added (such as FP) for the
KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
 arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
 2 files changed, 272 insertions(+), 3 deletions(-)

Comments

Alexander Graf Aug. 22, 2019, 12:01 p.m. UTC | #1
On 22.08.19 10:44, Anup Patel wrote:
> For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
> VCPU config and registers from user-space.
> 
> We have three types of VCPU registers:
> 1. CONFIG - these are VCPU config and capabilities
> 2. CORE   - these are VCPU general purpose registers
> 3. CSR    - these are VCPU control and status registers
> 
> The CONFIG registers available to user-space are ISA and TIMEBASE. Out
> of these, TIMEBASE is a read-only register which inform user-space about
> VCPU timer base frequency. The ISA register is a read and write register
> where user-space can only write the desired VCPU ISA capabilities before
> running the VCPU.
> 
> The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
> T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
> PC and MODE. The PC register represents program counter whereas the MODE
> register represent VCPU privilege mode (i.e. S/U-mode).
> 
> The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
> SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
> 
> In future, more VCPU register types will be added (such as FP) for the
> KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
> 
> Signed-off-by: Anup Patel <anup.patel@wdc.com>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
>   arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
>   2 files changed, 272 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
> index 6dbc056d58ba..024f220eb17e 100644
> --- a/arch/riscv/include/uapi/asm/kvm.h
> +++ b/arch/riscv/include/uapi/asm/kvm.h
> @@ -23,8 +23,15 @@
>   
>   /* for KVM_GET_REGS and KVM_SET_REGS */
>   struct kvm_regs {
> +	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
> +	struct user_regs_struct regs;
> +	unsigned long mode;

Is there any particular reason you're reusing kvm_regs and don't invent 
your own struct? kvm_regs is explicitly meant for the get_regs and 
set_regs ioctls.

>   };
>   
> +/* Possible privilege modes for kvm_regs */
> +#define KVM_RISCV_MODE_S	1
> +#define KVM_RISCV_MODE_U	0
> +
>   /* for KVM_GET_FPU and KVM_SET_FPU */
>   struct kvm_fpu {
>   };
> @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
>   struct kvm_sync_regs {
>   };
>   
> -/* dummy definition */
> +/* for KVM_GET_SREGS and KVM_SET_SREGS */
>   struct kvm_sregs {
> +	unsigned long sstatus;
> +	unsigned long sie;
> +	unsigned long stvec;
> +	unsigned long sscratch;
> +	unsigned long sepc;
> +	unsigned long scause;
> +	unsigned long stval;
> +	unsigned long sip;
> +	unsigned long satp;

Same comment here.

>   };
>   
> +#define KVM_REG_SIZE(id)		\
> +	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
> +
> +/* If you need to interpret the index values, here is the key: */
> +#define KVM_REG_RISCV_TYPE_MASK		0x00000000FF000000
> +#define KVM_REG_RISCV_TYPE_SHIFT	24
> +
> +/* Config registers are mapped as type 1 */
> +#define KVM_REG_RISCV_CONFIG		(0x01 << KVM_REG_RISCV_TYPE_SHIFT)
> +#define KVM_REG_RISCV_CONFIG_ISA	0x0
> +#define KVM_REG_RISCV_CONFIG_TIMEBASE	0x1
> +
> +/* Core registers are mapped as type 2 */
> +#define KVM_REG_RISCV_CORE		(0x02 << KVM_REG_RISCV_TYPE_SHIFT)
> +#define KVM_REG_RISCV_CORE_REG(name)	\
> +		(offsetof(struct kvm_regs, name) / sizeof(unsigned long))

I see, you're trying to implicitly use the struct offsets as index.

I'm not a really big fan of it, but I can't pinpoint exactly why just 
yet. It just seems too magical (read: potentially breaking down the 
road) for me.

> +
> +/* Control and status registers are mapped as type 3 */
> +#define KVM_REG_RISCV_CSR		(0x03 << KVM_REG_RISCV_TYPE_SHIFT)
> +#define KVM_REG_RISCV_CSR_REG(name)	\
> +		(offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
> +
>   #endif
>   
>   #endif /* __LINUX_KVM_RISCV_H */
> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
> index 7f59e85c6af8..9396a83c0611 100644
> --- a/arch/riscv/kvm/vcpu.c
> +++ b/arch/riscv/kvm/vcpu.c
> @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
>   	return VM_FAULT_SIGBUS;
>   }
>   
> +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
> +					 const struct kvm_one_reg *reg)
> +{
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CONFIG);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	switch (reg_num) {
> +	case KVM_REG_RISCV_CONFIG_ISA:
> +		reg_val = vcpu->arch.isa;
> +		break;
> +	case KVM_REG_RISCV_CONFIG_TIMEBASE:
> +		reg_val = riscv_timebase;

What does this reflect? The current guest time hopefully not? An offset? 
Related to what?

All ONE_REG registers should be documented in 
Documentation/virtual/kvm/api.txt. Please add them there.

> +		break;
> +	default:
> +		return -EINVAL;
> +	};
> +
> +	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
> +					 const struct kvm_one_reg *reg)
> +{
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CONFIG);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	switch (reg_num) {
> +	case KVM_REG_RISCV_CONFIG_ISA:
> +		if (!vcpu->arch.ran_atleast_once) {
> +			vcpu->arch.isa = reg_val;
> +			vcpu->arch.isa &= riscv_isa_extension_base(NULL);
> +			vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;

This register definitely needs proper documentation too ;). You may want 
to reconsider to put a few of the helper bits from patch 02/20 into 
uapi, so that user space can directly use them.

> +		} else {
> +			return -ENOTSUPP;
> +		}
> +		break;
> +	case KVM_REG_RISCV_CONFIG_TIMEBASE:
> +		return -ENOTSUPP;
> +	default:
> +		return -EINVAL;
> +	};
> +
> +	return 0;
> +}
> +
> +static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu,
> +				       const struct kvm_one_reg *reg)
> +{
> +	struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CORE);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> +		reg_val = cntx->sepc;
> +	else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> +		 reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> +		reg_val = ((unsigned long *)cntx)[reg_num];
> +	else if (reg_num == KVM_REG_RISCV_CORE_REG(mode))
> +		reg_val = (cntx->sstatus & SR_SPP) ?
> +				KVM_RISCV_MODE_S : KVM_RISCV_MODE_U;
> +	else
> +		return -EINVAL;
> +
> +	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu,
> +				       const struct kvm_one_reg *reg)
> +{
> +	struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CORE);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> +		cntx->sepc = reg_val;
> +	else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> +		 reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> +		((unsigned long *)cntx)[reg_num] = reg_val;
> +	else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) {
> +		if (reg_val == KVM_RISCV_MODE_S)
> +			cntx->sstatus |= SR_SPP;
> +		else
> +			cntx->sstatus &= ~SR_SPP;
> +	} else
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu,
> +				      const struct kvm_one_reg *reg)
> +{
> +	struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CSR);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +	if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> +		kvm_riscv_vcpu_flush_interrupts(vcpu);
> +
> +	reg_val = ((unsigned long *)csr)[reg_num];
> +
> +	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
> +				      const struct kvm_one_reg *reg)
> +{
> +	struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> +	unsigned long __user *uaddr =
> +			(unsigned long __user *)(unsigned long)reg->addr;
> +	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> +					    KVM_REG_SIZE_MASK |
> +					    KVM_REG_RISCV_CSR);
> +	unsigned long reg_val;
> +
> +	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> +		return -EINVAL;
> +	if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> +		return -EFAULT;
> +
> +	((unsigned long *)csr)[reg_num] = reg_val;
> +
> +	if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> +		WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);

Why does writing SIP clear all pending interrupts?


Alex
Anup Patel Aug. 22, 2019, 2 p.m. UTC | #2
On Thu, Aug 22, 2019 at 5:31 PM Alexander Graf <graf@amazon.com> wrote:
>
> On 22.08.19 10:44, Anup Patel wrote:
> > For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
> > VCPU config and registers from user-space.
> >
> > We have three types of VCPU registers:
> > 1. CONFIG - these are VCPU config and capabilities
> > 2. CORE   - these are VCPU general purpose registers
> > 3. CSR    - these are VCPU control and status registers
> >
> > The CONFIG registers available to user-space are ISA and TIMEBASE. Out
> > of these, TIMEBASE is a read-only register which inform user-space about
> > VCPU timer base frequency. The ISA register is a read and write register
> > where user-space can only write the desired VCPU ISA capabilities before
> > running the VCPU.
> >
> > The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
> > T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
> > PC and MODE. The PC register represents program counter whereas the MODE
> > register represent VCPU privilege mode (i.e. S/U-mode).
> >
> > The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
> > SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
> >
> > In future, more VCPU register types will be added (such as FP) for the
> > KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
> >
> > Signed-off-by: Anup Patel <anup.patel@wdc.com>
> > Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
> >   arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
> >   2 files changed, 272 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
> > index 6dbc056d58ba..024f220eb17e 100644
> > --- a/arch/riscv/include/uapi/asm/kvm.h
> > +++ b/arch/riscv/include/uapi/asm/kvm.h
> > @@ -23,8 +23,15 @@
> >
> >   /* for KVM_GET_REGS and KVM_SET_REGS */
> >   struct kvm_regs {
> > +     /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
> > +     struct user_regs_struct regs;
> > +     unsigned long mode;
>
> Is there any particular reason you're reusing kvm_regs and don't invent
> your own struct? kvm_regs is explicitly meant for the get_regs and
> set_regs ioctls.

We are implementing only ONE_REG interface so most of these
structs are unused hence we tried to reuse these struct instead
of introducing new structs. (Similar to KVM ARM64)

>
> >   };
> >
> > +/* Possible privilege modes for kvm_regs */
> > +#define KVM_RISCV_MODE_S     1
> > +#define KVM_RISCV_MODE_U     0
> > +
> >   /* for KVM_GET_FPU and KVM_SET_FPU */
> >   struct kvm_fpu {
> >   };
> > @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
> >   struct kvm_sync_regs {
> >   };
> >
> > -/* dummy definition */
> > +/* for KVM_GET_SREGS and KVM_SET_SREGS */
> >   struct kvm_sregs {
> > +     unsigned long sstatus;
> > +     unsigned long sie;
> > +     unsigned long stvec;
> > +     unsigned long sscratch;
> > +     unsigned long sepc;
> > +     unsigned long scause;
> > +     unsigned long stval;
> > +     unsigned long sip;
> > +     unsigned long satp;
>
> Same comment here.

Same as above, we are trying to use unused struct.

>
> >   };
> >
> > +#define KVM_REG_SIZE(id)             \
> > +     (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
> > +
> > +/* If you need to interpret the index values, here is the key: */
> > +#define KVM_REG_RISCV_TYPE_MASK              0x00000000FF000000
> > +#define KVM_REG_RISCV_TYPE_SHIFT     24
> > +
> > +/* Config registers are mapped as type 1 */
> > +#define KVM_REG_RISCV_CONFIG         (0x01 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CONFIG_ISA     0x0
> > +#define KVM_REG_RISCV_CONFIG_TIMEBASE        0x1
> > +
> > +/* Core registers are mapped as type 2 */
> > +#define KVM_REG_RISCV_CORE           (0x02 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CORE_REG(name) \
> > +             (offsetof(struct kvm_regs, name) / sizeof(unsigned long))
>
> I see, you're trying to implicitly use the struct offsets as index.
>
> I'm not a really big fan of it, but I can't pinpoint exactly why just
> yet. It just seems too magical (read: potentially breaking down the
> road) for me.
>
> > +
> > +/* Control and status registers are mapped as type 3 */
> > +#define KVM_REG_RISCV_CSR            (0x03 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CSR_REG(name)  \
> > +             (offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
> > +
> >   #endif
> >
> >   #endif /* __LINUX_KVM_RISCV_H */
> > diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
> > index 7f59e85c6af8..9396a83c0611 100644
> > --- a/arch/riscv/kvm/vcpu.c
> > +++ b/arch/riscv/kvm/vcpu.c
> > @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> >       return VM_FAULT_SIGBUS;
> >   }
> >
> > +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
> > +                                      const struct kvm_one_reg *reg)
> > +{
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CONFIG);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     switch (reg_num) {
> > +     case KVM_REG_RISCV_CONFIG_ISA:
> > +             reg_val = vcpu->arch.isa;
> > +             break;
> > +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
> > +             reg_val = riscv_timebase;
>
> What does this reflect? The current guest time hopefully not? An offset?
> Related to what?

riscv_timebase is the frequency in HZ of the system timer.

The name "timebase" is not appropriate but we have been
carrying it since quite some time now.

>
> All ONE_REG registers should be documented in
> Documentation/virtual/kvm/api.txt. Please add them there.

Sure, I will update in next revision.

>
> > +             break;
> > +     default:
> > +             return -EINVAL;
> > +     };
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
> > +                                      const struct kvm_one_reg *reg)
> > +{
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CONFIG);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     switch (reg_num) {
> > +     case KVM_REG_RISCV_CONFIG_ISA:
> > +             if (!vcpu->arch.ran_atleast_once) {
> > +                     vcpu->arch.isa = reg_val;
> > +                     vcpu->arch.isa &= riscv_isa_extension_base(NULL);
> > +                     vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;
>
> This register definitely needs proper documentation too ;). You may want
> to reconsider to put a few of the helper bits from patch 02/20 into
> uapi, so that user space can directly use them.

Sure, I will add details about ISA register in Documentation/virt/kvm/api.txt

Regards,
Anup


>
> > +             } else {
> > +                     return -ENOTSUPP;
> > +             }
> > +             break;
> > +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
> > +             return -ENOTSUPP;
> > +     default:
> > +             return -EINVAL;
> > +     };
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu,
> > +                                    const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CORE);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> > +             reg_val = cntx->sepc;
> > +     else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> > +              reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> > +             reg_val = ((unsigned long *)cntx)[reg_num];
> > +     else if (reg_num == KVM_REG_RISCV_CORE_REG(mode))
> > +             reg_val = (cntx->sstatus & SR_SPP) ?
> > +                             KVM_RISCV_MODE_S : KVM_RISCV_MODE_U;
> > +     else
> > +             return -EINVAL;
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu,
> > +                                    const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CORE);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> > +             cntx->sepc = reg_val;
> > +     else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> > +              reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> > +             ((unsigned long *)cntx)[reg_num] = reg_val;
> > +     else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) {
> > +             if (reg_val == KVM_RISCV_MODE_S)
> > +                     cntx->sstatus |= SR_SPP;
> > +             else
> > +                     cntx->sstatus &= ~SR_SPP;
> > +     } else
> > +             return -EINVAL;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu,
> > +                                   const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CSR);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +     if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> > +             kvm_riscv_vcpu_flush_interrupts(vcpu);
> > +
> > +     reg_val = ((unsigned long *)csr)[reg_num];
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
> > +                                   const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CSR);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +     if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     ((unsigned long *)csr)[reg_num] = reg_val;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> > +             WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);
>
> Why does writing SIP clear all pending interrupts?
>
>
> Alex
Anup Patel Aug. 22, 2019, 2:05 p.m. UTC | #3
On Thu, Aug 22, 2019 at 5:31 PM Alexander Graf <graf@amazon.com> wrote:
>
> On 22.08.19 10:44, Anup Patel wrote:
> > For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
> > VCPU config and registers from user-space.
> >
> > We have three types of VCPU registers:
> > 1. CONFIG - these are VCPU config and capabilities
> > 2. CORE   - these are VCPU general purpose registers
> > 3. CSR    - these are VCPU control and status registers
> >
> > The CONFIG registers available to user-space are ISA and TIMEBASE. Out
> > of these, TIMEBASE is a read-only register which inform user-space about
> > VCPU timer base frequency. The ISA register is a read and write register
> > where user-space can only write the desired VCPU ISA capabilities before
> > running the VCPU.
> >
> > The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
> > T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
> > PC and MODE. The PC register represents program counter whereas the MODE
> > register represent VCPU privilege mode (i.e. S/U-mode).
> >
> > The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
> > SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
> >
> > In future, more VCPU register types will be added (such as FP) for the
> > KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
> >
> > Signed-off-by: Anup Patel <anup.patel@wdc.com>
> > Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
> >   arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
> >   2 files changed, 272 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
> > index 6dbc056d58ba..024f220eb17e 100644
> > --- a/arch/riscv/include/uapi/asm/kvm.h
> > +++ b/arch/riscv/include/uapi/asm/kvm.h
> > @@ -23,8 +23,15 @@
> >
> >   /* for KVM_GET_REGS and KVM_SET_REGS */
> >   struct kvm_regs {
> > +     /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
> > +     struct user_regs_struct regs;
> > +     unsigned long mode;
>
> Is there any particular reason you're reusing kvm_regs and don't invent
> your own struct? kvm_regs is explicitly meant for the get_regs and
> set_regs ioctls.
>
> >   };
> >
> > +/* Possible privilege modes for kvm_regs */
> > +#define KVM_RISCV_MODE_S     1
> > +#define KVM_RISCV_MODE_U     0
> > +
> >   /* for KVM_GET_FPU and KVM_SET_FPU */
> >   struct kvm_fpu {
> >   };
> > @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
> >   struct kvm_sync_regs {
> >   };
> >
> > -/* dummy definition */
> > +/* for KVM_GET_SREGS and KVM_SET_SREGS */
> >   struct kvm_sregs {
> > +     unsigned long sstatus;
> > +     unsigned long sie;
> > +     unsigned long stvec;
> > +     unsigned long sscratch;
> > +     unsigned long sepc;
> > +     unsigned long scause;
> > +     unsigned long stval;
> > +     unsigned long sip;
> > +     unsigned long satp;
>
> Same comment here.
>
> >   };
> >
> > +#define KVM_REG_SIZE(id)             \
> > +     (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
> > +
> > +/* If you need to interpret the index values, here is the key: */
> > +#define KVM_REG_RISCV_TYPE_MASK              0x00000000FF000000
> > +#define KVM_REG_RISCV_TYPE_SHIFT     24
> > +
> > +/* Config registers are mapped as type 1 */
> > +#define KVM_REG_RISCV_CONFIG         (0x01 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CONFIG_ISA     0x0
> > +#define KVM_REG_RISCV_CONFIG_TIMEBASE        0x1
> > +
> > +/* Core registers are mapped as type 2 */
> > +#define KVM_REG_RISCV_CORE           (0x02 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CORE_REG(name) \
> > +             (offsetof(struct kvm_regs, name) / sizeof(unsigned long))
>
> I see, you're trying to implicitly use the struct offsets as index.
>
> I'm not a really big fan of it, but I can't pinpoint exactly why just
> yet. It just seems too magical (read: potentially breaking down the
> road) for me.
>
> > +
> > +/* Control and status registers are mapped as type 3 */
> > +#define KVM_REG_RISCV_CSR            (0x03 << KVM_REG_RISCV_TYPE_SHIFT)
> > +#define KVM_REG_RISCV_CSR_REG(name)  \
> > +             (offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
> > +
> >   #endif
> >
> >   #endif /* __LINUX_KVM_RISCV_H */
> > diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
> > index 7f59e85c6af8..9396a83c0611 100644
> > --- a/arch/riscv/kvm/vcpu.c
> > +++ b/arch/riscv/kvm/vcpu.c
> > @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> >       return VM_FAULT_SIGBUS;
> >   }
> >
> > +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
> > +                                      const struct kvm_one_reg *reg)
> > +{
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CONFIG);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     switch (reg_num) {
> > +     case KVM_REG_RISCV_CONFIG_ISA:
> > +             reg_val = vcpu->arch.isa;
> > +             break;
> > +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
> > +             reg_val = riscv_timebase;
>
> What does this reflect? The current guest time hopefully not? An offset?
> Related to what?
>
> All ONE_REG registers should be documented in
> Documentation/virtual/kvm/api.txt. Please add them there.
>
> > +             break;
> > +     default:
> > +             return -EINVAL;
> > +     };
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
> > +                                      const struct kvm_one_reg *reg)
> > +{
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CONFIG);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     switch (reg_num) {
> > +     case KVM_REG_RISCV_CONFIG_ISA:
> > +             if (!vcpu->arch.ran_atleast_once) {
> > +                     vcpu->arch.isa = reg_val;
> > +                     vcpu->arch.isa &= riscv_isa_extension_base(NULL);
> > +                     vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;
>
> This register definitely needs proper documentation too ;). You may want
> to reconsider to put a few of the helper bits from patch 02/20 into
> uapi, so that user space can directly use them.
>
> > +             } else {
> > +                     return -ENOTSUPP;
> > +             }
> > +             break;
> > +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
> > +             return -ENOTSUPP;
> > +     default:
> > +             return -EINVAL;
> > +     };
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu,
> > +                                    const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CORE);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> > +             reg_val = cntx->sepc;
> > +     else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> > +              reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> > +             reg_val = ((unsigned long *)cntx)[reg_num];
> > +     else if (reg_num == KVM_REG_RISCV_CORE_REG(mode))
> > +             reg_val = (cntx->sstatus & SR_SPP) ?
> > +                             KVM_RISCV_MODE_S : KVM_RISCV_MODE_U;
> > +     else
> > +             return -EINVAL;
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu,
> > +                                    const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CORE);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
> > +             cntx->sepc = reg_val;
> > +     else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
> > +              reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
> > +             ((unsigned long *)cntx)[reg_num] = reg_val;
> > +     else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) {
> > +             if (reg_val == KVM_RISCV_MODE_S)
> > +                     cntx->sstatus |= SR_SPP;
> > +             else
> > +                     cntx->sstatus &= ~SR_SPP;
> > +     } else
> > +             return -EINVAL;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu,
> > +                                   const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CSR);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +     if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> > +             kvm_riscv_vcpu_flush_interrupts(vcpu);
> > +
> > +     reg_val = ((unsigned long *)csr)[reg_num];
> > +
> > +     if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> > +static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
> > +                                   const struct kvm_one_reg *reg)
> > +{
> > +     struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
> > +     unsigned long __user *uaddr =
> > +                     (unsigned long __user *)(unsigned long)reg->addr;
> > +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> > +                                         KVM_REG_SIZE_MASK |
> > +                                         KVM_REG_RISCV_CSR);
> > +     unsigned long reg_val;
> > +
> > +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> > +             return -EINVAL;
> > +     if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
> > +             return -EINVAL;
> > +
> > +     if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
> > +             return -EFAULT;
> > +
> > +     ((unsigned long *)csr)[reg_num] = reg_val;
> > +
> > +     if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
> > +             WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);
>
> Why does writing SIP clear all pending interrupts?

irqs_pending_mask represents bits changes in irqs_pending.

Once the SIP CSR is updated by user-space, the changes to
irqs_pending are no longer valid so we clear irqs_pending_mask.

If we don't clear irqs_pending_mask then value programmed by
user-space can get overwritten if there were interrupts after
we saved SIP CSR and before we restored it.

Regards,
Anup

>
>
> Alex
Alexander Graf Aug. 22, 2019, 2:12 p.m. UTC | #4
On 22.08.19 16:00, Anup Patel wrote:
> On Thu, Aug 22, 2019 at 5:31 PM Alexander Graf <graf@amazon.com> wrote:
>>
>> On 22.08.19 10:44, Anup Patel wrote:
>>> For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
>>> VCPU config and registers from user-space.
>>>
>>> We have three types of VCPU registers:
>>> 1. CONFIG - these are VCPU config and capabilities
>>> 2. CORE   - these are VCPU general purpose registers
>>> 3. CSR    - these are VCPU control and status registers
>>>
>>> The CONFIG registers available to user-space are ISA and TIMEBASE. Out
>>> of these, TIMEBASE is a read-only register which inform user-space about
>>> VCPU timer base frequency. The ISA register is a read and write register
>>> where user-space can only write the desired VCPU ISA capabilities before
>>> running the VCPU.
>>>
>>> The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
>>> T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
>>> PC and MODE. The PC register represents program counter whereas the MODE
>>> register represent VCPU privilege mode (i.e. S/U-mode).
>>>
>>> The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
>>> SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
>>>
>>> In future, more VCPU register types will be added (such as FP) for the
>>> KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
>>>
>>> Signed-off-by: Anup Patel <anup.patel@wdc.com>
>>> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
>>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
>>> ---
>>>    arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
>>>    arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
>>>    2 files changed, 272 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
>>> index 6dbc056d58ba..024f220eb17e 100644
>>> --- a/arch/riscv/include/uapi/asm/kvm.h
>>> +++ b/arch/riscv/include/uapi/asm/kvm.h
>>> @@ -23,8 +23,15 @@
>>>
>>>    /* for KVM_GET_REGS and KVM_SET_REGS */
>>>    struct kvm_regs {
>>> +     /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
>>> +     struct user_regs_struct regs;
>>> +     unsigned long mode;
>>
>> Is there any particular reason you're reusing kvm_regs and don't invent
>> your own struct? kvm_regs is explicitly meant for the get_regs and
>> set_regs ioctls.
> 
> We are implementing only ONE_REG interface so most of these
> structs are unused hence we tried to reuse these struct instead
> of introducing new structs. (Similar to KVM ARM64)
> 
>>
>>>    };
>>>
>>> +/* Possible privilege modes for kvm_regs */
>>> +#define KVM_RISCV_MODE_S     1
>>> +#define KVM_RISCV_MODE_U     0
>>> +
>>>    /* for KVM_GET_FPU and KVM_SET_FPU */
>>>    struct kvm_fpu {
>>>    };
>>> @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
>>>    struct kvm_sync_regs {
>>>    };
>>>
>>> -/* dummy definition */
>>> +/* for KVM_GET_SREGS and KVM_SET_SREGS */
>>>    struct kvm_sregs {
>>> +     unsigned long sstatus;
>>> +     unsigned long sie;
>>> +     unsigned long stvec;
>>> +     unsigned long sscratch;
>>> +     unsigned long sepc;
>>> +     unsigned long scause;
>>> +     unsigned long stval;
>>> +     unsigned long sip;
>>> +     unsigned long satp;
>>
>> Same comment here.
> 
> Same as above, we are trying to use unused struct.
> 
>>
>>>    };
>>>
>>> +#define KVM_REG_SIZE(id)             \
>>> +     (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
>>> +
>>> +/* If you need to interpret the index values, here is the key: */
>>> +#define KVM_REG_RISCV_TYPE_MASK              0x00000000FF000000
>>> +#define KVM_REG_RISCV_TYPE_SHIFT     24
>>> +
>>> +/* Config registers are mapped as type 1 */
>>> +#define KVM_REG_RISCV_CONFIG         (0x01 << KVM_REG_RISCV_TYPE_SHIFT)
>>> +#define KVM_REG_RISCV_CONFIG_ISA     0x0
>>> +#define KVM_REG_RISCV_CONFIG_TIMEBASE        0x1
>>> +
>>> +/* Core registers are mapped as type 2 */
>>> +#define KVM_REG_RISCV_CORE           (0x02 << KVM_REG_RISCV_TYPE_SHIFT)
>>> +#define KVM_REG_RISCV_CORE_REG(name) \
>>> +             (offsetof(struct kvm_regs, name) / sizeof(unsigned long))
>>
>> I see, you're trying to implicitly use the struct offsets as index.
>>
>> I'm not a really big fan of it, but I can't pinpoint exactly why just
>> yet. It just seems too magical (read: potentially breaking down the
>> road) for me.
>>
>>> +
>>> +/* Control and status registers are mapped as type 3 */
>>> +#define KVM_REG_RISCV_CSR            (0x03 << KVM_REG_RISCV_TYPE_SHIFT)
>>> +#define KVM_REG_RISCV_CSR_REG(name)  \
>>> +             (offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
>>> +
>>>    #endif
>>>
>>>    #endif /* __LINUX_KVM_RISCV_H */
>>> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
>>> index 7f59e85c6af8..9396a83c0611 100644
>>> --- a/arch/riscv/kvm/vcpu.c
>>> +++ b/arch/riscv/kvm/vcpu.c
>>> @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
>>>        return VM_FAULT_SIGBUS;
>>>    }
>>>
>>> +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
>>> +                                      const struct kvm_one_reg *reg)
>>> +{
>>> +     unsigned long __user *uaddr =
>>> +                     (unsigned long __user *)(unsigned long)reg->addr;
>>> +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
>>> +                                         KVM_REG_SIZE_MASK |
>>> +                                         KVM_REG_RISCV_CONFIG);
>>> +     unsigned long reg_val;
>>> +
>>> +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
>>> +             return -EINVAL;
>>> +
>>> +     switch (reg_num) {
>>> +     case KVM_REG_RISCV_CONFIG_ISA:
>>> +             reg_val = vcpu->arch.isa;
>>> +             break;
>>> +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
>>> +             reg_val = riscv_timebase;
>>
>> What does this reflect? The current guest time hopefully not? An offset?
>> Related to what?
> 
> riscv_timebase is the frequency in HZ of the system timer.
> 
> The name "timebase" is not appropriate but we have been
> carrying it since quite some time now.

What do you mean by "some time"? So far I only see a kernel internal 
variable named after it. That's dramatically different from something 
exposed via uapi.

Just name it tbfreq.

So if this is the frequency, where is the offset? You will need it on 
save/restore. If you're saying that's out of scope for now, that's fine 
with me too :).


Alex
Anup Patel Aug. 23, 2019, 11:20 a.m. UTC | #5
On Thu, Aug 22, 2019 at 7:42 PM Alexander Graf <graf@amazon.com> wrote:
>
>
>
> On 22.08.19 16:00, Anup Patel wrote:
> > On Thu, Aug 22, 2019 at 5:31 PM Alexander Graf <graf@amazon.com> wrote:
> >>
> >> On 22.08.19 10:44, Anup Patel wrote:
> >>> For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
> >>> VCPU config and registers from user-space.
> >>>
> >>> We have three types of VCPU registers:
> >>> 1. CONFIG - these are VCPU config and capabilities
> >>> 2. CORE   - these are VCPU general purpose registers
> >>> 3. CSR    - these are VCPU control and status registers
> >>>
> >>> The CONFIG registers available to user-space are ISA and TIMEBASE. Out
> >>> of these, TIMEBASE is a read-only register which inform user-space about
> >>> VCPU timer base frequency. The ISA register is a read and write register
> >>> where user-space can only write the desired VCPU ISA capabilities before
> >>> running the VCPU.
> >>>
> >>> The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
> >>> T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
> >>> PC and MODE. The PC register represents program counter whereas the MODE
> >>> register represent VCPU privilege mode (i.e. S/U-mode).
> >>>
> >>> The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
> >>> SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
> >>>
> >>> In future, more VCPU register types will be added (such as FP) for the
> >>> KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
> >>>
> >>> Signed-off-by: Anup Patel <anup.patel@wdc.com>
> >>> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> >>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> >>> ---
> >>>    arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
> >>>    arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
> >>>    2 files changed, 272 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
> >>> index 6dbc056d58ba..024f220eb17e 100644
> >>> --- a/arch/riscv/include/uapi/asm/kvm.h
> >>> +++ b/arch/riscv/include/uapi/asm/kvm.h
> >>> @@ -23,8 +23,15 @@
> >>>
> >>>    /* for KVM_GET_REGS and KVM_SET_REGS */
> >>>    struct kvm_regs {
> >>> +     /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
> >>> +     struct user_regs_struct regs;
> >>> +     unsigned long mode;
> >>
> >> Is there any particular reason you're reusing kvm_regs and don't invent
> >> your own struct? kvm_regs is explicitly meant for the get_regs and
> >> set_regs ioctls.
> >
> > We are implementing only ONE_REG interface so most of these
> > structs are unused hence we tried to reuse these struct instead
> > of introducing new structs. (Similar to KVM ARM64)
> >
> >>
> >>>    };
> >>>
> >>> +/* Possible privilege modes for kvm_regs */
> >>> +#define KVM_RISCV_MODE_S     1
> >>> +#define KVM_RISCV_MODE_U     0
> >>> +
> >>>    /* for KVM_GET_FPU and KVM_SET_FPU */
> >>>    struct kvm_fpu {
> >>>    };
> >>> @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
> >>>    struct kvm_sync_regs {
> >>>    };
> >>>
> >>> -/* dummy definition */
> >>> +/* for KVM_GET_SREGS and KVM_SET_SREGS */
> >>>    struct kvm_sregs {
> >>> +     unsigned long sstatus;
> >>> +     unsigned long sie;
> >>> +     unsigned long stvec;
> >>> +     unsigned long sscratch;
> >>> +     unsigned long sepc;
> >>> +     unsigned long scause;
> >>> +     unsigned long stval;
> >>> +     unsigned long sip;
> >>> +     unsigned long satp;
> >>
> >> Same comment here.
> >
> > Same as above, we are trying to use unused struct.
> >
> >>
> >>>    };
> >>>
> >>> +#define KVM_REG_SIZE(id)             \
> >>> +     (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
> >>> +
> >>> +/* If you need to interpret the index values, here is the key: */
> >>> +#define KVM_REG_RISCV_TYPE_MASK              0x00000000FF000000
> >>> +#define KVM_REG_RISCV_TYPE_SHIFT     24
> >>> +
> >>> +/* Config registers are mapped as type 1 */
> >>> +#define KVM_REG_RISCV_CONFIG         (0x01 << KVM_REG_RISCV_TYPE_SHIFT)
> >>> +#define KVM_REG_RISCV_CONFIG_ISA     0x0
> >>> +#define KVM_REG_RISCV_CONFIG_TIMEBASE        0x1
> >>> +
> >>> +/* Core registers are mapped as type 2 */
> >>> +#define KVM_REG_RISCV_CORE           (0x02 << KVM_REG_RISCV_TYPE_SHIFT)
> >>> +#define KVM_REG_RISCV_CORE_REG(name) \
> >>> +             (offsetof(struct kvm_regs, name) / sizeof(unsigned long))
> >>
> >> I see, you're trying to implicitly use the struct offsets as index.
> >>
> >> I'm not a really big fan of it, but I can't pinpoint exactly why just
> >> yet. It just seems too magical (read: potentially breaking down the
> >> road) for me.
> >>
> >>> +
> >>> +/* Control and status registers are mapped as type 3 */
> >>> +#define KVM_REG_RISCV_CSR            (0x03 << KVM_REG_RISCV_TYPE_SHIFT)
> >>> +#define KVM_REG_RISCV_CSR_REG(name)  \
> >>> +             (offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
> >>> +
> >>>    #endif
> >>>
> >>>    #endif /* __LINUX_KVM_RISCV_H */
> >>> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
> >>> index 7f59e85c6af8..9396a83c0611 100644
> >>> --- a/arch/riscv/kvm/vcpu.c
> >>> +++ b/arch/riscv/kvm/vcpu.c
> >>> @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> >>>        return VM_FAULT_SIGBUS;
> >>>    }
> >>>
> >>> +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
> >>> +                                      const struct kvm_one_reg *reg)
> >>> +{
> >>> +     unsigned long __user *uaddr =
> >>> +                     (unsigned long __user *)(unsigned long)reg->addr;
> >>> +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
> >>> +                                         KVM_REG_SIZE_MASK |
> >>> +                                         KVM_REG_RISCV_CONFIG);
> >>> +     unsigned long reg_val;
> >>> +
> >>> +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
> >>> +             return -EINVAL;
> >>> +
> >>> +     switch (reg_num) {
> >>> +     case KVM_REG_RISCV_CONFIG_ISA:
> >>> +             reg_val = vcpu->arch.isa;
> >>> +             break;
> >>> +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
> >>> +             reg_val = riscv_timebase;
> >>
> >> What does this reflect? The current guest time hopefully not? An offset?
> >> Related to what?
> >
> > riscv_timebase is the frequency in HZ of the system timer.
> >
> > The name "timebase" is not appropriate but we have been
> > carrying it since quite some time now.
>
> What do you mean by "some time"? So far I only see a kernel internal
> variable named after it. That's dramatically different from something
> exposed via uapi.
>
> Just name it tbfreq.

Sure, I will use TBFREQ name.

>
> So if this is the frequency, where is the offset? You will need it on
> save/restore. If you're saying that's out of scope for now, that's fine
> with me too :).

tbfreq is read-only and fixed.

The Guest tbfreq has to be same as Host tbfreq. This means we
can only migrate Guest from Host A to Host B only if:
1. They have matching ISA capabilities
2. They have matching tbfreq

Regards,
Anup

>
>
> Alex
Alexander Graf Aug. 23, 2019, 11:42 a.m. UTC | #6
> Am 23.08.2019 um 13:21 schrieb Anup Patel <anup@brainfault.org>:
> 
>> On Thu, Aug 22, 2019 at 7:42 PM Alexander Graf <graf@amazon.com> wrote:
>> 
>> 
>> 
>>> On 22.08.19 16:00, Anup Patel wrote:
>>>> On Thu, Aug 22, 2019 at 5:31 PM Alexander Graf <graf@amazon.com> wrote:
>>>> 
>>>>> On 22.08.19 10:44, Anup Patel wrote:
>>>>> For KVM RISC-V, we use KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls to access
>>>>> VCPU config and registers from user-space.
>>>>> 
>>>>> We have three types of VCPU registers:
>>>>> 1. CONFIG - these are VCPU config and capabilities
>>>>> 2. CORE   - these are VCPU general purpose registers
>>>>> 3. CSR    - these are VCPU control and status registers
>>>>> 
>>>>> The CONFIG registers available to user-space are ISA and TIMEBASE. Out
>>>>> of these, TIMEBASE is a read-only register which inform user-space about
>>>>> VCPU timer base frequency. The ISA register is a read and write register
>>>>> where user-space can only write the desired VCPU ISA capabilities before
>>>>> running the VCPU.
>>>>> 
>>>>> The CORE registers available to user-space are PC, RA, SP, GP, TP, A0-A7,
>>>>> T0-T6, S0-S11 and MODE. Most of these are RISC-V general registers except
>>>>> PC and MODE. The PC register represents program counter whereas the MODE
>>>>> register represent VCPU privilege mode (i.e. S/U-mode).
>>>>> 
>>>>> The CSRs available to user-space are SSTATUS, SIE, STVEC, SSCRATCH, SEPC,
>>>>> SCAUSE, STVAL, SIP, and SATP. All of these are read/write registers.
>>>>> 
>>>>> In future, more VCPU register types will be added (such as FP) for the
>>>>> KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.
>>>>> 
>>>>> Signed-off-by: Anup Patel <anup.patel@wdc.com>
>>>>> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
>>>>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
>>>>> ---
>>>>>   arch/riscv/include/uapi/asm/kvm.h |  40 ++++-
>>>>>   arch/riscv/kvm/vcpu.c             | 235 +++++++++++++++++++++++++++++-
>>>>>   2 files changed, 272 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
>>>>> index 6dbc056d58ba..024f220eb17e 100644
>>>>> --- a/arch/riscv/include/uapi/asm/kvm.h
>>>>> +++ b/arch/riscv/include/uapi/asm/kvm.h
>>>>> @@ -23,8 +23,15 @@
>>>>> 
>>>>>   /* for KVM_GET_REGS and KVM_SET_REGS */
>>>>>   struct kvm_regs {
>>>>> +     /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
>>>>> +     struct user_regs_struct regs;
>>>>> +     unsigned long mode;
>>>> 
>>>> Is there any particular reason you're reusing kvm_regs and don't invent
>>>> your own struct? kvm_regs is explicitly meant for the get_regs and
>>>> set_regs ioctls.
>>> 
>>> We are implementing only ONE_REG interface so most of these
>>> structs are unused hence we tried to reuse these struct instead
>>> of introducing new structs. (Similar to KVM ARM64)
>>> 
>>>> 
>>>>>   };
>>>>> 
>>>>> +/* Possible privilege modes for kvm_regs */
>>>>> +#define KVM_RISCV_MODE_S     1
>>>>> +#define KVM_RISCV_MODE_U     0
>>>>> +
>>>>>   /* for KVM_GET_FPU and KVM_SET_FPU */
>>>>>   struct kvm_fpu {
>>>>>   };
>>>>> @@ -41,10 +48,41 @@ struct kvm_guest_debug_arch {
>>>>>   struct kvm_sync_regs {
>>>>>   };
>>>>> 
>>>>> -/* dummy definition */
>>>>> +/* for KVM_GET_SREGS and KVM_SET_SREGS */
>>>>>   struct kvm_sregs {
>>>>> +     unsigned long sstatus;
>>>>> +     unsigned long sie;
>>>>> +     unsigned long stvec;
>>>>> +     unsigned long sscratch;
>>>>> +     unsigned long sepc;
>>>>> +     unsigned long scause;
>>>>> +     unsigned long stval;
>>>>> +     unsigned long sip;
>>>>> +     unsigned long satp;
>>>> 
>>>> Same comment here.
>>> 
>>> Same as above, we are trying to use unused struct.
>>> 
>>>> 
>>>>>   };
>>>>> 
>>>>> +#define KVM_REG_SIZE(id)             \
>>>>> +     (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
>>>>> +
>>>>> +/* If you need to interpret the index values, here is the key: */
>>>>> +#define KVM_REG_RISCV_TYPE_MASK              0x00000000FF000000
>>>>> +#define KVM_REG_RISCV_TYPE_SHIFT     24
>>>>> +
>>>>> +/* Config registers are mapped as type 1 */
>>>>> +#define KVM_REG_RISCV_CONFIG         (0x01 << KVM_REG_RISCV_TYPE_SHIFT)
>>>>> +#define KVM_REG_RISCV_CONFIG_ISA     0x0
>>>>> +#define KVM_REG_RISCV_CONFIG_TIMEBASE        0x1
>>>>> +
>>>>> +/* Core registers are mapped as type 2 */
>>>>> +#define KVM_REG_RISCV_CORE           (0x02 << KVM_REG_RISCV_TYPE_SHIFT)
>>>>> +#define KVM_REG_RISCV_CORE_REG(name) \
>>>>> +             (offsetof(struct kvm_regs, name) / sizeof(unsigned long))
>>>> 
>>>> I see, you're trying to implicitly use the struct offsets as index.
>>>> 
>>>> I'm not a really big fan of it, but I can't pinpoint exactly why just
>>>> yet. It just seems too magical (read: potentially breaking down the
>>>> road) for me.
>>>> 
>>>>> +
>>>>> +/* Control and status registers are mapped as type 3 */
>>>>> +#define KVM_REG_RISCV_CSR            (0x03 << KVM_REG_RISCV_TYPE_SHIFT)
>>>>> +#define KVM_REG_RISCV_CSR_REG(name)  \
>>>>> +             (offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
>>>>> +
>>>>>   #endif
>>>>> 
>>>>>   #endif /* __LINUX_KVM_RISCV_H */
>>>>> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
>>>>> index 7f59e85c6af8..9396a83c0611 100644
>>>>> --- a/arch/riscv/kvm/vcpu.c
>>>>> +++ b/arch/riscv/kvm/vcpu.c
>>>>> @@ -164,6 +164,215 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
>>>>>       return VM_FAULT_SIGBUS;
>>>>>   }
>>>>> 
>>>>> +static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
>>>>> +                                      const struct kvm_one_reg *reg)
>>>>> +{
>>>>> +     unsigned long __user *uaddr =
>>>>> +                     (unsigned long __user *)(unsigned long)reg->addr;
>>>>> +     unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
>>>>> +                                         KVM_REG_SIZE_MASK |
>>>>> +                                         KVM_REG_RISCV_CONFIG);
>>>>> +     unsigned long reg_val;
>>>>> +
>>>>> +     if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
>>>>> +             return -EINVAL;
>>>>> +
>>>>> +     switch (reg_num) {
>>>>> +     case KVM_REG_RISCV_CONFIG_ISA:
>>>>> +             reg_val = vcpu->arch.isa;
>>>>> +             break;
>>>>> +     case KVM_REG_RISCV_CONFIG_TIMEBASE:
>>>>> +             reg_val = riscv_timebase;
>>>> 
>>>> What does this reflect? The current guest time hopefully not? An offset?
>>>> Related to what?
>>> 
>>> riscv_timebase is the frequency in HZ of the system timer.
>>> 
>>> The name "timebase" is not appropriate but we have been
>>> carrying it since quite some time now.
>> 
>> What do you mean by "some time"? So far I only see a kernel internal
>> variable named after it. That's dramatically different from something
>> exposed via uapi.
>> 
>> Just name it tbfreq.
> 
> Sure, I will use TBFREQ name.
> 
>> 
>> So if this is the frequency, where is the offset? You will need it on
>> save/restore. If you're saying that's out of scope for now, that's fine
>> with me too :).
> 
> tbfreq is read-only and fixed.
> 
> The Guest tbfreq has to be same as Host tbfreq. This means we
> can only migrate Guest from Host A to Host B only if:
> 1. They have matching ISA capabilities

That's what we have on almost all archs, it's a fair statement.

> 2. They have matching tbfreq

This was true for most archs in the early virtualization days, but CPU vendors learned since then. It really makes people upset if they can not move their guests to a new CPU.

If you see bits in the spec that are missing (tb freq scaling / trapping on tb reads), please work with the ISA people to resolve them going forward.

Alex

> 
> Regards,
> Anup
> 
>> 
>> 
>> Alex
diff mbox series

Patch

diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index 6dbc056d58ba..024f220eb17e 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -23,8 +23,15 @@ 
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	struct user_regs_struct regs;
+	unsigned long mode;
 };
 
+/* Possible privilege modes for kvm_regs */
+#define KVM_RISCV_MODE_S	1
+#define KVM_RISCV_MODE_U	0
+
 /* for KVM_GET_FPU and KVM_SET_FPU */
 struct kvm_fpu {
 };
@@ -41,10 +48,41 @@  struct kvm_guest_debug_arch {
 struct kvm_sync_regs {
 };
 
-/* dummy definition */
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
 struct kvm_sregs {
+	unsigned long sstatus;
+	unsigned long sie;
+	unsigned long stvec;
+	unsigned long sscratch;
+	unsigned long sepc;
+	unsigned long scause;
+	unsigned long stval;
+	unsigned long sip;
+	unsigned long satp;
 };
 
+#define KVM_REG_SIZE(id)		\
+	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
+
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_RISCV_TYPE_MASK		0x00000000FF000000
+#define KVM_REG_RISCV_TYPE_SHIFT	24
+
+/* Config registers are mapped as type 1 */
+#define KVM_REG_RISCV_CONFIG		(0x01 << KVM_REG_RISCV_TYPE_SHIFT)
+#define KVM_REG_RISCV_CONFIG_ISA	0x0
+#define KVM_REG_RISCV_CONFIG_TIMEBASE	0x1
+
+/* Core registers are mapped as type 2 */
+#define KVM_REG_RISCV_CORE		(0x02 << KVM_REG_RISCV_TYPE_SHIFT)
+#define KVM_REG_RISCV_CORE_REG(name)	\
+		(offsetof(struct kvm_regs, name) / sizeof(unsigned long))
+
+/* Control and status registers are mapped as type 3 */
+#define KVM_REG_RISCV_CSR		(0x03 << KVM_REG_RISCV_TYPE_SHIFT)
+#define KVM_REG_RISCV_CSR_REG(name)	\
+		(offsetof(struct kvm_sregs, name) / sizeof(unsigned long))
+
 #endif
 
 #endif /* __LINUX_KVM_RISCV_H */
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 7f59e85c6af8..9396a83c0611 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -164,6 +164,215 @@  vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
+static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
+					 const struct kvm_one_reg *reg)
+{
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CONFIG);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+
+	switch (reg_num) {
+	case KVM_REG_RISCV_CONFIG_ISA:
+		reg_val = vcpu->arch.isa;
+		break;
+	case KVM_REG_RISCV_CONFIG_TIMEBASE:
+		reg_val = riscv_timebase;
+		break;
+	default:
+		return -EINVAL;
+	};
+
+	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
+					 const struct kvm_one_reg *reg)
+{
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CONFIG);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+
+	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	switch (reg_num) {
+	case KVM_REG_RISCV_CONFIG_ISA:
+		if (!vcpu->arch.ran_atleast_once) {
+			vcpu->arch.isa = reg_val;
+			vcpu->arch.isa &= riscv_isa_extension_base(NULL);
+			vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED;
+		} else {
+			return -ENOTSUPP;
+		}
+		break;
+	case KVM_REG_RISCV_CONFIG_TIMEBASE:
+		return -ENOTSUPP;
+	default:
+		return -EINVAL;
+	};
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu,
+				       const struct kvm_one_reg *reg)
+{
+	struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CORE);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+
+	if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
+		reg_val = cntx->sepc;
+	else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
+		 reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
+		reg_val = ((unsigned long *)cntx)[reg_num];
+	else if (reg_num == KVM_REG_RISCV_CORE_REG(mode))
+		reg_val = (cntx->sstatus & SR_SPP) ?
+				KVM_RISCV_MODE_S : KVM_RISCV_MODE_U;
+	else
+		return -EINVAL;
+
+	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu,
+				       const struct kvm_one_reg *reg)
+{
+	struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CORE);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+
+	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
+		cntx->sepc = reg_val;
+	else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
+		 reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
+		((unsigned long *)cntx)[reg_num] = reg_val;
+	else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) {
+		if (reg_val == KVM_RISCV_MODE_S)
+			cntx->sstatus |= SR_SPP;
+		else
+			cntx->sstatus &= ~SR_SPP;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu,
+				      const struct kvm_one_reg *reg)
+{
+	struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CSR);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+	if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
+		return -EINVAL;
+
+	if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
+		kvm_riscv_vcpu_flush_interrupts(vcpu);
+
+	reg_val = ((unsigned long *)csr)[reg_num];
+
+	if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
+				      const struct kvm_one_reg *reg)
+{
+	struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+	unsigned long __user *uaddr =
+			(unsigned long __user *)(unsigned long)reg->addr;
+	unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+					    KVM_REG_SIZE_MASK |
+					    KVM_REG_RISCV_CSR);
+	unsigned long reg_val;
+
+	if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+		return -EINVAL;
+	if (reg_num >= sizeof(struct kvm_sregs) / sizeof(unsigned long))
+		return -EINVAL;
+
+	if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	((unsigned long *)csr)[reg_num] = reg_val;
+
+	if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
+		WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);
+
+	return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu,
+				  const struct kvm_one_reg *reg)
+{
+	if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CONFIG)
+		return kvm_riscv_vcpu_set_reg_config(vcpu, reg);
+	else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CORE)
+		return kvm_riscv_vcpu_set_reg_core(vcpu, reg);
+	else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CSR)
+		return kvm_riscv_vcpu_set_reg_csr(vcpu, reg);
+
+	return -EINVAL;
+}
+
+static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu,
+				  const struct kvm_one_reg *reg)
+{
+	if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CONFIG)
+		return kvm_riscv_vcpu_get_reg_config(vcpu, reg);
+	else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CORE)
+		return kvm_riscv_vcpu_get_reg_core(vcpu, reg);
+	else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CSR)
+		return kvm_riscv_vcpu_get_reg_csr(vcpu, reg);
+
+	return -EINVAL;
+}
+
 long kvm_arch_vcpu_async_ioctl(struct file *filp,
 			       unsigned int ioctl, unsigned long arg)
 {
@@ -188,8 +397,30 @@  long kvm_arch_vcpu_async_ioctl(struct file *filp,
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
-	/* TODO: */
-	return -EINVAL;
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG: {
+		struct kvm_one_reg reg;
+
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			break;
+
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_riscv_vcpu_set_reg(vcpu, &reg);
+		else
+			r = kvm_riscv_vcpu_get_reg(vcpu, &reg);
+		break;
+	}
+	default:
+		break;
+	}
+
+	return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,