diff mbox series

[v5,22/26] KVM: arm64/sve: Add pseudo-register for the guest's vector lengths

Message ID 1550519559-15915-23-git-send-email-Dave.Martin@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: SVE guest support | expand

Commit Message

Dave Martin Feb. 18, 2019, 7:52 p.m. UTC
This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
allow userspace to set and query the set of vector lengths visible
to the guest, along with corresponding storage in struct
kvm_vcpu_arch.

In the future, multiple register slices per SVE register may be
visible through the ioctl interface.  Once the set of slices has
been determined we would not be able to allow the vector length set
to be changed any more, in order to avoid userspace seeing
inconsistent sets of registers.  For this reason, this patch adds
support to track vcpu finalization explicitly, and enforce proper
sequencing of ioctls.

The new pseudo-register is not exposed yet.  Subsequent patches
will allow SVE to be turned on for guest vcpus, making it visible.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>

---

Changes since v4:

 * Add a UAPI header comment indicating the pseudo-register status of
   KVM_REG_ARM64_SVE_VLS.

 * Get rid of the sve_vqs[] array from struct kvm_vcpu_arch.  This
   array is pointless, because its contents must match the host's
   internal sve_vq_map anyway, up to vcpu->arch.sve_max_vl.

   The ioctl register accessors are slow-path code, so we can decode
   or reconstruct sve_vqs[] on demand instead, for exchange with
   userspace.

 * For compatibility with potential future architecture extensions,
   enabling vector lengths above 256 bytes for the guest is explicitly
   disallowed now (because the code for exposing additional bits
   through ioctl is currently missing).  This can be addressed later
   if/when needed.

Note:

 * I defensively pass an array by pointer here, to help avoid
   accidentally breaking assumptions during future maintenance.

   Due to (over?)zealous constification, this causes the following
   sparse warning.  I think this is sparse getting confused: I am not
   relying on any kernel-specific semantics here, and GCC generates no
   warning.

+arch/arm64/kvm/guest.c:33: warning: incorrect type in argument 1 (different modifiers)
+arch/arm64/kvm/guest.c:33:    expected unsigned long long const [usertype] ( *const vqs )[8]
+arch/arm64/kvm/guest.c:33:    got unsigned long long [usertype] ( * )[8]

---
 arch/arm64/include/asm/kvm_host.h |   7 ++-
 arch/arm64/include/uapi/asm/kvm.h |   4 ++
 arch/arm64/kvm/guest.c            | 124 ++++++++++++++++++++++++++++++++++++--
 arch/arm64/kvm/reset.c            |   9 +++
 4 files changed, 136 insertions(+), 8 deletions(-)

Comments

Julien Thierry Feb. 21, 2019, 5:48 p.m. UTC | #1
Hi Dave,

On 18/02/2019 19:52, Dave Martin wrote:
> This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
> allow userspace to set and query the set of vector lengths visible
> to the guest, along with corresponding storage in struct
> kvm_vcpu_arch.
> 
> In the future, multiple register slices per SVE register may be
> visible through the ioctl interface.  Once the set of slices has
> been determined we would not be able to allow the vector length set
> to be changed any more, in order to avoid userspace seeing
> inconsistent sets of registers.  For this reason, this patch adds
> support to track vcpu finalization explicitly, and enforce proper
> sequencing of ioctls.
> 
> The new pseudo-register is not exposed yet.  Subsequent patches
> will allow SVE to be turned on for guest vcpus, making it visible.
> 
> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> 
> ---
> 
> Changes since v4:
> 
>  * Add a UAPI header comment indicating the pseudo-register status of
>    KVM_REG_ARM64_SVE_VLS.
> 
>  * Get rid of the sve_vqs[] array from struct kvm_vcpu_arch.  This
>    array is pointless, because its contents must match the host's
>    internal sve_vq_map anyway, up to vcpu->arch.sve_max_vl.
> 
>    The ioctl register accessors are slow-path code, so we can decode
>    or reconstruct sve_vqs[] on demand instead, for exchange with
>    userspace.
> 
>  * For compatibility with potential future architecture extensions,
>    enabling vector lengths above 256 bytes for the guest is explicitly
>    disallowed now (because the code for exposing additional bits
>    through ioctl is currently missing).  This can be addressed later
>    if/when needed.
> 
> Note:
> 
>  * I defensively pass an array by pointer here, to help avoid
>    accidentally breaking assumptions during future maintenance.
> 
>    Due to (over?)zealous constification, this causes the following
>    sparse warning.  I think this is sparse getting confused: I am not
>    relying on any kernel-specific semantics here, and GCC generates no
>    warning.
> 
> +arch/arm64/kvm/guest.c:33: warning: incorrect type in argument 1 (different modifiers)
> +arch/arm64/kvm/guest.c:33:    expected unsigned long long const [usertype] ( *const vqs )[8]
> +arch/arm64/kvm/guest.c:33:    got unsigned long long [usertype] ( * )[8]
> 
> ---
>  arch/arm64/include/asm/kvm_host.h |   7 ++-
>  arch/arm64/include/uapi/asm/kvm.h |   4 ++
>  arch/arm64/kvm/guest.c            | 124 ++++++++++++++++++++++++++++++++++++--
>  arch/arm64/kvm/reset.c            |   9 +++
>  4 files changed, 136 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 015c2578..e53119a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -24,6 +24,7 @@
>  
>  #include <linux/bitmap.h>
>  #include <linux/types.h>
> +#include <linux/kernel.h>
>  #include <linux/kvm_types.h>
>  #include <asm/cpufeature.h>
>  #include <asm/daifflags.h>
> @@ -331,6 +332,7 @@ struct kvm_vcpu_arch {
>  #define KVM_ARM64_HOST_SVE_IN_USE	(1 << 3) /* backup for host TIF_SVE */
>  #define KVM_ARM64_HOST_SVE_ENABLED	(1 << 4) /* SVE enabled for EL0 */
>  #define KVM_ARM64_GUEST_HAS_SVE		(1 << 5) /* SVE exposed to guest */
> +#define KVM_ARM64_VCPU_FINALIZED	(1 << 6) /* vcpu config completed */
>  
>  #define vcpu_has_sve(vcpu) (system_supports_sve() && \
>  			    ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
> @@ -554,7 +556,8 @@ void kvm_arch_free_vm(struct kvm *kvm);
>  int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
>  
>  /* Commit to the set of vcpu registers currently configured: */
> -static inline int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu) { return 0; }
> -#define kvm_arm_vcpu_finalized(vcpu) true
> +int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu);
> +#define kvm_arm_vcpu_finalized(vcpu) \
> +	((vcpu)->arch.flags & KVM_ARM64_VCPU_FINALIZED)
>  
>  #endif /* __ARM64_KVM_HOST_H__ */
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index ced760c..7ff1bd4 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -243,6 +243,10 @@ struct kvm_vcpu_events {
>  					 ((n) << 5) | (i))
>  #define KVM_REG_ARM64_SVE_FFR(i)	KVM_REG_ARM64_SVE_PREG(16, i)
>  
> +/* Vector lengths pseudo-register: */
> +#define KVM_REG_ARM64_SVE_VLS		(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
> +					 KVM_REG_SIZE_U512 | 0xffff)
> +
>  /* Device Control API: ARM VGIC */
>  #define KVM_DEV_ARM_VGIC_GRP_ADDR	0
>  #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 4a2ad60..5f48c17 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -217,6 +217,81 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>  	return err;
>  }
>  
> +#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64)
> +#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64)
> +
> +static bool vq_present(
> +	const u64 (*const vqs)[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)],
> +	unsigned int vq)
> +{
> +	return (*vqs)[vq_word(vq)] & vq_mask(vq);
> +}
> +
> +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> +{
> +	unsigned int max_vq, vq;
> +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> +
> +	if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
> +		return -EINVAL;
> +
> +	memset(vqs, 0, sizeof(vqs));
> +
> +	max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> +		if (sve_vq_available(vq))
> +			vqs[vq_word(vq)] |= vq_mask(vq);
> +
> +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));

reg->id is not know at build time. From my understanding of
BUILD_BUG_ON(), things actually ends up evaluated at runtime but I'm not
sure what happens when doing sizeof(char[1- 2*0]) at runtime.

Anyway, I don't think this is intended.

> +	if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> +{
> +	unsigned int max_vq, vq;
> +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> +
> +	if (kvm_arm_vcpu_finalized(vcpu))
> +		return -EPERM; /* too late! */
> +
> +	if (WARN_ON(vcpu->arch.sve_state))
> +		return -EINVAL;
> +
> +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));

Same as above.

> +	if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
> +		return -EFAULT;
> +
> +	max_vq = 0;
> +	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
> +		if (vq_present(&vqs, vq))
> +			max_vq = vq;
> +
> +	/*
> +	 * The get_sve_reg()/set_sve_reg() ioctl interface will need
> +	 * to be extended with multiple register slice support in
> +	 * order to support vector lengths greater than
> +	 * SVE_VL_ARCH_MAX:
> +	 */
> +	if (WARN_ONCE(sve_vl_from_vq(max_vq) > SVE_VL_ARCH_MAX,
> +		      "KVM: Requested vector length not supported yet\n"))
> +		return -EINVAL;
> +
> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> +		if (vq_present(&vqs, vq) != sve_vq_available(vq))
> +			return -EINVAL;
> +
> +	/* Can't run with no vector lengths at all: */
> +	if (max_vq < SVE_VQ_MIN)
> +		return -EINVAL;
> +
> +	vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);
> +
> +	return 0;
> +}
> +
>  #define SVE_REG_SLICE_SHIFT	0
>  #define SVE_REG_SLICE_BITS	5
>  #define SVE_REG_ID_SHIFT	(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
> @@ -297,9 +372,21 @@ static int sve_reg_region(struct sve_state_region *region,
>  static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>  {
>  	struct sve_state_region region;
> +	int ret;
>  	char __user *uptr = (char __user *)reg->addr;
>  
> -	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
> +	if (!vcpu_has_sve(vcpu))
> +		return -ENOENT;
> +
> +	if (reg->id == KVM_REG_ARM64_SVE_VLS)
> +		return get_sve_vls(vcpu, reg);
> +
> +	/* Finalize the number of slices per SVE register: */
> +	ret = kvm_arm_vcpu_finalize(vcpu);

Having this here feels a bit random...

I'd suggest considering the pseudo-register outside of the SVE co-proc,
as part of a set of registers that do not finalize a vcpu when accessed.
All other registers (even non-sve ones) would finalize the vcpu when
accessed from userland.

Does that make sense?


> +	if (ret)
> +		return ret;
> +
> +	if (sve_reg_region(&region, vcpu, reg))
>  		return -ENOENT;
>  
>  	if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset,
> @@ -313,9 +400,21 @@ static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>  static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>  {
>  	struct sve_state_region region;
> +	int ret;
>  	const char __user *uptr = (const char __user *)reg->addr;
>  
> -	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
> +	if (!vcpu_has_sve(vcpu))
> +		return -ENOENT;
> +
> +	if (reg->id == KVM_REG_ARM64_SVE_VLS)
> +		return set_sve_vls(vcpu, reg);
> +
> +	/* Finalize the number of slices per SVE register: */
> +	ret = kvm_arm_vcpu_finalize(vcpu);
> +	if (ret)
> +		return ret;
> +
> +	if (sve_reg_region(&region, vcpu, reg))
>  		return -ENOENT;
>  
>  	if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr,
> @@ -452,30 +551,43 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
>  	if (!vcpu_has_sve(vcpu))
>  		return 0;
>  
> -	return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */);
> +	return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
> +		+ 1; /* KVM_REG_ARM64_SVE_VLS */
>  }
>  
>  static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, u64 __user **uind)
>  {
>  	/* Only the first slice ever exists, for now */
>  	const unsigned int slices = 1;
> +	u64 reg;
>  	unsigned int i, n;
>  
>  	if (!vcpu_has_sve(vcpu))
>  		return 0;
>  
> +	/*
> +	 * Enumerate this first, so that userspace can save/restore in
> +	 * the order reported by KVM_GET_REG_LIST:
> +	 */
> +	reg = KVM_REG_ARM64_SVE_VLS;
> +	if (put_user(reg, (*uind)++))
> +		return -EFAULT;
> +
>  	for (i = 0; i < slices; i++) {
>  		for (n = 0; n < SVE_NUM_ZREGS; n++) {
> -			if (put_user(KVM_REG_ARM64_SVE_ZREG(n, i), (*uind)++))
> +			reg = KVM_REG_ARM64_SVE_ZREG(n, i);
> +			if (put_user(reg, (*uind)++))
>  				return -EFAULT;
>  		}
>  
>  		for (n = 0; n < SVE_NUM_PREGS; n++) {
> -			if (put_user(KVM_REG_ARM64_SVE_PREG(n, i), (*uind)++))
> +			reg = KVM_REG_ARM64_SVE_PREG(n, i);
> +			if (put_user(reg, (*uind)++))
>  				return -EFAULT;
>  		}
>  
> -		if (put_user(KVM_REG_ARM64_SVE_FFR(i), (*uind)++))
> +		reg = KVM_REG_ARM64_SVE_FFR(i);
> +		if (put_user(reg, (*uind)++))
>  			return -EFAULT;
>  	}
>  
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index b72a3dd..1379fb2 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -98,6 +98,15 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	return r;
>  }
>  
> +int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu)
> +{
> +	if (likely(kvm_arm_vcpu_finalized(vcpu)))
> +		return 0;
> +
> +	vcpu->arch.flags |= KVM_ARM64_VCPU_FINALIZED;
> +	return 0;
> +}
> +

I think that the introduction of this flag and setting it should be part
of the previous patch.

Cheers,
Dave Martin Feb. 26, 2019, 12:13 p.m. UTC | #2
On Thu, Feb 21, 2019 at 05:48:59PM +0000, Julien Thierry wrote:
> Hi Dave,
> 
> On 18/02/2019 19:52, Dave Martin wrote:
> > This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
> > allow userspace to set and query the set of vector lengths visible
> > to the guest, along with corresponding storage in struct
> > kvm_vcpu_arch.
> > 
> > In the future, multiple register slices per SVE register may be
> > visible through the ioctl interface.  Once the set of slices has
> > been determined we would not be able to allow the vector length set
> > to be changed any more, in order to avoid userspace seeing
> > inconsistent sets of registers.  For this reason, this patch adds
> > support to track vcpu finalization explicitly, and enforce proper
> > sequencing of ioctls.
> > 
> > The new pseudo-register is not exposed yet.  Subsequent patches
> > will allow SVE to be turned on for guest vcpus, making it visible.
> > 
> > Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> > 
> > ---
> > 
> > Changes since v4:
> > 
> >  * Add a UAPI header comment indicating the pseudo-register status of
> >    KVM_REG_ARM64_SVE_VLS.
> > 
> >  * Get rid of the sve_vqs[] array from struct kvm_vcpu_arch.  This
> >    array is pointless, because its contents must match the host's
> >    internal sve_vq_map anyway, up to vcpu->arch.sve_max_vl.
> > 
> >    The ioctl register accessors are slow-path code, so we can decode
> >    or reconstruct sve_vqs[] on demand instead, for exchange with
> >    userspace.
> > 
> >  * For compatibility with potential future architecture extensions,
> >    enabling vector lengths above 256 bytes for the guest is explicitly
> >    disallowed now (because the code for exposing additional bits
> >    through ioctl is currently missing).  This can be addressed later
> >    if/when needed.
> > 
> > Note:
> > 
> >  * I defensively pass an array by pointer here, to help avoid
> >    accidentally breaking assumptions during future maintenance.
> > 
> >    Due to (over?)zealous constification, this causes the following
> >    sparse warning.  I think this is sparse getting confused: I am not
> >    relying on any kernel-specific semantics here, and GCC generates no
> >    warning.
> > 
> > +arch/arm64/kvm/guest.c:33: warning: incorrect type in argument 1 (different modifiers)
> > +arch/arm64/kvm/guest.c:33:    expected unsigned long long const [usertype] ( *const vqs )[8]
> > +arch/arm64/kvm/guest.c:33:    got unsigned long long [usertype] ( * )[8]
> > 
> > ---

[...]

> > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h

[...]

> > +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> > +{
> > +	unsigned int max_vq, vq;
> > +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> > +
> > +	if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
> > +		return -EINVAL;
> > +
> > +	memset(vqs, 0, sizeof(vqs));
> > +
> > +	max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
> > +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> > +		if (sve_vq_available(vq))
> > +			vqs[vq_word(vq)] |= vq_mask(vq);
> > +
> > +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
> 
> reg->id is not know at build time. From my understanding of
> BUILD_BUG_ON(), things actually ends up evaluated at runtime but I'm not
> sure what happens when doing sizeof(char[1- 2*0]) at runtime.
>
> Anyway, I don't think this is intended.

There's no runtime check: BUILD_BUG_ON() will cause compilation to fail
if the required condition doesn't fall out from optimisation.

Because of the way this function is called, reg->id is always
KVM_REG_ARM64_SVE_VLS, so inlining and constant propagation will make
this check pass (and compile to nothing).

We can assume a certain amount of inlining: the kernel officially can't
be built without optimisation.  But the precise build configuration can
sometimes have an effect here -- so it may not be better to rely on this
working for this slow-path code.

I'll convert these to if (WARN_ON()) return -EIO or something, or drop
them if I can get rid of them in other refactoring.

The fact that struct kvm_one_reg lacks any "size" field is rather
unfortunate, since it provides no explicit way for userspace to tell us
the size of its buffer.

> > +	if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
> > +		return -EFAULT;
> > +
> > +	return 0;
> > +}
> > +
> > +static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> > +{
> > +	unsigned int max_vq, vq;
> > +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> > +
> > +	if (kvm_arm_vcpu_finalized(vcpu))
> > +		return -EPERM; /* too late! */
> > +
> > +	if (WARN_ON(vcpu->arch.sve_state))
> > +		return -EINVAL;
> > +
> > +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
> 
> Same as above.
> 
> > +	if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
> > +		return -EFAULT;
> > +
> > +	max_vq = 0;
> > +	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
> > +		if (vq_present(&vqs, vq))
> > +			max_vq = vq;
> > +
> > +	/*
> > +	 * The get_sve_reg()/set_sve_reg() ioctl interface will need
> > +	 * to be extended with multiple register slice support in
> > +	 * order to support vector lengths greater than
> > +	 * SVE_VL_ARCH_MAX:
> > +	 */
> > +	if (WARN_ONCE(sve_vl_from_vq(max_vq) > SVE_VL_ARCH_MAX,
> > +		      "KVM: Requested vector length not supported yet\n"))
> > +		return -EINVAL;
> > +
> > +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> > +		if (vq_present(&vqs, vq) != sve_vq_available(vq))
> > +			return -EINVAL;
> > +
> > +	/* Can't run with no vector lengths at all: */
> > +	if (max_vq < SVE_VQ_MIN)
> > +		return -EINVAL;
> > +
> > +	vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);
> > +
> > +	return 0;
> > +}
> > +
> >  #define SVE_REG_SLICE_SHIFT	0
> >  #define SVE_REG_SLICE_BITS	5
> >  #define SVE_REG_ID_SHIFT	(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
> > @@ -297,9 +372,21 @@ static int sve_reg_region(struct sve_state_region *region,
> >  static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> >  {
> >  	struct sve_state_region region;
> > +	int ret;
> >  	char __user *uptr = (char __user *)reg->addr;
> >  
> > -	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
> > +	if (!vcpu_has_sve(vcpu))
> > +		return -ENOENT;
> > +
> > +	if (reg->id == KVM_REG_ARM64_SVE_VLS)
> > +		return get_sve_vls(vcpu, reg);
> > +
> > +	/* Finalize the number of slices per SVE register: */
> > +	ret = kvm_arm_vcpu_finalize(vcpu);
> 
> Having this here feels a bit random...

I'll have another think about this.

KVM_REG_ARM64_SVE_VLS certainly has nothing to do with anything except
SVE, and putting it in the SVE coproc space allows us to demux SVE-
related stuff from everything else neatly.

The whole "coprocessor" concept is nonsense for arm64 anyway except
for the sysregs (where the reg ID encodings map directly onto the
architecture and also align with AArch32).  So, the copro number is
pretty meaningless except as a demux key for related groups of
registers, which is how I use it here.

There's also no well-defined distinction between real and fake
registers, and no natural home for miscellaneous fakes -- all existing
blocks are for specific purposes such as recording the state of emulated
firmware APIs etc.  The SVE registers as exposed here already have no
direct equivalent in the architecture either, due to the padding up to
2048 bits, and the presentation of FFR as addressable P-register etc.

> I'd suggest considering the pseudo-register outside of the SVE co-proc,
> as part of a set of registers that do not finalize a vcpu when accessed.

(Also, the dependency is not one-way:  _SVE_VLS is not independent
of finalization: quite the reverse -- it can only be written _before_
finalization.)

> All other registers (even non-sve ones) would finalize the vcpu when
> accessed from userland.

So, we could consider register to be in two classes as you suggest:

 * early registers
 * regular registers (i.e., everything else)

where early registers can only be written until the first access to a
regular register, or the first KVM_GET_REG_LIST or KVM_RUN.

This is a stricter rule than I have today: i.e., _SVE_VLS would really
need to be written before touching anything else.  But that may be a
good thing for avoiding future ABI drift.

This may be premature factoring though.  It's not clear whether many new
registers like KVM_REG_ARM64_SVE_VLS will exist, or that they would have
compatible sequencing requirements.

Thoughts?


Independently of this, I'll have a look at whether there's
a straightforward way to simplify the code flow.

Cheers
---Dave
Julien Thierry March 1, 2019, 1:28 p.m. UTC | #3
On 26/02/2019 12:13, Dave Martin wrote:
> On Thu, Feb 21, 2019 at 05:48:59PM +0000, Julien Thierry wrote:
>> Hi Dave,
>>
>> On 18/02/2019 19:52, Dave Martin wrote:
>>> This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
>>> allow userspace to set and query the set of vector lengths visible
>>> to the guest, along with corresponding storage in struct
>>> kvm_vcpu_arch.
>>>
>>> In the future, multiple register slices per SVE register may be
>>> visible through the ioctl interface.  Once the set of slices has
>>> been determined we would not be able to allow the vector length set
>>> to be changed any more, in order to avoid userspace seeing
>>> inconsistent sets of registers.  For this reason, this patch adds
>>> support to track vcpu finalization explicitly, and enforce proper
>>> sequencing of ioctls.
>>>
>>> The new pseudo-register is not exposed yet.  Subsequent patches
>>> will allow SVE to be turned on for guest vcpus, making it visible.
>>>
>>> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
>>>
>>> ---
>>>
>>> Changes since v4:
>>>
>>>  * Add a UAPI header comment indicating the pseudo-register status of
>>>    KVM_REG_ARM64_SVE_VLS.
>>>
>>>  * Get rid of the sve_vqs[] array from struct kvm_vcpu_arch.  This
>>>    array is pointless, because its contents must match the host's
>>>    internal sve_vq_map anyway, up to vcpu->arch.sve_max_vl.
>>>
>>>    The ioctl register accessors are slow-path code, so we can decode
>>>    or reconstruct sve_vqs[] on demand instead, for exchange with
>>>    userspace.
>>>
>>>  * For compatibility with potential future architecture extensions,
>>>    enabling vector lengths above 256 bytes for the guest is explicitly
>>>    disallowed now (because the code for exposing additional bits
>>>    through ioctl is currently missing).  This can be addressed later
>>>    if/when needed.
>>>
>>> Note:
>>>
>>>  * I defensively pass an array by pointer here, to help avoid
>>>    accidentally breaking assumptions during future maintenance.
>>>
>>>    Due to (over?)zealous constification, this causes the following
>>>    sparse warning.  I think this is sparse getting confused: I am not
>>>    relying on any kernel-specific semantics here, and GCC generates no
>>>    warning.
>>>
>>> +arch/arm64/kvm/guest.c:33: warning: incorrect type in argument 1 (different modifiers)
>>> +arch/arm64/kvm/guest.c:33:    expected unsigned long long const [usertype] ( *const vqs )[8]
>>> +arch/arm64/kvm/guest.c:33:    got unsigned long long [usertype] ( * )[8]
>>>
>>> ---
> 
> [...]
> 
>>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> 
> [...]
> 
>>> +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>>> +{
>>> +	unsigned int max_vq, vq;
>>> +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
>>> +
>>> +	if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
>>> +		return -EINVAL;
>>> +
>>> +	memset(vqs, 0, sizeof(vqs));
>>> +
>>> +	max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
>>> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
>>> +		if (sve_vq_available(vq))
>>> +			vqs[vq_word(vq)] |= vq_mask(vq);
>>> +
>>> +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
>>
>> reg->id is not know at build time. From my understanding of
>> BUILD_BUG_ON(), things actually ends up evaluated at runtime but I'm not
>> sure what happens when doing sizeof(char[1- 2*0]) at runtime.
>>
>> Anyway, I don't think this is intended.
> 
> There's no runtime check: BUILD_BUG_ON() will cause compilation to fail
> if the required condition doesn't fall out from optimisation.
> 
> Because of the way this function is called, reg->id is always
> KVM_REG_ARM64_SVE_VLS, so inlining and constant propagation will make
> this check pass (and compile to nothing).
> 
> We can assume a certain amount of inlining: the kernel officially can't
> be built without optimisation.  But the precise build configuration can
> sometimes have an effect here -- so it may not be better to rely on this
> working for this slow-path code.
> 
> I'll convert these to if (WARN_ON()) return -EIO or something, or drop
> them if I can get rid of them in other refactoring.
> 
> The fact that struct kvm_one_reg lacks any "size" field is rather
> unfortunate, since it provides no explicit way for userspace to tell us
> the size of its buffer.
> 
>>> +	if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
>>> +		return -EFAULT;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>>> +{
>>> +	unsigned int max_vq, vq;
>>> +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
>>> +
>>> +	if (kvm_arm_vcpu_finalized(vcpu))
>>> +		return -EPERM; /* too late! */
>>> +
>>> +	if (WARN_ON(vcpu->arch.sve_state))
>>> +		return -EINVAL;
>>> +
>>> +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
>>
>> Same as above.
>>
>>> +	if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
>>> +		return -EFAULT;
>>> +
>>> +	max_vq = 0;
>>> +	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
>>> +		if (vq_present(&vqs, vq))
>>> +			max_vq = vq;
>>> +
>>> +	/*
>>> +	 * The get_sve_reg()/set_sve_reg() ioctl interface will need
>>> +	 * to be extended with multiple register slice support in
>>> +	 * order to support vector lengths greater than
>>> +	 * SVE_VL_ARCH_MAX:
>>> +	 */
>>> +	if (WARN_ONCE(sve_vl_from_vq(max_vq) > SVE_VL_ARCH_MAX,
>>> +		      "KVM: Requested vector length not supported yet\n"))
>>> +		return -EINVAL;
>>> +
>>> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
>>> +		if (vq_present(&vqs, vq) != sve_vq_available(vq))
>>> +			return -EINVAL;
>>> +
>>> +	/* Can't run with no vector lengths at all: */
>>> +	if (max_vq < SVE_VQ_MIN)
>>> +		return -EINVAL;
>>> +
>>> +	vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  #define SVE_REG_SLICE_SHIFT	0
>>>  #define SVE_REG_SLICE_BITS	5
>>>  #define SVE_REG_ID_SHIFT	(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
>>> @@ -297,9 +372,21 @@ static int sve_reg_region(struct sve_state_region *region,
>>>  static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
>>>  {
>>>  	struct sve_state_region region;
>>> +	int ret;
>>>  	char __user *uptr = (char __user *)reg->addr;
>>>  
>>> -	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
>>> +	if (!vcpu_has_sve(vcpu))
>>> +		return -ENOENT;
>>> +
>>> +	if (reg->id == KVM_REG_ARM64_SVE_VLS)
>>> +		return get_sve_vls(vcpu, reg);
>>> +
>>> +	/* Finalize the number of slices per SVE register: */
>>> +	ret = kvm_arm_vcpu_finalize(vcpu);
>>
>> Having this here feels a bit random...
> 
> I'll have another think about this.
> 
> KVM_REG_ARM64_SVE_VLS certainly has nothing to do with anything except
> SVE, and putting it in the SVE coproc space allows us to demux SVE-
> related stuff from everything else neatly.
> 
> The whole "coprocessor" concept is nonsense for arm64 anyway except
> for the sysregs (where the reg ID encodings map directly onto the
> architecture and also align with AArch32).  So, the copro number is
> pretty meaningless except as a demux key for related groups of
> registers, which is how I use it here.
> 
> There's also no well-defined distinction between real and fake
> registers, and no natural home for miscellaneous fakes -- all existing
> blocks are for specific purposes such as recording the state of emulated
> firmware APIs etc.  The SVE registers as exposed here already have no
> direct equivalent in the architecture either, due to the padding up to
> 2048 bits, and the presentation of FFR as addressable P-register etc.
> 
>> I'd suggest considering the pseudo-register outside of the SVE co-proc,
>> as part of a set of registers that do not finalize a vcpu when accessed.
> 
> (Also, the dependency is not one-way:  _SVE_VLS is not independent
> of finalization: quite the reverse -- it can only be written _before_
> finalization.)
> 
>> All other registers (even non-sve ones) would finalize the vcpu when
>> accessed from userland.
> 
> So, we could consider register to be in two classes as you suggest:
> 
>  * early registers
>  * regular registers (i.e., everything else)
> 
> where early registers can only be written until the first access to a
> regular register, or the first KVM_GET_REG_LIST or KVM_RUN.
> 

That would be my personal preference, yes.

> This is a stricter rule than I have today: i.e., _SVE_VLS would really
> need to be written before touching anything else.  But that may be a
> good thing for avoiding future ABI drift.
> 

I did notice this would be more restrictive, but I don't believe this
should introduce shortcoming in ways to set up a VCPU. I might have
missed some case of course.

> This may be premature factoring though.  It's not clear whether many new
> registers like KVM_REG_ARM64_SVE_VLS will exist, or that they would have
> compatible sequencing requirements.
> 
> Thoughts?
> 

It might be premature, but I really don't like seeing this
"vcpu_finalize" in the middle of SVE code. I think it is very easy to
miss that touching some "sve" register suddenly finalizes the vcpu.

I fear that some other feature/extention might need early vcpu
finalization and that it might somehow conflict with sve depending on
which triggers finalization first.

> 
> Independently of this, I'll have a look at whether there's
> a straightforward way to simplify the code flow.
> 

I might be over-thinking it, but if there is a way to move that
finalization call I'd prefer that.

Thanks,
Dave Martin March 1, 2019, 2:55 p.m. UTC | #4
[FYI Peter, your views on the proposal outlined torward the end of the
mail would be appreciated.]

On Fri, Mar 01, 2019 at 01:28:19PM +0000, Julien Thierry wrote:
> 
> 
> On 26/02/2019 12:13, Dave Martin wrote:
> > On Thu, Feb 21, 2019 at 05:48:59PM +0000, Julien Thierry wrote:
> >> Hi Dave,
> >>
> >> On 18/02/2019 19:52, Dave Martin wrote:
> >>> This patch adds a new pseudo-register KVM_REG_ARM64_SVE_VLS to
> >>> allow userspace to set and query the set of vector lengths visible
> >>> to the guest, along with corresponding storage in struct
> >>> kvm_vcpu_arch.
> >>>
> >>> In the future, multiple register slices per SVE register may be
> >>> visible through the ioctl interface.  Once the set of slices has
> >>> been determined we would not be able to allow the vector length set
> >>> to be changed any more, in order to avoid userspace seeing
> >>> inconsistent sets of registers.  For this reason, this patch adds
> >>> support to track vcpu finalization explicitly, and enforce proper
> >>> sequencing of ioctls.
> >>>
> >>> The new pseudo-register is not exposed yet.  Subsequent patches
> >>> will allow SVE to be turned on for guest vcpus, making it visible.
> >>>
> >>> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> >>>
> >>> ---
> >>>
> >>> Changes since v4:
> >>>
> >>>  * Add a UAPI header comment indicating the pseudo-register status of
> >>>    KVM_REG_ARM64_SVE_VLS.
> >>>
> >>>  * Get rid of the sve_vqs[] array from struct kvm_vcpu_arch.  This
> >>>    array is pointless, because its contents must match the host's
> >>>    internal sve_vq_map anyway, up to vcpu->arch.sve_max_vl.
> >>>
> >>>    The ioctl register accessors are slow-path code, so we can decode
> >>>    or reconstruct sve_vqs[] on demand instead, for exchange with
> >>>    userspace.
> >>>
> >>>  * For compatibility with potential future architecture extensions,
> >>>    enabling vector lengths above 256 bytes for the guest is explicitly
> >>>    disallowed now (because the code for exposing additional bits
> >>>    through ioctl is currently missing).  This can be addressed later
> >>>    if/when needed.
> >>>
> >>> Note:
> >>>
> >>>  * I defensively pass an array by pointer here, to help avoid
> >>>    accidentally breaking assumptions during future maintenance.
> >>>
> >>>    Due to (over?)zealous constification, this causes the following
> >>>    sparse warning.  I think this is sparse getting confused: I am not
> >>>    relying on any kernel-specific semantics here, and GCC generates no
> >>>    warning.
> >>>
> >>> +arch/arm64/kvm/guest.c:33: warning: incorrect type in argument 1 (different modifiers)
> >>> +arch/arm64/kvm/guest.c:33:    expected unsigned long long const [usertype] ( *const vqs )[8]
> >>> +arch/arm64/kvm/guest.c:33:    got unsigned long long [usertype] ( * )[8]
> >>>
> >>> ---
> > 
> > [...]
> > 
> >>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> > 
> > [...]
> > 
> >>> +static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> >>> +{
> >>> +	unsigned int max_vq, vq;
> >>> +	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> >>> +
> >>> +	if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
> >>> +		return -EINVAL;
> >>> +
> >>> +	memset(vqs, 0, sizeof(vqs));
> >>> +
> >>> +	max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
> >>> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> >>> +		if (sve_vq_available(vq))
> >>> +			vqs[vq_word(vq)] |= vq_mask(vq);
> >>> +
> >>> +	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
> >>
> >> reg->id is not know at build time. From my understanding of
> >> BUILD_BUG_ON(), things actually ends up evaluated at runtime but I'm not
> >> sure what happens when doing sizeof(char[1- 2*0]) at runtime.
> >>
> >> Anyway, I don't think this is intended.
> > 
> > There's no runtime check: BUILD_BUG_ON() will cause compilation to fail
> > if the required condition doesn't fall out from optimisation.
> > 
> > Because of the way this function is called, reg->id is always
> > KVM_REG_ARM64_SVE_VLS, so inlining and constant propagation will make
> > this check pass (and compile to nothing).
> > 
> > We can assume a certain amount of inlining: the kernel officially can't
> > be built without optimisation.  But the precise build configuration can
> > sometimes have an effect here -- so it may not be better to rely on this
> > working for this slow-path code.
> > 
> > I'll convert these to if (WARN_ON()) return -EIO or something, or drop
> > them if I can get rid of them in other refactoring.
> > 
> > The fact that struct kvm_one_reg lacks any "size" field is rather
> > unfortunate, since it provides no explicit way for userspace to tell us
> > the size of its buffer.

I dropped the checks completely now, since they seem rather
unnecessary.

[...]

> >>>  #define SVE_REG_SLICE_SHIFT	0
> >>>  #define SVE_REG_SLICE_BITS	5
> >>>  #define SVE_REG_ID_SHIFT	(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
> >>> @@ -297,9 +372,21 @@ static int sve_reg_region(struct sve_state_region *region,
> >>>  static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
> >>>  {
> >>>  	struct sve_state_region region;
> >>> +	int ret;
> >>>  	char __user *uptr = (char __user *)reg->addr;
> >>>  
> >>> -	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
> >>> +	if (!vcpu_has_sve(vcpu))
> >>> +		return -ENOENT;
> >>> +
> >>> +	if (reg->id == KVM_REG_ARM64_SVE_VLS)
> >>> +		return get_sve_vls(vcpu, reg);
> >>> +
> >>> +	/* Finalize the number of slices per SVE register: */
> >>> +	ret = kvm_arm_vcpu_finalize(vcpu);
> >>
> >> Having this here feels a bit random...
> > 
> > I'll have another think about this.
> > 
> > KVM_REG_ARM64_SVE_VLS certainly has nothing to do with anything except
> > SVE, and putting it in the SVE coproc space allows us to demux SVE-
> > related stuff from everything else neatly.
> > 
> > The whole "coprocessor" concept is nonsense for arm64 anyway except
> > for the sysregs (where the reg ID encodings map directly onto the
> > architecture and also align with AArch32).  So, the copro number is
> > pretty meaningless except as a demux key for related groups of
> > registers, which is how I use it here.
> > 
> > There's also no well-defined distinction between real and fake
> > registers, and no natural home for miscellaneous fakes -- all existing
> > blocks are for specific purposes such as recording the state of emulated
> > firmware APIs etc.  The SVE registers as exposed here already have no
> > direct equivalent in the architecture either, due to the padding up to
> > 2048 bits, and the presentation of FFR as addressable P-register etc.
> > 
> >> I'd suggest considering the pseudo-register outside of the SVE co-proc,
> >> as part of a set of registers that do not finalize a vcpu when accessed.

(Without a compelling alternative, I've kept KVM_ARM_ARM64_SVE_VLS in the
SVE copro for now, though this could still be changed if people feel
strongly about it.)

> > (Also, the dependency is not one-way:  _SVE_VLS is not independent
> > of finalization: quite the reverse -- it can only be written _before_
> > finalization.)
> > 
> >> All other registers (even non-sve ones) would finalize the vcpu when
> >> accessed from userland.
> > 
> > So, we could consider register to be in two classes as you suggest:
> > 
> >  * early registers
> >  * regular registers (i.e., everything else)
> > 
> > where early registers can only be written until the first access to a
> > regular register, or the first KVM_GET_REG_LIST or KVM_RUN.
> > 
> 
> That would be my personal preference, yes.

For comparison, we currently have

 * early registers (currently KVM_REG_ARM64_SVE_VLS)
 * late registers (currently KVM_REG_ARM64_SVE_{Z,P,FFR}*)
 * relaxed registers (everything else)

plus a couple of "late ioctls" that implcitly finalize (KVM_RUN,
KVM_GET_REG_LIST).

A write to an early register cannot follow a late ioctl or any access to
a late register.

I'm not convinced that's worse, but it's different.

> > This is a stricter rule than I have today: i.e., _SVE_VLS would really
> > need to be written before touching anything else.  But that may be a
> > good thing for avoiding future ABI drift.
> > 
> 
> I did notice this would be more restrictive, but I don't believe this
> should introduce shortcoming in ways to set up a VCPU. I might have
> missed some case of course.
> 
> > This may be premature factoring though.  It's not clear whether many new
> > registers like KVM_REG_ARM64_SVE_VLS will exist, or that they would have
> > compatible sequencing requirements.
> > 
> > Thoughts?
> > 
> 
> It might be premature, but I really don't like seeing this
> "vcpu_finalize" in the middle of SVE code. I think it is very easy to
> miss that touching some "sve" register suddenly finalizes the vcpu.
> I fear that some other feature/extention might need early vcpu
> finalization and that it might somehow conflict with sve depending on
> which triggers finalization first.
> 
> > 
> > Independently of this, I'll have a look at whether there's
> > a straightforward way to simplify the code flow.
> > 
> 
> I might be over-thinking it, but if there is a way to move that
> finalization call I'd prefer that.

I know what you mean, but there's not really another clear place to put
it either, right now.  Making finalization a side-effect of only KVM_RUN
and KVM_GET_REG_LIST would mean that if userspace is squirting in the
state of a migrated vcpu, a dummy KVM_GET_REG_LIST call would need to be
inserted between setting KVM_REG_ARM64_SVE_VLS and setting the other SVE
regs.

One thing we could to is get rid of the implicit finalization behaviour
completely, and require an explicit ioctl KVM_ARM_VCPU_FINALIZE to do
this.  This makes a clear barrier between reg writes that manipulate the
"physical" configuration of the vcpu, and reg reads/writes that merely
manipulate the vcpu's runtime state.  Say:

	KVM_ARM_VCPU_INIT(features[] = KVM_ARM_VCPU_SVE) -> ok
	KVM_RUN -> -EPERM (too early)
	KVM_GET_REG_LIST -> -EPERM (too early)
	...
	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> ok
	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> -EPERM (too early)
	...
	KVM_RUN -> -EPERM (too early)
	...
	KVM_ARM_VCPU_FINALIZE -> ok
	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
	...
	KVM_REG_REG_LIST -> ok
	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> -EPERM (too late)
	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> ok
	KVM_RUN -> ok
	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)

This would change all existing kvm_arm_vcpu_finalize() calls to

	if (!kvm_arm_vcpu_finalized())
		return -EPERM;

(or similar).
	
Without an affected vcpu feature enabled, for backwards compatibility
we would not require the KVM_ARM_VCPU_FINALIZE call (or perhaps even
forbid it, since userspace that wants to backwards compatible cannot
use the new ioctl anyway.  I'm in two minds about this.  Either way,
calling _FINALIZE on an old kvm is harmless, so long as userspace knows
to ignore this failure case.)

The backwards-compatible flow would be:

	KVM_ARM_VCPU_INIT(features[] = 0) -> ok
	...
	KVM_ARM_VCPU_FINALIZE -> ok (or -EPERM)
	...
	KVM_RUN -> ok
	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)


Thoughts?

I may not have time to rework this for -rc1, and being a late ABI
change I'd like to get others' views on it before heading too far into
the tunnel.

Cheers
---Dave
Marc Zyngier March 7, 2019, 1:47 p.m. UTC | #5
Hi Dave,

On 01/03/2019 14:55, Dave Martin wrote:
> [FYI Peter, your views on the proposal outlined torward the end of the
> mail would be appreciated.]
> 
> On Fri, Mar 01, 2019 at 01:28:19PM +0000, Julien Thierry wrote:

[...]

>> I might be over-thinking it, but if there is a way to move that
>> finalization call I'd prefer that.
> 
> I know what you mean, but there's not really another clear place to put
> it either, right now.  Making finalization a side-effect of only KVM_RUN
> and KVM_GET_REG_LIST would mean that if userspace is squirting in the
> state of a migrated vcpu, a dummy KVM_GET_REG_LIST call would need to be
> inserted between setting KVM_REG_ARM64_SVE_VLS and setting the other SVE
> regs.
> 
> One thing we could to is get rid of the implicit finalization behaviour
> completely, and require an explicit ioctl KVM_ARM_VCPU_FINALIZE to do

+1. In the past, implicit finalization has been a major pain, and the
"implicit GICv2 configuration" has been an absolute disaster.

> this.  This makes a clear barrier between reg writes that manipulate the
> "physical" configuration of the vcpu, and reg reads/writes that merely
> manipulate the vcpu's runtime state.  Say:
> 
> 	KVM_ARM_VCPU_INIT(features[] = KVM_ARM_VCPU_SVE) -> ok
> 	KVM_RUN -> -EPERM (too early)
> 	KVM_GET_REG_LIST -> -EPERM (too early)
> 	...
> 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> ok
> 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> -EPERM (too early)
> 	...
> 	KVM_RUN -> -EPERM (too early)
> 	...
> 	KVM_ARM_VCPU_FINALIZE -> ok
> 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> 	...
> 	KVM_REG_REG_LIST -> ok
> 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> -EPERM (too late)
> 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> ok
> 	KVM_RUN -> ok
> 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> 
> This would change all existing kvm_arm_vcpu_finalize() calls to
> 
> 	if (!kvm_arm_vcpu_finalized())
> 		return -EPERM;
> 
> (or similar).

I find this rather compelling, assuming we can easily map features that
are associated to finalization.

> 	
> Without an affected vcpu feature enabled, for backwards compatibility
> we would not require the KVM_ARM_VCPU_FINALIZE call (or perhaps even
> forbid it, since userspace that wants to backwards compatible cannot
> use the new ioctl anyway.  I'm in two minds about this.  Either way,
> calling _FINALIZE on an old kvm is harmless, so long as userspace knows
> to ignore this failure case.)
> 
> The backwards-compatible flow would be:
> 
> 	KVM_ARM_VCPU_INIT(features[] = 0) -> ok
> 	...
> 	KVM_ARM_VCPU_FINALIZE -> ok (or -EPERM)
> 	...
> 	KVM_RUN -> ok
> 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> 
> 
> Thoughts?

This goes back to the above question: how do we ensure that userspace
knows which features are subject to being finalized. As it is currently
described, KVM_ARM_VCPU_FINALIZE isn't qualified by a feature set, nor
can the kernel report what feature flags requires being finalized. It is
also unclear to me whether all "finalizable" features would have the
same init cycle.

So either we take the simplest approach and make KVM_ARM_VCPU_FINALIZE
strictly SVE specific (renaming it in the process), or it takes some
argument that allow specifying the feature set it applies to. Maybe we
can get away with the kernel not reporting which features require to be
finalized as long that it is clearly documented.

> I may not have time to rework this for -rc1, and being a late ABI
> change I'd like to get others' views on it before heading too far into
> the tunnel.

Please do your best to have something as close as possible to the final
version for -rc1. From that point on, I'd expect changes to be mostly
cosmetic.

Thanks,

	M.
Dave Martin March 7, 2019, 3:30 p.m. UTC | #6
On Thu, Mar 07, 2019 at 01:47:09PM +0000, Marc Zyngier wrote:
> Hi Dave,
> 
> On 01/03/2019 14:55, Dave Martin wrote:
> > [FYI Peter, your views on the proposal outlined torward the end of the
> > mail would be appreciated.]
> > 
> > On Fri, Mar 01, 2019 at 01:28:19PM +0000, Julien Thierry wrote:
> 
> [...]
> 
> >> I might be over-thinking it, but if there is a way to move that
> >> finalization call I'd prefer that.
> > 
> > I know what you mean, but there's not really another clear place to put
> > it either, right now.  Making finalization a side-effect of only KVM_RUN
> > and KVM_GET_REG_LIST would mean that if userspace is squirting in the
> > state of a migrated vcpu, a dummy KVM_GET_REG_LIST call would need to be
> > inserted between setting KVM_REG_ARM64_SVE_VLS and setting the other SVE
> > regs.
> > 
> > One thing we could to is get rid of the implicit finalization behaviour
> > completely, and require an explicit ioctl KVM_ARM_VCPU_FINALIZE to do
> 
> +1. In the past, implicit finalization has been a major pain, and the
> "implicit GICv2 configuration" has been an absolute disaster.
> 
> > this.  This makes a clear barrier between reg writes that manipulate the
> > "physical" configuration of the vcpu, and reg reads/writes that merely
> > manipulate the vcpu's runtime state.  Say:
> > 
> > 	KVM_ARM_VCPU_INIT(features[] = KVM_ARM_VCPU_SVE) -> ok
> > 	KVM_RUN -> -EPERM (too early)
> > 	KVM_GET_REG_LIST -> -EPERM (too early)
> > 	...
> > 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> ok
> > 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> -EPERM (too early)
> > 	...
> > 	KVM_RUN -> -EPERM (too early)
> > 	...
> > 	KVM_ARM_VCPU_FINALIZE -> ok
> > 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> > 	...
> > 	KVM_REG_REG_LIST -> ok
> > 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_VLS) -> -EPERM (too late)
> > 	KVM_SET_ONE_REG(KVM_REG_ARM64_SVE_ZREG(0, 0)) -> ok
> > 	KVM_RUN -> ok
> > 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> > 
> > This would change all existing kvm_arm_vcpu_finalize() calls to
> > 
> > 	if (!kvm_arm_vcpu_finalized())
> > 		return -EPERM;
> > 
> > (or similar).
> 
> I find this rather compelling, assuming we can easily map features that
> are associated to finalization.

OK ... thanks for taking a look.

> > Without an affected vcpu feature enabled, for backwards compatibility
> > we would not require the KVM_ARM_VCPU_FINALIZE call (or perhaps even
> > forbid it, since userspace that wants to backwards compatible cannot
> > use the new ioctl anyway.  I'm in two minds about this.  Either way,
> > calling _FINALIZE on an old kvm is harmless, so long as userspace knows
> > to ignore this failure case.)
> > 
> > The backwards-compatible flow would be:
> > 
> > 	KVM_ARM_VCPU_INIT(features[] = 0) -> ok
> > 	...
> > 	KVM_ARM_VCPU_FINALIZE -> ok (or -EPERM)
> > 	...
> > 	KVM_RUN -> ok
> > 	KVM_ARM_VCPU_FINALIZE -> -EPERM (too late)
> > 
> > 
> > Thoughts?
> 
> This goes back to the above question: how do we ensure that userspace
> knows which features are subject to being finalized. As it is currently
> described, KVM_ARM_VCPU_FINALIZE isn't qualified by a feature set, nor
> can the kernel report what feature flags requires being finalized. It is
> also unclear to me whether all "finalizable" features would have the
> same init cycle.

So, it's not clear whether any other features will ever need
finalization, and even if they do there's a fair chance they won't be
interdependent with SVE in such a way as to require multiple
finalization steps.

For now, it's seems reasonable to make the finalization call a no-op
when no finalization is required.

Where finalization is required, KVM_ARM_VCPU_FINALIZE becomes mandatory,
but the requirement is a) strictly opt-in, and b) userspace will quickly
discover if it gets this wrong.

For this reason I'd rather not have any explicit reporting of whether
finalization is needed or not (or why): we just document and enforce at
run-time.

> So either we take the simplest approach and make KVM_ARM_VCPU_FINALIZE
> strictly SVE specific (renaming it in the process), or it takes some
> argument that allow specifying the feature set it applies to. Maybe we
> can get away with the kernel not reporting which features require to be
> finalized as long that it is clearly documented.

OK, what about:

 * Userspace treats KVM_ARM_VCPU_FINALIZE() -> EINVAL as no error
   (so that userspace can simply insert this into its init sequence,
   even on old kernels).

 * We add an argument, so that KVM_ARM_VCPU_FINALIZE(0) means
   "finalize default stuff, including SVE".  We can add explicit flags
   later if needed to finalize individual features separately.

   I don't know whether any features ever will have interdependent
   finalization requirements, but this should help get us off the
   hook if so.


Question:

 * KVM_REG_ARM64_SVE_VLS is a weird register because of its special
   sequencing requirements.

   The main reason for this is to make it easier to detect migration
   to a mismatched destination node.  But userspace needs some explicit
   code to make all this work anyway, so should we just have a couple
   of ioctls to get/set it instead?

   If there's no strong view either way, I'll leave it as-is, to
   minimise churn.

[...]

> Please do your best to have something as close as possible to the final
> version for -rc1. From that point on, I'd expect changes to be mostly
> cosmetic.

This ought to be feasible.  The changes being discussed so far are non-
invasive.

Cheers
---Dave
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 015c2578..e53119a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -24,6 +24,7 @@ 
 
 #include <linux/bitmap.h>
 #include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/kvm_types.h>
 #include <asm/cpufeature.h>
 #include <asm/daifflags.h>
@@ -331,6 +332,7 @@  struct kvm_vcpu_arch {
 #define KVM_ARM64_HOST_SVE_IN_USE	(1 << 3) /* backup for host TIF_SVE */
 #define KVM_ARM64_HOST_SVE_ENABLED	(1 << 4) /* SVE enabled for EL0 */
 #define KVM_ARM64_GUEST_HAS_SVE		(1 << 5) /* SVE exposed to guest */
+#define KVM_ARM64_VCPU_FINALIZED	(1 << 6) /* vcpu config completed */
 
 #define vcpu_has_sve(vcpu) (system_supports_sve() && \
 			    ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
@@ -554,7 +556,8 @@  void kvm_arch_free_vm(struct kvm *kvm);
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
 
 /* Commit to the set of vcpu registers currently configured: */
-static inline int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu) { return 0; }
-#define kvm_arm_vcpu_finalized(vcpu) true
+int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu);
+#define kvm_arm_vcpu_finalized(vcpu) \
+	((vcpu)->arch.flags & KVM_ARM64_VCPU_FINALIZED)
 
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ced760c..7ff1bd4 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -243,6 +243,10 @@  struct kvm_vcpu_events {
 					 ((n) << 5) | (i))
 #define KVM_REG_ARM64_SVE_FFR(i)	KVM_REG_ARM64_SVE_PREG(16, i)
 
+/* Vector lengths pseudo-register: */
+#define KVM_REG_ARM64_SVE_VLS		(KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
+					 KVM_REG_SIZE_U512 | 0xffff)
+
 /* Device Control API: ARM VGIC */
 #define KVM_DEV_ARM_VGIC_GRP_ADDR	0
 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 4a2ad60..5f48c17 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -217,6 +217,81 @@  static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	return err;
 }
 
+#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64)
+#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64)
+
+static bool vq_present(
+	const u64 (*const vqs)[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)],
+	unsigned int vq)
+{
+	return (*vqs)[vq_word(vq)] & vq_mask(vq);
+}
+
+static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	unsigned int max_vq, vq;
+	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
+
+	if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
+		return -EINVAL;
+
+	memset(vqs, 0, sizeof(vqs));
+
+	max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
+		if (sve_vq_available(vq))
+			vqs[vq_word(vq)] |= vq_mask(vq);
+
+	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
+	if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	unsigned int max_vq, vq;
+	u64 vqs[DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
+
+	if (kvm_arm_vcpu_finalized(vcpu))
+		return -EPERM; /* too late! */
+
+	if (WARN_ON(vcpu->arch.sve_state))
+		return -EINVAL;
+
+	BUILD_BUG_ON(sizeof(vqs) != KVM_REG_SIZE(reg->id));
+	if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
+		return -EFAULT;
+
+	max_vq = 0;
+	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
+		if (vq_present(&vqs, vq))
+			max_vq = vq;
+
+	/*
+	 * The get_sve_reg()/set_sve_reg() ioctl interface will need
+	 * to be extended with multiple register slice support in
+	 * order to support vector lengths greater than
+	 * SVE_VL_ARCH_MAX:
+	 */
+	if (WARN_ONCE(sve_vl_from_vq(max_vq) > SVE_VL_ARCH_MAX,
+		      "KVM: Requested vector length not supported yet\n"))
+		return -EINVAL;
+
+	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
+		if (vq_present(&vqs, vq) != sve_vq_available(vq))
+			return -EINVAL;
+
+	/* Can't run with no vector lengths at all: */
+	if (max_vq < SVE_VQ_MIN)
+		return -EINVAL;
+
+	vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);
+
+	return 0;
+}
+
 #define SVE_REG_SLICE_SHIFT	0
 #define SVE_REG_SLICE_BITS	5
 #define SVE_REG_ID_SHIFT	(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
@@ -297,9 +372,21 @@  static int sve_reg_region(struct sve_state_region *region,
 static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
 	struct sve_state_region region;
+	int ret;
 	char __user *uptr = (char __user *)reg->addr;
 
-	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
+	if (!vcpu_has_sve(vcpu))
+		return -ENOENT;
+
+	if (reg->id == KVM_REG_ARM64_SVE_VLS)
+		return get_sve_vls(vcpu, reg);
+
+	/* Finalize the number of slices per SVE register: */
+	ret = kvm_arm_vcpu_finalize(vcpu);
+	if (ret)
+		return ret;
+
+	if (sve_reg_region(&region, vcpu, reg))
 		return -ENOENT;
 
 	if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset,
@@ -313,9 +400,21 @@  static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
 	struct sve_state_region region;
+	int ret;
 	const char __user *uptr = (const char __user *)reg->addr;
 
-	if (!vcpu_has_sve(vcpu) || sve_reg_region(&region, vcpu, reg))
+	if (!vcpu_has_sve(vcpu))
+		return -ENOENT;
+
+	if (reg->id == KVM_REG_ARM64_SVE_VLS)
+		return set_sve_vls(vcpu, reg);
+
+	/* Finalize the number of slices per SVE register: */
+	ret = kvm_arm_vcpu_finalize(vcpu);
+	if (ret)
+		return ret;
+
+	if (sve_reg_region(&region, vcpu, reg))
 		return -ENOENT;
 
 	if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr,
@@ -452,30 +551,43 @@  static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
 	if (!vcpu_has_sve(vcpu))
 		return 0;
 
-	return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */);
+	return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
+		+ 1; /* KVM_REG_ARM64_SVE_VLS */
 }
 
 static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, u64 __user **uind)
 {
 	/* Only the first slice ever exists, for now */
 	const unsigned int slices = 1;
+	u64 reg;
 	unsigned int i, n;
 
 	if (!vcpu_has_sve(vcpu))
 		return 0;
 
+	/*
+	 * Enumerate this first, so that userspace can save/restore in
+	 * the order reported by KVM_GET_REG_LIST:
+	 */
+	reg = KVM_REG_ARM64_SVE_VLS;
+	if (put_user(reg, (*uind)++))
+		return -EFAULT;
+
 	for (i = 0; i < slices; i++) {
 		for (n = 0; n < SVE_NUM_ZREGS; n++) {
-			if (put_user(KVM_REG_ARM64_SVE_ZREG(n, i), (*uind)++))
+			reg = KVM_REG_ARM64_SVE_ZREG(n, i);
+			if (put_user(reg, (*uind)++))
 				return -EFAULT;
 		}
 
 		for (n = 0; n < SVE_NUM_PREGS; n++) {
-			if (put_user(KVM_REG_ARM64_SVE_PREG(n, i), (*uind)++))
+			reg = KVM_REG_ARM64_SVE_PREG(n, i);
+			if (put_user(reg, (*uind)++))
 				return -EFAULT;
 		}
 
-		if (put_user(KVM_REG_ARM64_SVE_FFR(i), (*uind)++))
+		reg = KVM_REG_ARM64_SVE_FFR(i);
+		if (put_user(reg, (*uind)++))
 			return -EFAULT;
 	}
 
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b72a3dd..1379fb2 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -98,6 +98,15 @@  int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	return r;
 }
 
+int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu)
+{
+	if (likely(kvm_arm_vcpu_finalized(vcpu)))
+		return 0;
+
+	vcpu->arch.flags |= KVM_ARM64_VCPU_FINALIZED;
+	return 0;
+}
+
 /**
  * kvm_reset_vcpu - sets core registers and sys_regs to reset value
  * @vcpu: The VCPU pointer