diff mbox series

[RFC,v2,19/23] KVM: arm64/sve: Report and enable SVE API extensions for userspace

Message ID 1538141967-15375-20-git-send-email-Dave.Martin@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Initial support for SVE guests | expand

Commit Message

Dave Martin Sept. 28, 2018, 1:39 p.m. UTC
This patch adds the necessary API extensions to allow userspace to
detect SVE support for guests and enable it.

A new capability KVM_CAP_ARM_SVE is defined to allow userspace to
detect the availability of the KVM SVE API extensions in the usual
way.

Userspace needs to enable SVE explicitly per vcpu and configure the
set of SVE vector lengths available to the guest before the vcpu is
allowed to run.  For these purposes, a new arm64-specific vcpu
ioctl KVM_ARM_SVE_CONFIG is added, with the following subcommands
(in rough order of expected use):

KVM_ARM_SVE_CONFIG_QUERY: report the set of vector lengths
    supported by this host.

    The resulting set can be supplied directly to
    KVM_ARM_SVE_CONFIG_SET in order to obtain the maximal possible
    set, or used to inform userspace's decision on the appropriate
    set of vector lengths (possibly taking into account the
    configuration of other nodes in the cluster so that the VM can
    migrate freely).

KVM_ARM_SVE_CONFIG_SET: enable SVE for this vcpu and configure the
    set of vector lengths it offers to the guest.

    This can only be done once, before the vcpu is run.

KVM_ARM_SVE_CONFIG_GET: report the set of vector lengths available
    to the guest on this vcpu (for use when snapshotting or
    migrating a VM).

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
---

Changes since RFCv1:

 * The new feature bit for PREFERRED_TARGET / VCPU_INIT is gone in
   favour of a capability and a new ioctl to enable/configure SVE.

   Perhaps the SVE configuration could be done via device attributes,
   but it still has to be done early, so crowbarring support for this
   behind a generic API may cause more trouble than it solves.

   This is still up for discussion if anybody feels strongly about it.

 * An ioctl KVM_ARM_SVE_CONFIG has been added to report the set of
   vector lengths available and configure SVE for a vcpu.

   To reduce ioctl namespace pollution the new operations are grouped
   as subcommands under a single ioctl, since they use the same
   argument format anyway.
---
 arch/arm64/include/asm/kvm_host.h |   8 +-
 arch/arm64/include/uapi/asm/kvm.h |  14 ++++
 arch/arm64/kvm/guest.c            | 164 +++++++++++++++++++++++++++++++++++++-
 arch/arm64/kvm/reset.c            |  50 ++++++++++++
 include/uapi/linux/kvm.h          |   4 +
 5 files changed, 238 insertions(+), 2 deletions(-)

Comments

Alex Bennée Nov. 22, 2018, 3:23 p.m. UTC | #1
Dave Martin <Dave.Martin@arm.com> writes:

> This patch adds the necessary API extensions to allow userspace to
> detect SVE support for guests and enable it.
>
> A new capability KVM_CAP_ARM_SVE is defined to allow userspace to
> detect the availability of the KVM SVE API extensions in the usual
> way.
>
> Userspace needs to enable SVE explicitly per vcpu and configure the
> set of SVE vector lengths available to the guest before the vcpu is
> allowed to run.  For these purposes, a new arm64-specific vcpu
> ioctl KVM_ARM_SVE_CONFIG is added, with the following subcommands
> (in rough order of expected use):
>
> KVM_ARM_SVE_CONFIG_QUERY: report the set of vector lengths
>     supported by this host.
>
>     The resulting set can be supplied directly to
>     KVM_ARM_SVE_CONFIG_SET in order to obtain the maximal possible
>     set, or used to inform userspace's decision on the appropriate
>     set of vector lengths (possibly taking into account the
>     configuration of other nodes in the cluster so that the VM can
>     migrate freely).
>
> KVM_ARM_SVE_CONFIG_SET: enable SVE for this vcpu and configure the
>     set of vector lengths it offers to the guest.
>
>     This can only be done once, before the vcpu is run.
>
> KVM_ARM_SVE_CONFIG_GET: report the set of vector lengths available
>     to the guest on this vcpu (for use when snapshotting or
>     migrating a VM).
>
> Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> ---
>
> Changes since RFCv1:
>
>  * The new feature bit for PREFERRED_TARGET / VCPU_INIT is gone in
>    favour of a capability and a new ioctl to enable/configure SVE.
>
>    Perhaps the SVE configuration could be done via device attributes,
>    but it still has to be done early, so crowbarring support for this
>    behind a generic API may cause more trouble than it solves.
>
>    This is still up for discussion if anybody feels strongly about it.
>
>  * An ioctl KVM_ARM_SVE_CONFIG has been added to report the set of
>    vector lengths available and configure SVE for a vcpu.
>
>    To reduce ioctl namespace pollution the new operations are grouped
>    as subcommands under a single ioctl, since they use the same
>    argument format anyway.
> ---
>  arch/arm64/include/asm/kvm_host.h |   8 +-
>  arch/arm64/include/uapi/asm/kvm.h |  14 ++++
>  arch/arm64/kvm/guest.c            | 164 +++++++++++++++++++++++++++++++++++++-
>  arch/arm64/kvm/reset.c            |  50 ++++++++++++
>  include/uapi/linux/kvm.h          |   4 +
>  5 files changed, 238 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index bbde597..5225485 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -52,6 +52,12 @@
>
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>
> +#ifdef CONFIG_ARM64_SVE
> +bool kvm_sve_supported(void);
> +#else
> +static inline bool kvm_sve_supported(void) { return false; }
> +#endif
> +
>  int __attribute_const__ kvm_target_cpu(void);
>  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
>  int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
> @@ -441,7 +447,7 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
>  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>
> -static inline void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
> +void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
>
>  void kvm_arm_init_debug(void);
>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 1ff68fa..94f6932 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -32,6 +32,7 @@
>  #define KVM_NR_SPSR	5
>
>  #ifndef __ASSEMBLY__
> +#include <linux/kernel.h>
>  #include <linux/psci.h>
>  #include <linux/types.h>
>  #include <asm/ptrace.h>
> @@ -108,6 +109,19 @@ struct kvm_vcpu_init {
>  	__u32 features[7];
>  };
>
> +/* Vector length set for KVM_ARM_SVE_CONFIG */
> +struct kvm_sve_vls {
> +	__u16 cmd;
> +	__u16 max_vq;
> +	__u16 _reserved[2];
> +	__u64 required_vqs[__KERNEL_DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
> +};
> +
> +/* values for cmd: */
> +#define KVM_ARM_SVE_CONFIG_QUERY	0 /* query what the host can support */
> +#define KVM_ARM_SVE_CONFIG_SET		1 /* enable SVE for vcpu and set VLs */
> +#define KVM_ARM_SVE_CONFIG_GET		2 /* read the set of VLs for a vcpu */
> +
>  struct kvm_sregs {
>  };
>
> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
> index 331b85e..d96145a 100644
> --- a/arch/arm64/kvm/guest.c
> +++ b/arch/arm64/kvm/guest.c
> @@ -26,6 +26,9 @@
>  #include <linux/module.h>
>  #include <linux/vmalloc.h>
>  #include <linux/fs.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <linux/types.h>
>  #include <kvm/arm_psci.h>
>  #include <asm/cputype.h>
>  #include <linux/uaccess.h>
> @@ -56,6 +59,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>
> +void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +	kfree(vcpu->arch.sve_state);
> +}
> +
>  static u64 core_reg_offset_from_id(u64 id)
>  {
>  	return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
> @@ -546,10 +554,164 @@ int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
>  	return 0;
>  }
>
> +#define VQS_PER_U64 64
> +#define vq_word(vqs, vq) (&(vqs)[((vq) - SVE_VQ_MIN) / VQS_PER_U64])
> +#define vq_mask(vq) ((u64)1 << (((vq) - SVE_VQ_MIN) % VQS_PER_U64))
> +
> +static void set_vq(u64 *vqs, unsigned int vq)
> +{
> +	*vq_word(vqs, vq) |= vq_mask(vq);
> +}
> +
> +static bool vq_set(const u64 *vqs, unsigned int vq)
> +{
> +	return *vq_word(vqs, vq) & vq_mask(vq);
> +}
> +
> +static int kvm_vcpu_set_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
> +		struct kvm_sve_vls __user *userp)
> +{
> +	unsigned int vq, max_vq;
> +	int ret;
> +
> +	if (vcpu->arch.has_run_once || vcpu_has_sve(vcpu))
> +		return -EBADFD; /* too late, or already configured */
> +
> +	BUG_ON(vcpu->arch.sve_max_vl || vcpu->arch.sve_state);
> +
> +	if (vls->max_vq < SVE_VQ_MIN || vls->max_vq > SVE_VQ_MAX)
> +		return -EINVAL;
> +
> +	max_vq = 0;
> +	for (vq = SVE_VQ_MIN; vq <= vls->max_vq; ++vq) {
> +		bool available = sve_vq_available(vq);
> +		bool required = vq_set(vls->required_vqs, vq);
> +
> +		if (required != available)
> +			break;
> +
> +		if (required)
> +			max_vq = vq;
> +	}
> +
> +	if (max_vq < SVE_VQ_MIN)
> +		return -EINVAL;
> +
> +	vls->max_vq = max_vq;
> +	ret = put_user(vls->max_vq, &userp->max_vq);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * kvm_reset_vcpu() may already have run in KVM_VCPU_INIT, so we
> +	 * rely on kzalloc() being sufficient to reset the guest SVE
> +	 * state here for a new vcpu.
> +	 *
> +	 * Subsequent resets after vcpu initialisation are handled by
> +	 * kvm_reset_sve().
> +	 */
> +	vcpu->arch.sve_state = kzalloc(SVE_SIG_REGS_SIZE(vls->max_vq),
> +				       GFP_KERNEL);
> +	if (!vcpu->arch.sve_state)
> +		return -ENOMEM;
> +
> +	vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE;
> +	vcpu->arch.sve_max_vl = sve_vl_from_vq(vls->max_vq);
> +
> +	return 0;
> +}
> +
> +static int __kvm_vcpu_query_sve_vls(struct kvm_sve_vls *vls,
> +		unsigned int max_vq, struct kvm_sve_vls __user *userp)
> +{
> +	unsigned int vq, max_available_vq;
> +
> +	memset(&vls->required_vqs, 0, sizeof(vls->required_vqs));
> +
> +	BUG_ON(max_vq < SVE_VQ_MIN || max_vq > SVE_VQ_MAX);
> +
> +	max_available_vq = 0;
> +	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
> +		if (sve_vq_available(vq)) {
> +			set_vq(vls->required_vqs, vq);
> +			max_available_vq = vq;
> +		}
> +
> +	if (WARN_ON(max_available_vq < SVE_VQ_MIN))
> +		return -EIO;
> +
> +	vls->max_vq = max_available_vq;
> +	if (copy_to_user(userp, vls, sizeof(*vls)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vcpu_query_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
> +		struct kvm_sve_vls __user *userp)
> +{
> +	BUG_ON(!sve_vl_valid(sve_max_vl));
> +
> +	return __kvm_vcpu_query_sve_vls(vls,
> +			sve_vq_from_vl(sve_max_vl), userp);
> +}
> +
> +static int kvm_vcpu_get_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
> +		struct kvm_sve_vls __user *userp)
> +{
> +	if (!vcpu_has_sve(vcpu))
> +		return -EBADFD; /* not configured yet */
> +
> +	BUG_ON(!sve_vl_valid(vcpu->arch.sve_max_vl));
> +
> +	return __kvm_vcpu_query_sve_vls(vls,
> +			sve_vq_from_vl(vcpu->arch.sve_max_vl), userp);
> +}
> +
> +static int kvm_vcpu_sve_config(struct kvm_vcpu *vcpu,
> +			       struct kvm_sve_vls __user *userp)
> +{
> +	struct kvm_sve_vls vls;
> +
> +	if (!kvm_sve_supported())
> +		return -EINVAL;
> +
> +	if (copy_from_user(&vls, userp, sizeof(vls)))
> +		return -EFAULT;
> +
> +	/*
> +	 * For forwards compatibility, flush any set bits in _reserved[]
> +	 * to tell userspace that we didn't look at them:
> +	 */
> +	memset(&vls._reserved, 0, sizeof vls._reserved);
> +
> +	switch (vls.cmd) {
> +	case KVM_ARM_SVE_CONFIG_QUERY:
> +		return kvm_vcpu_query_sve_vls(vcpu, &vls, userp);
> +
> +	case KVM_ARM_SVE_CONFIG_SET:
> +		return kvm_vcpu_set_sve_vls(vcpu, &vls, userp);
> +
> +	case KVM_ARM_SVE_CONFIG_GET:
> +		return kvm_vcpu_get_sve_vls(vcpu, &vls, userp);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int kvm_arm_arch_vcpu_ioctl(struct kvm_vcpu *vcpu,
>  			    unsigned int ioctl, unsigned long arg)
>  {
> -	return -EINVAL;
> +	void __user *userp = (void __user *)arg;
> +
> +	switch (ioctl) {
> +	case KVM_ARM_SVE_CONFIG:
> +		return kvm_vcpu_sve_config(vcpu, userp);
> +
> +	default:
> +		return -EINVAL;
> +	}
>  }
>
>  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index e37c78b..c2edcde 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -19,10 +19,12 @@
>   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>   */
>
> +#include <linux/atomic.h>
>  #include <linux/errno.h>
>  #include <linux/kvm_host.h>
>  #include <linux/kvm.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/string.h>
>
>  #include <kvm/arm_arch_timer.h>
>
> @@ -54,6 +56,31 @@ static bool cpu_has_32bit_el1(void)
>  	return !!(pfr0 & 0x20);
>  }
>
> +#ifdef CONFIG_ARM64_SVE
> +bool kvm_sve_supported(void)
> +{
> +	static bool warn_printed = false;
> +
> +	if (!system_supports_sve())
> +		return false;
> +
> +	/*
> +	 * For now, consider the hardware broken if implementation
> +	 * differences between CPUs in the system result in the set of
> +	 * vector lengths safely virtualisable for guests being less
> +	 * than the set provided to userspace:
> +	 */
> +	if (sve_max_virtualisable_vl != sve_max_vl) {
> +		if (!xchg(&warn_printed, true))
> +			kvm_err("Hardware SVE implementations
> mismatched: suppressing SVE for guests.");

This seems like you are re-inventing WARN_ONCE for the sake of having
"kvm [%i]: " in your printk string.

> +
> +		return false;
> +	}
> +
> +	return true;
> +}
> +#endif
> +
>  /**
>   * kvm_arch_dev_ioctl_check_extension
>   *
> @@ -85,6 +112,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_VCPU_EVENTS:
>  		r = 1;
>  		break;
> +	case KVM_CAP_ARM_SVE:
> +		r = kvm_sve_supported();
> +		break;

For debugging we actually use the return value to indicate how many
WP/BPs we have. We could do the same here for max number of VQs but I
guess KVM_ARM_SVE_CONFIG_QUERY reports a much richer set of information.
However this does beg the question of how useful all this extra
information is to the guest program?

A dumber implementation would be:

QEMU                 |        Kernel

KVM_CAP_ARM_SVE  --------->
                              Max VQ=n
           VQ/0  <---------

We want n < max VQ

KVM_ARM_SVE_CONFIG(n) ---->
                              Unsupported VQ
           EINVAL <--------

Weird HW can't support our choice of n.
Give up or try another value.

KVM_ARM_SVE_CONFIG(n-1) --->
                              That's OK
           0 (OK) <---------

It imposes more heavy lifting on the userspace side of things but I
would expect the "normal" case would be sane hardware supports all VLs
from Max VQ to 1. And for cases where it doesn't iterating through
several KVM_ARM_SVE_CONFIG steps is a start-up cost not a runtime one.

This would mean one capability and one SVE_CONFIG sub-command with a single
parameter. Would could always add the extend the interface later but I
wonder if we are gold plating the API too early here?

What do the maintainers thing?


>  	default:
>  		r = 0;
>  	}
> @@ -92,6 +122,21 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  	return r;
>  }
>
> +int kvm_reset_sve(struct kvm_vcpu *vcpu)
> +{
> +	if (!vcpu_has_sve(vcpu))
> +		return 0;
> +
> +	if (WARN_ON(!vcpu->arch.sve_state ||
> +		    !sve_vl_valid(vcpu->arch.sve_max_vl)))
> +		return -EIO;

For some reason using WARN_ON for side effects seems sketchy but while
BUG_ON can compile away to nothing it seems WARN_ON has been designed to
always give you a result of the condition so never mind...

> +
> +	memset(vcpu->arch.sve_state, 0,
> +	       SVE_SIG_REGS_SIZE(sve_vq_from_vl(vcpu->arch.sve_max_vl)));
> +
> +	return 0;
> +}
> +
>  /**
>   * kvm_reset_vcpu - sets core registers and sys_regs to reset value
>   * @vcpu: The VCPU pointer
> @@ -103,6 +148,7 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
>  {
>  	const struct kvm_regs *cpu_reset;
> +	int ret;
>
>  	switch (vcpu->arch.target) {
>  	default:
> @@ -120,6 +166,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
>  	/* Reset core registers */
>  	memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset));
>
> +	ret = kvm_reset_sve(vcpu);
> +	if (ret)
> +		return ret;
> +
>  	/* Reset system registers */
>  	kvm_reset_sys_regs(vcpu);
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7c3c5cc..488ca56 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -953,6 +953,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_NESTED_STATE 157
>  #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
>  #define KVM_CAP_MSR_PLATFORM_INFO 159
> +#define KVM_CAP_ARM_SVE 160
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1400,6 +1401,9 @@ struct kvm_enc_region {
>  #define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
>  #define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
>
> +/* Available with KVM_CAP_ARM_SVE */
> +#define KVM_ARM_SVE_CONFIG	  _IOWR(KVMIO,  0xc0, struct kvm_sve_vls)
> +
>  /* Secure Encrypted Virtualization command */
>  enum sev_cmd_id {
>  	/* Guest initialization commands */


--
Alex Bennée
Dave Martin Dec. 5, 2018, 6:22 p.m. UTC | #2
On Thu, Nov 22, 2018 at 03:23:13PM +0000, Alex Bennée wrote:
> 
> Dave Martin <Dave.Martin@arm.com> writes:
> 
> > This patch adds the necessary API extensions to allow userspace to
> > detect SVE support for guests and enable it.
> >
> > A new capability KVM_CAP_ARM_SVE is defined to allow userspace to
> > detect the availability of the KVM SVE API extensions in the usual
> > way.
> >
> > Userspace needs to enable SVE explicitly per vcpu and configure the
> > set of SVE vector lengths available to the guest before the vcpu is
> > allowed to run.  For these purposes, a new arm64-specific vcpu
> > ioctl KVM_ARM_SVE_CONFIG is added, with the following subcommands
> > (in rough order of expected use):
> >
> > KVM_ARM_SVE_CONFIG_QUERY: report the set of vector lengths
> >     supported by this host.
> >
> >     The resulting set can be supplied directly to
> >     KVM_ARM_SVE_CONFIG_SET in order to obtain the maximal possible
> >     set, or used to inform userspace's decision on the appropriate
> >     set of vector lengths (possibly taking into account the
> >     configuration of other nodes in the cluster so that the VM can
> >     migrate freely).
> >
> > KVM_ARM_SVE_CONFIG_SET: enable SVE for this vcpu and configure the
> >     set of vector lengths it offers to the guest.
> >
> >     This can only be done once, before the vcpu is run.
> >
> > KVM_ARM_SVE_CONFIG_GET: report the set of vector lengths available
> >     to the guest on this vcpu (for use when snapshotting or
> >     migrating a VM).
> >
> > Signed-off-by: Dave Martin <Dave.Martin@arm.com>
> > ---
> >
> > Changes since RFCv1:
> >
> >  * The new feature bit for PREFERRED_TARGET / VCPU_INIT is gone in
> >    favour of a capability and a new ioctl to enable/configure SVE.
> >
> >    Perhaps the SVE configuration could be done via device attributes,
> >    but it still has to be done early, so crowbarring support for this
> >    behind a generic API may cause more trouble than it solves.
> >
> >    This is still up for discussion if anybody feels strongly about it.
> >
> >  * An ioctl KVM_ARM_SVE_CONFIG has been added to report the set of
> >    vector lengths available and configure SVE for a vcpu.
> >
> >    To reduce ioctl namespace pollution the new operations are grouped
> >    as subcommands under a single ioctl, since they use the same
> >    argument format anyway.
> > ---
> >  arch/arm64/include/asm/kvm_host.h |   8 +-
> >  arch/arm64/include/uapi/asm/kvm.h |  14 ++++
> >  arch/arm64/kvm/guest.c            | 164 +++++++++++++++++++++++++++++++++++++-
> >  arch/arm64/kvm/reset.c            |  50 ++++++++++++
> >  include/uapi/linux/kvm.h          |   4 +
> >  5 files changed, 238 insertions(+), 2 deletions(-)
> >

[...]

> > diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> > index e37c78b..c2edcde 100644
> > --- a/arch/arm64/kvm/reset.c
> > +++ b/arch/arm64/kvm/reset.c
> > @@ -19,10 +19,12 @@
> >   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> >   */
> >
> > +#include <linux/atomic.h>
> >  #include <linux/errno.h>
> >  #include <linux/kvm_host.h>
> >  #include <linux/kvm.h>
> >  #include <linux/hw_breakpoint.h>
> > +#include <linux/string.h>
> >
> >  #include <kvm/arm_arch_timer.h>
> >
> > @@ -54,6 +56,31 @@ static bool cpu_has_32bit_el1(void)
> >  	return !!(pfr0 & 0x20);
> >  }
> >
> > +#ifdef CONFIG_ARM64_SVE
> > +bool kvm_sve_supported(void)
> > +{
> > +	static bool warn_printed = false;
> > +
> > +	if (!system_supports_sve())
> > +		return false;
> > +
> > +	/*
> > +	 * For now, consider the hardware broken if implementation
> > +	 * differences between CPUs in the system result in the set of
> > +	 * vector lengths safely virtualisable for guests being less
> > +	 * than the set provided to userspace:
> > +	 */
> > +	if (sve_max_virtualisable_vl != sve_max_vl) {
> > +		if (!xchg(&warn_printed, true))
> > +			kvm_err("Hardware SVE implementations
> > mismatched: suppressing SVE for guests.");
> 
> This seems like you are re-inventing WARN_ONCE for the sake of having
> "kvm [%i]: " in your printk string.

Yes... adding a kvm_err_once() just for this seemed overkill.

Perhaps we don't really need to print the PID here anyway: this is a
system issue, not something that specific KVM instance did wrong, so
it's misleading to confine the warning to a specific PID.

So maybe I should just do a bare printk_once(KERN_ERR "kvm: " ...)
instead?

> 
> > +
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +#endif
> > +
> >  /**
> >   * kvm_arch_dev_ioctl_check_extension
> >   *
> > @@ -85,6 +112,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
> >  	case KVM_CAP_VCPU_EVENTS:
> >  		r = 1;
> >  		break;
> > +	case KVM_CAP_ARM_SVE:
> > +		r = kvm_sve_supported();
> > +		break;
> 
> For debugging we actually use the return value to indicate how many
> WP/BPs we have. We could do the same here for max number of VQs but I
> guess KVM_ARM_SVE_CONFIG_QUERY reports a much richer set of information.
> However this does beg the question of how useful all this extra
> information is to the guest program?

As elaborated below, while I agree that the max VQ is potentially useful
to return here, I don't think it's enough.  There's a risk people will
get lazy and just guess which VQs <= the max are supported from the
return here.

So I prefer to return nothing, to avoid giving a false comfort to the
caller.

> 
> A dumber implementation would be:
> 
> QEMU                 |        Kernel
> 
> KVM_CAP_ARM_SVE  --------->
>                               Max VQ=n
>            VQ/0  <---------
> 
> We want n < max VQ
> 
> KVM_ARM_SVE_CONFIG(n) ---->
>                               Unsupported VQ
>            EINVAL <--------
> 
> Weird HW can't support our choice of n.
> Give up or try another value.
> 
> KVM_ARM_SVE_CONFIG(n-1) --->
>                               That's OK
>            0 (OK) <---------
> 
> It imposes more heavy lifting on the userspace side of things but I
> would expect the "normal" case would be sane hardware supports all VLs
> from Max VQ to 1. And for cases where it doesn't iterating through
> several KVM_ARM_SVE_CONFIG steps is a start-up cost not a runtime one.

The architecture only mandates power-of-two vector lengths, and I would
not be surprised to see hardware that takes advantage of that.

> This would mean one capability and one SVE_CONFIG sub-command with a single
> parameter. Would could always add the extend the interface later but I
> wonder if we are gold plating the API too early here?

I agree this is simpler and would typically work, but I'm not confident
that max VQ is sufficient alone to avoid silently letting
incompatibilities through.

That's why I designed the interface to specify the set of VQs explicitly
rather than just specifying the maximum and accepting whatever other VQs
come along with it.

The problem is that if you created a VM on a node that only supports
power-of-two vector lengths (say), and then migrate to a node that
support the same or greater max VQ but supports additional
non-power-of-two vector lengths below that, then weird things are going
to happen in the guest -- yet we would silently allow that here.

Of course, we could throw this into the "you are supposed to know what
you are doing and build a sane compute cluster" bucket, but I'd prefer
to be able to flag up obvious incompatibilities explicitly.

> What do the maintainers thing?
> 
> 
> >  	default:
> >  		r = 0;
> >  	}
> > @@ -92,6 +122,21 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
> >  	return r;
> >  }
> >
> > +int kvm_reset_sve(struct kvm_vcpu *vcpu)
> > +{
> > +	if (!vcpu_has_sve(vcpu))
> > +		return 0;
> > +
> > +	if (WARN_ON(!vcpu->arch.sve_state ||
> > +		    !sve_vl_valid(vcpu->arch.sve_max_vl)))
> > +		return -EIO;
> 
> For some reason using WARN_ON for side effects seems sketchy but while
> BUG_ON can compile away to nothing it seems WARN_ON has been designed to
> always give you a result of the condition so never mind...

I think this is a common idiom in the kernel.

I would definitely agree that WARN_ON(expr) (and especially
BUG_ON(expr)) is poor practice if expr has side-effects, but as you say,
WARN_ON() seems to have been designed explicitly to let the check
expression show through.

If nothing else, this avoids the possibility of typoing the expression
between the WARN_ON() and the accompanying if ().

That's just my opinion, but I'll probably stick with it in its current
form unless somebody shouts.

Cheers
---Dave
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bbde597..5225485 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -52,6 +52,12 @@ 
 
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
+#ifdef CONFIG_ARM64_SVE
+bool kvm_sve_supported(void);
+#else
+static inline bool kvm_sve_supported(void) { return false; }
+#endif
+
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
@@ -441,7 +447,7 @@  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
-static inline void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
+void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
 
 void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 1ff68fa..94f6932 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -32,6 +32,7 @@ 
 #define KVM_NR_SPSR	5
 
 #ifndef __ASSEMBLY__
+#include <linux/kernel.h>
 #include <linux/psci.h>
 #include <linux/types.h>
 #include <asm/ptrace.h>
@@ -108,6 +109,19 @@  struct kvm_vcpu_init {
 	__u32 features[7];
 };
 
+/* Vector length set for KVM_ARM_SVE_CONFIG */
+struct kvm_sve_vls {
+	__u16 cmd;
+	__u16 max_vq;
+	__u16 _reserved[2];
+	__u64 required_vqs[__KERNEL_DIV_ROUND_UP(SVE_VQ_MAX - SVE_VQ_MIN + 1, 64)];
+};
+
+/* values for cmd: */
+#define KVM_ARM_SVE_CONFIG_QUERY	0 /* query what the host can support */
+#define KVM_ARM_SVE_CONFIG_SET		1 /* enable SVE for vcpu and set VLs */
+#define KVM_ARM_SVE_CONFIG_GET		2 /* read the set of VLs for a vcpu */
+
 struct kvm_sregs {
 };
 
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 331b85e..d96145a 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -26,6 +26,9 @@ 
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
 #include <kvm/arm_psci.h>
 #include <asm/cputype.h>
 #include <linux/uaccess.h>
@@ -56,6 +59,11 @@  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+void kvm_arm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kfree(vcpu->arch.sve_state);
+}
+
 static u64 core_reg_offset_from_id(u64 id)
 {
 	return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
@@ -546,10 +554,164 @@  int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 	return 0;
 }
 
+#define VQS_PER_U64 64
+#define vq_word(vqs, vq) (&(vqs)[((vq) - SVE_VQ_MIN) / VQS_PER_U64])
+#define vq_mask(vq) ((u64)1 << (((vq) - SVE_VQ_MIN) % VQS_PER_U64))
+
+static void set_vq(u64 *vqs, unsigned int vq)
+{
+	*vq_word(vqs, vq) |= vq_mask(vq);
+}
+
+static bool vq_set(const u64 *vqs, unsigned int vq)
+{
+	return *vq_word(vqs, vq) & vq_mask(vq);
+}
+
+static int kvm_vcpu_set_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
+		struct kvm_sve_vls __user *userp)
+{
+	unsigned int vq, max_vq;
+	int ret;
+
+	if (vcpu->arch.has_run_once || vcpu_has_sve(vcpu))
+		return -EBADFD; /* too late, or already configured */
+
+	BUG_ON(vcpu->arch.sve_max_vl || vcpu->arch.sve_state);
+
+	if (vls->max_vq < SVE_VQ_MIN || vls->max_vq > SVE_VQ_MAX)
+		return -EINVAL;
+
+	max_vq = 0;
+	for (vq = SVE_VQ_MIN; vq <= vls->max_vq; ++vq) {
+		bool available = sve_vq_available(vq);
+		bool required = vq_set(vls->required_vqs, vq);
+
+		if (required != available)
+			break;
+
+		if (required)
+			max_vq = vq;
+	}
+
+	if (max_vq < SVE_VQ_MIN)
+		return -EINVAL;
+
+	vls->max_vq = max_vq;
+	ret = put_user(vls->max_vq, &userp->max_vq);
+	if (ret)
+		return ret;
+
+	/*
+	 * kvm_reset_vcpu() may already have run in KVM_VCPU_INIT, so we
+	 * rely on kzalloc() being sufficient to reset the guest SVE
+	 * state here for a new vcpu.
+	 *
+	 * Subsequent resets after vcpu initialisation are handled by
+	 * kvm_reset_sve().
+	 */
+	vcpu->arch.sve_state = kzalloc(SVE_SIG_REGS_SIZE(vls->max_vq),
+				       GFP_KERNEL);
+	if (!vcpu->arch.sve_state)
+		return -ENOMEM;
+
+	vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE;
+	vcpu->arch.sve_max_vl = sve_vl_from_vq(vls->max_vq);
+
+	return 0;
+}
+
+static int __kvm_vcpu_query_sve_vls(struct kvm_sve_vls *vls,
+		unsigned int max_vq, struct kvm_sve_vls __user *userp)
+{
+	unsigned int vq, max_available_vq;
+
+	memset(&vls->required_vqs, 0, sizeof(vls->required_vqs));
+
+	BUG_ON(max_vq < SVE_VQ_MIN || max_vq > SVE_VQ_MAX);
+
+	max_available_vq = 0;
+	for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
+		if (sve_vq_available(vq)) {
+			set_vq(vls->required_vqs, vq);
+			max_available_vq = vq;
+		}
+
+	if (WARN_ON(max_available_vq < SVE_VQ_MIN))
+		return -EIO;
+
+	vls->max_vq = max_available_vq;
+	if (copy_to_user(userp, vls, sizeof(*vls)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int kvm_vcpu_query_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
+		struct kvm_sve_vls __user *userp)
+{
+	BUG_ON(!sve_vl_valid(sve_max_vl));
+
+	return __kvm_vcpu_query_sve_vls(vls,
+			sve_vq_from_vl(sve_max_vl), userp);
+}
+
+static int kvm_vcpu_get_sve_vls(struct kvm_vcpu *vcpu, struct kvm_sve_vls *vls,
+		struct kvm_sve_vls __user *userp)
+{
+	if (!vcpu_has_sve(vcpu))
+		return -EBADFD; /* not configured yet */
+
+	BUG_ON(!sve_vl_valid(vcpu->arch.sve_max_vl));
+
+	return __kvm_vcpu_query_sve_vls(vls,
+			sve_vq_from_vl(vcpu->arch.sve_max_vl), userp);
+}
+
+static int kvm_vcpu_sve_config(struct kvm_vcpu *vcpu,
+			       struct kvm_sve_vls __user *userp)
+{
+	struct kvm_sve_vls vls;
+
+	if (!kvm_sve_supported())
+		return -EINVAL;
+
+	if (copy_from_user(&vls, userp, sizeof(vls)))
+		return -EFAULT;
+
+	/*
+	 * For forwards compatibility, flush any set bits in _reserved[]
+	 * to tell userspace that we didn't look at them:
+	 */
+	memset(&vls._reserved, 0, sizeof vls._reserved);
+
+	switch (vls.cmd) {
+	case KVM_ARM_SVE_CONFIG_QUERY:
+		return kvm_vcpu_query_sve_vls(vcpu, &vls, userp);
+
+	case KVM_ARM_SVE_CONFIG_SET:
+		return kvm_vcpu_set_sve_vls(vcpu, &vls, userp);
+
+	case KVM_ARM_SVE_CONFIG_GET:
+		return kvm_vcpu_get_sve_vls(vcpu, &vls, userp);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 int kvm_arm_arch_vcpu_ioctl(struct kvm_vcpu *vcpu,
 			    unsigned int ioctl, unsigned long arg)
 {
-	return -EINVAL;
+	void __user *userp = (void __user *)arg;
+
+	switch (ioctl) {
+	case KVM_ARM_SVE_CONFIG:
+		return kvm_vcpu_sve_config(vcpu, userp);
+
+	default:
+		return -EINVAL;
+	}
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e37c78b..c2edcde 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -19,10 +19,12 @@ 
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/string.h>
 
 #include <kvm/arm_arch_timer.h>
 
@@ -54,6 +56,31 @@  static bool cpu_has_32bit_el1(void)
 	return !!(pfr0 & 0x20);
 }
 
+#ifdef CONFIG_ARM64_SVE
+bool kvm_sve_supported(void)
+{
+	static bool warn_printed = false;
+
+	if (!system_supports_sve())
+		return false;
+
+	/*
+	 * For now, consider the hardware broken if implementation
+	 * differences between CPUs in the system result in the set of
+	 * vector lengths safely virtualisable for guests being less
+	 * than the set provided to userspace:
+	 */
+	if (sve_max_virtualisable_vl != sve_max_vl) {
+		if (!xchg(&warn_printed, true))
+			kvm_err("Hardware SVE implementations mismatched: suppressing SVE for guests.");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
 /**
  * kvm_arch_dev_ioctl_check_extension
  *
@@ -85,6 +112,9 @@  int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_EVENTS:
 		r = 1;
 		break;
+	case KVM_CAP_ARM_SVE:
+		r = kvm_sve_supported();
+		break;
 	default:
 		r = 0;
 	}
@@ -92,6 +122,21 @@  int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 	return r;
 }
 
+int kvm_reset_sve(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu_has_sve(vcpu))
+		return 0;
+
+	if (WARN_ON(!vcpu->arch.sve_state ||
+		    !sve_vl_valid(vcpu->arch.sve_max_vl)))
+		return -EIO;
+
+	memset(vcpu->arch.sve_state, 0,
+	       SVE_SIG_REGS_SIZE(sve_vq_from_vl(vcpu->arch.sve_max_vl)));
+
+	return 0;
+}
+
 /**
  * kvm_reset_vcpu - sets core registers and sys_regs to reset value
  * @vcpu: The VCPU pointer
@@ -103,6 +148,7 @@  int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
 	const struct kvm_regs *cpu_reset;
+	int ret;
 
 	switch (vcpu->arch.target) {
 	default:
@@ -120,6 +166,10 @@  int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset core registers */
 	memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset));
 
+	ret = kvm_reset_sve(vcpu);
+	if (ret)
+		return ret;
+
 	/* Reset system registers */
 	kvm_reset_sys_regs(vcpu);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7c3c5cc..488ca56 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -953,6 +953,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_NESTED_STATE 157
 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
 #define KVM_CAP_MSR_PLATFORM_INFO 159
+#define KVM_CAP_ARM_SVE 160
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1400,6 +1401,9 @@  struct kvm_enc_region {
 #define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
 #define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
 
+/* Available with KVM_CAP_ARM_SVE */
+#define KVM_ARM_SVE_CONFIG	  _IOWR(KVMIO,  0xc0, struct kvm_sve_vls)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */