diff mbox

[v2,4/5] KVM: add KVM_USER_EXIT vcpu ioctl for userspace exit

Message ID 1438792381-19453-5-git-send-email-rkrcmar@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Radim Krčmář Aug. 5, 2015, 4:33 p.m. UTC
The guest can use KVM_USER_EXIT instead of a signal-based exiting to
userspace.  Availability depends on KVM_CAP_USER_EXIT.
Only x86 is implemented so far.

Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com>
---
 v2:
  * use vcpu ioctl instead of vm one [4/5]
  * shrink kvm_user_exit from 64 to 32 bytes [4/5]

 Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  7 +++++++
 virt/kvm/kvm_main.c               |  5 +++--
 4 files changed, 64 insertions(+), 2 deletions(-)

Comments

Paolo Bonzini Aug. 5, 2015, 4:36 p.m. UTC | #1
On 05/08/2015 18:33, Radim Kr?má? wrote:
> The guest can use KVM_USER_EXIT instead of a signal-based exiting to
> userspace.  Availability depends on KVM_CAP_USER_EXIT.
> Only x86 is implemented so far.
> 
> Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com>
> ---
>  v2:
>   * use vcpu ioctl instead of vm one [4/5]
>   * shrink kvm_user_exit from 64 to 32 bytes [4/5]
> 
>  Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
>  arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  7 +++++++
>  virt/kvm/kvm_main.c               |  5 +++--
>  4 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 3c714d43a717..c5844f0b8e7c 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3020,6 +3020,36 @@ Returns: 0 on success, -1 on error
>  
>  Queues an SMI on the thread's vcpu.
>  
> +
> +4.97 KVM_USER_EXIT
> +
> +Capability: KVM_CAP_USER_EXIT
> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_user_exit (in)
> +Returns: 0 on success,
> +         -EFAULT if the parameter couldn't be read,
> +         -EINVAL if 'reserved' is not zeroed,
> +
> +struct kvm_user_exit {
> +	__u8 reserved[32];
> +};
> +
> +The ioctl is asynchronous to VCPU execution and can be issued from all threads.
> +
> +Make vcpu_id exit to userspace as soon as possible.  If the VCPU is not running
> +in kernel at the time, it will exit early on the next call to KVM_RUN.
> +If the VCPU was going to exit because of other reasons when KVM_USER_EXIT was
> +issued, it will keep the original exit reason and not exit early on next
> +KVM_RUN.
> +If VCPU exited because of KVM_USER_EXIT, the exit reason is KVM_EXIT_REQUEST.
> +
> +This ioctl has very similar effect (same sans some races on userspace exit) as
> +sending a signal (that is blocked in userspace and set in KVM_SET_SIGNAL_MASK)
> +to the VCPU thread.

Can we just return EINVAL if the parameter is not NULL?

Paolo

> +
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3493457ad0a1..27d777eb34e4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2466,6 +2466,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_ASSIGN_DEV_IRQ:
>  	case KVM_CAP_PCI_2_3:
>  #endif
> +	case KVM_CAP_USER_EXIT:
>  		r = 1;
>  		break;
>  	case KVM_CAP_X86_SMM:
> @@ -3077,6 +3078,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>  
> +int kvm_vcpu_ioctl_user_exit(struct kvm_vcpu *vcpu, struct kvm_user_exit *info)
> +{
> +	struct kvm_user_exit valid = {};
> +       BUILD_BUG_ON(sizeof(valid) != 32);
> +
> +	if (memcmp(info, &valid, sizeof(valid)))
> +		return -EINVAL;
> +
> +	kvm_make_request(KVM_REQ_EXIT, vcpu);
> +	kvm_vcpu_kick(vcpu);
> +
> +	return 0;
> +}
> +
>  long kvm_arch_vcpu_ioctl(struct file *filp,
>  			 unsigned int ioctl, unsigned long arg)
>  {
> @@ -3341,6 +3356,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		r = kvm_set_guest_paused(vcpu);
>  		goto out;
>  	}
> +	case KVM_USER_EXIT: {
> +		struct kvm_user_exit info;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&info, argp, sizeof(info)))
> +			goto out;
> +		r = kvm_vcpu_ioctl_user_exit(vcpu, &info);
> +		break;
> +	}
>  	default:
>  		r = -EINVAL;
>  	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index d996a7cdb4d2..bc5a1abe9626 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -826,6 +826,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_X86_SMM 117
>  #define KVM_CAP_MULTI_ADDRESS_SPACE 118
>  #define KVM_CAP_SPLIT_IRQCHIP 119
> +#define KVM_CAP_USER_EXIT 120
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1008,6 +1009,10 @@ struct kvm_device_attr {
>  	__u64	addr;		/* userspace address of attr data */
>  };
>  
> +struct kvm_user_exit {
> +	__u8 reserved[32];
> +};
> +
>  #define  KVM_DEV_VFIO_GROUP			1
>  #define   KVM_DEV_VFIO_GROUP_ADD			1
>  #define   KVM_DEV_VFIO_GROUP_DEL			2
> @@ -1119,6 +1124,8 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
>  /* Available with KVM_CAP_PPC_RTAS */
>  #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
> +/* Available with KVM_CAP_USER_EXIT */
> +#define KVM_USER_EXIT             _IOW(KVMIO,  0xad, struct kvm_user_exit)
>  
>  /* ioctl for vm fd */
>  #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b34a328fdac1..d7ffe6090520 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2248,15 +2248,16 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
>  		return -EINVAL;
>  
> -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
>  	/*
>  	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
>  	 * so vcpu_load() would break it.
>  	 */
> +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
>  	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
>  		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>  #endif
> -
> +	if (ioctl == KVM_USER_EXIT)
> +		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>  
>  	r = vcpu_load(vcpu);
>  	if (r)
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Aug. 6, 2015, 1:44 p.m. UTC | #2
2015-08-05 18:36+0200, Paolo Bonzini:
> On 05/08/2015 18:33, Radim Kr?má? wrote:
>> +4.97 KVM_USER_EXIT
>> +
>> +Capability: KVM_CAP_USER_EXIT
>> +Architectures: x86
>> +Type: vcpu ioctl
>> +Parameters: struct kvm_user_exit (in)
>> +Returns: 0 on success,
>> +         -EFAULT if the parameter couldn't be read,
>> +         -EINVAL if 'reserved' is not zeroed,
>> +
>> +struct kvm_user_exit {
>> +	__u8 reserved[32];
>> +};
> 
> Can we just return EINVAL if the parameter is not NULL?

It complicates handling if we extend the ioctl, but removes the useless
clearing/copying/checking now ...

The two obvious extensions are flags to skip kvm_make_request() or
kvm_vcpu_kick(), both of dubious use.  Another possibility is setting up
conditional exits, but that would be better as a separate control, like
most other sophisticated extensions.

I think that u32 flags would be sufficient -- is casting the 'unsigned
long arg' (data pointer) to a value still an accepted solution?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 6, 2015, 1:52 p.m. UTC | #3
On 06/08/2015 15:44, Radim Kr?má? wrote:
>> > Can we just return EINVAL if the parameter is not NULL?
> It complicates handling if we extend the ioctl, but removes the useless
> clearing/copying/checking now ...

Yes.

> The two obvious extensions are flags to skip kvm_make_request() or
> kvm_vcpu_kick(), both of dubious use.

Skipping kvm_make_request() would make some sense if you can set
vcpu->run->request_interrupt_window asynchronously.  So you could do

	vcpu->run->request_interrupt_window = 1;
	ioctl(vcpu_fd, KVM_USER_EXIT, KVM_USER_EXIT_LAZY);

and only cause a lightweight vmexit if the interrupt window is currently
closed.  I haven't thought of any races that could happen, but it looks
like it could work.

Skipping kvm_vcpu_kick() makes much less sense.

> Another possibility is setting up
> conditional exits, but that would be better as a separate control, like
> most other sophisticated extensions.
> 
> I think that u32 flags would be sufficient -- is casting the 'unsigned
> long arg' (data pointer) to a value still an accepted solution?

Yeah, that would work for me as well.  Also because, for now, you'd
return EINVAL if the unsigned long is not zero, which boils down to
"return EINVAL if the parameter is not NULL". :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Aug. 6, 2015, 5:21 p.m. UTC | #4
2015-08-06 15:52+0200, Paolo Bonzini:
> On 06/08/2015 15:44, Radim Kr?má? wrote:
>> The two obvious extensions are flags to skip kvm_make_request() or
>> kvm_vcpu_kick(), both of dubious use.
> 
> Skipping kvm_make_request() would make some sense if you can set
> vcpu->run->request_interrupt_window asynchronously.  So you could do
> 
> 	vcpu->run->request_interrupt_window = 1;
> 	ioctl(vcpu_fd, KVM_USER_EXIT, KVM_USER_EXIT_LAZY);
> 
> and only cause a lightweight vmexit if the interrupt window is currently
> closed.  I haven't thought of any races that could happen, but it looks
> like it could work.

Seems doable, kvm_run should have been better protected :)

> Skipping kvm_vcpu_kick() makes much less sense.

Could save some cycles when queuing an early exit from the VCPU thread.

>> Another possibility is setting up
>> conditional exits, but that would be better as a separate control, like
>> most other sophisticated extensions.
>> 
>> I think that u32 flags would be sufficient -- is casting the 'unsigned
>> long arg' (data pointer) to a value still an accepted solution?
> 
> Yeah, that would work for me as well.  Also because, for now, you'd
> return EINVAL if the unsigned long is not zero, which boils down to
> "return EINVAL if the parameter is not NULL". :)

Exactly, only the ioctl number will change.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 16, 2015, 8:27 p.m. UTC | #5
On 08/05/2015 07:33 PM, Radim Kr?má? wrote:
> The guest can use KVM_USER_EXIT instead of a signal-based exiting to
> userspace.  Availability depends on KVM_CAP_USER_EXIT.
> Only x86 is implemented so far.
>
> Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com>
> ---
>   v2:
>    * use vcpu ioctl instead of vm one [4/5]
>    * shrink kvm_user_exit from 64 to 32 bytes [4/5]
>
>   Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
>   arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
>   include/uapi/linux/kvm.h          |  7 +++++++
>   virt/kvm/kvm_main.c               |  5 +++--
>   4 files changed, 64 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 3c714d43a717..c5844f0b8e7c 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3020,6 +3020,36 @@ Returns: 0 on success, -1 on error
>   
>   Queues an SMI on the thread's vcpu.
>   
> +
> +4.97 KVM_USER_EXIT
> +
> +Capability: KVM_CAP_USER_EXIT
> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_user_exit (in)
> +Returns: 0 on success,
> +         -EFAULT if the parameter couldn't be read,
> +         -EINVAL if 'reserved' is not zeroed,
> +
> +struct kvm_user_exit {
> +	__u8 reserved[32];
> +};
> +
> +The ioctl is asynchronous to VCPU execution and can be issued from all threads.
> +format

This breaks an invariant of vcpu ioctls, and also forces a cacheline 
bounce when we fget() the vcpu fd.

We should really try to avoid this.  One options is to have a 
KVM_VCPU_MAKE_EXITFD vcpu ioctl, which returns an eventfd that you then 
write into.  You can make as many exitfds as you like, one for each 
waking thread, so they never cause cacheline conflicts.

Edit: I see the invariant was already broken.  But the other comment stands.

> +Make vcpu_id exit to userspace as soon as possible.  If the VCPU is not running
> +in kernel at the time, it will exit early on the next call to KVM_RUN.
> +If the VCPU was going to exit because of other reasons when KVM_USER_EXIT was
> +issued, it will keep the original exit reason and not exit early on next
> +KVM_RUN.
> +If VCPU exited because of KVM_USER_EXIT, the exit reason is KVM_EXIT_REQUEST.
> +
> +This ioctl has very similar effect (same sans some races on userspace exit) as
> +sending a signal (that is blocked in userspace and set in KVM_SET_SIGNAL_MASK)
> +to the VCPU thread.
> +
> +
> +
>   5. The kvm_run structure
>   ------------------------
>   
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3493457ad0a1..27d777eb34e4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2466,6 +2466,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   	case KVM_CAP_ASSIGN_DEV_IRQ:
>   	case KVM_CAP_PCI_2_3:
>   #endif
> +	case KVM_CAP_USER_EXIT:
>   		r = 1;
>   		break;
>   	case KVM_CAP_X86_SMM:
> @@ -3077,6 +3078,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +int kvm_vcpu_ioctl_user_exit(struct kvm_vcpu *vcpu, struct kvm_user_exit *info)
> +{
> +	struct kvm_user_exit valid = {};
> +       BUILD_BUG_ON(sizeof(valid) != 32);
> +
> +	if (memcmp(info, &valid, sizeof(valid)))
> +		return -EINVAL;
> +
> +	kvm_make_request(KVM_REQ_EXIT, vcpu);
> +	kvm_vcpu_kick(vcpu);
> +
> +	return 0;
> +}
> +
>   long kvm_arch_vcpu_ioctl(struct file *filp,
>   			 unsigned int ioctl, unsigned long arg)
>   {
> @@ -3341,6 +3356,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>   		r = kvm_set_guest_paused(vcpu);
>   		goto out;
>   	}
> +	case KVM_USER_EXIT: {
> +		struct kvm_user_exit info;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&info, argp, sizeof(info)))
> +			goto out;
> +		r = kvm_vcpu_ioctl_user_exit(vcpu, &info);
> +		break;
> +	}
>   	default:
>   		r = -EINVAL;
>   	}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index d996a7cdb4d2..bc5a1abe9626 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -826,6 +826,7 @@ struct kvm_ppc_smmu_info {
>   #define KVM_CAP_X86_SMM 117
>   #define KVM_CAP_MULTI_ADDRESS_SPACE 118
>   #define KVM_CAP_SPLIT_IRQCHIP 119
> +#define KVM_CAP_USER_EXIT 120
>   
>   #ifdef KVM_CAP_IRQ_ROUTING
>   
> @@ -1008,6 +1009,10 @@ struct kvm_device_attr {
>   	__u64	addr;		/* userspace address of attr data */
>   };
>   
> +struct kvm_user_exit {
> +	__u8 reserved[32];
> +};
> +
>   #define  KVM_DEV_VFIO_GROUP			1
>   #define   KVM_DEV_VFIO_GROUP_ADD			1
>   #define   KVM_DEV_VFIO_GROUP_DEL			2
> @@ -1119,6 +1124,8 @@ struct kvm_s390_ucas_mapping {
>   #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
>   /* Available with KVM_CAP_PPC_RTAS */
>   #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
> +/* Available with KVM_CAP_USER_EXIT */
> +#define KVM_USER_EXIT             _IOW(KVMIO,  0xad, struct kvm_user_exit)
>   
>   /* ioctl for vm fd */
>   #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b34a328fdac1..d7ffe6090520 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2248,15 +2248,16 @@ static long kvm_vcpu_ioctl(struct file *filp,
>   	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
>   		return -EINVAL;
>   
> -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
>   	/*
>   	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
>   	 * so vcpu_load() would break it.
>   	 */
> +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
>   	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
>   		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>   #endif
> -
> +	if (ioctl == KVM_USER_EXIT)
> +		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>   
>   	r = vcpu_load(vcpu);
>   	if (r)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 17, 2015, 1:15 p.m. UTC | #6
On 16/08/2015 13:27, Avi Kivity wrote:
> On 08/05/2015 07:33 PM, Radim Kr?má? wrote:
>> The guest can use KVM_USER_EXIT instead of a signal-based exiting to
>> userspace.  Availability depends on KVM_CAP_USER_EXIT.
>> Only x86 is implemented so far.
>>
>> Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com>
>> ---
>>   v2:
>>    * use vcpu ioctl instead of vm one [4/5]
>>    * shrink kvm_user_exit from 64 to 32 bytes [4/5]
>>
>>   Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
>>   arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
>>   include/uapi/linux/kvm.h          |  7 +++++++
>>   virt/kvm/kvm_main.c               |  5 +++--
>>   4 files changed, 64 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt
>> b/Documentation/virtual/kvm/api.txt
>> index 3c714d43a717..c5844f0b8e7c 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -3020,6 +3020,36 @@ Returns: 0 on success, -1 on error
>>     Queues an SMI on the thread's vcpu.
>>   +
>> +4.97 KVM_USER_EXIT
>> +
>> +Capability: KVM_CAP_USER_EXIT
>> +Architectures: x86
>> +Type: vcpu ioctl
>> +Parameters: struct kvm_user_exit (in)
>> +Returns: 0 on success,
>> +         -EFAULT if the parameter couldn't be read,
>> +         -EINVAL if 'reserved' is not zeroed,
>> +
>> +struct kvm_user_exit {
>> +    __u8 reserved[32];
>> +};
>> +
>> +The ioctl is asynchronous to VCPU execution and can be issued from
>> all threads.
>> +format
> 
> This breaks an invariant of vcpu ioctls, and also forces a cacheline
> bounce when we fget() the vcpu fd.

KVM_USER_EXIT in practice should be so rare (at least with in-kernel
LAPIC) that I don't think this matters.  KVM_USER_EXIT is relatively
uninteresting, it only exists to provide an alternative to signals that
doesn't require expensive atomics on each and every KVM_RUN. :(

In addition, when you do it you have to transfer information anyway from
the signaling thread to the VCPU thread, which causes cacheline bounces
too.  For example in QEMU both the iothread mutex and
cpu->interrupt_request cachelines bounce.

> We should really try to avoid this.  One options is to have a
> KVM_VCPU_MAKE_EXITFD vcpu ioctl, which returns an eventfd that you then
> write into.  You can make as many exitfds as you like, one for each
> waking thread, so they never cause cacheline conflicts.

> Edit: I see the invariant was already broken.

Yeah, that was a long time ago.  This is when it became apparent in
kvm_vcpu_ioctl, but it was broken even before that for these 2-3
special asynchronous vcpu ioctls:

    commit 2122ff5eab8faec853e43f6de886e8dc8f31e317
    Author: Avi Kivity <avi@redhat.com>
    Date:   Thu May 13 11:25:04 2010 +0300

    KVM: move vcpu locking to dispatcher for generic vcpu ioctls

    All vcpu ioctls need to be locked, so instead of locking each one
    specifically we lock at the generic dispatcher.

    This patch only updates generic ioctls and leaves arch specific
    ioctls alone.

    Signed-off-by: Avi Kivity <avi@redhat.com>

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 18, 2015, 6:30 p.m. UTC | #7
On 08/17/2015 04:15 PM, Paolo Bonzini wrote:
>
> On 16/08/2015 13:27, Avi Kivity wrote:
>> On 08/05/2015 07:33 PM, Radim Kr?má? wrote:
>>> The guest can use KVM_USER_EXIT instead of a signal-based exiting to
>>> userspace.  Availability depends on KVM_CAP_USER_EXIT.
>>> Only x86 is implemented so far.
>>>
>>> Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com>
>>> ---
>>>    v2:
>>>     * use vcpu ioctl instead of vm one [4/5]
>>>     * shrink kvm_user_exit from 64 to 32 bytes [4/5]
>>>
>>>    Documentation/virtual/kvm/api.txt | 30 ++++++++++++++++++++++++++++++
>>>    arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++++
>>>    include/uapi/linux/kvm.h          |  7 +++++++
>>>    virt/kvm/kvm_main.c               |  5 +++--
>>>    4 files changed, 64 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/Documentation/virtual/kvm/api.txt
>>> b/Documentation/virtual/kvm/api.txt
>>> index 3c714d43a717..c5844f0b8e7c 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -3020,6 +3020,36 @@ Returns: 0 on success, -1 on error
>>>      Queues an SMI on the thread's vcpu.
>>>    +
>>> +4.97 KVM_USER_EXIT
>>> +
>>> +Capability: KVM_CAP_USER_EXIT
>>> +Architectures: x86
>>> +Type: vcpu ioctl
>>> +Parameters: struct kvm_user_exit (in)
>>> +Returns: 0 on success,
>>> +         -EFAULT if the parameter couldn't be read,
>>> +         -EINVAL if 'reserved' is not zeroed,
>>> +
>>> +struct kvm_user_exit {
>>> +    __u8 reserved[32];
>>> +};
>>> +
>>> +The ioctl is asynchronous to VCPU execution and can be issued from
>>> all threads.
>>> +format
>> This breaks an invariant of vcpu ioctls, and also forces a cacheline
>> bounce when we fget() the vcpu fd.
> KVM_USER_EXIT in practice should be so rare (at least with in-kernel
> LAPIC) that I don't think this matters.  KVM_USER_EXIT is relatively
> uninteresting, it only exists to provide an alternative to signals that
> doesn't require expensive atomics on each and every KVM_RUN. :(

Ah, so the idea is to remove the cost of changing the signal mask?

Yes, although it looks like a thread-local operation, it takes a 
process-wide lock.

I expect most user wakeups are via irqfd, so indeed the performance of 
KVM_USER_EXIT is uninteresting.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 18, 2015, 7:57 p.m. UTC | #8
On 18/08/2015 11:30, Avi Kivity wrote:
>> KVM_USER_EXIT in practice should be so rare (at least with in-kernel
>> LAPIC) that I don't think this matters.  KVM_USER_EXIT is relatively
>> uninteresting, it only exists to provide an alternative to signals that
>> doesn't require expensive atomics on each and every KVM_RUN. :(
> 
> Ah, so the idea is to remove the cost of changing the signal mask?

Yes, it's explained in the cover letter.

> Yes, although it looks like a thread-local operation, it takes a
> process-wide lock.

IIRC the lock was only task-wide and uncontended.  Problem is, it's on
the node that created the thread rather than the node that is running
it, and inter-node atomics are really, really slow.

For guests spanning >1 host NUMA nodes it's not really practical to
ensure that the thread is created on the right node.  Even for guests
that fit into 1 host node, if you rely on AutoNUMA the VCPUs are created
too early for AutoNUMA to have any effect.  And newer machines have
frighteningly small nodes (two nodes per socket, so it's something like
7 pCPUs if you don't have hyper-threading enabled).  True, the NUMA
penalty within the same socket is not huge, but it still costs a few
thousand clock cycles on vmexit.flat and this feature sweeps it away
completely.

> I expect most user wakeups are via irqfd, so indeed the performance of
> KVM_USER_EXIT is uninteresting.

Yup, either irqfd or KVM_SET_SIGNAL_MSI.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 19, 2015, 6:43 a.m. UTC | #9
On 08/18/2015 10:57 PM, Paolo Bonzini wrote:
>
> On 18/08/2015 11:30, Avi Kivity wrote:
>>> KVM_USER_EXIT in practice should be so rare (at least with in-kernel
>>> LAPIC) that I don't think this matters.  KVM_USER_EXIT is relatively
>>> uninteresting, it only exists to provide an alternative to signals that
>>> doesn't require expensive atomics on each and every KVM_RUN. :(
>> Ah, so the idea is to remove the cost of changing the signal mask?
> Yes, it's explained in the cover letter.
>
>> Yes, although it looks like a thread-local operation, it takes a
>> process-wide lock.
> IIRC the lock was only task-wide and uncontended.  Problem is, it's on
> the node that created the thread rather than the node that is running
> it, and inter-node atomics are really, really slow.

Cached inter-node atomics are (relatively) fast, but I think it really 
is a process-wide lock:

sigprocmask calls:

void __set_current_blocked(const sigset_t *newset)
{
     struct task_struct *tsk = current;

     spin_lock_irq(&tsk->sighand->siglock);
     __set_task_blocked(tsk, newset);
     spin_unlock_irq(&tsk->sighand->siglock);
}

struct sighand_struct {
     atomic_t        count;
     struct k_sigaction    action[_NSIG];
     spinlock_t        siglock;
     wait_queue_head_t    signalfd_wqh;
};

Since sigaction is usually process-wide, I conclude that so will 
tsk->sighand.



>
> For guests spanning >1 host NUMA nodes it's not really practical to
> ensure that the thread is created on the right node.  Even for guests
> that fit into 1 host node, if you rely on AutoNUMA the VCPUs are created
> too early for AutoNUMA to have any effect.  And newer machines have
> frighteningly small nodes (two nodes per socket, so it's something like
> 7 pCPUs if you don't have hyper-threading enabled).  True, the NUMA
> penalty within the same socket is not huge, but it still costs a few
> thousand clock cycles on vmexit.flat and this feature sweeps it away
> completely.
>
>> I expect most user wakeups are via irqfd, so indeed the performance of
>> KVM_USER_EXIT is uninteresting.
> Yup, either irqfd or KVM_SET_SIGNAL_MSI.
>
> Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c714d43a717..c5844f0b8e7c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3020,6 +3020,36 @@  Returns: 0 on success, -1 on error
 
 Queues an SMI on the thread's vcpu.
 
+
+4.97 KVM_USER_EXIT
+
+Capability: KVM_CAP_USER_EXIT
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_user_exit (in)
+Returns: 0 on success,
+         -EFAULT if the parameter couldn't be read,
+         -EINVAL if 'reserved' is not zeroed,
+
+struct kvm_user_exit {
+	__u8 reserved[32];
+};
+
+The ioctl is asynchronous to VCPU execution and can be issued from all threads.
+
+Make vcpu_id exit to userspace as soon as possible.  If the VCPU is not running
+in kernel at the time, it will exit early on the next call to KVM_RUN.
+If the VCPU was going to exit because of other reasons when KVM_USER_EXIT was
+issued, it will keep the original exit reason and not exit early on next
+KVM_RUN.
+If VCPU exited because of KVM_USER_EXIT, the exit reason is KVM_EXIT_REQUEST.
+
+This ioctl has very similar effect (same sans some races on userspace exit) as
+sending a signal (that is blocked in userspace and set in KVM_SET_SIGNAL_MASK)
+to the VCPU thread.
+
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3493457ad0a1..27d777eb34e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2466,6 +2466,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
 #endif
+	case KVM_CAP_USER_EXIT:
 		r = 1;
 		break;
 	case KVM_CAP_X86_SMM:
@@ -3077,6 +3078,20 @@  static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_vcpu_ioctl_user_exit(struct kvm_vcpu *vcpu, struct kvm_user_exit *info)
+{
+	struct kvm_user_exit valid = {};
+       BUILD_BUG_ON(sizeof(valid) != 32);
+
+	if (memcmp(info, &valid, sizeof(valid)))
+		return -EINVAL;
+
+	kvm_make_request(KVM_REQ_EXIT, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -3341,6 +3356,15 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_set_guest_paused(vcpu);
 		goto out;
 	}
+	case KVM_USER_EXIT: {
+		struct kvm_user_exit info;
+
+		r = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info)))
+			goto out;
+		r = kvm_vcpu_ioctl_user_exit(vcpu, &info);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d996a7cdb4d2..bc5a1abe9626 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -826,6 +826,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_SPLIT_IRQCHIP 119
+#define KVM_CAP_USER_EXIT 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1008,6 +1009,10 @@  struct kvm_device_attr {
 	__u64	addr;		/* userspace address of attr data */
 };
 
+struct kvm_user_exit {
+	__u8 reserved[32];
+};
+
 #define  KVM_DEV_VFIO_GROUP			1
 #define   KVM_DEV_VFIO_GROUP_ADD			1
 #define   KVM_DEV_VFIO_GROUP_DEL			2
@@ -1119,6 +1124,8 @@  struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_USER_EXIT */
+#define KVM_USER_EXIT             _IOW(KVMIO,  0xad, struct kvm_user_exit)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b34a328fdac1..d7ffe6090520 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2248,15 +2248,16 @@  static long kvm_vcpu_ioctl(struct file *filp,
 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
 		return -EINVAL;
 
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
 	/*
 	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
 	 * so vcpu_load() would break it.
 	 */
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
 	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
 		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 #endif
-
+	if (ioctl == KVM_USER_EXIT)
+		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 
 	r = vcpu_load(vcpu);
 	if (r)