Patchworkβ [03/11] Handle asynchronous page fault in a PV guest.

login
register
about
Submitter Gleb Natapov
Date 2009-11-01 11:56:22
Message ID <1257076590-29559-4-git-send-email-gleb@redhat.com>
Download mbox | patch
Permalink /patch/56844/
State New
Headers show

Comments

Gleb Natapov - 2009-11-01 11:56:22
Asynchronous page fault notifies vcpu that page it is trying to access
is swapped out by a host. In response guest puts a task that caused the
fault to sleep until page is swapped in again. When missing page is
brought back into the memory guest is notified and task resumes execution.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_para.h |    3 +
 arch/x86/kernel/kvm.c           |  120 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 3 deletions(-)
Avi Kivity - 2009-11-02 12:38:30
On 11/01/2009 01:56 PM, Gleb Natapov wrote:
> Asynchronous page fault notifies vcpu that page it is trying to access
> is swapped out by a host. In response guest puts a task that caused the
> fault to sleep until page is swapped in again. When missing page is
> brought back into the memory guest is notified and task resumes execution.
>
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index 90708b7..61e2aa3 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -52,6 +52,9 @@ struct kvm_mmu_op_release_pt {
>
>   #define KVM_PV_SHM_FEATURES_ASYNC_PF		(1<<  0)
>
> +#define KVM_PV_REASON_PAGE_NP 1
> +#define KVM_PV_REASON_PAGE_READY 2
>    

_NOT_PRESENT would improve readability.

> +static void apf_task_wait(struct task_struct *tsk, u64 token)
>   {
> +	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
> +	struct kvm_task_sleep_head *b =&async_pf_sleepers[key];
> +	struct kvm_task_sleep_node n, *e;
> +	DEFINE_WAIT(wait);
> +
> +	spin_lock(&b->lock);
> +	e = _find_apf_task(b, token);
> +	if (e) {
> +		/* dummy entry exist ->  wake up was delivered ahead of PF */
> +		hlist_del(&e->link);
> +		kfree(e);
> +		spin_unlock(&b->lock);
> +		return;
> +	}
> +
> +	n.token = token;
> +	init_waitqueue_head(&n.wq);
> +	hlist_add_head(&n.link,&b->list);
> +	spin_unlock(&b->lock);
> +
> +	for (;;) {
> +		prepare_to_wait(&n.wq,&wait, TASK_UNINTERRUPTIBLE);
> +		if (hlist_unhashed(&n.link))
> +			break;
>    

Don't you need locking here?  At least for the implied memory barriers.

> +		schedule();
> +	}
> +	finish_wait(&n.wq,&wait);
> +
> +	return;
> +}
> +
> +static void apf_task_wake(u64 token)
> +{
> +	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
> +	struct kvm_task_sleep_head *b =&async_pf_sleepers[key];
> +	struct kvm_task_sleep_node *n;
> +
> +	spin_lock(&b->lock);
> +	n = _find_apf_task(b, token);
> +	if (!n) {
> +		/* PF was not yet handled. Add dummy entry for the token */
> +		n = kmalloc(sizeof(*n), GFP_ATOMIC);
> +		if (!n) {
> +			printk(KERN_EMERG"async PF can't allocate memory\n");
>    

Worrying.  We could have an emergency pool of one node per cpu, and 
disable apf if we use it until it's returned.  But that's a lot of 
complexity for an edge case, so a simpler solution would be welcome.

> +int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code)
> +{
> +	u64 reason, token;
>   	struct kvm_vcpu_pv_shm *pv_shm =
>   		per_cpu(kvm_vcpu_pv_shm, smp_processor_id());
>
>   	if (!pv_shm)
> -		return;
> +		return 0;
> +
> +	reason = pv_shm->reason;
> +	pv_shm->reason = 0;
> +
> +	token = pv_shm->param;
> +
> +	switch (reason) {
> +	default:
> +		return 0;
> +	case KVM_PV_REASON_PAGE_NP:
> +		/* real page is missing. */
> +		apf_task_wait(current, token);
> +		break;
> +	case KVM_PV_REASON_PAGE_READY:
> +		apf_task_wake(token);
> +		break;
> +	}
>    

Ah, reason is not a bitmask but an enumerator.  __u32 is more friendly 
to i386 in that case.

Much of the code here is arch independent and would work well on non-x86 
kvm ports.  But we can always lay the burden of moving it on the non-x86 
maintainers.
Gleb Natapov - 2009-11-02 15:54:10
On Mon, Nov 02, 2009 at 02:38:30PM +0200, Avi Kivity wrote:
> On 11/01/2009 01:56 PM, Gleb Natapov wrote:
> >Asynchronous page fault notifies vcpu that page it is trying to access
> >is swapped out by a host. In response guest puts a task that caused the
> >fault to sleep until page is swapped in again. When missing page is
> >brought back into the memory guest is notified and task resumes execution.
> >
> >diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> >index 90708b7..61e2aa3 100644
> >--- a/arch/x86/include/asm/kvm_para.h
> >+++ b/arch/x86/include/asm/kvm_para.h
> >@@ -52,6 +52,9 @@ struct kvm_mmu_op_release_pt {
> >
> >  #define KVM_PV_SHM_FEATURES_ASYNC_PF		(1<<  0)
> >
> >+#define KVM_PV_REASON_PAGE_NP 1
> >+#define KVM_PV_REASON_PAGE_READY 2
> 
> _NOT_PRESENT would improve readability.
> 
> >+static void apf_task_wait(struct task_struct *tsk, u64 token)
> >  {
> >+	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
> >+	struct kvm_task_sleep_head *b =&async_pf_sleepers[key];
> >+	struct kvm_task_sleep_node n, *e;
> >+	DEFINE_WAIT(wait);
> >+
> >+	spin_lock(&b->lock);
> >+	e = _find_apf_task(b, token);
> >+	if (e) {
> >+		/* dummy entry exist ->  wake up was delivered ahead of PF */
> >+		hlist_del(&e->link);
> >+		kfree(e);
> >+		spin_unlock(&b->lock);
> >+		return;
> >+	}
> >+
> >+	n.token = token;
> >+	init_waitqueue_head(&n.wq);
> >+	hlist_add_head(&n.link,&b->list);
> >+	spin_unlock(&b->lock);
> >+
> >+	for (;;) {
> >+		prepare_to_wait(&n.wq,&wait, TASK_UNINTERRUPTIBLE);
> >+		if (hlist_unhashed(&n.link))
> >+			break;
> 
> Don't you need locking here?  At least for the implied memory barriers.
> 
May be memory barrier will be enough. Will look at it.

> >+		schedule();
> >+	}
> >+	finish_wait(&n.wq,&wait);
> >+
> >+	return;
> >+}
> >+
> >+static void apf_task_wake(u64 token)
> >+{
> >+	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
> >+	struct kvm_task_sleep_head *b =&async_pf_sleepers[key];
> >+	struct kvm_task_sleep_node *n;
> >+
> >+	spin_lock(&b->lock);
> >+	n = _find_apf_task(b, token);
> >+	if (!n) {
> >+		/* PF was not yet handled. Add dummy entry for the token */
> >+		n = kmalloc(sizeof(*n), GFP_ATOMIC);
> >+		if (!n) {
> >+			printk(KERN_EMERG"async PF can't allocate memory\n");
> 
> Worrying.  We could have an emergency pool of one node per cpu, and
> disable apf if we use it until it's returned.  But that's a lot of
> complexity for an edge case, so a simpler solution would be welcome.
> 
Currently this code can't trigger since "wake up" is always sent on the
same vcpu as "not present", but I don't want this implementation detail
to be part of guest/host interface. Idea you've described can be easy
to implement. Will look into it.

> >+int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code)
> >+{
> >+	u64 reason, token;
> >  	struct kvm_vcpu_pv_shm *pv_shm =
> >  		per_cpu(kvm_vcpu_pv_shm, smp_processor_id());
> >
> >  	if (!pv_shm)
> >-		return;
> >+		return 0;
> >+
> >+	reason = pv_shm->reason;
> >+	pv_shm->reason = 0;
> >+
> >+	token = pv_shm->param;
> >+
> >+	switch (reason) {
> >+	default:
> >+		return 0;
> >+	case KVM_PV_REASON_PAGE_NP:
> >+		/* real page is missing. */
> >+		apf_task_wait(current, token);
> >+		break;
> >+	case KVM_PV_REASON_PAGE_READY:
> >+		apf_task_wake(token);
> >+		break;
> >+	}
> 
> Ah, reason is not a bitmask but an enumerator.  __u32 is more
> friendly to i386 in that case.
> 
OK. Will need padding for 64 bit host case.

> Much of the code here is arch independent and would work well on
> non-x86 kvm ports.  But we can always lay the burden of moving it on
> the non-x86 maintainers.
> 
> -- 
> error compiling committee.c: too many arguments to function

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti - 2009-11-03 14:14:23
On Sun, Nov 01, 2009 at 01:56:22PM +0200, Gleb Natapov wrote:
> Asynchronous page fault notifies vcpu that page it is trying to access
> is swapped out by a host. In response guest puts a task that caused the
> fault to sleep until page is swapped in again. When missing page is
> brought back into the memory guest is notified and task resumes execution.

Can't you apply this to non-paravirt guests, and continue to deliver
interrupts while waiting for the swapin? 

It should allow the guest to schedule a different task.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov - 2009-11-03 14:25:33
On Tue, Nov 03, 2009 at 12:14:23PM -0200, Marcelo Tosatti wrote:
> On Sun, Nov 01, 2009 at 01:56:22PM +0200, Gleb Natapov wrote:
> > Asynchronous page fault notifies vcpu that page it is trying to access
> > is swapped out by a host. In response guest puts a task that caused the
> > fault to sleep until page is swapped in again. When missing page is
> > brought back into the memory guest is notified and task resumes execution.
> 
> Can't you apply this to non-paravirt guests, and continue to deliver
> interrupts while waiting for the swapin? 
> 
> It should allow the guest to schedule a different task.
But how can I make the guest to not run the task that caused the fault?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti - 2009-11-03 14:32:50
On Tue, Nov 03, 2009 at 04:25:33PM +0200, Gleb Natapov wrote:
> On Tue, Nov 03, 2009 at 12:14:23PM -0200, Marcelo Tosatti wrote:
> > On Sun, Nov 01, 2009 at 01:56:22PM +0200, Gleb Natapov wrote:
> > > Asynchronous page fault notifies vcpu that page it is trying to access
> > > is swapped out by a host. In response guest puts a task that caused the
> > > fault to sleep until page is swapped in again. When missing page is
> > > brought back into the memory guest is notified and task resumes execution.
> > 
> > Can't you apply this to non-paravirt guests, and continue to deliver
> > interrupts while waiting for the swapin? 
> > 
> > It should allow the guest to schedule a different task.
> But how can I make the guest to not run the task that caused the fault?

Any attempt to access the swapped out data will cause a #PF vmexit,
since the translation is marked as not present. If there's swapin in
progress, you wait for that swapin, otherwise start swapin and wait.

Its not as efficient as paravirt because you have to wait for a timer
interrupt and the guest scheduler to decide to taskswitch, but OTOH its
transparent.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - 2009-11-03 14:38:46
On 11/03/2009 04:32 PM, Marcelo Tosatti wrote:
> Any attempt to access the swapped out data will cause a #PF vmexit,
> since the translation is marked as not present. If there's swapin in
> progress, you wait for that swapin, otherwise start swapin and wait.
>
> Its not as efficient as paravirt because you have to wait for a timer
> interrupt and the guest scheduler to decide to taskswitch, but OTOH its
> transparent.
>    

With a dyntick guest the timer interrupt will come at the end of the 
time slice, likely after the page has been swapped in.  That leaves smp 
reschedule interrupts and non-dyntick guests.

An advantage is that there is one code path for apf and non-apf.  
Another is that interrupts are processed, improving timekeeping and 
maybe responsiveness.

Patch

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 90708b7..61e2aa3 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -52,6 +52,9 @@  struct kvm_mmu_op_release_pt {
 
 #define KVM_PV_SHM_FEATURES_ASYNC_PF		(1 << 0)
 
+#define KVM_PV_REASON_PAGE_NP 1
+#define KVM_PV_REASON_PAGE_READY 2
+
 struct kvm_vcpu_pv_shm {
 	__u64 features;
 	__u64 reason;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d03f33c..79d291f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -30,6 +30,8 @@ 
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 
@@ -55,15 +57,121 @@  static void kvm_io_delay(void)
 {
 }
 
-static void kvm_end_context_switch(struct task_struct *next)
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+	struct hlist_node link;
+	wait_queue_head_t wq;
+	u64 token;
+};
+
+static struct kvm_task_sleep_head {
+	spinlock_t lock;
+	struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+						  u64 token)
+{
+	struct hlist_node *p;
+
+	hlist_for_each(p, &b->list) {
+		struct kvm_task_sleep_node *n =
+			hlist_entry(p, typeof(*n), link);
+		if (n->token == token)
+			return n;
+	}
+
+	return NULL;
+}
+
+static void apf_task_wait(struct task_struct *tsk, u64 token)
 {
+	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
+	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+	struct kvm_task_sleep_node n, *e;
+	DEFINE_WAIT(wait);
+
+	spin_lock(&b->lock);
+	e = _find_apf_task(b, token);
+	if (e) {
+		/* dummy entry exist -> wake up was delivered ahead of PF */
+		hlist_del(&e->link);
+		kfree(e);
+		spin_unlock(&b->lock);
+		return;
+	}
+
+	n.token = token;
+	init_waitqueue_head(&n.wq);
+	hlist_add_head(&n.link, &b->list);
+	spin_unlock(&b->lock);
+
+	for (;;) {
+		prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (hlist_unhashed(&n.link))
+			break;
+		schedule();
+	}
+	finish_wait(&n.wq, &wait);
+
+	return;
+}
+
+static void apf_task_wake(u64 token)
+{
+	u64 key = hash_64(token, KVM_TASK_SLEEP_HASHBITS);
+	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+	struct kvm_task_sleep_node *n;
+
+	spin_lock(&b->lock);
+	n = _find_apf_task(b, token);
+	if (!n) {
+		/* PF was not yet handled. Add dummy entry for the token */
+		n = kmalloc(sizeof(*n), GFP_ATOMIC);
+		if (!n) {
+			printk(KERN_EMERG"async PF can't allocate memory\n");
+		} else {
+			n->token = token;
+			hlist_add_head(&n->link, &b->list);
+		}
+	} else {
+		hlist_del_init(&n->link);
+		if (waitqueue_active(&n->wq))
+			wake_up(&n->wq);
+	}
+	spin_unlock(&b->lock);
+	return;
+}
+
+int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code)
+{
+	u64 reason, token;
 	struct kvm_vcpu_pv_shm *pv_shm =
 		per_cpu(kvm_vcpu_pv_shm, smp_processor_id());
 
 	if (!pv_shm)
-		return;
+		return 0;
+
+	reason = pv_shm->reason;
+	pv_shm->reason = 0;
+
+	token = pv_shm->param;
+
+	switch (reason) {
+	default:
+		return 0;
+	case KVM_PV_REASON_PAGE_NP:
+		/* real page is missing. */
+		apf_task_wait(current, token);
+		break;
+	case KVM_PV_REASON_PAGE_READY:
+		apf_task_wake(token);
+		break;
+	}
 
-	pv_shm->current_task = (u64)next;
+	return 1;
 }
 
 static void kvm_mmu_op(void *buffer, unsigned len)
@@ -219,6 +327,9 @@  static void __init paravirt_ops_setup(void)
 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 		pv_cpu_ops.io_delay = kvm_io_delay;
 
+	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+		pv_cpu_ops.handle_pf = kvm_handle_pf;
+
 	if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
 		pv_mmu_ops.set_pte = kvm_set_pte;
 		pv_mmu_ops.set_pte_at = kvm_set_pte_at;
@@ -272,11 +383,14 @@  static struct notifier_block kvm_pv_reboot_nb = {
 
 void __init kvm_guest_init(void)
 {
+	int i;
 	if (!kvm_para_available())
 		return;
 
 	paravirt_ops_setup();
 	register_reboot_notifier(&kvm_pv_reboot_nb);
+	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+		spin_lock_init(&async_pf_sleepers[i].lock);
 }
 
 void __cpuinit kvm_guest_cpu_init(void)