[RFCv2,9/9] arm64: Support async page fault
diff mbox series

Message ID 20200508032919.52147-10-gshan@redhat.com
State New
Headers show
Series
  • kvm/arm64: Support Async Page Fault
Related show

Commit Message

Gavin Shan May 8, 2020, 3:29 a.m. UTC
This supports asynchronous page fault for the guest. The design is
similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from
the host, the current task is either rescheduled or put into power
saving mode. The task will be waken up when PAGE_READY signal is
received. The PAGE_READY signal might be received in the context
of the suspended process, to be waken up. That means the suspended
process has to wake up itself, but it's not safe and prone to cause
dead-lock on CPU runqueue lock. So the wakeup is delayed on returning
from kernel space to user space or idle process is picked for running.

The signals are conveyed through the async page fault control block,
which was passed to host on enabling the functionality. On each page
fault, the control block is checked and switch to the async page fault
handling flow if any signals exist.

The feature is put into the CONFIG_KVM_GUEST umbrella, which is added
by this patch. So we have inline functions implemented in kvm_para.h,
like other architectures do, to check if async page fault (one of the
KVM para-virtualized features) is available. Also, the kernel boot
parameter "no-kvmapf" can be specified to disable the feature.

Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 arch/arm64/Kconfig                 |  11 +
 arch/arm64/include/asm/exception.h |   3 +
 arch/arm64/include/asm/kvm_para.h  |  27 +-
 arch/arm64/kernel/entry.S          |  33 +++
 arch/arm64/kernel/process.c        |   4 +
 arch/arm64/mm/fault.c              | 434 +++++++++++++++++++++++++++++
 6 files changed, 505 insertions(+), 7 deletions(-)

Comments

Mark Rutland May 26, 2020, 12:56 p.m. UTC | #1
On Fri, May 08, 2020 at 01:29:19PM +1000, Gavin Shan wrote:
> This supports asynchronous page fault for the guest. The design is
> similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from
> the host, the current task is either rescheduled or put into power
> saving mode. The task will be waken up when PAGE_READY signal is
> received. The PAGE_READY signal might be received in the context
> of the suspended process, to be waken up. That means the suspended
> process has to wake up itself, but it's not safe and prone to cause
> dead-lock on CPU runqueue lock. So the wakeup is delayed on returning
> from kernel space to user space or idle process is picked for running.
> 
> The signals are conveyed through the async page fault control block,
> which was passed to host on enabling the functionality. On each page
> fault, the control block is checked and switch to the async page fault
> handling flow if any signals exist.
> 
> The feature is put into the CONFIG_KVM_GUEST umbrella, which is added
> by this patch. So we have inline functions implemented in kvm_para.h,
> like other architectures do, to check if async page fault (one of the
> KVM para-virtualized features) is available. Also, the kernel boot
> parameter "no-kvmapf" can be specified to disable the feature.
> 
> Signed-off-by: Gavin Shan <gshan@redhat.com>
> ---
>  arch/arm64/Kconfig                 |  11 +
>  arch/arm64/include/asm/exception.h |   3 +
>  arch/arm64/include/asm/kvm_para.h  |  27 +-
>  arch/arm64/kernel/entry.S          |  33 +++
>  arch/arm64/kernel/process.c        |   4 +
>  arch/arm64/mm/fault.c              | 434 +++++++++++++++++++++++++++++
>  6 files changed, 505 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 40fb05d96c60..2d5e5ee62d6d 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1045,6 +1045,17 @@ config PARAVIRT
>  	  under a hypervisor, potentially improving performance significantly
>  	  over full virtualization.
>  
> +config KVM_GUEST
> +	bool "KVM Guest Support"
> +	depends on PARAVIRT
> +	default y
> +	help
> +	  This option enables various optimizations for running under the KVM
> +	  hypervisor. Overhead for the kernel when not running inside KVM should
> +	  be minimal.
> +
> +	  In case of doubt, say Y
> +
>  config PARAVIRT_TIME_ACCOUNTING
>  	bool "Paravirtual steal time accounting"
>  	select PARAVIRT
> diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> index 7a6e81ca23a8..d878afa42746 100644
> --- a/arch/arm64/include/asm/exception.h
> +++ b/arch/arm64/include/asm/exception.h
> @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr);
>  void do_cp15instr(unsigned int esr, struct pt_regs *regs);
>  void do_el0_svc(struct pt_regs *regs);
>  void do_el0_svc_compat(struct pt_regs *regs);
> +#ifdef CONFIG_KVM_GUEST
> +void kvm_async_pf_delayed_wake(void);
> +#endif
>  #endif	/* __ASM_EXCEPTION_H */
> diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h
> index 0ea481dd1c7a..b2f8ef243df7 100644
> --- a/arch/arm64/include/asm/kvm_para.h
> +++ b/arch/arm64/include/asm/kvm_para.h
> @@ -3,6 +3,20 @@
>  #define _ASM_ARM_KVM_PARA_H
>  
>  #include <uapi/asm/kvm_para.h>
> +#include <linux/of.h>
> +#include <asm/hypervisor.h>
> +
> +#ifdef CONFIG_KVM_GUEST
> +static inline int kvm_para_available(void)
> +{
> +	return 1;
> +}
> +#else
> +static inline int kvm_para_available(void)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_KVM_GUEST */

Please make these bool, and return true/false, as was the case with the
existing stub.

>  
>  static inline bool kvm_check_and_clear_guest_paused(void)
>  {
> @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>  
>  static inline unsigned int kvm_arch_para_features(void)
>  {
> -	return 0;
> +	unsigned int features = 0;
> +
> +	if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF))
> +		features |= (1 << KVM_FEATURE_ASYNC_PF);
> +
> +	return features;
>  }
>  
>  static inline unsigned int kvm_arch_para_hints(void)
>  {
>  	return 0;
>  }
> -
> -static inline bool kvm_para_available(void)
> -{
> -	return false;
> -}
> -
>  #endif /* _ASM_ARM_KVM_PARA_H */
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index ddcde093c433..15efd57129ff 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -751,12 +751,45 @@ finish_ret_to_user:
>  	enable_step_tsk x1, x2
>  #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>  	bl	stackleak_erase
> +#endif
> +#ifdef CONFIG_KVM_GUEST
> +	bl	kvm_async_pf_delayed_wake
>  #endif

Yuck. I am very much not keen on this living in the entry assembly.

What precisely is this needed for?

>  	kernel_exit 0
>  ENDPROC(ret_to_user)
>  
>  	.popsection				// .entry.text
>  
> +#ifdef CONFIG_KVM_GUEST
> +	.pushsection ".rodata", "a"
> +SYM_DATA_START(__exception_handlers_offset)
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	el1_sync - vectors
> +	.quad	el1_irq - vectors
> +	.quad	0
> +	.quad	el1_error - vectors
> +	.quad	el0_sync - vectors
> +	.quad	el0_irq - vectors
> +	.quad	0
> +	.quad	el0_error - vectors
> +#ifdef CONFIG_COMPAT
> +	.quad	el0_sync_compat - vectors
> +	.quad	el0_irq_compat - vectors
> +	.quad	0
> +	.quad	el0_error_compat - vectors
> +#else
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +#endif
> +SYM_DATA_END(__exception_handlers_offset)
> +	.popsection				// .rodata
> +#endif /* CONFIG_KVM_GUEST */

This looks scary, and needs an introduction in the commit message.

> +
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>  /*
>   * Exception vectors trampoline.
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 56be4cbf771f..5e7ee553566d 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -53,6 +53,7 @@
>  #include <asm/processor.h>
>  #include <asm/pointer_auth.h>
>  #include <asm/stacktrace.h>
> +#include <asm/exception.h>
>  
>  #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
>  #include <linux/stackprotector.h>
> @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
>  
>  static void __cpu_do_idle(void)
>  {
> +#ifdef CONFIG_KVM_GUEST
> +	kvm_async_pf_delayed_wake();
> +#endif
>  	dsb(sy);
>  	wfi();
>  }

This is meant to be a very low-level helper, so I don't think this
should live here.

If nothing else, this needs to have no overhead when async page faults
are not in use, so this probably needs an inline with a static key.


> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index c9cedc0432d2..cbf8b52135c9 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -19,10 +19,12 @@
>  #include <linux/page-flags.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/debug.h>
> +#include <linux/swait.h>
>  #include <linux/highmem.h>
>  #include <linux/perf_event.h>
>  #include <linux/preempt.h>
>  #include <linux/hugetlb.h>
> +#include <linux/kvm_para.h>
>  
>  #include <asm/acpi.h>
>  #include <asm/bug.h>
> @@ -48,8 +50,31 @@ struct fault_info {
>  	const char *name;
>  };
>  
> +#ifdef CONFIG_KVM_GUEST
> +struct kvm_task_sleep_node {
> +	struct hlist_node	link;
> +	struct swait_queue_head	wq;
> +	u32			token;
> +	struct task_struct	*task;
> +	int			cpu;
> +	bool			halted;
> +	bool			delayed;
> +};
> +
> +struct kvm_task_sleep_head {
> +	raw_spinlock_t		lock;
> +	struct hlist_head	list;
> +};
> +#endif /* CONFIG_KVM_GUEST */
> +
>  static const struct fault_info fault_info[];
>  static struct fault_info debug_fault_info[];
> +#ifdef CONFIG_KVM_GUEST
> +extern char __exception_handlers_offset[];
> +static bool async_pf_available = true;
> +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64);
> +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head);
> +#endif
>  
>  static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
>  {
> @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = {
>  	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
>  };
>  
> +#ifdef CONFIG_KVM_GUEST
> +static inline unsigned int kvm_async_pf_read_enabled(void)
> +{
> +	return __this_cpu_read(apf_data.enabled);
> +}
> +
> +static inline void kvm_async_pf_write_enabled(unsigned int val)
> +{
> +	__this_cpu_write(apf_data.enabled, val);
> +}
> +
> +static inline unsigned int kvm_async_pf_read_reason(void)
> +{
> +	return __this_cpu_read(apf_data.reason);
> +}
> +
> +static inline void kvm_async_pf_write_reason(unsigned int val)
> +{
> +	__this_cpu_write(apf_data.reason, val);
> +}
> +
> +#define kvm_async_pf_lock(b, flags)					\
> +	raw_spin_lock_irqsave(&(b)->lock, (flags))
> +#define kvm_async_pf_trylock(b, flags)					\
> +	raw_spin_trylock_irqsave(&(b)->lock, (flags))
> +#define kvm_async_pf_unlock(b, flags)					\
> +	raw_spin_unlock_irqrestore(&(b)->lock, (flags))
> +#define kvm_async_pf_unlock_and_clear(b, flags)				\
> +	do {								\
> +		raw_spin_unlock_irqrestore(&(b)->lock, (flags));	\
> +		kvm_async_pf_write_reason(0);				\
> +	} while (0)
> +
> +static struct kvm_task_sleep_node *kvm_async_pf_find(
> +		struct kvm_task_sleep_head *b, u32 token)
> +{
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p;
> +
> +	hlist_for_each(p, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		if (n->token == token)
> +			return n;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void kvm_async_pf_wait(u32 token, int in_kernel)
> +{
> +	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
> +	struct kvm_task_sleep_node n, *e;
> +	DECLARE_SWAITQUEUE(wait);
> +	unsigned long flags;
> +
> +	kvm_async_pf_lock(b, flags);
> +	e = kvm_async_pf_find(b, token);
> +	if (e) {
> +		/* dummy entry exist -> wake up was delivered ahead of PF */
> +		hlist_del(&e->link);
> +		kfree(e);
> +		kvm_async_pf_unlock_and_clear(b, flags);
> +
> +		return;
> +	}
> +
> +	n.token = token;
> +	n.task = current;
> +	n.cpu = smp_processor_id();
> +	n.halted = is_idle_task(current) ||
> +		   (IS_ENABLED(CONFIG_PREEMPT_COUNT) ?
> +		    preempt_count() > 1 || rcu_preempt_depth() : in_kernel);
> +	n.delayed = false;
> +	init_swait_queue_head(&n.wq);
> +	hlist_add_head(&n.link, &b->list);
> +	kvm_async_pf_unlock_and_clear(b, flags);
> +
> +	for (;;) {
> +		if (!n.halted) {
> +			prepare_to_swait_exclusive(&n.wq, &wait,
> +						   TASK_UNINTERRUPTIBLE);
> +		}
> +
> +		if (hlist_unhashed(&n.link))
> +			break;
> +
> +		if (!n.halted) {
> +			schedule();
> +		} else {
> +			dsb(sy);
> +			wfi();
> +		}

Please don't open-code idle sequences. I strongly suspect this won't
work with pseudo-nmi, and regardless we don't want to duplicate this.

> +	}
> +
> +	if (!n.halted)
> +		finish_swait(&n.wq, &wait);
> +}
> +
> +/*
> + * There are two cases the suspended processed can't be waken up
> + * immediately: The waker is exactly the suspended process, or
> + * the current CPU runqueue has been locked. Otherwise, we might
> + * run into dead-lock.
> + */
> +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n)
> +{
> +	if (n->task == current ||
> +	    cpu_rq_is_locked(smp_processor_id())) {
> +		n->delayed = true;
> +		return;
> +	}
> +
> +	hlist_del_init(&n->link);
> +	if (n->halted)
> +		smp_send_reschedule(n->cpu);
> +	else
> +		swake_up_one(&n->wq);
> +}
> +
> +void kvm_async_pf_delayed_wake(void)
> +{
> +	struct kvm_task_sleep_head *b;
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p, *next;
> +	unsigned int reason;
> +	unsigned long flags;
> +
> +	if (!kvm_async_pf_read_enabled())
> +		return;
> +
> +	/*
> +	 * We're running in the edging context, we need to complete
> +	 * the work as quick as possible. So we have a preliminary
> +	 * check without holding the lock.
> +	 */

What is 'the edging context'?

> +	b = this_cpu_ptr(&apf_head);
> +	if (hlist_empty(&b->list))
> +		return;
> +
> +	/*
> +	 * Set the async page fault reason to something to avoid
> +	 * receiving the signals, which might cause lock contention
> +	 * and possibly dead-lock. As we're in guest context, it's
> +	 * safe to set the reason here.
> +	 *
> +	 * There might be pending signals. For that case, we needn't
> +	 * do anything. Otherwise, the pending signal will be lost.
> +	 */
> +	reason = kvm_async_pf_read_reason();
> +	if (!reason) {
> +		kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT +
> +					  KVM_PV_REASON_PAGE_READY);
> +	}

Huh? Are we doing this to prevent the host from writing tho this area?

> +
> +	if (!kvm_async_pf_trylock(b, flags))
> +		goto done;
> +
> +	hlist_for_each_safe(p, next, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		if (n->cpu != smp_processor_id())
> +			continue;
> +		if (!n->delayed)
> +			continue;
> +
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock(b, flags);
> +
> +done:
> +	if (!reason)
> +		kvm_async_pf_write_reason(0);
> +}
> +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake);
> +
> +static void kvm_async_pf_wake_all(void)
> +{
> +	struct kvm_task_sleep_head *b;
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p, *next;
> +	unsigned long flags;
> +
> +	b = this_cpu_ptr(&apf_head);
> +	kvm_async_pf_lock(b, flags);
> +
> +	hlist_for_each_safe(p, next, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock(b, flags);
> +
> +	kvm_async_pf_write_reason(0);
> +}
> +
> +static void kvm_async_pf_wake(u32 token)
> +{
> +	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
> +	struct kvm_task_sleep_node *n;
> +	unsigned long flags;
> +
> +	if (token == ~0) {
> +		kvm_async_pf_wake_all();
> +		return;
> +	}
> +
> +again:
> +	kvm_async_pf_lock(b, flags);
> +
> +	n = kvm_async_pf_find(b, token);
> +	if (!n) {
> +		/*
> +		 * Async PF was not yet handled. Add dummy entry
> +		 * for the token. Busy wait until other CPU handles
> +		 * the async PF on allocation failure.
> +		 */
> +		n = kzalloc(sizeof(*n), GFP_ATOMIC);
> +		if (!n) {
> +			kvm_async_pf_unlock(b, flags);
> +			cpu_relax();
> +			goto again;
> +		}
> +		n->token = token;
> +		n->task = current;
> +		n->cpu = smp_processor_id();
> +		n->halted = false;
> +		n->delayed = false;
> +		init_swait_queue_head(&n->wq);
> +		hlist_add_head(&n->link, &b->list);
> +	} else {
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock_and_clear(b, flags);
> +}
> +
> +static bool do_async_pf(unsigned long addr, unsigned int esr,
> +		       struct pt_regs *regs)
> +{
> +	u32 reason;
> +
> +	if (!kvm_async_pf_read_enabled())
> +		return false;
> +
> +	reason = kvm_async_pf_read_reason();
> +	if (!reason)
> +		return false;
> +
> +	switch (reason) {
> +	case KVM_PV_REASON_PAGE_NOT_PRESENT:
> +		kvm_async_pf_wait((u32)addr, !user_mode(regs));
> +		break;
> +	case KVM_PV_REASON_PAGE_READY:
> +		kvm_async_pf_wake((u32)addr);
> +		break;
> +	default:
> +		if (reason) {
> +			pr_warn("%s: Illegal reason %d\n", __func__, reason);
> +			kvm_async_pf_write_reason(0);
> +		}
> +	}
> +
> +	return true;
> +}
> +#endif /* CONFIG_KVM_GUEST */
> +
>  void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf = esr_to_fault_info(esr);
>  
> +#ifdef CONFIG_KVM_GUEST
> +	if (do_async_pf(addr, esr, regs))
> +		return;
> +#endif
> +
>  	if (!inf->fn(addr, esr, regs))
>  		return;
>  
> @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
>  	debug_exception_exit(regs);
>  }
>  NOKPROBE_SYMBOL(do_debug_exception);
> +
> +#ifdef CONFIG_KVM_GUEST
> +static int __init kvm_async_pf_available(char *arg)
> +{
> +	async_pf_available = false;
> +	return 0;
> +}
> +early_param("no-kvmapf", kvm_async_pf_available);
> +
> +static void kvm_async_pf_enable(bool enable)
> +{
> +	struct arm_smccc_res res;
> +	unsigned long *offsets = (unsigned long *)__exception_handlers_offset;
> +	u32 enabled = kvm_async_pf_read_enabled();
> +	u64 val;
> +	int i;
> +
> +	if (enable == enabled)
> +		return;
> +
> +	if (enable) {
> +		/*
> +		 * Asychonous page faults will be prohibited when CPU runs
> +		 * instructions between the vector base and the maximal
> +		 * offset, plus 4096. The 4096 is the assumped maximal
> +		 * length for individual handler. The hardware registers
> +		 * should be saved to stack at the beginning of the handlers,
> +		 * so 4096 shuld be safe enough.
> +		 */
> +		val = 0;
> +		for (i = 0; i < 16; i++) {
> +			if (offsets[i] > val)
> +				val = offsets[i];
> +		}
> +
> +		val += 4096;

NAK. This assumption is not true, and regardless we should not make any
assumptions of this sort; we should derive this from the code
explicitly. Guessing is not ok.

Given that non-reentrant exception handling code is scattered across at
least:

* kernel/debug-monitors.c
* kernel/entry.S
* kernel/entry-common.S
* kernel/traps.c
* mm/fault.c

... we *cannot* assume that fault handling code is virtually contiguous,
and certainly cannot assume where this falls w.r.t. the architectural
vectors that VBAR_ELx points to.

How does x86 handle this?

Thanks,
Mark.
Paolo Bonzini May 27, 2020, 6:48 a.m. UTC | #2
Hi Gavin,

I definitely appreciate the work, but this is repeating most of the
mistakes done in the x86 implementation.  In particular:

- the page ready signal can be done as an interrupt, rather than an
exception.  This is because "page ready" can be handled asynchronously,
in contrast to "page not present" which must be done on the same
instruction that triggers it.  You can refer to the recent series from
Vitaly Kuznetsov that switched "page ready" to an interrupt.

- the page not present is reusing the memory abort exception, and
there's really no reason to do so.  I think it would be best if ARM
could reserve one ESR exception code for the hypervisor.  Mark, any
ideas how to proceed here?

- for x86 we're also thinking of initiating the page fault from the
exception handler, rather than doing so from the hypervisor before
injecting the exception.  If ARM leads the way here, we would do our
best to share code when x86 does the same.

- do not bother with using KVM_ASYNC_PF_SEND_ALWAYS, it's a fringe case
that adds a lot of complexity.

Also, please include me on further iterations of the series.

Thanks!

Paolo

On 08/05/20 05:29, Gavin Shan wrote:
> This supports asynchronous page fault for the guest. The design is
> similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from
> the host, the current task is either rescheduled or put into power
> saving mode. The task will be waken up when PAGE_READY signal is
> received. The PAGE_READY signal might be received in the context
> of the suspended process, to be waken up. That means the suspended
> process has to wake up itself, but it's not safe and prone to cause
> dead-lock on CPU runqueue lock. So the wakeup is delayed on returning
> from kernel space to user space or idle process is picked for running.
> 
> The signals are conveyed through the async page fault control block,
> which was passed to host on enabling the functionality. On each page
> fault, the control block is checked and switch to the async page fault
> handling flow if any signals exist.
> 
> The feature is put into the CONFIG_KVM_GUEST umbrella, which is added
> by this patch. So we have inline functions implemented in kvm_para.h,
> like other architectures do, to check if async page fault (one of the
> KVM para-virtualized features) is available. Also, the kernel boot
> parameter "no-kvmapf" can be specified to disable the feature.
> 
> Signed-off-by: Gavin Shan <gshan@redhat.com>
> ---
>  arch/arm64/Kconfig                 |  11 +
>  arch/arm64/include/asm/exception.h |   3 +
>  arch/arm64/include/asm/kvm_para.h  |  27 +-
>  arch/arm64/kernel/entry.S          |  33 +++
>  arch/arm64/kernel/process.c        |   4 +
>  arch/arm64/mm/fault.c              | 434 +++++++++++++++++++++++++++++
>  6 files changed, 505 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 40fb05d96c60..2d5e5ee62d6d 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1045,6 +1045,17 @@ config PARAVIRT
>  	  under a hypervisor, potentially improving performance significantly
>  	  over full virtualization.
>  
> +config KVM_GUEST
> +	bool "KVM Guest Support"
> +	depends on PARAVIRT
> +	default y
> +	help
> +	  This option enables various optimizations for running under the KVM
> +	  hypervisor. Overhead for the kernel when not running inside KVM should
> +	  be minimal.
> +
> +	  In case of doubt, say Y
> +
>  config PARAVIRT_TIME_ACCOUNTING
>  	bool "Paravirtual steal time accounting"
>  	select PARAVIRT
> diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> index 7a6e81ca23a8..d878afa42746 100644
> --- a/arch/arm64/include/asm/exception.h
> +++ b/arch/arm64/include/asm/exception.h
> @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr);
>  void do_cp15instr(unsigned int esr, struct pt_regs *regs);
>  void do_el0_svc(struct pt_regs *regs);
>  void do_el0_svc_compat(struct pt_regs *regs);
> +#ifdef CONFIG_KVM_GUEST
> +void kvm_async_pf_delayed_wake(void);
> +#endif
>  #endif	/* __ASM_EXCEPTION_H */
> diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h
> index 0ea481dd1c7a..b2f8ef243df7 100644
> --- a/arch/arm64/include/asm/kvm_para.h
> +++ b/arch/arm64/include/asm/kvm_para.h
> @@ -3,6 +3,20 @@
>  #define _ASM_ARM_KVM_PARA_H
>  
>  #include <uapi/asm/kvm_para.h>
> +#include <linux/of.h>
> +#include <asm/hypervisor.h>
> +
> +#ifdef CONFIG_KVM_GUEST
> +static inline int kvm_para_available(void)
> +{
> +	return 1;
> +}
> +#else
> +static inline int kvm_para_available(void)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_KVM_GUEST */
>  
>  static inline bool kvm_check_and_clear_guest_paused(void)
>  {
> @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void)
>  
>  static inline unsigned int kvm_arch_para_features(void)
>  {
> -	return 0;
> +	unsigned int features = 0;
> +
> +	if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF))
> +		features |= (1 << KVM_FEATURE_ASYNC_PF);
> +
> +	return features;
>  }
>  
>  static inline unsigned int kvm_arch_para_hints(void)
>  {
>  	return 0;
>  }
> -
> -static inline bool kvm_para_available(void)
> -{
> -	return false;
> -}
> -
>  #endif /* _ASM_ARM_KVM_PARA_H */
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index ddcde093c433..15efd57129ff 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -751,12 +751,45 @@ finish_ret_to_user:
>  	enable_step_tsk x1, x2
>  #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>  	bl	stackleak_erase
> +#endif
> +#ifdef CONFIG_KVM_GUEST
> +	bl	kvm_async_pf_delayed_wake
>  #endif
>  	kernel_exit 0
>  ENDPROC(ret_to_user)
>  
>  	.popsection				// .entry.text
>  
> +#ifdef CONFIG_KVM_GUEST
> +	.pushsection ".rodata", "a"
> +SYM_DATA_START(__exception_handlers_offset)
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	el1_sync - vectors
> +	.quad	el1_irq - vectors
> +	.quad	0
> +	.quad	el1_error - vectors
> +	.quad	el0_sync - vectors
> +	.quad	el0_irq - vectors
> +	.quad	0
> +	.quad	el0_error - vectors
> +#ifdef CONFIG_COMPAT
> +	.quad	el0_sync_compat - vectors
> +	.quad	el0_irq_compat - vectors
> +	.quad	0
> +	.quad	el0_error_compat - vectors
> +#else
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +	.quad	0
> +#endif
> +SYM_DATA_END(__exception_handlers_offset)
> +	.popsection				// .rodata
> +#endif /* CONFIG_KVM_GUEST */
> +
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>  /*
>   * Exception vectors trampoline.
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 56be4cbf771f..5e7ee553566d 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -53,6 +53,7 @@
>  #include <asm/processor.h>
>  #include <asm/pointer_auth.h>
>  #include <asm/stacktrace.h>
> +#include <asm/exception.h>
>  
>  #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
>  #include <linux/stackprotector.h>
> @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
>  
>  static void __cpu_do_idle(void)
>  {
> +#ifdef CONFIG_KVM_GUEST
> +	kvm_async_pf_delayed_wake();
> +#endif
>  	dsb(sy);
>  	wfi();
>  }
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index c9cedc0432d2..cbf8b52135c9 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -19,10 +19,12 @@
>  #include <linux/page-flags.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/debug.h>
> +#include <linux/swait.h>
>  #include <linux/highmem.h>
>  #include <linux/perf_event.h>
>  #include <linux/preempt.h>
>  #include <linux/hugetlb.h>
> +#include <linux/kvm_para.h>
>  
>  #include <asm/acpi.h>
>  #include <asm/bug.h>
> @@ -48,8 +50,31 @@ struct fault_info {
>  	const char *name;
>  };
>  
> +#ifdef CONFIG_KVM_GUEST
> +struct kvm_task_sleep_node {
> +	struct hlist_node	link;
> +	struct swait_queue_head	wq;
> +	u32			token;
> +	struct task_struct	*task;
> +	int			cpu;
> +	bool			halted;
> +	bool			delayed;
> +};
> +
> +struct kvm_task_sleep_head {
> +	raw_spinlock_t		lock;
> +	struct hlist_head	list;
> +};
> +#endif /* CONFIG_KVM_GUEST */
> +
>  static const struct fault_info fault_info[];
>  static struct fault_info debug_fault_info[];
> +#ifdef CONFIG_KVM_GUEST
> +extern char __exception_handlers_offset[];
> +static bool async_pf_available = true;
> +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64);
> +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head);
> +#endif
>  
>  static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
>  {
> @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = {
>  	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
>  };
>  
> +#ifdef CONFIG_KVM_GUEST
> +static inline unsigned int kvm_async_pf_read_enabled(void)
> +{
> +	return __this_cpu_read(apf_data.enabled);
> +}
> +
> +static inline void kvm_async_pf_write_enabled(unsigned int val)
> +{
> +	__this_cpu_write(apf_data.enabled, val);
> +}
> +
> +static inline unsigned int kvm_async_pf_read_reason(void)
> +{
> +	return __this_cpu_read(apf_data.reason);
> +}
> +
> +static inline void kvm_async_pf_write_reason(unsigned int val)
> +{
> +	__this_cpu_write(apf_data.reason, val);
> +}
> +
> +#define kvm_async_pf_lock(b, flags)					\
> +	raw_spin_lock_irqsave(&(b)->lock, (flags))
> +#define kvm_async_pf_trylock(b, flags)					\
> +	raw_spin_trylock_irqsave(&(b)->lock, (flags))
> +#define kvm_async_pf_unlock(b, flags)					\
> +	raw_spin_unlock_irqrestore(&(b)->lock, (flags))
> +#define kvm_async_pf_unlock_and_clear(b, flags)				\
> +	do {								\
> +		raw_spin_unlock_irqrestore(&(b)->lock, (flags));	\
> +		kvm_async_pf_write_reason(0);				\
> +	} while (0)
> +
> +static struct kvm_task_sleep_node *kvm_async_pf_find(
> +		struct kvm_task_sleep_head *b, u32 token)
> +{
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p;
> +
> +	hlist_for_each(p, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		if (n->token == token)
> +			return n;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void kvm_async_pf_wait(u32 token, int in_kernel)
> +{
> +	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
> +	struct kvm_task_sleep_node n, *e;
> +	DECLARE_SWAITQUEUE(wait);
> +	unsigned long flags;
> +
> +	kvm_async_pf_lock(b, flags);
> +	e = kvm_async_pf_find(b, token);
> +	if (e) {
> +		/* dummy entry exist -> wake up was delivered ahead of PF */
> +		hlist_del(&e->link);
> +		kfree(e);
> +		kvm_async_pf_unlock_and_clear(b, flags);
> +
> +		return;
> +	}
> +
> +	n.token = token;
> +	n.task = current;
> +	n.cpu = smp_processor_id();
> +	n.halted = is_idle_task(current) ||
> +		   (IS_ENABLED(CONFIG_PREEMPT_COUNT) ?
> +		    preempt_count() > 1 || rcu_preempt_depth() : in_kernel);
> +	n.delayed = false;
> +	init_swait_queue_head(&n.wq);
> +	hlist_add_head(&n.link, &b->list);
> +	kvm_async_pf_unlock_and_clear(b, flags);
> +
> +	for (;;) {
> +		if (!n.halted) {
> +			prepare_to_swait_exclusive(&n.wq, &wait,
> +						   TASK_UNINTERRUPTIBLE);
> +		}
> +
> +		if (hlist_unhashed(&n.link))
> +			break;
> +
> +		if (!n.halted) {
> +			schedule();
> +		} else {
> +			dsb(sy);
> +			wfi();
> +		}
> +	}
> +
> +	if (!n.halted)
> +		finish_swait(&n.wq, &wait);
> +}
> +
> +/*
> + * There are two cases the suspended processed can't be waken up
> + * immediately: The waker is exactly the suspended process, or
> + * the current CPU runqueue has been locked. Otherwise, we might
> + * run into dead-lock.
> + */
> +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n)
> +{
> +	if (n->task == current ||
> +	    cpu_rq_is_locked(smp_processor_id())) {
> +		n->delayed = true;
> +		return;
> +	}
> +
> +	hlist_del_init(&n->link);
> +	if (n->halted)
> +		smp_send_reschedule(n->cpu);
> +	else
> +		swake_up_one(&n->wq);
> +}
> +
> +void kvm_async_pf_delayed_wake(void)
> +{
> +	struct kvm_task_sleep_head *b;
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p, *next;
> +	unsigned int reason;
> +	unsigned long flags;
> +
> +	if (!kvm_async_pf_read_enabled())
> +		return;
> +
> +	/*
> +	 * We're running in the edging context, we need to complete
> +	 * the work as quick as possible. So we have a preliminary
> +	 * check without holding the lock.
> +	 */
> +	b = this_cpu_ptr(&apf_head);
> +	if (hlist_empty(&b->list))
> +		return;
> +
> +	/*
> +	 * Set the async page fault reason to something to avoid
> +	 * receiving the signals, which might cause lock contention
> +	 * and possibly dead-lock. As we're in guest context, it's
> +	 * safe to set the reason here.
> +	 *
> +	 * There might be pending signals. For that case, we needn't
> +	 * do anything. Otherwise, the pending signal will be lost.
> +	 */
> +	reason = kvm_async_pf_read_reason();
> +	if (!reason) {
> +		kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT +
> +					  KVM_PV_REASON_PAGE_READY);
> +	}
> +
> +	if (!kvm_async_pf_trylock(b, flags))
> +		goto done;
> +
> +	hlist_for_each_safe(p, next, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		if (n->cpu != smp_processor_id())
> +			continue;
> +		if (!n->delayed)
> +			continue;
> +
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock(b, flags);
> +
> +done:
> +	if (!reason)
> +		kvm_async_pf_write_reason(0);
> +}
> +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake);
> +
> +static void kvm_async_pf_wake_all(void)
> +{
> +	struct kvm_task_sleep_head *b;
> +	struct kvm_task_sleep_node *n;
> +	struct hlist_node *p, *next;
> +	unsigned long flags;
> +
> +	b = this_cpu_ptr(&apf_head);
> +	kvm_async_pf_lock(b, flags);
> +
> +	hlist_for_each_safe(p, next, &b->list) {
> +		n = hlist_entry(p, typeof(*n), link);
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock(b, flags);
> +
> +	kvm_async_pf_write_reason(0);
> +}
> +
> +static void kvm_async_pf_wake(u32 token)
> +{
> +	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
> +	struct kvm_task_sleep_node *n;
> +	unsigned long flags;
> +
> +	if (token == ~0) {
> +		kvm_async_pf_wake_all();
> +		return;
> +	}
> +
> +again:
> +	kvm_async_pf_lock(b, flags);
> +
> +	n = kvm_async_pf_find(b, token);
> +	if (!n) {
> +		/*
> +		 * Async PF was not yet handled. Add dummy entry
> +		 * for the token. Busy wait until other CPU handles
> +		 * the async PF on allocation failure.
> +		 */
> +		n = kzalloc(sizeof(*n), GFP_ATOMIC);
> +		if (!n) {
> +			kvm_async_pf_unlock(b, flags);
> +			cpu_relax();
> +			goto again;
> +		}
> +		n->token = token;
> +		n->task = current;
> +		n->cpu = smp_processor_id();
> +		n->halted = false;
> +		n->delayed = false;
> +		init_swait_queue_head(&n->wq);
> +		hlist_add_head(&n->link, &b->list);
> +	} else {
> +		kvm_async_pf_wake_one(n);
> +	}
> +
> +	kvm_async_pf_unlock_and_clear(b, flags);
> +}
> +
> +static bool do_async_pf(unsigned long addr, unsigned int esr,
> +		       struct pt_regs *regs)
> +{
> +	u32 reason;
> +
> +	if (!kvm_async_pf_read_enabled())
> +		return false;
> +
> +	reason = kvm_async_pf_read_reason();
> +	if (!reason)
> +		return false;
> +
> +	switch (reason) {
> +	case KVM_PV_REASON_PAGE_NOT_PRESENT:
> +		kvm_async_pf_wait((u32)addr, !user_mode(regs));
> +		break;
> +	case KVM_PV_REASON_PAGE_READY:
> +		kvm_async_pf_wake((u32)addr);
> +		break;
> +	default:
> +		if (reason) {
> +			pr_warn("%s: Illegal reason %d\n", __func__, reason);
> +			kvm_async_pf_write_reason(0);
> +		}
> +	}
> +
> +	return true;
> +}
> +#endif /* CONFIG_KVM_GUEST */
> +
>  void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf = esr_to_fault_info(esr);
>  
> +#ifdef CONFIG_KVM_GUEST
> +	if (do_async_pf(addr, esr, regs))
> +		return;
> +#endif
> +
>  	if (!inf->fn(addr, esr, regs))
>  		return;
>  
> @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
>  	debug_exception_exit(regs);
>  }
>  NOKPROBE_SYMBOL(do_debug_exception);
> +
> +#ifdef CONFIG_KVM_GUEST
> +static int __init kvm_async_pf_available(char *arg)
> +{
> +	async_pf_available = false;
> +	return 0;
> +}
> +early_param("no-kvmapf", kvm_async_pf_available);
> +
> +static void kvm_async_pf_enable(bool enable)
> +{
> +	struct arm_smccc_res res;
> +	unsigned long *offsets = (unsigned long *)__exception_handlers_offset;
> +	u32 enabled = kvm_async_pf_read_enabled();
> +	u64 val;
> +	int i;
> +
> +	if (enable == enabled)
> +		return;
> +
> +	if (enable) {
> +		/*
> +		 * Asychonous page faults will be prohibited when CPU runs
> +		 * instructions between the vector base and the maximal
> +		 * offset, plus 4096. The 4096 is the assumped maximal
> +		 * length for individual handler. The hardware registers
> +		 * should be saved to stack at the beginning of the handlers,
> +		 * so 4096 shuld be safe enough.
> +		 */
> +		val = 0;
> +		for (i = 0; i < 16; i++) {
> +			if (offsets[i] > val)
> +				val = offsets[i];
> +		}
> +
> +		val += 4096;
> +		val |= BIT(62);
> +
> +		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
> +				     (u32)val, (u32)(val >> 32), &res);
> +		if (res.a0 != SMCCC_RET_SUCCESS) {
> +			pr_warn("Async PF configuration error %ld on CPU %d\n",
> +				res.a0, smp_processor_id());
> +			return;
> +		}
> +
> +		/* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS */
> +		val = BIT(63);
> +		val |= virt_to_phys(this_cpu_ptr(&apf_data));
> +		val |= KVM_ASYNC_PF_ENABLED;
> +
> +		kvm_async_pf_write_enabled(1);
> +		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
> +				     (u32)val, (u32)(val >> 32), &res);
> +		if (res.a0 != SMCCC_RET_SUCCESS) {
> +			pr_warn("Async PF enable error %ld on CPU %d\n",
> +				res.a0, smp_processor_id());
> +			kvm_async_pf_write_enabled(0);
> +			return;
> +		}
> +	} else {
> +		val = BIT(63);
> +		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
> +				     (u32)val, (u32)(val >> 32), &res);
> +		if (res.a0 != SMCCC_RET_SUCCESS) {
> +			pr_warn("Async PF disable error %ld on CPU %d\n",
> +				res.a0, smp_processor_id());
> +			return;
> +		}
> +
> +		kvm_async_pf_write_enabled(0);
> +	}
> +
> +	pr_info("Async PF %s on CPU %d\n",
> +		enable ? "enabled" : "disabled", smp_processor_id());
> +}
> +
> +static void kvm_async_pf_cpu_reboot(void *unused)
> +{
> +	kvm_async_pf_enable(false);
> +}
> +
> +static int kvm_async_pf_cpu_reboot_notify(struct notifier_block *nb,
> +					  unsigned long code, void *unused)
> +{
> +	if (code == SYS_RESTART)
> +		on_each_cpu(kvm_async_pf_cpu_reboot, NULL, 1);
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block kvm_async_pf_cpu_reboot_nb = {
> +	.notifier_call = kvm_async_pf_cpu_reboot_notify,
> +};
> +
> +static int kvm_async_pf_cpu_online(unsigned int cpu)
> +{
> +	struct kvm_task_sleep_head *b;
> +
> +	b = this_cpu_ptr(&apf_head);
> +	raw_spin_lock_init(&b->lock);
> +	kvm_async_pf_enable(true);
> +	return 0;
> +}
> +
> +static int kvm_async_pf_cpu_offline(unsigned int cpu)
> +{
> +	kvm_async_pf_enable(false);
> +	return 0;
> +}
> +
> +static int __init kvm_async_pf_cpu_init(void)
> +{
> +	struct kvm_task_sleep_head *b;
> +	int ret;
> +
> +	if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) ||
> +	    !async_pf_available)
> +		return -EPERM;
> +
> +	register_reboot_notifier(&kvm_async_pf_cpu_reboot_nb);
> +	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> +			"arm/kvm:online", kvm_async_pf_cpu_online,
> +			kvm_async_pf_cpu_offline);
> +	if (ret < 0) {
> +		pr_warn("%s: Error %d to install cpu hotplug callbacks\n",
> +			__func__, ret);
> +		return ret;
> +	}
> +
> +	b = this_cpu_ptr(&apf_head);
> +	raw_spin_lock_init(&b->lock);
> +	kvm_async_pf_enable(true);
> +
> +	return 0;
> +}
> +early_initcall(kvm_async_pf_cpu_init);
> +#endif /* CONFIG_KVM_GUEST */
>
Gavin Shan May 28, 2020, 6:14 a.m. UTC | #3
Hi Paolo,

On 5/27/20 4:48 PM, Paolo Bonzini wrote:
> I definitely appreciate the work, but this is repeating most of the
> mistakes done in the x86 implementation.  In particular:
> 
> - the page ready signal can be done as an interrupt, rather than an
> exception.  This is because "page ready" can be handled asynchronously,
> in contrast to "page not present" which must be done on the same
> instruction that triggers it.  You can refer to the recent series from
> Vitaly Kuznetsov that switched "page ready" to an interrupt.
> 

Yeah, page ready can be handled asynchronously. I think it would be
nice for x86/arm64 to share same design. x86 has 256 vectors and it
seems 0xec is picked for the purpose. However, arm64 doesn't have so
many (interrupt/exception) vectors and PPI might be appropriate for
the purpose if I'm correct, because it has same INTD for all CPUs.
 From this point, it's similar to x86's vector. There are 16 PPIs, which
are in range of 16 to 31, and we might reserve one for this. According
to GICv3/v4 spec, 22 - 30 have been assigned.

> - the page not present is reusing the memory abort exception, and
> there's really no reason to do so.  I think it would be best if ARM
> could reserve one ESR exception code for the hypervisor.  Mark, any
> ideas how to proceed here?
> 

Well, a subclass of ESR exception code, whose DFSC (Data Fault Status
Code) is 0x34, was taken for the purpose in RFCv1. The code is IMPDEF
one and Mark suggested not to do so. I agree the page not present needs a
separately subclass of exception. With that, there will be less conflicts
and complexity. However, the question is which subclass or DFSC code I should
used for the purpose.

> - for x86 we're also thinking of initiating the page fault from the
> exception handler, rather than doing so from the hypervisor before
> injecting the exception.  If ARM leads the way here, we would do our
> best to share code when x86 does the same.
> 

Sorry, Paolo, I don't follow your idea here. Could you please provide
more details?

> - do not bother with using KVM_ASYNC_PF_SEND_ALWAYS, it's a fringe case
> that adds a lot of complexity.
> 

Yeah, I don't consider it so far.

> Also, please include me on further iterations of the series.
> 

Sure.

Thanks,
Gavin

[...]
Marc Zyngier May 28, 2020, 7:03 a.m. UTC | #4
On 2020-05-28 07:14, Gavin Shan wrote:
> Hi Paolo,
> 
> On 5/27/20 4:48 PM, Paolo Bonzini wrote:
>> I definitely appreciate the work, but this is repeating most of the
>> mistakes done in the x86 implementation.  In particular:
>> 
>> - the page ready signal can be done as an interrupt, rather than an
>> exception.  This is because "page ready" can be handled 
>> asynchronously,
>> in contrast to "page not present" which must be done on the same
>> instruction that triggers it.  You can refer to the recent series from
>> Vitaly Kuznetsov that switched "page ready" to an interrupt.
>> 
> 
> Yeah, page ready can be handled asynchronously. I think it would be
> nice for x86/arm64 to share same design. x86 has 256 vectors and it
> seems 0xec is picked for the purpose. However, arm64 doesn't have so
> many (interrupt/exception) vectors and PPI might be appropriate for
> the purpose if I'm correct, because it has same INTD for all CPUs.
> From this point, it's similar to x86's vector. There are 16 PPIs, which
> are in range of 16 to 31, and we might reserve one for this. According
> to GICv3/v4 spec, 22 - 30 have been assigned.

The assignment of the PPIs is completely implementation defined,
and is not part of the architecture (and certainly not in the
GICv3/v4 spec). SBSA makes some statements as to the way they *could*
be assigned, but that's in no way enforced. This allocation is entirely
controlled by userspace, which would need to configure tell KVM
which PPI to use on a per-VM basis.

You would then need to describe the PPI assignment through firmware
(both DT and ACPI) so that the guest kernel can know what PPI the
hypervisor would be signalling on.

It is also not very future proof should we move to a different
interrupt architecture.

> 
>> - the page not present is reusing the memory abort exception, and
>> there's really no reason to do so.  I think it would be best if ARM
>> could reserve one ESR exception code for the hypervisor.  Mark, any
>> ideas how to proceed here?
>> 
> 
> Well, a subclass of ESR exception code, whose DFSC (Data Fault Status
> Code) is 0x34, was taken for the purpose in RFCv1. The code is IMPDEF
> one and Mark suggested not to do so. I agree the page not present needs 
> a
> separately subclass of exception. With that, there will be less 
> conflicts
> and complexity. However, the question is which subclass or DFSC code I 
> should
> used for the purpose.

The current state of the architecture doesn't seem to leave much leeway 
in
terms of SW creativity here. You just can't allocate your own ISS 
encoding
without risking a clash with future revisions of the architecture.
It isn't even clear whether the value you put would stick in ESR_EL1
if it isn't a valid value for this CPU (see the definition of 'Reserved'
in the ARM ARM).

Allocating such a syndrome would require from ARM:

- the guarantee that no existing implementation, irrespective of the
   implementer, can cope with the ISS encoding of your choice,

- the written promise in the architecture that some EC/ISS values
   are reserved for SW, and that promise to apply retrospectively.

This is somewhat unlikely to happen.

         M.
Paolo Bonzini May 28, 2020, 10:48 a.m. UTC | #5
On 28/05/20 08:14, Gavin Shan wrote:
>> - for x86 we're also thinking of initiating the page fault from the
>> exception handler, rather than doing so from the hypervisor before
>> injecting the exception.  If ARM leads the way here, we would do our
>> best to share code when x86 does the same.
> 
> Sorry, Paolo, I don't follow your idea here. Could you please provide
> more details?

The idea is to inject stage2 page faults into the guest even before the
host starts populates the page.  The guest then invokes a hypercall,
telling the host to populate the page table and inject the 'page ready'
event (interrupt) when it's done.

For x86 the advantage is that the processor can take care of raising the
stage2 page fault in the guest, so it's faster.

Paolo
Paolo Bonzini May 28, 2020, 10:53 a.m. UTC | #6
On 28/05/20 09:03, Marc Zyngier wrote:
> The current state of the architecture doesn't seem to leave much leeway in
> terms of SW creativity here. You just can't allocate your own ISS encoding
> without risking a clash with future revisions of the architecture.
> It isn't even clear whether the value you put would stick in ESR_EL1
> if it isn't a valid value for this CPU (see the definition of 'Reserved'
> in the ARM ARM).
> 
> Allocating such a syndrome would require from ARM:
> 
> - the guarantee that no existing implementation, irrespective of the
>   implementer, can cope with the ISS encoding of your choice,
> 
> - the written promise in the architecture that some EC/ISS values
>   are reserved for SW, and that promise to apply retrospectively.
> 
> This is somewhat unlikely to happen.

Well, that's a euphemism probably.  On x86 we're somewhat lucky that
there's an architectural way for injecting hypervisor vmexit directly in
the guest, and we can piggyback on that for async page faults (which are
essentially stage2 page faults that are processed by the guest).

Is it possible to reuse EL2 exception codes / syndromes somehow?  (I
haven't checked in the ARM ARM the differences between the EL1 and EL2
syndrome registers).

Paolo
Gavin Shan May 28, 2020, 11:02 p.m. UTC | #7
Hi Paolo,

On 5/28/20 8:48 PM, Paolo Bonzini wrote:
> On 28/05/20 08:14, Gavin Shan wrote:
>>> - for x86 we're also thinking of initiating the page fault from the
>>> exception handler, rather than doing so from the hypervisor before
>>> injecting the exception.  If ARM leads the way here, we would do our
>>> best to share code when x86 does the same.
>>
>> Sorry, Paolo, I don't follow your idea here. Could you please provide
>> more details?
> 
> The idea is to inject stage2 page faults into the guest even before the
> host starts populates the page.  The guest then invokes a hypercall,
> telling the host to populate the page table and inject the 'page ready'
> event (interrupt) when it's done.
> 
> For x86 the advantage is that the processor can take care of raising the
> stage2 page fault in the guest, so it's faster.
> 
I think there might be too much overhead if the page can be populated
quickly by host. For example, it's fast to populate the pages if swapin
isn't involved.

If I'm correct enough, it seems arm64 doesn't have similar mechanism,
routing stage2 page fault to guest.

Thanks,
Gavin
Marc Zyngier May 29, 2020, 9:41 a.m. UTC | #8
On 2020-05-29 00:02, Gavin Shan wrote:
> Hi Paolo,
> 
> On 5/28/20 8:48 PM, Paolo Bonzini wrote:
>> On 28/05/20 08:14, Gavin Shan wrote:
>>>> - for x86 we're also thinking of initiating the page fault from the
>>>> exception handler, rather than doing so from the hypervisor before
>>>> injecting the exception.  If ARM leads the way here, we would do our
>>>> best to share code when x86 does the same.
>>> 
>>> Sorry, Paolo, I don't follow your idea here. Could you please provide
>>> more details?
>> 
>> The idea is to inject stage2 page faults into the guest even before 
>> the
>> host starts populates the page.  The guest then invokes a hypercall,
>> telling the host to populate the page table and inject the 'page 
>> ready'
>> event (interrupt) when it's done.
>> 
>> For x86 the advantage is that the processor can take care of raising 
>> the
>> stage2 page fault in the guest, so it's faster.
>> 
> I think there might be too much overhead if the page can be populated
> quickly by host. For example, it's fast to populate the pages if swapin
> isn't involved.
> 
> If I'm correct enough, it seems arm64 doesn't have similar mechanism,
> routing stage2 page fault to guest.

Indeed, this isn't a thing on arm64. Exception caused by a S2 fault are
always routed to EL2.

         M.
Paolo Bonzini May 29, 2020, 11:11 a.m. UTC | #9
On 29/05/20 11:41, Marc Zyngier wrote:
>>>
>>>
>>> For x86 the advantage is that the processor can take care of raising the
>>> stage2 page fault in the guest, so it's faster.
>>>
>> I think there might be too much overhead if the page can be populated
>> quickly by host. For example, it's fast to populate the pages if swapin
>> isn't involved.

Those would still be handled by the host.  Only those that are not
present in the host (which you can see through the MMU notifier) would
be routed to the guest.  You can do things differently between "not
present fault because the page table does not exist" and "not present
fault because the page is missing in the host".

>> If I'm correct enough, it seems arm64 doesn't have similar mechanism,
>> routing stage2 page fault to guest.
> 
> Indeed, this isn't a thing on arm64. Exception caused by a S2 fault are
> always routed to EL2.

Is there an ARM-approved way to reuse the S2 fault syndromes to detect
async page faults?

(By the way, another "modern" use for async page faults is for postcopy
live migration).

Thanks,

Paolo
Marc Zyngier May 31, 2020, 12:44 p.m. UTC | #10
On 2020-05-29 12:11, Paolo Bonzini wrote:
> On 29/05/20 11:41, Marc Zyngier wrote:
>>>> 
>>>> 
>>>> For x86 the advantage is that the processor can take care of raising 
>>>> the
>>>> stage2 page fault in the guest, so it's faster.
>>>> 
>>> I think there might be too much overhead if the page can be populated
>>> quickly by host. For example, it's fast to populate the pages if 
>>> swapin
>>> isn't involved.
> 
> Those would still be handled by the host.  Only those that are not
> present in the host (which you can see through the MMU notifier) would
> be routed to the guest.  You can do things differently between "not
> present fault because the page table does not exist" and "not present
> fault because the page is missing in the host".
> 
>>> If I'm correct enough, it seems arm64 doesn't have similar mechanism,
>>> routing stage2 page fault to guest.
>> 
>> Indeed, this isn't a thing on arm64. Exception caused by a S2 fault 
>> are
>> always routed to EL2.
> 
> Is there an ARM-approved way to reuse the S2 fault syndromes to detect
> async page faults?

It would mean being able to set an ESR_EL2 register value into ESR_EL1,
and there is nothing in the architecture that would allow that, with
the exception of nested virt: a VHE guest hypervisor running at EL1
must be able to observe S2 faults for its own S2, as synthesized by
the host hypervisor.

The trouble is that:
- there is so far no commercially available CPU supporting NV
- even if you could get hold of such a machine, there is no
   guarantee that such "EL2 syndrome at EL1" is valid outside of
   the nested context
- this doesn't solve the issue for non-NV CPUs anyway

> (By the way, another "modern" use for async page faults is for postcopy
> live migration).

Right. That's definitely a more interesting version of "swap-in".

         M.
Paolo Bonzini June 1, 2020, 9:21 a.m. UTC | #11
On 31/05/20 14:44, Marc Zyngier wrote:
>>
>> Is there an ARM-approved way to reuse the S2 fault syndromes to detect
>> async page faults?
> 
> It would mean being able to set an ESR_EL2 register value into ESR_EL1,
> and there is nothing in the architecture that would allow that,

I understand that this is not what you want to do and I'm not proposing
it, but I want to understand this better: _in practice_ do CPUs check
closely what is written in ESR_EL1?

In any case, the only way to implement this, it seems to me, would be a
completely paravirtualized exception vector that doesn't use ESR at all.

On the other hand, for the page ready (interrupt) side assigning a PPI
seems complicated but doable.

Paolo

> with
> the exception of nested virt: a VHE guest hypervisor running at EL1
> must be able to observe S2 faults for its own S2, as synthesized by
> the host hypervisor.

> The trouble is that:
> - there is so far no commercially available CPU supporting NV
> - even if you could get hold of such a machine, there is no
>   guarantee that such "EL2 syndrome at EL1" is valid outside of
>   the nested context
> - this doesn't solve the issue for non-NV CPUs anyway
Gavin Shan June 2, 2020, 5:44 a.m. UTC | #12
Hi Marc, Paolo,

On 6/1/20 7:21 PM, Paolo Bonzini wrote:
> On 31/05/20 14:44, Marc Zyngier wrote:
>>>
>>> Is there an ARM-approved way to reuse the S2 fault syndromes to detect
>>> async page faults?
>>
>> It would mean being able to set an ESR_EL2 register value into ESR_EL1,
>> and there is nothing in the architecture that would allow that,
> 
> I understand that this is not what you want to do and I'm not proposing
> it, but I want to understand this better: _in practice_ do CPUs check
> closely what is written in ESR_EL1?
> 
> In any case, the only way to implement this, it seems to me, would be a
> completely paravirtualized exception vector that doesn't use ESR at all.
> 
> On the other hand, for the page ready (interrupt) side assigning a PPI
> seems complicated but doable.
> 

Marc suggested to use SDEI in another reply. I think it might be the
appropriate way to deliver page-not-present. To some extent, it could
be regarded as exception, which doesn't use ESR at all. It matches with
what Paolo is thinking of: paravirtualized exception vector that doesn't
use ESR at all. However, it seems it's not supported in kvm-arm yet. So
I assume it needs to be developed from scratch. Marc, could you please
help to confirm? Thanks in advance.

I agree with Paolo PPI (interrupt) might be the best way to deliver
page-ready currently. I don't think SDEI is suitable because there
are no big difference between SDEI and currently used DABT injection
to some extent. With SDEI, We will have the issues we are facing.
For example, some critical code section isn't safe to receive SDEI
if I'm correct.


Thanks,
Gavin

[...]

Patch
diff mbox series

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 40fb05d96c60..2d5e5ee62d6d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1045,6 +1045,17 @@  config PARAVIRT
 	  under a hypervisor, potentially improving performance significantly
 	  over full virtualization.
 
+config KVM_GUEST
+	bool "KVM Guest Support"
+	depends on PARAVIRT
+	default y
+	help
+	  This option enables various optimizations for running under the KVM
+	  hypervisor. Overhead for the kernel when not running inside KVM should
+	  be minimal.
+
+	  In case of doubt, say Y
+
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	select PARAVIRT
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..d878afa42746 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -46,4 +46,7 @@  void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr);
 void do_cp15instr(unsigned int esr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
+#ifdef CONFIG_KVM_GUEST
+void kvm_async_pf_delayed_wake(void);
+#endif
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h
index 0ea481dd1c7a..b2f8ef243df7 100644
--- a/arch/arm64/include/asm/kvm_para.h
+++ b/arch/arm64/include/asm/kvm_para.h
@@ -3,6 +3,20 @@ 
 #define _ASM_ARM_KVM_PARA_H
 
 #include <uapi/asm/kvm_para.h>
+#include <linux/of.h>
+#include <asm/hypervisor.h>
+
+#ifdef CONFIG_KVM_GUEST
+static inline int kvm_para_available(void)
+{
+	return 1;
+}
+#else
+static inline int kvm_para_available(void)
+{
+	return 0;
+}
+#endif /* CONFIG_KVM_GUEST */
 
 static inline bool kvm_check_and_clear_guest_paused(void)
 {
@@ -11,17 +25,16 @@  static inline bool kvm_check_and_clear_guest_paused(void)
 
 static inline unsigned int kvm_arch_para_features(void)
 {
-	return 0;
+	unsigned int features = 0;
+
+	if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF))
+		features |= (1 << KVM_FEATURE_ASYNC_PF);
+
+	return features;
 }
 
 static inline unsigned int kvm_arch_para_hints(void)
 {
 	return 0;
 }
-
-static inline bool kvm_para_available(void)
-{
-	return false;
-}
-
 #endif /* _ASM_ARM_KVM_PARA_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ddcde093c433..15efd57129ff 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -751,12 +751,45 @@  finish_ret_to_user:
 	enable_step_tsk x1, x2
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 	bl	stackleak_erase
+#endif
+#ifdef CONFIG_KVM_GUEST
+	bl	kvm_async_pf_delayed_wake
 #endif
 	kernel_exit 0
 ENDPROC(ret_to_user)
 
 	.popsection				// .entry.text
 
+#ifdef CONFIG_KVM_GUEST
+	.pushsection ".rodata", "a"
+SYM_DATA_START(__exception_handlers_offset)
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	el1_sync - vectors
+	.quad	el1_irq - vectors
+	.quad	0
+	.quad	el1_error - vectors
+	.quad	el0_sync - vectors
+	.quad	el0_irq - vectors
+	.quad	0
+	.quad	el0_error - vectors
+#ifdef CONFIG_COMPAT
+	.quad	el0_sync_compat - vectors
+	.quad	el0_irq_compat - vectors
+	.quad	0
+	.quad	el0_error_compat - vectors
+#else
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+#endif
+SYM_DATA_END(__exception_handlers_offset)
+	.popsection				// .rodata
+#endif /* CONFIG_KVM_GUEST */
+
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 /*
  * Exception vectors trampoline.
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 56be4cbf771f..5e7ee553566d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -53,6 +53,7 @@ 
 #include <asm/processor.h>
 #include <asm/pointer_auth.h>
 #include <asm/stacktrace.h>
+#include <asm/exception.h>
 
 #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
 #include <linux/stackprotector.h>
@@ -70,6 +71,9 @@  void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
 static void __cpu_do_idle(void)
 {
+#ifdef CONFIG_KVM_GUEST
+	kvm_async_pf_delayed_wake();
+#endif
 	dsb(sy);
 	wfi();
 }
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..cbf8b52135c9 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -19,10 +19,12 @@ 
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/swait.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
 #include <linux/hugetlb.h>
+#include <linux/kvm_para.h>
 
 #include <asm/acpi.h>
 #include <asm/bug.h>
@@ -48,8 +50,31 @@  struct fault_info {
 	const char *name;
 };
 
+#ifdef CONFIG_KVM_GUEST
+struct kvm_task_sleep_node {
+	struct hlist_node	link;
+	struct swait_queue_head	wq;
+	u32			token;
+	struct task_struct	*task;
+	int			cpu;
+	bool			halted;
+	bool			delayed;
+};
+
+struct kvm_task_sleep_head {
+	raw_spinlock_t		lock;
+	struct hlist_head	list;
+};
+#endif /* CONFIG_KVM_GUEST */
+
 static const struct fault_info fault_info[];
 static struct fault_info debug_fault_info[];
+#ifdef CONFIG_KVM_GUEST
+extern char __exception_handlers_offset[];
+static bool async_pf_available = true;
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head);
+#endif
 
 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
 {
@@ -717,10 +742,281 @@  static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
+#ifdef CONFIG_KVM_GUEST
+static inline unsigned int kvm_async_pf_read_enabled(void)
+{
+	return __this_cpu_read(apf_data.enabled);
+}
+
+static inline void kvm_async_pf_write_enabled(unsigned int val)
+{
+	__this_cpu_write(apf_data.enabled, val);
+}
+
+static inline unsigned int kvm_async_pf_read_reason(void)
+{
+	return __this_cpu_read(apf_data.reason);
+}
+
+static inline void kvm_async_pf_write_reason(unsigned int val)
+{
+	__this_cpu_write(apf_data.reason, val);
+}
+
+#define kvm_async_pf_lock(b, flags)					\
+	raw_spin_lock_irqsave(&(b)->lock, (flags))
+#define kvm_async_pf_trylock(b, flags)					\
+	raw_spin_trylock_irqsave(&(b)->lock, (flags))
+#define kvm_async_pf_unlock(b, flags)					\
+	raw_spin_unlock_irqrestore(&(b)->lock, (flags))
+#define kvm_async_pf_unlock_and_clear(b, flags)				\
+	do {								\
+		raw_spin_unlock_irqrestore(&(b)->lock, (flags));	\
+		kvm_async_pf_write_reason(0);				\
+	} while (0)
+
+static struct kvm_task_sleep_node *kvm_async_pf_find(
+		struct kvm_task_sleep_head *b, u32 token)
+{
+	struct kvm_task_sleep_node *n;
+	struct hlist_node *p;
+
+	hlist_for_each(p, &b->list) {
+		n = hlist_entry(p, typeof(*n), link);
+		if (n->token == token)
+			return n;
+	}
+
+	return NULL;
+}
+
+static void kvm_async_pf_wait(u32 token, int in_kernel)
+{
+	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
+	struct kvm_task_sleep_node n, *e;
+	DECLARE_SWAITQUEUE(wait);
+	unsigned long flags;
+
+	kvm_async_pf_lock(b, flags);
+	e = kvm_async_pf_find(b, token);
+	if (e) {
+		/* dummy entry exist -> wake up was delivered ahead of PF */
+		hlist_del(&e->link);
+		kfree(e);
+		kvm_async_pf_unlock_and_clear(b, flags);
+
+		return;
+	}
+
+	n.token = token;
+	n.task = current;
+	n.cpu = smp_processor_id();
+	n.halted = is_idle_task(current) ||
+		   (IS_ENABLED(CONFIG_PREEMPT_COUNT) ?
+		    preempt_count() > 1 || rcu_preempt_depth() : in_kernel);
+	n.delayed = false;
+	init_swait_queue_head(&n.wq);
+	hlist_add_head(&n.link, &b->list);
+	kvm_async_pf_unlock_and_clear(b, flags);
+
+	for (;;) {
+		if (!n.halted) {
+			prepare_to_swait_exclusive(&n.wq, &wait,
+						   TASK_UNINTERRUPTIBLE);
+		}
+
+		if (hlist_unhashed(&n.link))
+			break;
+
+		if (!n.halted) {
+			schedule();
+		} else {
+			dsb(sy);
+			wfi();
+		}
+	}
+
+	if (!n.halted)
+		finish_swait(&n.wq, &wait);
+}
+
+/*
+ * There are two cases the suspended processed can't be waken up
+ * immediately: The waker is exactly the suspended process, or
+ * the current CPU runqueue has been locked. Otherwise, we might
+ * run into dead-lock.
+ */
+static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n)
+{
+	if (n->task == current ||
+	    cpu_rq_is_locked(smp_processor_id())) {
+		n->delayed = true;
+		return;
+	}
+
+	hlist_del_init(&n->link);
+	if (n->halted)
+		smp_send_reschedule(n->cpu);
+	else
+		swake_up_one(&n->wq);
+}
+
+void kvm_async_pf_delayed_wake(void)
+{
+	struct kvm_task_sleep_head *b;
+	struct kvm_task_sleep_node *n;
+	struct hlist_node *p, *next;
+	unsigned int reason;
+	unsigned long flags;
+
+	if (!kvm_async_pf_read_enabled())
+		return;
+
+	/*
+	 * We're running in the edging context, we need to complete
+	 * the work as quick as possible. So we have a preliminary
+	 * check without holding the lock.
+	 */
+	b = this_cpu_ptr(&apf_head);
+	if (hlist_empty(&b->list))
+		return;
+
+	/*
+	 * Set the async page fault reason to something to avoid
+	 * receiving the signals, which might cause lock contention
+	 * and possibly dead-lock. As we're in guest context, it's
+	 * safe to set the reason here.
+	 *
+	 * There might be pending signals. For that case, we needn't
+	 * do anything. Otherwise, the pending signal will be lost.
+	 */
+	reason = kvm_async_pf_read_reason();
+	if (!reason) {
+		kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT +
+					  KVM_PV_REASON_PAGE_READY);
+	}
+
+	if (!kvm_async_pf_trylock(b, flags))
+		goto done;
+
+	hlist_for_each_safe(p, next, &b->list) {
+		n = hlist_entry(p, typeof(*n), link);
+		if (n->cpu != smp_processor_id())
+			continue;
+		if (!n->delayed)
+			continue;
+
+		kvm_async_pf_wake_one(n);
+	}
+
+	kvm_async_pf_unlock(b, flags);
+
+done:
+	if (!reason)
+		kvm_async_pf_write_reason(0);
+}
+NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake);
+
+static void kvm_async_pf_wake_all(void)
+{
+	struct kvm_task_sleep_head *b;
+	struct kvm_task_sleep_node *n;
+	struct hlist_node *p, *next;
+	unsigned long flags;
+
+	b = this_cpu_ptr(&apf_head);
+	kvm_async_pf_lock(b, flags);
+
+	hlist_for_each_safe(p, next, &b->list) {
+		n = hlist_entry(p, typeof(*n), link);
+		kvm_async_pf_wake_one(n);
+	}
+
+	kvm_async_pf_unlock(b, flags);
+
+	kvm_async_pf_write_reason(0);
+}
+
+static void kvm_async_pf_wake(u32 token)
+{
+	struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head);
+	struct kvm_task_sleep_node *n;
+	unsigned long flags;
+
+	if (token == ~0) {
+		kvm_async_pf_wake_all();
+		return;
+	}
+
+again:
+	kvm_async_pf_lock(b, flags);
+
+	n = kvm_async_pf_find(b, token);
+	if (!n) {
+		/*
+		 * Async PF was not yet handled. Add dummy entry
+		 * for the token. Busy wait until other CPU handles
+		 * the async PF on allocation failure.
+		 */
+		n = kzalloc(sizeof(*n), GFP_ATOMIC);
+		if (!n) {
+			kvm_async_pf_unlock(b, flags);
+			cpu_relax();
+			goto again;
+		}
+		n->token = token;
+		n->task = current;
+		n->cpu = smp_processor_id();
+		n->halted = false;
+		n->delayed = false;
+		init_swait_queue_head(&n->wq);
+		hlist_add_head(&n->link, &b->list);
+	} else {
+		kvm_async_pf_wake_one(n);
+	}
+
+	kvm_async_pf_unlock_and_clear(b, flags);
+}
+
+static bool do_async_pf(unsigned long addr, unsigned int esr,
+		       struct pt_regs *regs)
+{
+	u32 reason;
+
+	if (!kvm_async_pf_read_enabled())
+		return false;
+
+	reason = kvm_async_pf_read_reason();
+	if (!reason)
+		return false;
+
+	switch (reason) {
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+		kvm_async_pf_wait((u32)addr, !user_mode(regs));
+		break;
+	case KVM_PV_REASON_PAGE_READY:
+		kvm_async_pf_wake((u32)addr);
+		break;
+	default:
+		if (reason) {
+			pr_warn("%s: Illegal reason %d\n", __func__, reason);
+			kvm_async_pf_write_reason(0);
+		}
+	}
+
+	return true;
+}
+#endif /* CONFIG_KVM_GUEST */
+
 void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
 
+#ifdef CONFIG_KVM_GUEST
+	if (do_async_pf(addr, esr, regs))
+		return;
+#endif
+
 	if (!inf->fn(addr, esr, regs))
 		return;
 
@@ -878,3 +1174,141 @@  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 	debug_exception_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug_exception);
+
+#ifdef CONFIG_KVM_GUEST
+static int __init kvm_async_pf_available(char *arg)
+{
+	async_pf_available = false;
+	return 0;
+}
+early_param("no-kvmapf", kvm_async_pf_available);
+
+static void kvm_async_pf_enable(bool enable)
+{
+	struct arm_smccc_res res;
+	unsigned long *offsets = (unsigned long *)__exception_handlers_offset;
+	u32 enabled = kvm_async_pf_read_enabled();
+	u64 val;
+	int i;
+
+	if (enable == enabled)
+		return;
+
+	if (enable) {
+		/*
+		 * Asychonous page faults will be prohibited when CPU runs
+		 * instructions between the vector base and the maximal
+		 * offset, plus 4096. The 4096 is the assumped maximal
+		 * length for individual handler. The hardware registers
+		 * should be saved to stack at the beginning of the handlers,
+		 * so 4096 shuld be safe enough.
+		 */
+		val = 0;
+		for (i = 0; i < 16; i++) {
+			if (offsets[i] > val)
+				val = offsets[i];
+		}
+
+		val += 4096;
+		val |= BIT(62);
+
+		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
+				     (u32)val, (u32)(val >> 32), &res);
+		if (res.a0 != SMCCC_RET_SUCCESS) {
+			pr_warn("Async PF configuration error %ld on CPU %d\n",
+				res.a0, smp_processor_id());
+			return;
+		}
+
+		/* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS */
+		val = BIT(63);
+		val |= virt_to_phys(this_cpu_ptr(&apf_data));
+		val |= KVM_ASYNC_PF_ENABLED;
+
+		kvm_async_pf_write_enabled(1);
+		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
+				     (u32)val, (u32)(val >> 32), &res);
+		if (res.a0 != SMCCC_RET_SUCCESS) {
+			pr_warn("Async PF enable error %ld on CPU %d\n",
+				res.a0, smp_processor_id());
+			kvm_async_pf_write_enabled(0);
+			return;
+		}
+	} else {
+		val = BIT(63);
+		arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID,
+				     (u32)val, (u32)(val >> 32), &res);
+		if (res.a0 != SMCCC_RET_SUCCESS) {
+			pr_warn("Async PF disable error %ld on CPU %d\n",
+				res.a0, smp_processor_id());
+			return;
+		}
+
+		kvm_async_pf_write_enabled(0);
+	}
+
+	pr_info("Async PF %s on CPU %d\n",
+		enable ? "enabled" : "disabled", smp_processor_id());
+}
+
+static void kvm_async_pf_cpu_reboot(void *unused)
+{
+	kvm_async_pf_enable(false);
+}
+
+static int kvm_async_pf_cpu_reboot_notify(struct notifier_block *nb,
+					  unsigned long code, void *unused)
+{
+	if (code == SYS_RESTART)
+		on_each_cpu(kvm_async_pf_cpu_reboot, NULL, 1);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_async_pf_cpu_reboot_nb = {
+	.notifier_call = kvm_async_pf_cpu_reboot_notify,
+};
+
+static int kvm_async_pf_cpu_online(unsigned int cpu)
+{
+	struct kvm_task_sleep_head *b;
+
+	b = this_cpu_ptr(&apf_head);
+	raw_spin_lock_init(&b->lock);
+	kvm_async_pf_enable(true);
+	return 0;
+}
+
+static int kvm_async_pf_cpu_offline(unsigned int cpu)
+{
+	kvm_async_pf_enable(false);
+	return 0;
+}
+
+static int __init kvm_async_pf_cpu_init(void)
+{
+	struct kvm_task_sleep_head *b;
+	int ret;
+
+	if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) ||
+	    !async_pf_available)
+		return -EPERM;
+
+	register_reboot_notifier(&kvm_async_pf_cpu_reboot_nb);
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+			"arm/kvm:online", kvm_async_pf_cpu_online,
+			kvm_async_pf_cpu_offline);
+	if (ret < 0) {
+		pr_warn("%s: Error %d to install cpu hotplug callbacks\n",
+			__func__, ret);
+		return ret;
+	}
+
+	b = this_cpu_ptr(&apf_head);
+	raw_spin_lock_init(&b->lock);
+	kvm_async_pf_enable(true);
+
+	return 0;
+}
+early_initcall(kvm_async_pf_cpu_init);
+#endif /* CONFIG_KVM_GUEST */