Message ID | 20200508032919.52147-10-gshan@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | kvm/arm64: Support Async Page Fault | expand |
On Fri, May 08, 2020 at 01:29:19PM +1000, Gavin Shan wrote: > This supports asynchronous page fault for the guest. The design is > similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from > the host, the current task is either rescheduled or put into power > saving mode. The task will be waken up when PAGE_READY signal is > received. The PAGE_READY signal might be received in the context > of the suspended process, to be waken up. That means the suspended > process has to wake up itself, but it's not safe and prone to cause > dead-lock on CPU runqueue lock. So the wakeup is delayed on returning > from kernel space to user space or idle process is picked for running. > > The signals are conveyed through the async page fault control block, > which was passed to host on enabling the functionality. On each page > fault, the control block is checked and switch to the async page fault > handling flow if any signals exist. > > The feature is put into the CONFIG_KVM_GUEST umbrella, which is added > by this patch. So we have inline functions implemented in kvm_para.h, > like other architectures do, to check if async page fault (one of the > KVM para-virtualized features) is available. Also, the kernel boot > parameter "no-kvmapf" can be specified to disable the feature. > > Signed-off-by: Gavin Shan <gshan@redhat.com> > --- > arch/arm64/Kconfig | 11 + > arch/arm64/include/asm/exception.h | 3 + > arch/arm64/include/asm/kvm_para.h | 27 +- > arch/arm64/kernel/entry.S | 33 +++ > arch/arm64/kernel/process.c | 4 + > arch/arm64/mm/fault.c | 434 +++++++++++++++++++++++++++++ > 6 files changed, 505 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 40fb05d96c60..2d5e5ee62d6d 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -1045,6 +1045,17 @@ config PARAVIRT > under a hypervisor, potentially improving performance significantly > over full virtualization. > > +config KVM_GUEST > + bool "KVM Guest Support" > + depends on PARAVIRT > + default y > + help > + This option enables various optimizations for running under the KVM > + hypervisor. Overhead for the kernel when not running inside KVM should > + be minimal. > + > + In case of doubt, say Y > + > config PARAVIRT_TIME_ACCOUNTING > bool "Paravirtual steal time accounting" > select PARAVIRT > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h > index 7a6e81ca23a8..d878afa42746 100644 > --- a/arch/arm64/include/asm/exception.h > +++ b/arch/arm64/include/asm/exception.h > @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); > void do_cp15instr(unsigned int esr, struct pt_regs *regs); > void do_el0_svc(struct pt_regs *regs); > void do_el0_svc_compat(struct pt_regs *regs); > +#ifdef CONFIG_KVM_GUEST > +void kvm_async_pf_delayed_wake(void); > +#endif > #endif /* __ASM_EXCEPTION_H */ > diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h > index 0ea481dd1c7a..b2f8ef243df7 100644 > --- a/arch/arm64/include/asm/kvm_para.h > +++ b/arch/arm64/include/asm/kvm_para.h > @@ -3,6 +3,20 @@ > #define _ASM_ARM_KVM_PARA_H > > #include <uapi/asm/kvm_para.h> > +#include <linux/of.h> > +#include <asm/hypervisor.h> > + > +#ifdef CONFIG_KVM_GUEST > +static inline int kvm_para_available(void) > +{ > + return 1; > +} > +#else > +static inline int kvm_para_available(void) > +{ > + return 0; > +} > +#endif /* CONFIG_KVM_GUEST */ Please make these bool, and return true/false, as was the case with the existing stub. > > static inline bool kvm_check_and_clear_guest_paused(void) > { > @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void) > > static inline unsigned int kvm_arch_para_features(void) > { > - return 0; > + unsigned int features = 0; > + > + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF)) > + features |= (1 << KVM_FEATURE_ASYNC_PF); > + > + return features; > } > > static inline unsigned int kvm_arch_para_hints(void) > { > return 0; > } > - > -static inline bool kvm_para_available(void) > -{ > - return false; > -} > - > #endif /* _ASM_ARM_KVM_PARA_H */ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index ddcde093c433..15efd57129ff 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -751,12 +751,45 @@ finish_ret_to_user: > enable_step_tsk x1, x2 > #ifdef CONFIG_GCC_PLUGIN_STACKLEAK > bl stackleak_erase > +#endif > +#ifdef CONFIG_KVM_GUEST > + bl kvm_async_pf_delayed_wake > #endif Yuck. I am very much not keen on this living in the entry assembly. What precisely is this needed for? > kernel_exit 0 > ENDPROC(ret_to_user) > > .popsection // .entry.text > > +#ifdef CONFIG_KVM_GUEST > + .pushsection ".rodata", "a" > +SYM_DATA_START(__exception_handlers_offset) > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > + .quad el1_sync - vectors > + .quad el1_irq - vectors > + .quad 0 > + .quad el1_error - vectors > + .quad el0_sync - vectors > + .quad el0_irq - vectors > + .quad 0 > + .quad el0_error - vectors > +#ifdef CONFIG_COMPAT > + .quad el0_sync_compat - vectors > + .quad el0_irq_compat - vectors > + .quad 0 > + .quad el0_error_compat - vectors > +#else > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > +#endif > +SYM_DATA_END(__exception_handlers_offset) > + .popsection // .rodata > +#endif /* CONFIG_KVM_GUEST */ This looks scary, and needs an introduction in the commit message. > + > #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 > /* > * Exception vectors trampoline. > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index 56be4cbf771f..5e7ee553566d 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -53,6 +53,7 @@ > #include <asm/processor.h> > #include <asm/pointer_auth.h> > #include <asm/stacktrace.h> > +#include <asm/exception.h> > > #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) > #include <linux/stackprotector.h> > @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); > > static void __cpu_do_idle(void) > { > +#ifdef CONFIG_KVM_GUEST > + kvm_async_pf_delayed_wake(); > +#endif > dsb(sy); > wfi(); > } This is meant to be a very low-level helper, so I don't think this should live here. If nothing else, this needs to have no overhead when async page faults are not in use, so this probably needs an inline with a static key. > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index c9cedc0432d2..cbf8b52135c9 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -19,10 +19,12 @@ > #include <linux/page-flags.h> > #include <linux/sched/signal.h> > #include <linux/sched/debug.h> > +#include <linux/swait.h> > #include <linux/highmem.h> > #include <linux/perf_event.h> > #include <linux/preempt.h> > #include <linux/hugetlb.h> > +#include <linux/kvm_para.h> > > #include <asm/acpi.h> > #include <asm/bug.h> > @@ -48,8 +50,31 @@ struct fault_info { > const char *name; > }; > > +#ifdef CONFIG_KVM_GUEST > +struct kvm_task_sleep_node { > + struct hlist_node link; > + struct swait_queue_head wq; > + u32 token; > + struct task_struct *task; > + int cpu; > + bool halted; > + bool delayed; > +}; > + > +struct kvm_task_sleep_head { > + raw_spinlock_t lock; > + struct hlist_head list; > +}; > +#endif /* CONFIG_KVM_GUEST */ > + > static const struct fault_info fault_info[]; > static struct fault_info debug_fault_info[]; > +#ifdef CONFIG_KVM_GUEST > +extern char __exception_handlers_offset[]; > +static bool async_pf_available = true; > +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); > +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head); > +#endif > > static inline const struct fault_info *esr_to_fault_info(unsigned int esr) > { > @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = { > { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, > }; > > +#ifdef CONFIG_KVM_GUEST > +static inline unsigned int kvm_async_pf_read_enabled(void) > +{ > + return __this_cpu_read(apf_data.enabled); > +} > + > +static inline void kvm_async_pf_write_enabled(unsigned int val) > +{ > + __this_cpu_write(apf_data.enabled, val); > +} > + > +static inline unsigned int kvm_async_pf_read_reason(void) > +{ > + return __this_cpu_read(apf_data.reason); > +} > + > +static inline void kvm_async_pf_write_reason(unsigned int val) > +{ > + __this_cpu_write(apf_data.reason, val); > +} > + > +#define kvm_async_pf_lock(b, flags) \ > + raw_spin_lock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_trylock(b, flags) \ > + raw_spin_trylock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock(b, flags) \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock_and_clear(b, flags) \ > + do { \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)); \ > + kvm_async_pf_write_reason(0); \ > + } while (0) > + > +static struct kvm_task_sleep_node *kvm_async_pf_find( > + struct kvm_task_sleep_head *b, u32 token) > +{ > + struct kvm_task_sleep_node *n; > + struct hlist_node *p; > + > + hlist_for_each(p, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->token == token) > + return n; > + } > + > + return NULL; > +} > + > +static void kvm_async_pf_wait(u32 token, int in_kernel) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node n, *e; > + DECLARE_SWAITQUEUE(wait); > + unsigned long flags; > + > + kvm_async_pf_lock(b, flags); > + e = kvm_async_pf_find(b, token); > + if (e) { > + /* dummy entry exist -> wake up was delivered ahead of PF */ > + hlist_del(&e->link); > + kfree(e); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + return; > + } > + > + n.token = token; > + n.task = current; > + n.cpu = smp_processor_id(); > + n.halted = is_idle_task(current) || > + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? > + preempt_count() > 1 || rcu_preempt_depth() : in_kernel); > + n.delayed = false; > + init_swait_queue_head(&n.wq); > + hlist_add_head(&n.link, &b->list); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + for (;;) { > + if (!n.halted) { > + prepare_to_swait_exclusive(&n.wq, &wait, > + TASK_UNINTERRUPTIBLE); > + } > + > + if (hlist_unhashed(&n.link)) > + break; > + > + if (!n.halted) { > + schedule(); > + } else { > + dsb(sy); > + wfi(); > + } Please don't open-code idle sequences. I strongly suspect this won't work with pseudo-nmi, and regardless we don't want to duplicate this. > + } > + > + if (!n.halted) > + finish_swait(&n.wq, &wait); > +} > + > +/* > + * There are two cases the suspended processed can't be waken up > + * immediately: The waker is exactly the suspended process, or > + * the current CPU runqueue has been locked. Otherwise, we might > + * run into dead-lock. > + */ > +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n) > +{ > + if (n->task == current || > + cpu_rq_is_locked(smp_processor_id())) { > + n->delayed = true; > + return; > + } > + > + hlist_del_init(&n->link); > + if (n->halted) > + smp_send_reschedule(n->cpu); > + else > + swake_up_one(&n->wq); > +} > + > +void kvm_async_pf_delayed_wake(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned int reason; > + unsigned long flags; > + > + if (!kvm_async_pf_read_enabled()) > + return; > + > + /* > + * We're running in the edging context, we need to complete > + * the work as quick as possible. So we have a preliminary > + * check without holding the lock. > + */ What is 'the edging context'? > + b = this_cpu_ptr(&apf_head); > + if (hlist_empty(&b->list)) > + return; > + > + /* > + * Set the async page fault reason to something to avoid > + * receiving the signals, which might cause lock contention > + * and possibly dead-lock. As we're in guest context, it's > + * safe to set the reason here. > + * > + * There might be pending signals. For that case, we needn't > + * do anything. Otherwise, the pending signal will be lost. > + */ > + reason = kvm_async_pf_read_reason(); > + if (!reason) { > + kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT + > + KVM_PV_REASON_PAGE_READY); > + } Huh? Are we doing this to prevent the host from writing tho this area? > + > + if (!kvm_async_pf_trylock(b, flags)) > + goto done; > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->cpu != smp_processor_id()) > + continue; > + if (!n->delayed) > + continue; > + > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > +done: > + if (!reason) > + kvm_async_pf_write_reason(0); > +} > +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake); > + > +static void kvm_async_pf_wake_all(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned long flags; > + > + b = this_cpu_ptr(&apf_head); > + kvm_async_pf_lock(b, flags); > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > + kvm_async_pf_write_reason(0); > +} > + > +static void kvm_async_pf_wake(u32 token) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node *n; > + unsigned long flags; > + > + if (token == ~0) { > + kvm_async_pf_wake_all(); > + return; > + } > + > +again: > + kvm_async_pf_lock(b, flags); > + > + n = kvm_async_pf_find(b, token); > + if (!n) { > + /* > + * Async PF was not yet handled. Add dummy entry > + * for the token. Busy wait until other CPU handles > + * the async PF on allocation failure. > + */ > + n = kzalloc(sizeof(*n), GFP_ATOMIC); > + if (!n) { > + kvm_async_pf_unlock(b, flags); > + cpu_relax(); > + goto again; > + } > + n->token = token; > + n->task = current; > + n->cpu = smp_processor_id(); > + n->halted = false; > + n->delayed = false; > + init_swait_queue_head(&n->wq); > + hlist_add_head(&n->link, &b->list); > + } else { > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock_and_clear(b, flags); > +} > + > +static bool do_async_pf(unsigned long addr, unsigned int esr, > + struct pt_regs *regs) > +{ > + u32 reason; > + > + if (!kvm_async_pf_read_enabled()) > + return false; > + > + reason = kvm_async_pf_read_reason(); > + if (!reason) > + return false; > + > + switch (reason) { > + case KVM_PV_REASON_PAGE_NOT_PRESENT: > + kvm_async_pf_wait((u32)addr, !user_mode(regs)); > + break; > + case KVM_PV_REASON_PAGE_READY: > + kvm_async_pf_wake((u32)addr); > + break; > + default: > + if (reason) { > + pr_warn("%s: Illegal reason %d\n", __func__, reason); > + kvm_async_pf_write_reason(0); > + } > + } > + > + return true; > +} > +#endif /* CONFIG_KVM_GUEST */ > + > void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) > { > const struct fault_info *inf = esr_to_fault_info(esr); > > +#ifdef CONFIG_KVM_GUEST > + if (do_async_pf(addr, esr, regs)) > + return; > +#endif > + > if (!inf->fn(addr, esr, regs)) > return; > > @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, > debug_exception_exit(regs); > } > NOKPROBE_SYMBOL(do_debug_exception); > + > +#ifdef CONFIG_KVM_GUEST > +static int __init kvm_async_pf_available(char *arg) > +{ > + async_pf_available = false; > + return 0; > +} > +early_param("no-kvmapf", kvm_async_pf_available); > + > +static void kvm_async_pf_enable(bool enable) > +{ > + struct arm_smccc_res res; > + unsigned long *offsets = (unsigned long *)__exception_handlers_offset; > + u32 enabled = kvm_async_pf_read_enabled(); > + u64 val; > + int i; > + > + if (enable == enabled) > + return; > + > + if (enable) { > + /* > + * Asychonous page faults will be prohibited when CPU runs > + * instructions between the vector base and the maximal > + * offset, plus 4096. The 4096 is the assumped maximal > + * length for individual handler. The hardware registers > + * should be saved to stack at the beginning of the handlers, > + * so 4096 shuld be safe enough. > + */ > + val = 0; > + for (i = 0; i < 16; i++) { > + if (offsets[i] > val) > + val = offsets[i]; > + } > + > + val += 4096; NAK. This assumption is not true, and regardless we should not make any assumptions of this sort; we should derive this from the code explicitly. Guessing is not ok. Given that non-reentrant exception handling code is scattered across at least: * kernel/debug-monitors.c * kernel/entry.S * kernel/entry-common.S * kernel/traps.c * mm/fault.c ... we *cannot* assume that fault handling code is virtually contiguous, and certainly cannot assume where this falls w.r.t. the architectural vectors that VBAR_ELx points to. How does x86 handle this? Thanks, Mark.
Hi Gavin, I definitely appreciate the work, but this is repeating most of the mistakes done in the x86 implementation. In particular: - the page ready signal can be done as an interrupt, rather than an exception. This is because "page ready" can be handled asynchronously, in contrast to "page not present" which must be done on the same instruction that triggers it. You can refer to the recent series from Vitaly Kuznetsov that switched "page ready" to an interrupt. - the page not present is reusing the memory abort exception, and there's really no reason to do so. I think it would be best if ARM could reserve one ESR exception code for the hypervisor. Mark, any ideas how to proceed here? - for x86 we're also thinking of initiating the page fault from the exception handler, rather than doing so from the hypervisor before injecting the exception. If ARM leads the way here, we would do our best to share code when x86 does the same. - do not bother with using KVM_ASYNC_PF_SEND_ALWAYS, it's a fringe case that adds a lot of complexity. Also, please include me on further iterations of the series. Thanks! Paolo On 08/05/20 05:29, Gavin Shan wrote: > This supports asynchronous page fault for the guest. The design is > similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from > the host, the current task is either rescheduled or put into power > saving mode. The task will be waken up when PAGE_READY signal is > received. The PAGE_READY signal might be received in the context > of the suspended process, to be waken up. That means the suspended > process has to wake up itself, but it's not safe and prone to cause > dead-lock on CPU runqueue lock. So the wakeup is delayed on returning > from kernel space to user space or idle process is picked for running. > > The signals are conveyed through the async page fault control block, > which was passed to host on enabling the functionality. On each page > fault, the control block is checked and switch to the async page fault > handling flow if any signals exist. > > The feature is put into the CONFIG_KVM_GUEST umbrella, which is added > by this patch. So we have inline functions implemented in kvm_para.h, > like other architectures do, to check if async page fault (one of the > KVM para-virtualized features) is available. Also, the kernel boot > parameter "no-kvmapf" can be specified to disable the feature. > > Signed-off-by: Gavin Shan <gshan@redhat.com> > --- > arch/arm64/Kconfig | 11 + > arch/arm64/include/asm/exception.h | 3 + > arch/arm64/include/asm/kvm_para.h | 27 +- > arch/arm64/kernel/entry.S | 33 +++ > arch/arm64/kernel/process.c | 4 + > arch/arm64/mm/fault.c | 434 +++++++++++++++++++++++++++++ > 6 files changed, 505 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 40fb05d96c60..2d5e5ee62d6d 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -1045,6 +1045,17 @@ config PARAVIRT > under a hypervisor, potentially improving performance significantly > over full virtualization. > > +config KVM_GUEST > + bool "KVM Guest Support" > + depends on PARAVIRT > + default y > + help > + This option enables various optimizations for running under the KVM > + hypervisor. Overhead for the kernel when not running inside KVM should > + be minimal. > + > + In case of doubt, say Y > + > config PARAVIRT_TIME_ACCOUNTING > bool "Paravirtual steal time accounting" > select PARAVIRT > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h > index 7a6e81ca23a8..d878afa42746 100644 > --- a/arch/arm64/include/asm/exception.h > +++ b/arch/arm64/include/asm/exception.h > @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); > void do_cp15instr(unsigned int esr, struct pt_regs *regs); > void do_el0_svc(struct pt_regs *regs); > void do_el0_svc_compat(struct pt_regs *regs); > +#ifdef CONFIG_KVM_GUEST > +void kvm_async_pf_delayed_wake(void); > +#endif > #endif /* __ASM_EXCEPTION_H */ > diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h > index 0ea481dd1c7a..b2f8ef243df7 100644 > --- a/arch/arm64/include/asm/kvm_para.h > +++ b/arch/arm64/include/asm/kvm_para.h > @@ -3,6 +3,20 @@ > #define _ASM_ARM_KVM_PARA_H > > #include <uapi/asm/kvm_para.h> > +#include <linux/of.h> > +#include <asm/hypervisor.h> > + > +#ifdef CONFIG_KVM_GUEST > +static inline int kvm_para_available(void) > +{ > + return 1; > +} > +#else > +static inline int kvm_para_available(void) > +{ > + return 0; > +} > +#endif /* CONFIG_KVM_GUEST */ > > static inline bool kvm_check_and_clear_guest_paused(void) > { > @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void) > > static inline unsigned int kvm_arch_para_features(void) > { > - return 0; > + unsigned int features = 0; > + > + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF)) > + features |= (1 << KVM_FEATURE_ASYNC_PF); > + > + return features; > } > > static inline unsigned int kvm_arch_para_hints(void) > { > return 0; > } > - > -static inline bool kvm_para_available(void) > -{ > - return false; > -} > - > #endif /* _ASM_ARM_KVM_PARA_H */ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index ddcde093c433..15efd57129ff 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -751,12 +751,45 @@ finish_ret_to_user: > enable_step_tsk x1, x2 > #ifdef CONFIG_GCC_PLUGIN_STACKLEAK > bl stackleak_erase > +#endif > +#ifdef CONFIG_KVM_GUEST > + bl kvm_async_pf_delayed_wake > #endif > kernel_exit 0 > ENDPROC(ret_to_user) > > .popsection // .entry.text > > +#ifdef CONFIG_KVM_GUEST > + .pushsection ".rodata", "a" > +SYM_DATA_START(__exception_handlers_offset) > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > + .quad el1_sync - vectors > + .quad el1_irq - vectors > + .quad 0 > + .quad el1_error - vectors > + .quad el0_sync - vectors > + .quad el0_irq - vectors > + .quad 0 > + .quad el0_error - vectors > +#ifdef CONFIG_COMPAT > + .quad el0_sync_compat - vectors > + .quad el0_irq_compat - vectors > + .quad 0 > + .quad el0_error_compat - vectors > +#else > + .quad 0 > + .quad 0 > + .quad 0 > + .quad 0 > +#endif > +SYM_DATA_END(__exception_handlers_offset) > + .popsection // .rodata > +#endif /* CONFIG_KVM_GUEST */ > + > #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 > /* > * Exception vectors trampoline. > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c > index 56be4cbf771f..5e7ee553566d 100644 > --- a/arch/arm64/kernel/process.c > +++ b/arch/arm64/kernel/process.c > @@ -53,6 +53,7 @@ > #include <asm/processor.h> > #include <asm/pointer_auth.h> > #include <asm/stacktrace.h> > +#include <asm/exception.h> > > #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) > #include <linux/stackprotector.h> > @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); > > static void __cpu_do_idle(void) > { > +#ifdef CONFIG_KVM_GUEST > + kvm_async_pf_delayed_wake(); > +#endif > dsb(sy); > wfi(); > } > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index c9cedc0432d2..cbf8b52135c9 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -19,10 +19,12 @@ > #include <linux/page-flags.h> > #include <linux/sched/signal.h> > #include <linux/sched/debug.h> > +#include <linux/swait.h> > #include <linux/highmem.h> > #include <linux/perf_event.h> > #include <linux/preempt.h> > #include <linux/hugetlb.h> > +#include <linux/kvm_para.h> > > #include <asm/acpi.h> > #include <asm/bug.h> > @@ -48,8 +50,31 @@ struct fault_info { > const char *name; > }; > > +#ifdef CONFIG_KVM_GUEST > +struct kvm_task_sleep_node { > + struct hlist_node link; > + struct swait_queue_head wq; > + u32 token; > + struct task_struct *task; > + int cpu; > + bool halted; > + bool delayed; > +}; > + > +struct kvm_task_sleep_head { > + raw_spinlock_t lock; > + struct hlist_head list; > +}; > +#endif /* CONFIG_KVM_GUEST */ > + > static const struct fault_info fault_info[]; > static struct fault_info debug_fault_info[]; > +#ifdef CONFIG_KVM_GUEST > +extern char __exception_handlers_offset[]; > +static bool async_pf_available = true; > +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); > +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head); > +#endif > > static inline const struct fault_info *esr_to_fault_info(unsigned int esr) > { > @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = { > { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, > }; > > +#ifdef CONFIG_KVM_GUEST > +static inline unsigned int kvm_async_pf_read_enabled(void) > +{ > + return __this_cpu_read(apf_data.enabled); > +} > + > +static inline void kvm_async_pf_write_enabled(unsigned int val) > +{ > + __this_cpu_write(apf_data.enabled, val); > +} > + > +static inline unsigned int kvm_async_pf_read_reason(void) > +{ > + return __this_cpu_read(apf_data.reason); > +} > + > +static inline void kvm_async_pf_write_reason(unsigned int val) > +{ > + __this_cpu_write(apf_data.reason, val); > +} > + > +#define kvm_async_pf_lock(b, flags) \ > + raw_spin_lock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_trylock(b, flags) \ > + raw_spin_trylock_irqsave(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock(b, flags) \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)) > +#define kvm_async_pf_unlock_and_clear(b, flags) \ > + do { \ > + raw_spin_unlock_irqrestore(&(b)->lock, (flags)); \ > + kvm_async_pf_write_reason(0); \ > + } while (0) > + > +static struct kvm_task_sleep_node *kvm_async_pf_find( > + struct kvm_task_sleep_head *b, u32 token) > +{ > + struct kvm_task_sleep_node *n; > + struct hlist_node *p; > + > + hlist_for_each(p, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->token == token) > + return n; > + } > + > + return NULL; > +} > + > +static void kvm_async_pf_wait(u32 token, int in_kernel) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node n, *e; > + DECLARE_SWAITQUEUE(wait); > + unsigned long flags; > + > + kvm_async_pf_lock(b, flags); > + e = kvm_async_pf_find(b, token); > + if (e) { > + /* dummy entry exist -> wake up was delivered ahead of PF */ > + hlist_del(&e->link); > + kfree(e); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + return; > + } > + > + n.token = token; > + n.task = current; > + n.cpu = smp_processor_id(); > + n.halted = is_idle_task(current) || > + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? > + preempt_count() > 1 || rcu_preempt_depth() : in_kernel); > + n.delayed = false; > + init_swait_queue_head(&n.wq); > + hlist_add_head(&n.link, &b->list); > + kvm_async_pf_unlock_and_clear(b, flags); > + > + for (;;) { > + if (!n.halted) { > + prepare_to_swait_exclusive(&n.wq, &wait, > + TASK_UNINTERRUPTIBLE); > + } > + > + if (hlist_unhashed(&n.link)) > + break; > + > + if (!n.halted) { > + schedule(); > + } else { > + dsb(sy); > + wfi(); > + } > + } > + > + if (!n.halted) > + finish_swait(&n.wq, &wait); > +} > + > +/* > + * There are two cases the suspended processed can't be waken up > + * immediately: The waker is exactly the suspended process, or > + * the current CPU runqueue has been locked. Otherwise, we might > + * run into dead-lock. > + */ > +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n) > +{ > + if (n->task == current || > + cpu_rq_is_locked(smp_processor_id())) { > + n->delayed = true; > + return; > + } > + > + hlist_del_init(&n->link); > + if (n->halted) > + smp_send_reschedule(n->cpu); > + else > + swake_up_one(&n->wq); > +} > + > +void kvm_async_pf_delayed_wake(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned int reason; > + unsigned long flags; > + > + if (!kvm_async_pf_read_enabled()) > + return; > + > + /* > + * We're running in the edging context, we need to complete > + * the work as quick as possible. So we have a preliminary > + * check without holding the lock. > + */ > + b = this_cpu_ptr(&apf_head); > + if (hlist_empty(&b->list)) > + return; > + > + /* > + * Set the async page fault reason to something to avoid > + * receiving the signals, which might cause lock contention > + * and possibly dead-lock. As we're in guest context, it's > + * safe to set the reason here. > + * > + * There might be pending signals. For that case, we needn't > + * do anything. Otherwise, the pending signal will be lost. > + */ > + reason = kvm_async_pf_read_reason(); > + if (!reason) { > + kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT + > + KVM_PV_REASON_PAGE_READY); > + } > + > + if (!kvm_async_pf_trylock(b, flags)) > + goto done; > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + if (n->cpu != smp_processor_id()) > + continue; > + if (!n->delayed) > + continue; > + > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > +done: > + if (!reason) > + kvm_async_pf_write_reason(0); > +} > +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake); > + > +static void kvm_async_pf_wake_all(void) > +{ > + struct kvm_task_sleep_head *b; > + struct kvm_task_sleep_node *n; > + struct hlist_node *p, *next; > + unsigned long flags; > + > + b = this_cpu_ptr(&apf_head); > + kvm_async_pf_lock(b, flags); > + > + hlist_for_each_safe(p, next, &b->list) { > + n = hlist_entry(p, typeof(*n), link); > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock(b, flags); > + > + kvm_async_pf_write_reason(0); > +} > + > +static void kvm_async_pf_wake(u32 token) > +{ > + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); > + struct kvm_task_sleep_node *n; > + unsigned long flags; > + > + if (token == ~0) { > + kvm_async_pf_wake_all(); > + return; > + } > + > +again: > + kvm_async_pf_lock(b, flags); > + > + n = kvm_async_pf_find(b, token); > + if (!n) { > + /* > + * Async PF was not yet handled. Add dummy entry > + * for the token. Busy wait until other CPU handles > + * the async PF on allocation failure. > + */ > + n = kzalloc(sizeof(*n), GFP_ATOMIC); > + if (!n) { > + kvm_async_pf_unlock(b, flags); > + cpu_relax(); > + goto again; > + } > + n->token = token; > + n->task = current; > + n->cpu = smp_processor_id(); > + n->halted = false; > + n->delayed = false; > + init_swait_queue_head(&n->wq); > + hlist_add_head(&n->link, &b->list); > + } else { > + kvm_async_pf_wake_one(n); > + } > + > + kvm_async_pf_unlock_and_clear(b, flags); > +} > + > +static bool do_async_pf(unsigned long addr, unsigned int esr, > + struct pt_regs *regs) > +{ > + u32 reason; > + > + if (!kvm_async_pf_read_enabled()) > + return false; > + > + reason = kvm_async_pf_read_reason(); > + if (!reason) > + return false; > + > + switch (reason) { > + case KVM_PV_REASON_PAGE_NOT_PRESENT: > + kvm_async_pf_wait((u32)addr, !user_mode(regs)); > + break; > + case KVM_PV_REASON_PAGE_READY: > + kvm_async_pf_wake((u32)addr); > + break; > + default: > + if (reason) { > + pr_warn("%s: Illegal reason %d\n", __func__, reason); > + kvm_async_pf_write_reason(0); > + } > + } > + > + return true; > +} > +#endif /* CONFIG_KVM_GUEST */ > + > void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) > { > const struct fault_info *inf = esr_to_fault_info(esr); > > +#ifdef CONFIG_KVM_GUEST > + if (do_async_pf(addr, esr, regs)) > + return; > +#endif > + > if (!inf->fn(addr, esr, regs)) > return; > > @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, > debug_exception_exit(regs); > } > NOKPROBE_SYMBOL(do_debug_exception); > + > +#ifdef CONFIG_KVM_GUEST > +static int __init kvm_async_pf_available(char *arg) > +{ > + async_pf_available = false; > + return 0; > +} > +early_param("no-kvmapf", kvm_async_pf_available); > + > +static void kvm_async_pf_enable(bool enable) > +{ > + struct arm_smccc_res res; > + unsigned long *offsets = (unsigned long *)__exception_handlers_offset; > + u32 enabled = kvm_async_pf_read_enabled(); > + u64 val; > + int i; > + > + if (enable == enabled) > + return; > + > + if (enable) { > + /* > + * Asychonous page faults will be prohibited when CPU runs > + * instructions between the vector base and the maximal > + * offset, plus 4096. The 4096 is the assumped maximal > + * length for individual handler. The hardware registers > + * should be saved to stack at the beginning of the handlers, > + * so 4096 shuld be safe enough. > + */ > + val = 0; > + for (i = 0; i < 16; i++) { > + if (offsets[i] > val) > + val = offsets[i]; > + } > + > + val += 4096; > + val |= BIT(62); > + > + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, > + (u32)val, (u32)(val >> 32), &res); > + if (res.a0 != SMCCC_RET_SUCCESS) { > + pr_warn("Async PF configuration error %ld on CPU %d\n", > + res.a0, smp_processor_id()); > + return; > + } > + > + /* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS */ > + val = BIT(63); > + val |= virt_to_phys(this_cpu_ptr(&apf_data)); > + val |= KVM_ASYNC_PF_ENABLED; > + > + kvm_async_pf_write_enabled(1); > + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, > + (u32)val, (u32)(val >> 32), &res); > + if (res.a0 != SMCCC_RET_SUCCESS) { > + pr_warn("Async PF enable error %ld on CPU %d\n", > + res.a0, smp_processor_id()); > + kvm_async_pf_write_enabled(0); > + return; > + } > + } else { > + val = BIT(63); > + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, > + (u32)val, (u32)(val >> 32), &res); > + if (res.a0 != SMCCC_RET_SUCCESS) { > + pr_warn("Async PF disable error %ld on CPU %d\n", > + res.a0, smp_processor_id()); > + return; > + } > + > + kvm_async_pf_write_enabled(0); > + } > + > + pr_info("Async PF %s on CPU %d\n", > + enable ? "enabled" : "disabled", smp_processor_id()); > +} > + > +static void kvm_async_pf_cpu_reboot(void *unused) > +{ > + kvm_async_pf_enable(false); > +} > + > +static int kvm_async_pf_cpu_reboot_notify(struct notifier_block *nb, > + unsigned long code, void *unused) > +{ > + if (code == SYS_RESTART) > + on_each_cpu(kvm_async_pf_cpu_reboot, NULL, 1); > + > + return NOTIFY_DONE; > +} > + > +static struct notifier_block kvm_async_pf_cpu_reboot_nb = { > + .notifier_call = kvm_async_pf_cpu_reboot_notify, > +}; > + > +static int kvm_async_pf_cpu_online(unsigned int cpu) > +{ > + struct kvm_task_sleep_head *b; > + > + b = this_cpu_ptr(&apf_head); > + raw_spin_lock_init(&b->lock); > + kvm_async_pf_enable(true); > + return 0; > +} > + > +static int kvm_async_pf_cpu_offline(unsigned int cpu) > +{ > + kvm_async_pf_enable(false); > + return 0; > +} > + > +static int __init kvm_async_pf_cpu_init(void) > +{ > + struct kvm_task_sleep_head *b; > + int ret; > + > + if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) || > + !async_pf_available) > + return -EPERM; > + > + register_reboot_notifier(&kvm_async_pf_cpu_reboot_nb); > + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, > + "arm/kvm:online", kvm_async_pf_cpu_online, > + kvm_async_pf_cpu_offline); > + if (ret < 0) { > + pr_warn("%s: Error %d to install cpu hotplug callbacks\n", > + __func__, ret); > + return ret; > + } > + > + b = this_cpu_ptr(&apf_head); > + raw_spin_lock_init(&b->lock); > + kvm_async_pf_enable(true); > + > + return 0; > +} > +early_initcall(kvm_async_pf_cpu_init); > +#endif /* CONFIG_KVM_GUEST */ >
Hi Paolo, On 5/27/20 4:48 PM, Paolo Bonzini wrote: > I definitely appreciate the work, but this is repeating most of the > mistakes done in the x86 implementation. In particular: > > - the page ready signal can be done as an interrupt, rather than an > exception. This is because "page ready" can be handled asynchronously, > in contrast to "page not present" which must be done on the same > instruction that triggers it. You can refer to the recent series from > Vitaly Kuznetsov that switched "page ready" to an interrupt. > Yeah, page ready can be handled asynchronously. I think it would be nice for x86/arm64 to share same design. x86 has 256 vectors and it seems 0xec is picked for the purpose. However, arm64 doesn't have so many (interrupt/exception) vectors and PPI might be appropriate for the purpose if I'm correct, because it has same INTD for all CPUs. From this point, it's similar to x86's vector. There are 16 PPIs, which are in range of 16 to 31, and we might reserve one for this. According to GICv3/v4 spec, 22 - 30 have been assigned. > - the page not present is reusing the memory abort exception, and > there's really no reason to do so. I think it would be best if ARM > could reserve one ESR exception code for the hypervisor. Mark, any > ideas how to proceed here? > Well, a subclass of ESR exception code, whose DFSC (Data Fault Status Code) is 0x34, was taken for the purpose in RFCv1. The code is IMPDEF one and Mark suggested not to do so. I agree the page not present needs a separately subclass of exception. With that, there will be less conflicts and complexity. However, the question is which subclass or DFSC code I should used for the purpose. > - for x86 we're also thinking of initiating the page fault from the > exception handler, rather than doing so from the hypervisor before > injecting the exception. If ARM leads the way here, we would do our > best to share code when x86 does the same. > Sorry, Paolo, I don't follow your idea here. Could you please provide more details? > - do not bother with using KVM_ASYNC_PF_SEND_ALWAYS, it's a fringe case > that adds a lot of complexity. > Yeah, I don't consider it so far. > Also, please include me on further iterations of the series. > Sure. Thanks, Gavin [...]
On 2020-05-28 07:14, Gavin Shan wrote: > Hi Paolo, > > On 5/27/20 4:48 PM, Paolo Bonzini wrote: >> I definitely appreciate the work, but this is repeating most of the >> mistakes done in the x86 implementation. In particular: >> >> - the page ready signal can be done as an interrupt, rather than an >> exception. This is because "page ready" can be handled >> asynchronously, >> in contrast to "page not present" which must be done on the same >> instruction that triggers it. You can refer to the recent series from >> Vitaly Kuznetsov that switched "page ready" to an interrupt. >> > > Yeah, page ready can be handled asynchronously. I think it would be > nice for x86/arm64 to share same design. x86 has 256 vectors and it > seems 0xec is picked for the purpose. However, arm64 doesn't have so > many (interrupt/exception) vectors and PPI might be appropriate for > the purpose if I'm correct, because it has same INTD for all CPUs. > From this point, it's similar to x86's vector. There are 16 PPIs, which > are in range of 16 to 31, and we might reserve one for this. According > to GICv3/v4 spec, 22 - 30 have been assigned. The assignment of the PPIs is completely implementation defined, and is not part of the architecture (and certainly not in the GICv3/v4 spec). SBSA makes some statements as to the way they *could* be assigned, but that's in no way enforced. This allocation is entirely controlled by userspace, which would need to configure tell KVM which PPI to use on a per-VM basis. You would then need to describe the PPI assignment through firmware (both DT and ACPI) so that the guest kernel can know what PPI the hypervisor would be signalling on. It is also not very future proof should we move to a different interrupt architecture. > >> - the page not present is reusing the memory abort exception, and >> there's really no reason to do so. I think it would be best if ARM >> could reserve one ESR exception code for the hypervisor. Mark, any >> ideas how to proceed here? >> > > Well, a subclass of ESR exception code, whose DFSC (Data Fault Status > Code) is 0x34, was taken for the purpose in RFCv1. The code is IMPDEF > one and Mark suggested not to do so. I agree the page not present needs > a > separately subclass of exception. With that, there will be less > conflicts > and complexity. However, the question is which subclass or DFSC code I > should > used for the purpose. The current state of the architecture doesn't seem to leave much leeway in terms of SW creativity here. You just can't allocate your own ISS encoding without risking a clash with future revisions of the architecture. It isn't even clear whether the value you put would stick in ESR_EL1 if it isn't a valid value for this CPU (see the definition of 'Reserved' in the ARM ARM). Allocating such a syndrome would require from ARM: - the guarantee that no existing implementation, irrespective of the implementer, can cope with the ISS encoding of your choice, - the written promise in the architecture that some EC/ISS values are reserved for SW, and that promise to apply retrospectively. This is somewhat unlikely to happen. M.
On 28/05/20 08:14, Gavin Shan wrote: >> - for x86 we're also thinking of initiating the page fault from the >> exception handler, rather than doing so from the hypervisor before >> injecting the exception. If ARM leads the way here, we would do our >> best to share code when x86 does the same. > > Sorry, Paolo, I don't follow your idea here. Could you please provide > more details? The idea is to inject stage2 page faults into the guest even before the host starts populates the page. The guest then invokes a hypercall, telling the host to populate the page table and inject the 'page ready' event (interrupt) when it's done. For x86 the advantage is that the processor can take care of raising the stage2 page fault in the guest, so it's faster. Paolo
On 28/05/20 09:03, Marc Zyngier wrote: > The current state of the architecture doesn't seem to leave much leeway in > terms of SW creativity here. You just can't allocate your own ISS encoding > without risking a clash with future revisions of the architecture. > It isn't even clear whether the value you put would stick in ESR_EL1 > if it isn't a valid value for this CPU (see the definition of 'Reserved' > in the ARM ARM). > > Allocating such a syndrome would require from ARM: > > - the guarantee that no existing implementation, irrespective of the > implementer, can cope with the ISS encoding of your choice, > > - the written promise in the architecture that some EC/ISS values > are reserved for SW, and that promise to apply retrospectively. > > This is somewhat unlikely to happen. Well, that's a euphemism probably. On x86 we're somewhat lucky that there's an architectural way for injecting hypervisor vmexit directly in the guest, and we can piggyback on that for async page faults (which are essentially stage2 page faults that are processed by the guest). Is it possible to reuse EL2 exception codes / syndromes somehow? (I haven't checked in the ARM ARM the differences between the EL1 and EL2 syndrome registers). Paolo
Hi Paolo, On 5/28/20 8:48 PM, Paolo Bonzini wrote: > On 28/05/20 08:14, Gavin Shan wrote: >>> - for x86 we're also thinking of initiating the page fault from the >>> exception handler, rather than doing so from the hypervisor before >>> injecting the exception. If ARM leads the way here, we would do our >>> best to share code when x86 does the same. >> >> Sorry, Paolo, I don't follow your idea here. Could you please provide >> more details? > > The idea is to inject stage2 page faults into the guest even before the > host starts populates the page. The guest then invokes a hypercall, > telling the host to populate the page table and inject the 'page ready' > event (interrupt) when it's done. > > For x86 the advantage is that the processor can take care of raising the > stage2 page fault in the guest, so it's faster. > I think there might be too much overhead if the page can be populated quickly by host. For example, it's fast to populate the pages if swapin isn't involved. If I'm correct enough, it seems arm64 doesn't have similar mechanism, routing stage2 page fault to guest. Thanks, Gavin
On 2020-05-29 00:02, Gavin Shan wrote: > Hi Paolo, > > On 5/28/20 8:48 PM, Paolo Bonzini wrote: >> On 28/05/20 08:14, Gavin Shan wrote: >>>> - for x86 we're also thinking of initiating the page fault from the >>>> exception handler, rather than doing so from the hypervisor before >>>> injecting the exception. If ARM leads the way here, we would do our >>>> best to share code when x86 does the same. >>> >>> Sorry, Paolo, I don't follow your idea here. Could you please provide >>> more details? >> >> The idea is to inject stage2 page faults into the guest even before >> the >> host starts populates the page. The guest then invokes a hypercall, >> telling the host to populate the page table and inject the 'page >> ready' >> event (interrupt) when it's done. >> >> For x86 the advantage is that the processor can take care of raising >> the >> stage2 page fault in the guest, so it's faster. >> > I think there might be too much overhead if the page can be populated > quickly by host. For example, it's fast to populate the pages if swapin > isn't involved. > > If I'm correct enough, it seems arm64 doesn't have similar mechanism, > routing stage2 page fault to guest. Indeed, this isn't a thing on arm64. Exception caused by a S2 fault are always routed to EL2. M.
On 29/05/20 11:41, Marc Zyngier wrote: >>> >>> >>> For x86 the advantage is that the processor can take care of raising the >>> stage2 page fault in the guest, so it's faster. >>> >> I think there might be too much overhead if the page can be populated >> quickly by host. For example, it's fast to populate the pages if swapin >> isn't involved. Those would still be handled by the host. Only those that are not present in the host (which you can see through the MMU notifier) would be routed to the guest. You can do things differently between "not present fault because the page table does not exist" and "not present fault because the page is missing in the host". >> If I'm correct enough, it seems arm64 doesn't have similar mechanism, >> routing stage2 page fault to guest. > > Indeed, this isn't a thing on arm64. Exception caused by a S2 fault are > always routed to EL2. Is there an ARM-approved way to reuse the S2 fault syndromes to detect async page faults? (By the way, another "modern" use for async page faults is for postcopy live migration). Thanks, Paolo
On 2020-05-29 12:11, Paolo Bonzini wrote: > On 29/05/20 11:41, Marc Zyngier wrote: >>>> >>>> >>>> For x86 the advantage is that the processor can take care of raising >>>> the >>>> stage2 page fault in the guest, so it's faster. >>>> >>> I think there might be too much overhead if the page can be populated >>> quickly by host. For example, it's fast to populate the pages if >>> swapin >>> isn't involved. > > Those would still be handled by the host. Only those that are not > present in the host (which you can see through the MMU notifier) would > be routed to the guest. You can do things differently between "not > present fault because the page table does not exist" and "not present > fault because the page is missing in the host". > >>> If I'm correct enough, it seems arm64 doesn't have similar mechanism, >>> routing stage2 page fault to guest. >> >> Indeed, this isn't a thing on arm64. Exception caused by a S2 fault >> are >> always routed to EL2. > > Is there an ARM-approved way to reuse the S2 fault syndromes to detect > async page faults? It would mean being able to set an ESR_EL2 register value into ESR_EL1, and there is nothing in the architecture that would allow that, with the exception of nested virt: a VHE guest hypervisor running at EL1 must be able to observe S2 faults for its own S2, as synthesized by the host hypervisor. The trouble is that: - there is so far no commercially available CPU supporting NV - even if you could get hold of such a machine, there is no guarantee that such "EL2 syndrome at EL1" is valid outside of the nested context - this doesn't solve the issue for non-NV CPUs anyway > (By the way, another "modern" use for async page faults is for postcopy > live migration). Right. That's definitely a more interesting version of "swap-in". M.
On 31/05/20 14:44, Marc Zyngier wrote: >> >> Is there an ARM-approved way to reuse the S2 fault syndromes to detect >> async page faults? > > It would mean being able to set an ESR_EL2 register value into ESR_EL1, > and there is nothing in the architecture that would allow that, I understand that this is not what you want to do and I'm not proposing it, but I want to understand this better: _in practice_ do CPUs check closely what is written in ESR_EL1? In any case, the only way to implement this, it seems to me, would be a completely paravirtualized exception vector that doesn't use ESR at all. On the other hand, for the page ready (interrupt) side assigning a PPI seems complicated but doable. Paolo > with > the exception of nested virt: a VHE guest hypervisor running at EL1 > must be able to observe S2 faults for its own S2, as synthesized by > the host hypervisor. > The trouble is that: > - there is so far no commercially available CPU supporting NV > - even if you could get hold of such a machine, there is no > guarantee that such "EL2 syndrome at EL1" is valid outside of > the nested context > - this doesn't solve the issue for non-NV CPUs anyway
Hi Marc, Paolo, On 6/1/20 7:21 PM, Paolo Bonzini wrote: > On 31/05/20 14:44, Marc Zyngier wrote: >>> >>> Is there an ARM-approved way to reuse the S2 fault syndromes to detect >>> async page faults? >> >> It would mean being able to set an ESR_EL2 register value into ESR_EL1, >> and there is nothing in the architecture that would allow that, > > I understand that this is not what you want to do and I'm not proposing > it, but I want to understand this better: _in practice_ do CPUs check > closely what is written in ESR_EL1? > > In any case, the only way to implement this, it seems to me, would be a > completely paravirtualized exception vector that doesn't use ESR at all. > > On the other hand, for the page ready (interrupt) side assigning a PPI > seems complicated but doable. > Marc suggested to use SDEI in another reply. I think it might be the appropriate way to deliver page-not-present. To some extent, it could be regarded as exception, which doesn't use ESR at all. It matches with what Paolo is thinking of: paravirtualized exception vector that doesn't use ESR at all. However, it seems it's not supported in kvm-arm yet. So I assume it needs to be developed from scratch. Marc, could you please help to confirm? Thanks in advance. I agree with Paolo PPI (interrupt) might be the best way to deliver page-ready currently. I don't think SDEI is suitable because there are no big difference between SDEI and currently used DABT injection to some extent. With SDEI, We will have the issues we are facing. For example, some critical code section isn't safe to receive SDEI if I'm correct. Thanks, Gavin [...]
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 40fb05d96c60..2d5e5ee62d6d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1045,6 +1045,17 @@ config PARAVIRT under a hypervisor, potentially improving performance significantly over full virtualization. +config KVM_GUEST + bool "KVM Guest Support" + depends on PARAVIRT + default y + help + This option enables various optimizations for running under the KVM + hypervisor. Overhead for the kernel when not running inside KVM should + be minimal. + + In case of doubt, say Y + config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7a6e81ca23a8..d878afa42746 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -46,4 +46,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); +#ifdef CONFIG_KVM_GUEST +void kvm_async_pf_delayed_wake(void); +#endif #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h index 0ea481dd1c7a..b2f8ef243df7 100644 --- a/arch/arm64/include/asm/kvm_para.h +++ b/arch/arm64/include/asm/kvm_para.h @@ -3,6 +3,20 @@ #define _ASM_ARM_KVM_PARA_H #include <uapi/asm/kvm_para.h> +#include <linux/of.h> +#include <asm/hypervisor.h> + +#ifdef CONFIG_KVM_GUEST +static inline int kvm_para_available(void) +{ + return 1; +} +#else +static inline int kvm_para_available(void) +{ + return 0; +} +#endif /* CONFIG_KVM_GUEST */ static inline bool kvm_check_and_clear_guest_paused(void) { @@ -11,17 +25,16 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline unsigned int kvm_arch_para_features(void) { - return 0; + unsigned int features = 0; + + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_APF)) + features |= (1 << KVM_FEATURE_ASYNC_PF); + + return features; } static inline unsigned int kvm_arch_para_hints(void) { return 0; } - -static inline bool kvm_para_available(void) -{ - return false; -} - #endif /* _ASM_ARM_KVM_PARA_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ddcde093c433..15efd57129ff 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -751,12 +751,45 @@ finish_ret_to_user: enable_step_tsk x1, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK bl stackleak_erase +#endif +#ifdef CONFIG_KVM_GUEST + bl kvm_async_pf_delayed_wake #endif kernel_exit 0 ENDPROC(ret_to_user) .popsection // .entry.text +#ifdef CONFIG_KVM_GUEST + .pushsection ".rodata", "a" +SYM_DATA_START(__exception_handlers_offset) + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad el1_sync - vectors + .quad el1_irq - vectors + .quad 0 + .quad el1_error - vectors + .quad el0_sync - vectors + .quad el0_irq - vectors + .quad 0 + .quad el0_error - vectors +#ifdef CONFIG_COMPAT + .quad el0_sync_compat - vectors + .quad el0_irq_compat - vectors + .quad 0 + .quad el0_error_compat - vectors +#else + .quad 0 + .quad 0 + .quad 0 + .quad 0 +#endif +SYM_DATA_END(__exception_handlers_offset) + .popsection // .rodata +#endif /* CONFIG_KVM_GUEST */ + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* * Exception vectors trampoline. diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 56be4cbf771f..5e7ee553566d 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -53,6 +53,7 @@ #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/exception.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> @@ -70,6 +71,9 @@ void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); static void __cpu_do_idle(void) { +#ifdef CONFIG_KVM_GUEST + kvm_async_pf_delayed_wake(); +#endif dsb(sy); wfi(); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c9cedc0432d2..cbf8b52135c9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -19,10 +19,12 @@ #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/swait.h> #include <linux/highmem.h> #include <linux/perf_event.h> #include <linux/preempt.h> #include <linux/hugetlb.h> +#include <linux/kvm_para.h> #include <asm/acpi.h> #include <asm/bug.h> @@ -48,8 +50,31 @@ struct fault_info { const char *name; }; +#ifdef CONFIG_KVM_GUEST +struct kvm_task_sleep_node { + struct hlist_node link; + struct swait_queue_head wq; + u32 token; + struct task_struct *task; + int cpu; + bool halted; + bool delayed; +}; + +struct kvm_task_sleep_head { + raw_spinlock_t lock; + struct hlist_head list; +}; +#endif /* CONFIG_KVM_GUEST */ + static const struct fault_info fault_info[]; static struct fault_info debug_fault_info[]; +#ifdef CONFIG_KVM_GUEST +extern char __exception_handlers_offset[]; +static bool async_pf_available = true; +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); +static DEFINE_PER_CPU(struct kvm_task_sleep_head, apf_head); +#endif static inline const struct fault_info *esr_to_fault_info(unsigned int esr) { @@ -717,10 +742,281 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; +#ifdef CONFIG_KVM_GUEST +static inline unsigned int kvm_async_pf_read_enabled(void) +{ + return __this_cpu_read(apf_data.enabled); +} + +static inline void kvm_async_pf_write_enabled(unsigned int val) +{ + __this_cpu_write(apf_data.enabled, val); +} + +static inline unsigned int kvm_async_pf_read_reason(void) +{ + return __this_cpu_read(apf_data.reason); +} + +static inline void kvm_async_pf_write_reason(unsigned int val) +{ + __this_cpu_write(apf_data.reason, val); +} + +#define kvm_async_pf_lock(b, flags) \ + raw_spin_lock_irqsave(&(b)->lock, (flags)) +#define kvm_async_pf_trylock(b, flags) \ + raw_spin_trylock_irqsave(&(b)->lock, (flags)) +#define kvm_async_pf_unlock(b, flags) \ + raw_spin_unlock_irqrestore(&(b)->lock, (flags)) +#define kvm_async_pf_unlock_and_clear(b, flags) \ + do { \ + raw_spin_unlock_irqrestore(&(b)->lock, (flags)); \ + kvm_async_pf_write_reason(0); \ + } while (0) + +static struct kvm_task_sleep_node *kvm_async_pf_find( + struct kvm_task_sleep_head *b, u32 token) +{ + struct kvm_task_sleep_node *n; + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static void kvm_async_pf_wait(u32 token, int in_kernel) +{ + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); + struct kvm_task_sleep_node n, *e; + DECLARE_SWAITQUEUE(wait); + unsigned long flags; + + kvm_async_pf_lock(b, flags); + e = kvm_async_pf_find(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + kvm_async_pf_unlock_and_clear(b, flags); + + return; + } + + n.token = token; + n.task = current; + n.cpu = smp_processor_id(); + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? + preempt_count() > 1 || rcu_preempt_depth() : in_kernel); + n.delayed = false; + init_swait_queue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + kvm_async_pf_unlock_and_clear(b, flags); + + for (;;) { + if (!n.halted) { + prepare_to_swait_exclusive(&n.wq, &wait, + TASK_UNINTERRUPTIBLE); + } + + if (hlist_unhashed(&n.link)) + break; + + if (!n.halted) { + schedule(); + } else { + dsb(sy); + wfi(); + } + } + + if (!n.halted) + finish_swait(&n.wq, &wait); +} + +/* + * There are two cases the suspended processed can't be waken up + * immediately: The waker is exactly the suspended process, or + * the current CPU runqueue has been locked. Otherwise, we might + * run into dead-lock. + */ +static inline void kvm_async_pf_wake_one(struct kvm_task_sleep_node *n) +{ + if (n->task == current || + cpu_rq_is_locked(smp_processor_id())) { + n->delayed = true; + return; + } + + hlist_del_init(&n->link); + if (n->halted) + smp_send_reschedule(n->cpu); + else + swake_up_one(&n->wq); +} + +void kvm_async_pf_delayed_wake(void) +{ + struct kvm_task_sleep_head *b; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + unsigned int reason; + unsigned long flags; + + if (!kvm_async_pf_read_enabled()) + return; + + /* + * We're running in the edging context, we need to complete + * the work as quick as possible. So we have a preliminary + * check without holding the lock. + */ + b = this_cpu_ptr(&apf_head); + if (hlist_empty(&b->list)) + return; + + /* + * Set the async page fault reason to something to avoid + * receiving the signals, which might cause lock contention + * and possibly dead-lock. As we're in guest context, it's + * safe to set the reason here. + * + * There might be pending signals. For that case, we needn't + * do anything. Otherwise, the pending signal will be lost. + */ + reason = kvm_async_pf_read_reason(); + if (!reason) { + kvm_async_pf_write_reason(KVM_PV_REASON_PAGE_NOT_PRESENT + + KVM_PV_REASON_PAGE_READY); + } + + if (!kvm_async_pf_trylock(b, flags)) + goto done; + + hlist_for_each_safe(p, next, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->cpu != smp_processor_id()) + continue; + if (!n->delayed) + continue; + + kvm_async_pf_wake_one(n); + } + + kvm_async_pf_unlock(b, flags); + +done: + if (!reason) + kvm_async_pf_write_reason(0); +} +NOKPROBE_SYMBOL(kvm_async_pf_delayed_wake); + +static void kvm_async_pf_wake_all(void) +{ + struct kvm_task_sleep_head *b; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + unsigned long flags; + + b = this_cpu_ptr(&apf_head); + kvm_async_pf_lock(b, flags); + + hlist_for_each_safe(p, next, &b->list) { + n = hlist_entry(p, typeof(*n), link); + kvm_async_pf_wake_one(n); + } + + kvm_async_pf_unlock(b, flags); + + kvm_async_pf_write_reason(0); +} + +static void kvm_async_pf_wake(u32 token) +{ + struct kvm_task_sleep_head *b = this_cpu_ptr(&apf_head); + struct kvm_task_sleep_node *n; + unsigned long flags; + + if (token == ~0) { + kvm_async_pf_wake_all(); + return; + } + +again: + kvm_async_pf_lock(b, flags); + + n = kvm_async_pf_find(b, token); + if (!n) { + /* + * Async PF was not yet handled. Add dummy entry + * for the token. Busy wait until other CPU handles + * the async PF on allocation failure. + */ + n = kzalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + kvm_async_pf_unlock(b, flags); + cpu_relax(); + goto again; + } + n->token = token; + n->task = current; + n->cpu = smp_processor_id(); + n->halted = false; + n->delayed = false; + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else { + kvm_async_pf_wake_one(n); + } + + kvm_async_pf_unlock_and_clear(b, flags); +} + +static bool do_async_pf(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + u32 reason; + + if (!kvm_async_pf_read_enabled()) + return false; + + reason = kvm_async_pf_read_reason(); + if (!reason) + return false; + + switch (reason) { + case KVM_PV_REASON_PAGE_NOT_PRESENT: + kvm_async_pf_wait((u32)addr, !user_mode(regs)); + break; + case KVM_PV_REASON_PAGE_READY: + kvm_async_pf_wake((u32)addr); + break; + default: + if (reason) { + pr_warn("%s: Illegal reason %d\n", __func__, reason); + kvm_async_pf_write_reason(0); + } + } + + return true; +} +#endif /* CONFIG_KVM_GUEST */ + void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); +#ifdef CONFIG_KVM_GUEST + if (do_async_pf(addr, esr, regs)) + return; +#endif + if (!inf->fn(addr, esr, regs)) return; @@ -878,3 +1174,141 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, debug_exception_exit(regs); } NOKPROBE_SYMBOL(do_debug_exception); + +#ifdef CONFIG_KVM_GUEST +static int __init kvm_async_pf_available(char *arg) +{ + async_pf_available = false; + return 0; +} +early_param("no-kvmapf", kvm_async_pf_available); + +static void kvm_async_pf_enable(bool enable) +{ + struct arm_smccc_res res; + unsigned long *offsets = (unsigned long *)__exception_handlers_offset; + u32 enabled = kvm_async_pf_read_enabled(); + u64 val; + int i; + + if (enable == enabled) + return; + + if (enable) { + /* + * Asychonous page faults will be prohibited when CPU runs + * instructions between the vector base and the maximal + * offset, plus 4096. The 4096 is the assumped maximal + * length for individual handler. The hardware registers + * should be saved to stack at the beginning of the handlers, + * so 4096 shuld be safe enough. + */ + val = 0; + for (i = 0; i < 16; i++) { + if (offsets[i] > val) + val = offsets[i]; + } + + val += 4096; + val |= BIT(62); + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, + (u32)val, (u32)(val >> 32), &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn("Async PF configuration error %ld on CPU %d\n", + res.a0, smp_processor_id()); + return; + } + + /* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS */ + val = BIT(63); + val |= virt_to_phys(this_cpu_ptr(&apf_data)); + val |= KVM_ASYNC_PF_ENABLED; + + kvm_async_pf_write_enabled(1); + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, + (u32)val, (u32)(val >> 32), &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn("Async PF enable error %ld on CPU %d\n", + res.a0, smp_processor_id()); + kvm_async_pf_write_enabled(0); + return; + } + } else { + val = BIT(63); + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_APF_FUNC_ID, + (u32)val, (u32)(val >> 32), &res); + if (res.a0 != SMCCC_RET_SUCCESS) { + pr_warn("Async PF disable error %ld on CPU %d\n", + res.a0, smp_processor_id()); + return; + } + + kvm_async_pf_write_enabled(0); + } + + pr_info("Async PF %s on CPU %d\n", + enable ? "enabled" : "disabled", smp_processor_id()); +} + +static void kvm_async_pf_cpu_reboot(void *unused) +{ + kvm_async_pf_enable(false); +} + +static int kvm_async_pf_cpu_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_async_pf_cpu_reboot, NULL, 1); + + return NOTIFY_DONE; +} + +static struct notifier_block kvm_async_pf_cpu_reboot_nb = { + .notifier_call = kvm_async_pf_cpu_reboot_notify, +}; + +static int kvm_async_pf_cpu_online(unsigned int cpu) +{ + struct kvm_task_sleep_head *b; + + b = this_cpu_ptr(&apf_head); + raw_spin_lock_init(&b->lock); + kvm_async_pf_enable(true); + return 0; +} + +static int kvm_async_pf_cpu_offline(unsigned int cpu) +{ + kvm_async_pf_enable(false); + return 0; +} + +static int __init kvm_async_pf_cpu_init(void) +{ + struct kvm_task_sleep_head *b; + int ret; + + if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) || + !async_pf_available) + return -EPERM; + + register_reboot_notifier(&kvm_async_pf_cpu_reboot_nb); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "arm/kvm:online", kvm_async_pf_cpu_online, + kvm_async_pf_cpu_offline); + if (ret < 0) { + pr_warn("%s: Error %d to install cpu hotplug callbacks\n", + __func__, ret); + return ret; + } + + b = this_cpu_ptr(&apf_head); + raw_spin_lock_init(&b->lock); + kvm_async_pf_enable(true); + + return 0; +} +early_initcall(kvm_async_pf_cpu_init); +#endif /* CONFIG_KVM_GUEST */
This supports asynchronous page fault for the guest. The design is similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from the host, the current task is either rescheduled or put into power saving mode. The task will be waken up when PAGE_READY signal is received. The PAGE_READY signal might be received in the context of the suspended process, to be waken up. That means the suspended process has to wake up itself, but it's not safe and prone to cause dead-lock on CPU runqueue lock. So the wakeup is delayed on returning from kernel space to user space or idle process is picked for running. The signals are conveyed through the async page fault control block, which was passed to host on enabling the functionality. On each page fault, the control block is checked and switch to the async page fault handling flow if any signals exist. The feature is put into the CONFIG_KVM_GUEST umbrella, which is added by this patch. So we have inline functions implemented in kvm_para.h, like other architectures do, to check if async page fault (one of the KVM para-virtualized features) is available. Also, the kernel boot parameter "no-kvmapf" can be specified to disable the feature. Signed-off-by: Gavin Shan <gshan@redhat.com> --- arch/arm64/Kconfig | 11 + arch/arm64/include/asm/exception.h | 3 + arch/arm64/include/asm/kvm_para.h | 27 +- arch/arm64/kernel/entry.S | 33 +++ arch/arm64/kernel/process.c | 4 + arch/arm64/mm/fault.c | 434 +++++++++++++++++++++++++++++ 6 files changed, 505 insertions(+), 7 deletions(-)