Message ID | 1537524123-9578-9-git-send-email-paulus@ozlabs.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: PPC: Book3S HV: Nested HV virtualization | expand |
On Fri, Sep 21, 2018 at 08:01:39PM +1000, Paul Mackerras wrote: > This creates an alternative guest entry/exit path which is used for > radix guests on POWER9 systems when we have indep_threads_mode=Y. In > these circumstances there is exactly one vcpu per vcore and there is > no coordination required between vcpus or vcores; the vcpu can enter > the guest without needing to synchronize with anything else. > > The new fast path is implemented almost entirely in C in book3s_hv.c > and runs with the MMU on until the guest is entered. On guest exit > we use the existing path until the point where we are committed to > exiting the guest (as distinct from handling an interrupt in the > low-level code and returning to the guest) and we have pulled the > guest context from the XIVE. At that point we check a flag in the > stack frame to see whether we came in via the old path and the new > path; if we came in via the new path then we go back to C code to do > the rest of the process of saving the guest context and restoring the > host context. > > The C code is split into separate functions for handling the > OS-accessible state and the hypervisor state, with the idea that the > latter can be replaced by a hypercall when we implement nested > virtualization. > > Signed-off-by: Paul Mackerras <paulus@ozlabs.org> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> > --- > arch/powerpc/include/asm/asm-prototypes.h | 2 + > arch/powerpc/include/asm/kvm_ppc.h | 2 + > arch/powerpc/kvm/book3s_hv.c | 423 +++++++++++++++++++++++++++++- > arch/powerpc/kvm/book3s_hv_ras.c | 2 + > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 95 ++++++- > arch/powerpc/kvm/book3s_xive.c | 63 +++++ > 6 files changed, 583 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h > index 0c1a2b0..5c9b00c 100644 > --- a/arch/powerpc/include/asm/asm-prototypes.h > +++ b/arch/powerpc/include/asm/asm-prototypes.h > @@ -165,4 +165,6 @@ void kvmhv_load_host_pmu(void); > void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); > void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); > > +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); > + > #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 83d61b8..245e564 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -585,6 +585,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); > > extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > int level, bool line_status); > +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); > #else > static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > u32 priority) { return -1; } > @@ -607,6 +608,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur > > static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > int level, bool line_status) { return -ENODEV; } > +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > #endif /* CONFIG_KVM_XIVE */ > > /* > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 0e17593..8576a7b 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -3080,6 +3080,267 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) > } > > /* > + * Load up hypervisor-mode registers on P9. > + */ > +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + s64 hdec; > + u64 tb, purr, spurr; > + int trap; > + unsigned long host_hfscr = mfspr(SPRN_HFSCR); > + unsigned long host_ciabr = mfspr(SPRN_CIABR); > + unsigned long host_dawr = mfspr(SPRN_DAWR); > + unsigned long host_dawrx = mfspr(SPRN_DAWRX); > + unsigned long host_psscr = mfspr(SPRN_PSSCR); > + unsigned long host_pidr = mfspr(SPRN_PID); > + > + hdec = local_paca->kvm_hstate.dec_expires - mftb(); > + if (hdec < 0) > + return BOOK3S_INTERRUPT_HV_DECREMENTER; > + mtspr(SPRN_HDEC, hdec); > + > + if (vc->tb_offset) { > + u64 new_tb = mftb() + vc->tb_offset; > + mtspr(SPRN_TBU40, new_tb); > + tb = mftb(); > + if ((tb & 0xffffff) < (new_tb & 0xffffff)) > + mtspr(SPRN_TBU40, new_tb + 0x1000000); > + vc->tb_offset_applied = vc->tb_offset; > + } > + > + if (vc->pcr) > + mtspr(SPRN_PCR, vc->pcr); > + mtspr(SPRN_DPDES, vc->dpdes); > + mtspr(SPRN_VTB, vc->vtb); > + > + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); > + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); > + mtspr(SPRN_PURR, vcpu->arch.purr); > + mtspr(SPRN_SPURR, vcpu->arch.spurr); > + > + if (cpu_has_feature(CPU_FTR_DAWR)) { > + mtspr(SPRN_DAWR, vcpu->arch.dawr); > + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); > + } > + mtspr(SPRN_CIABR, vcpu->arch.ciabr); > + mtspr(SPRN_IC, vcpu->arch.ic); > + mtspr(SPRN_PID, vcpu->arch.pid); > + > + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | > + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); > + > + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); > + > + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); > + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); > + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); > + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); > + > + mtspr(SPRN_AMOR, ~0UL); > + > + mtspr(SPRN_LPCR, vc->lpcr); > + isync(); > + > + kvmppc_xive_push_vcpu(vcpu); > + > + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); > + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); > + > + trap = __kvmhv_vcpu_entry_p9(vcpu); > + > + /* Advance host PURR/SPURR by the amount used by guest */ > + purr = mfspr(SPRN_PURR); > + spurr = mfspr(SPRN_SPURR); > + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + > + purr - vcpu->arch.purr); > + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + > + spurr - vcpu->arch.spurr); > + vcpu->arch.purr = purr; > + vcpu->arch.spurr = spurr; > + > + vcpu->arch.ic = mfspr(SPRN_IC); > + vcpu->arch.pid = mfspr(SPRN_PID); > + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; > + > + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); > + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); > + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); > + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); > + > + mtspr(SPRN_PSSCR, host_psscr); > + mtspr(SPRN_HFSCR, host_hfscr); > + mtspr(SPRN_CIABR, host_ciabr); > + mtspr(SPRN_DAWR, host_dawr); > + mtspr(SPRN_DAWRX, host_dawrx); > + mtspr(SPRN_PID, host_pidr); > + > + /* > + * Since this is radix, do a eieio; tlbsync; ptesync sequence in > + * case we interrupted the guest between a tlbie and a ptesync. > + */ > + asm volatile("eieio; tlbsync; ptesync"); > + > + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ > + isync(); > + > + vc->dpdes = mfspr(SPRN_DPDES); > + vc->vtb = mfspr(SPRN_VTB); > + mtspr(SPRN_DPDES, 0); > + if (vc->pcr) > + mtspr(SPRN_PCR, 0); > + > + if (vc->tb_offset_applied) { > + u64 new_tb = mftb() - vc->tb_offset_applied; > + mtspr(SPRN_TBU40, new_tb); > + tb = mftb(); > + if ((tb & 0xffffff) < (new_tb & 0xffffff)) > + mtspr(SPRN_TBU40, new_tb + 0x1000000); > + vc->tb_offset_applied = 0; > + } > + > + mtspr(SPRN_HDEC, 0x7fffffff); > + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); > + > + return trap; > +} > + > +/* > + * Virtual-mode guest entry for POWER9 and later when the host and > + * guest are both using the radix MMU. The LPIDR has already been set. > + */ > +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + unsigned long host_dscr = mfspr(SPRN_DSCR); > + unsigned long host_tidr = mfspr(SPRN_TIDR); > + unsigned long host_iamr = mfspr(SPRN_IAMR); > + s64 dec; > + u64 tb; > + int trap, save_pmu; > + > + dec = mfspr(SPRN_DEC); > + tb = mftb(); > + if (dec < 512) > + return BOOK3S_INTERRUPT_HV_DECREMENTER; > + local_paca->kvm_hstate.dec_expires = dec + tb; > + > + vcpu->arch.ceded = 0; > + > + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ > + > + kvmppc_subcore_enter_guest(); > + > + vc->entry_exit_map = 1; > + vc->in_guest = 1; > + > + if (vcpu->arch.vpa.pinned_addr) { > + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; > + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; > + lp->yield_count = cpu_to_be32(yield_count); > + vcpu->arch.vpa.dirty = 1; > + } > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + > + kvmhv_load_guest_pmu(vcpu); > + > + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); > + load_fp_state(&vcpu->arch.fp); > + load_vr_state(&vcpu->arch.vr); > + > + mtspr(SPRN_DSCR, vcpu->arch.dscr); > + mtspr(SPRN_IAMR, vcpu->arch.iamr); > + mtspr(SPRN_PSPB, vcpu->arch.pspb); > + mtspr(SPRN_FSCR, vcpu->arch.fscr); > + mtspr(SPRN_TAR, vcpu->arch.tar); > + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); > + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); > + mtspr(SPRN_BESCR, vcpu->arch.bescr); > + mtspr(SPRN_WORT, vcpu->arch.wort); > + mtspr(SPRN_TIDR, vcpu->arch.tid); > + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); > + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); > + mtspr(SPRN_AMR, vcpu->arch.amr); > + mtspr(SPRN_UAMOR, vcpu->arch.uamor); > + > + if (!(vcpu->arch.ctrl & 1)) > + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); > + > + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); > + > + if (vcpu->arch.doorbell_request) { > + vc->dpdes = 1; > + smp_wmb(); > + vcpu->arch.doorbell_request = 0; > + } > + > + trap = kvmhv_load_hv_regs_and_go(vcpu); > + > + vcpu->arch.slb_max = 0; > + dec = mfspr(SPRN_DEC); > + tb = mftb(); > + vcpu->arch.dec_expires = dec + tb; > + vcpu->cpu = -1; > + vcpu->arch.thread_cpu = -1; > + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); > + > + vcpu->arch.iamr = mfspr(SPRN_IAMR); > + vcpu->arch.pspb = mfspr(SPRN_PSPB); > + vcpu->arch.fscr = mfspr(SPRN_FSCR); > + vcpu->arch.tar = mfspr(SPRN_TAR); > + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); > + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); > + vcpu->arch.bescr = mfspr(SPRN_BESCR); > + vcpu->arch.wort = mfspr(SPRN_WORT); > + vcpu->arch.tid = mfspr(SPRN_TIDR); > + vcpu->arch.amr = mfspr(SPRN_AMR); > + vcpu->arch.uamor = mfspr(SPRN_UAMOR); > + vcpu->arch.dscr = mfspr(SPRN_DSCR); > + > + mtspr(SPRN_PSPB, 0); > + mtspr(SPRN_WORT, 0); > + mtspr(SPRN_AMR, 0); > + mtspr(SPRN_UAMOR, 0); > + mtspr(SPRN_DSCR, host_dscr); > + mtspr(SPRN_TIDR, host_tidr); > + mtspr(SPRN_IAMR, host_iamr); > + mtspr(SPRN_PSPB, 0); > + > + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); > + store_fp_state(&vcpu->arch.fp); > + store_vr_state(&vcpu->arch.vr); > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + > + save_pmu = 1; > + if (vcpu->arch.vpa.pinned_addr) { > + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; > + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; > + lp->yield_count = cpu_to_be32(yield_count); > + vcpu->arch.vpa.dirty = 1; > + save_pmu = lp->pmcregs_in_use; > + } > + > + kvmhv_save_guest_pmu(vcpu, save_pmu); > + > + vc->entry_exit_map = 0x101; > + vc->in_guest = 0; > + > + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); > + > + kvmhv_load_host_pmu(); > + > + kvmppc_subcore_exit_guest(); > + > + return trap; > +} > + > +/* > * Wait for some other vcpu thread to execute us, and > * wake us up when we need to handle something in the host. > */ > @@ -3405,6 +3666,163 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > return vcpu->arch.ret; > } > > +static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > + struct kvm_vcpu *vcpu) > +{ > + int trap, r, pcpu, pcpu0; > + int srcu_idx; > + struct kvmppc_vcore *vc; > + > + trace_kvmppc_run_vcpu_enter(vcpu); > + > + kvm_run->exit_reason = 0; > + vcpu->arch.ret = RESUME_GUEST; > + vcpu->arch.trap = 0; > + > + vc = vcpu->arch.vcore; > + vcpu->arch.ceded = 0; > + vcpu->arch.run_task = current; > + vcpu->arch.kvm_run = kvm_run; > + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); > + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; > + vcpu->arch.busy_preempt = TB_NIL; > + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; > + vc->runnable_threads[0] = vcpu; > + vc->n_runnable = 1; > + vc->runner = vcpu; > + > + /* See if the MMU is ready to go */ > + if (!vcpu->kvm->arch.mmu_ready) { > + r = kvmhv_setup_mmu(vcpu); > + if (r) { > + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; > + kvm_run->fail_entry. > + hardware_entry_failure_reason = 0; > + vcpu->arch.ret = r; > + goto out; > + } > + } > + > + if (need_resched()) > + cond_resched(); > + > + kvmppc_update_vpas(vcpu); > + > + init_vcore_to_run(vc); > + vc->preempt_tb = TB_NIL; > + > + preempt_disable(); > + pcpu = smp_processor_id(); > + vc->pcpu = pcpu; > + kvmppc_prepare_radix_vcpu(vcpu, pcpu); > + > + local_irq_disable(); > + hard_irq_disable(); > + if (signal_pending(current)) > + goto sigpend; > + if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready) > + goto out; > + > + kvmppc_core_prepare_to_enter(vcpu); > + > + kvmppc_clear_host_core(pcpu); > + > + local_paca->kvm_hstate.tid = 0; > + local_paca->kvm_hstate.napping = 0; > + local_paca->kvm_hstate.kvm_split_mode = NULL; > + kvmppc_start_thread(vcpu, vc); > + kvmppc_create_dtl_entry(vcpu, vc); > + trace_kvm_guest_enter(vcpu); > + > + vc->vcore_state = VCORE_RUNNING; > + trace_kvmppc_run_core(vc, 0); > + > + mtspr(SPRN_LPID, vc->kvm->arch.lpid); > + isync(); > + > + /* See comment above in kvmppc_run_core() about this */ > + pcpu0 = pcpu; > + if (cpu_has_feature(CPU_FTR_ARCH_300)) > + pcpu0 &= ~0x3UL; > + > + if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) { > + radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); > + /* Clear the bit after the TLB flush */ > + cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush); > + } > + > + trace_hardirqs_on(); > + guest_enter_irqoff(); > + > + srcu_idx = srcu_read_lock(&vc->kvm->srcu); > + > + this_cpu_disable_ftrace(); > + > + trap = kvmhv_p9_guest_entry(vcpu); > + vcpu->arch.trap = trap; > + > + this_cpu_enable_ftrace(); > + > + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); > + > + mtspr(SPRN_LPID, vc->kvm->arch.host_lpid); > + isync(); > + > + trace_hardirqs_off(); > + set_irq_happened(trap); > + > + kvmppc_set_host_core(pcpu); > + > + local_irq_enable(); > + guest_exit(); > + preempt_enable(); > + > + /* cancel pending decrementer exception if DEC is now positive */ > + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) > + kvmppc_core_dequeue_dec(vcpu); > + > + trace_kvm_guest_exit(vcpu); > + r = RESUME_GUEST; > + if (trap) > + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + vcpu->arch.ret = r; > + > + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && > + !kvmppc_vcpu_woken(vcpu)) { > + kvmppc_set_timer(vcpu); > + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { > + if (signal_pending(current)) { > + vcpu->stat.signal_exits++; > + kvm_run->exit_reason = KVM_EXIT_INTR; > + vcpu->arch.ret = -EINTR; > + break; > + } > + spin_lock(&vc->lock); > + kvmppc_vcore_blocked(vc); > + spin_unlock(&vc->lock); > + } > + } > + vcpu->arch.ceded = 0; > + > + vc->vcore_state = VCORE_INACTIVE; > + trace_kvmppc_run_core(vc, 1); > + > + done: > + kvmppc_remove_runnable(vc, vcpu); > + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); > + > + return vcpu->arch.ret; > + > + sigpend: > + vcpu->stat.signal_exits++; > + kvm_run->exit_reason = KVM_EXIT_INTR; > + vcpu->arch.ret = -EINTR; > + out: > + local_irq_enable(); > + preempt_enable(); > + goto done; > +} > + > static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > { > int r; > @@ -3480,7 +3898,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; > > do { > - r = kvmppc_run_vcpu(run, vcpu); > + if (kvm->arch.threads_indep && kvm_is_radix(kvm)) > + r = kvmppc_run_single_vcpu(run, vcpu); > + else > + r = kvmppc_run_vcpu(run, vcpu); > > if (run->exit_reason == KVM_EXIT_PAPR_HCALL && > !(vcpu->arch.shregs.msr & MSR_PR)) { > diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c > index ee564b6..0787f12 100644 > --- a/arch/powerpc/kvm/book3s_hv_ras.c > +++ b/arch/powerpc/kvm/book3s_hv_ras.c > @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) > > local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; > } > +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); > > void kvmppc_subcore_exit_guest(void) > { > @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) > > local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; > } > +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); > > static bool kvmppc_tb_resync_required(void) > { > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 67a847f..2abc336 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -47,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) > #define NAPPING_NOVCPU 2 > > /* Stack frame offsets for kvmppc_hv_entry */ > -#define SFS 160 > +#define SFS 208 > #define STACK_SLOT_TRAP (SFS-4) > +#define STACK_SLOT_SHORT_PATH (SFS-8) > #define STACK_SLOT_TID (SFS-16) > #define STACK_SLOT_PSSCR (SFS-24) > #define STACK_SLOT_PID (SFS-32) > @@ -57,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) > #define STACK_SLOT_DAWR (SFS-56) > #define STACK_SLOT_DAWRX (SFS-64) > #define STACK_SLOT_HFSCR (SFS-72) > +/* the following is used by the P9 short path */ > +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ > > /* > * Call kvmppc_hv_entry in real mode. > @@ -1020,6 +1023,9 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) > no_xive: > #endif /* CONFIG_KVM_XICS */ > > + li r0, 0 > + stw r0, STACK_SLOT_SHORT_PATH(r1) > + > deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ > /* Check if we can deliver an external or decrementer interrupt now */ > ld r0, VCPU_PENDING_EXC(r4) > @@ -1034,13 +1040,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > bl kvmppc_guest_entry_inject_int > ld r4, HSTATE_KVM_VCPU(r13) > 71: > - ld r10, VCPU_PC(r4) > - ld r11, VCPU_MSR(r4) > ld r6, VCPU_SRR0(r4) > ld r7, VCPU_SRR1(r4) > mtspr SPRN_SRR0, r6 > mtspr SPRN_SRR1, r7 > > +fast_guest_entry_c: > + ld r10, VCPU_PC(r4) > + ld r11, VCPU_MSR(r4) > /* r11 = vcpu->arch.msr & ~MSR_HV */ > rldicl r11, r11, 63 - MSR_HV_LG, 1 > rotldi r11, r11, 1 + MSR_HV_LG > @@ -1117,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > HRFI_TO_GUEST > b . > > +/* > + * Enter the guest on a P9 or later system where we have exactly > + * one vcpu per vcore and we don't need to go to real mode > + * (which implies that host and guest are both using radix MMU mode). > + * r3 = vcpu pointer > + * Most SPRs and all the VSRs have been loaded already. > + */ > +_GLOBAL(__kvmhv_vcpu_entry_p9) > +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) > + mflr r0 > + std r0, PPC_LR_STKOFF(r1) > + stdu r1, -SFS(r1) > + > + li r0, 1 > + stw r0, STACK_SLOT_SHORT_PATH(r1) > + > + std r3, HSTATE_KVM_VCPU(r13) > + mfcr r4 > + stw r4, SFS+8(r1) > + > + std r1, HSTATE_HOST_R1(r13) > + > + reg = 14 > + .rept 18 > + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + reg = 14 > + .rept 18 > + ld reg, __VCPU_GPR(reg)(r3) > + reg = reg + 1 > + .endr > + > + mfmsr r10 > + std r10, HSTATE_HOST_MSR(r13) > + > + mr r4, r3 > + b fast_guest_entry_c > +guest_exit_short_path: > + > + li r0, KVM_GUEST_MODE_NONE > + stb r0, HSTATE_IN_GUEST(r13) > + > + reg = 14 > + .rept 18 > + std reg, __VCPU_GPR(reg)(r9) > + reg = reg + 1 > + .endr > + > + reg = 14 > + .rept 18 > + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + lwz r4, SFS+8(r1) > + mtcr r4 > + > + mr r3, r12 /* trap number */ > + > + addi r1, r1, SFS > + ld r0, PPC_LR_STKOFF(r1) > + mtlr r0 > + > + /* If we are in real mode, do a rfid to get back to the caller */ > + mfmsr r4 > + andi. r5, r4, MSR_IR > + bnelr > + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ > + mtspr SPRN_SRR0, r0 > + ld r10, HSTATE_HOST_MSR(r13) > + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG > + mtspr SPRN_SRR1, r10 > + RFI_TO_KERNEL > + b . > + > secondary_too_late: > li r12, 0 > stw r12, STACK_SLOT_TRAP(r1) > @@ -1377,6 +1461,11 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ > 1: > #endif /* CONFIG_KVM_XICS */ > > + /* If we came in through the P9 short path, go back out to C now */ > + lwz r0, STACK_SLOT_SHORT_PATH(r1) > + cmpwi r0, 0 > + bne guest_exit_short_path > + > /* For hash guest, read the guest SLB and save it away */ > ld r5, VCPU_KVM(r9) > lbz r0, KVM_RADIX(r5) > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 30c2eb7..ad4a370 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -62,6 +62,69 @@ > #define XIVE_Q_GAP 2 > > /* > + * Push a vcpu's context to the XIVE on guest entry. > + * This assumes we are in virtual mode (MMU on) > + */ > +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) > +{ > + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; > + u64 pq; > + > + if (!tima) > + return; > + eieio(); > + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); > + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); > + vcpu->arch.xive_pushed = 1; > + eieio(); > + > + /* > + * We clear the irq_pending flag. There is a small chance of a > + * race vs. the escalation interrupt happening on another > + * processor setting it again, but the only consequence is to > + * cause a spurious wakeup on the next H_CEDE, which is not an > + * issue. > + */ > + vcpu->arch.irq_pending = 0; > + > + /* > + * In single escalation mode, if the escalation interrupt is > + * on, we mask it. > + */ > + if (vcpu->arch.xive_esc_on) { > + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + > + XIVE_ESB_SET_PQ_01)); > + mb(); > + > + /* > + * We have a possible subtle race here: The escalation > + * interrupt might have fired and be on its way to the > + * host queue while we mask it, and if we unmask it > + * early enough (re-cede right away), there is a > + * theorical possibility that it fires again, thus > + * landing in the target queue more than once which is > + * a big no-no. > + * > + * Fortunately, solving this is rather easy. If the > + * above load setting PQ to 01 returns a previous > + * value where P is set, then we know the escalation > + * interrupt is somewhere on its way to the host. In > + * that case we simply don't clear the xive_esc_on > + * flag below. It will be eventually cleared by the > + * handler for the escalation interrupt. > + * > + * Then, when doing a cede, we check that flag again > + * before re-enabling the escalation interrupt, and if > + * set, we abort the cede. > + */ > + if (!(pq & XIVE_ESB_VAL_P)) > + /* Now P is 0, we can clear the flag */ > + vcpu->arch.xive_esc_on = 0; > + } > +} > +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); > + > +/* > * This is a simple trigger for a generic XIVE IRQ. This must > * only be called for interrupts that support a trigger page > */
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 0c1a2b0..5c9b00c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -165,4 +165,6 @@ void kvmhv_load_host_pmu(void); void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); + #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 83d61b8..245e564 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -585,6 +585,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -607,6 +608,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ /* diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0e17593..8576a7b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3080,6 +3080,267 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* + * Load up hypervisor-mode registers on P9. + */ +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + s64 hdec; + u64 tb, purr, spurr; + int trap; + unsigned long host_hfscr = mfspr(SPRN_HFSCR); + unsigned long host_ciabr = mfspr(SPRN_CIABR); + unsigned long host_dawr = mfspr(SPRN_DAWR); + unsigned long host_dawrx = mfspr(SPRN_DAWRX); + unsigned long host_psscr = mfspr(SPRN_PSSCR); + unsigned long host_pidr = mfspr(SPRN_PID); + + hdec = local_paca->kvm_hstate.dec_expires - mftb(); + if (hdec < 0) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + mtspr(SPRN_HDEC, hdec); + + if (vc->tb_offset) { + u64 new_tb = mftb() + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = vc->tb_offset; + } + + if (vc->pcr) + mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_DPDES, vc->dpdes); + mtspr(SPRN_VTB, vc->vtb); + + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, vcpu->arch.purr); + mtspr(SPRN_SPURR, vcpu->arch.spurr); + + if (cpu_has_feature(CPU_FTR_DAWR)) { + mtspr(SPRN_DAWR, vcpu->arch.dawr); + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); + } + mtspr(SPRN_CIABR, vcpu->arch.ciabr); + mtspr(SPRN_IC, vcpu->arch.ic); + mtspr(SPRN_PID, vcpu->arch.pid); + + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); + + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + + mtspr(SPRN_AMOR, ~0UL); + + mtspr(SPRN_LPCR, vc->lpcr); + isync(); + + kvmppc_xive_push_vcpu(vcpu); + + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); + + trap = __kvmhv_vcpu_entry_p9(vcpu); + + /* Advance host PURR/SPURR by the amount used by guest */ + purr = mfspr(SPRN_PURR); + spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + + purr - vcpu->arch.purr); + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + + spurr - vcpu->arch.spurr); + vcpu->arch.purr = purr; + vcpu->arch.spurr = spurr; + + vcpu->arch.ic = mfspr(SPRN_IC); + vcpu->arch.pid = mfspr(SPRN_PID); + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; + + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + + mtspr(SPRN_PSSCR, host_psscr); + mtspr(SPRN_HFSCR, host_hfscr); + mtspr(SPRN_CIABR, host_ciabr); + mtspr(SPRN_DAWR, host_dawr); + mtspr(SPRN_DAWRX, host_dawrx); + mtspr(SPRN_PID, host_pidr); + + /* + * Since this is radix, do a eieio; tlbsync; ptesync sequence in + * case we interrupted the guest between a tlbie and a ptesync. + */ + asm volatile("eieio; tlbsync; ptesync"); + + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ + isync(); + + vc->dpdes = mfspr(SPRN_DPDES); + vc->vtb = mfspr(SPRN_VTB); + mtspr(SPRN_DPDES, 0); + if (vc->pcr) + mtspr(SPRN_PCR, 0); + + if (vc->tb_offset_applied) { + u64 new_tb = mftb() - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = 0; + } + + mtspr(SPRN_HDEC, 0x7fffffff); + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + + return trap; +} + +/* + * Virtual-mode guest entry for POWER9 and later when the host and + * guest are both using the radix MMU. The LPIDR has already been set. + */ +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long host_dscr = mfspr(SPRN_DSCR); + unsigned long host_tidr = mfspr(SPRN_TIDR); + unsigned long host_iamr = mfspr(SPRN_IAMR); + s64 dec; + u64 tb; + int trap, save_pmu; + + dec = mfspr(SPRN_DEC); + tb = mftb(); + if (dec < 512) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + local_paca->kvm_hstate.dec_expires = dec + tb; + + vcpu->arch.ceded = 0; + + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ + + kvmppc_subcore_enter_guest(); + + vc->entry_exit_map = 1; + vc->in_guest = 1; + + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + } + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + kvmhv_load_guest_pmu(vcpu); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + load_fp_state(&vcpu->arch.fp); + load_vr_state(&vcpu->arch.vr); + + mtspr(SPRN_DSCR, vcpu->arch.dscr); + mtspr(SPRN_IAMR, vcpu->arch.iamr); + mtspr(SPRN_PSPB, vcpu->arch.pspb); + mtspr(SPRN_FSCR, vcpu->arch.fscr); + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + mtspr(SPRN_WORT, vcpu->arch.wort); + mtspr(SPRN_TIDR, vcpu->arch.tid); + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + mtspr(SPRN_AMR, vcpu->arch.amr); + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + + if (vcpu->arch.doorbell_request) { + vc->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } + + trap = kvmhv_load_hv_regs_and_go(vcpu); + + vcpu->arch.slb_max = 0; + dec = mfspr(SPRN_DEC); + tb = mftb(); + vcpu->arch.dec_expires = dec + tb; + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + vcpu->arch.wort = mfspr(SPRN_WORT); + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); + + mtspr(SPRN_PSPB, 0); + mtspr(SPRN_WORT, 0); + mtspr(SPRN_AMR, 0); + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_DSCR, host_dscr); + mtspr(SPRN_TIDR, host_tidr); + mtspr(SPRN_IAMR, host_iamr); + mtspr(SPRN_PSPB, 0); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + store_fp_state(&vcpu->arch.fp); + store_vr_state(&vcpu->arch.vr); + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + save_pmu = 1; + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + save_pmu = lp->pmcregs_in_use; + } + + kvmhv_save_guest_pmu(vcpu, save_pmu); + + vc->entry_exit_map = 0x101; + vc->in_guest = 0; + + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + + kvmhv_load_host_pmu(); + + kvmppc_subcore_exit_guest(); + + return trap; +} + +/* * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host. */ @@ -3405,6 +3666,163 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } +static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, + struct kvm_vcpu *vcpu) +{ + int trap, r, pcpu, pcpu0; + int srcu_idx; + struct kvmppc_vcore *vc; + + trace_kvmppc_run_vcpu_enter(vcpu); + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + vc = vcpu->arch.vcore; + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; + vc->runnable_threads[0] = vcpu; + vc->n_runnable = 1; + vc->runner = vcpu; + + /* See if the MMU is ready to go */ + if (!vcpu->kvm->arch.mmu_ready) { + r = kvmhv_setup_mmu(vcpu); + if (r) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry. + hardware_entry_failure_reason = 0; + vcpu->arch.ret = r; + goto out; + } + } + + if (need_resched()) + cond_resched(); + + kvmppc_update_vpas(vcpu); + + init_vcore_to_run(vc); + vc->preempt_tb = TB_NIL; + + preempt_disable(); + pcpu = smp_processor_id(); + vc->pcpu = pcpu; + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + + local_irq_disable(); + hard_irq_disable(); + if (signal_pending(current)) + goto sigpend; + if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready) + goto out; + + kvmppc_core_prepare_to_enter(vcpu); + + kvmppc_clear_host_core(pcpu); + + local_paca->kvm_hstate.tid = 0; + local_paca->kvm_hstate.napping = 0; + local_paca->kvm_hstate.kvm_split_mode = NULL; + kvmppc_start_thread(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc); + trace_kvm_guest_enter(vcpu); + + vc->vcore_state = VCORE_RUNNING; + trace_kvmppc_run_core(vc, 0); + + mtspr(SPRN_LPID, vc->kvm->arch.lpid); + isync(); + + /* See comment above in kvmppc_run_core() about this */ + pcpu0 = pcpu; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pcpu0 &= ~0x3UL; + + if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) { + radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush); + } + + trace_hardirqs_on(); + guest_enter_irqoff(); + + srcu_idx = srcu_read_lock(&vc->kvm->srcu); + + this_cpu_disable_ftrace(); + + trap = kvmhv_p9_guest_entry(vcpu); + vcpu->arch.trap = trap; + + this_cpu_enable_ftrace(); + + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + + mtspr(SPRN_LPID, vc->kvm->arch.host_lpid); + isync(); + + trace_hardirqs_off(); + set_irq_happened(trap); + + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + guest_exit(); + preempt_enable(); + + /* cancel pending decrementer exception if DEC is now positive */ + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + r = RESUME_GUEST; + if (trap) + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + vcpu->arch.ret = r; + + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && + !kvmppc_vcpu_woken(vcpu)) { + kvmppc_set_timer(vcpu); + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { + if (signal_pending(current)) { + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + break; + } + spin_lock(&vc->lock); + kvmppc_vcore_blocked(vc); + spin_unlock(&vc->lock); + } + } + vcpu->arch.ceded = 0; + + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_run_core(vc, 1); + + done: + kvmppc_remove_runnable(vc, vcpu); + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); + + return vcpu->arch.ret; + + sigpend: + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + out: + local_irq_enable(); + preempt_enable(); + goto done; +} + static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; @@ -3480,7 +3898,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { - r = kvmppc_run_vcpu(run, vcpu); + if (kvm->arch.threads_indep && kvm_is_radix(kvm)) + r = kvmppc_run_single_vcpu(run, vcpu); + else + r = kvmppc_run_vcpu(run, vcpu); if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index ee564b6..0787f12 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); void kvmppc_subcore_exit_guest(void) { @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); static bool kvmppc_tb_resync_required(void) { diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 67a847f..2abc336 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -47,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_NOVCPU 2 /* Stack frame offsets for kvmppc_hv_entry */ -#define SFS 160 +#define SFS 208 #define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_SHORT_PATH (SFS-8) #define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_PSSCR (SFS-24) #define STACK_SLOT_PID (SFS-32) @@ -57,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +/* the following is used by the P9 short path */ +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ /* * Call kvmppc_hv_entry in real mode. @@ -1020,6 +1023,9 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) no_xive: #endif /* CONFIG_KVM_XICS */ + li r0, 0 + stw r0, STACK_SLOT_SHORT_PATH(r1) + deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ /* Check if we can deliver an external or decrementer interrupt now */ ld r0, VCPU_PENDING_EXC(r4) @@ -1034,13 +1040,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) bl kvmppc_guest_entry_inject_int ld r4, HSTATE_KVM_VCPU(r13) 71: - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 +fast_guest_entry_c: + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG @@ -1117,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) HRFI_TO_GUEST b . +/* + * Enter the guest on a P9 or later system where we have exactly + * one vcpu per vcore and we don't need to go to real mode + * (which implies that host and guest are both using radix MMU mode). + * r3 = vcpu pointer + * Most SPRs and all the VSRs have been loaded already. + */ +_GLOBAL(__kvmhv_vcpu_entry_p9) +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -SFS(r1) + + li r0, 1 + stw r0, STACK_SLOT_SHORT_PATH(r1) + + std r3, HSTATE_KVM_VCPU(r13) + mfcr r4 + stw r4, SFS+8(r1) + + std r1, HSTATE_HOST_R1(r13) + + reg = 14 + .rept 18 + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, __VCPU_GPR(reg)(r3) + reg = reg + 1 + .endr + + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + + mr r4, r3 + b fast_guest_entry_c +guest_exit_short_path: + + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + reg = 14 + .rept 18 + std reg, __VCPU_GPR(reg)(r9) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + lwz r4, SFS+8(r1) + mtcr r4 + + mr r3, r12 /* trap number */ + + addi r1, r1, SFS + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + + /* If we are in real mode, do a rfid to get back to the caller */ + mfmsr r4 + andi. r5, r4, MSR_IR + bnelr + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ + mtspr SPRN_SRR0, r0 + ld r10, HSTATE_HOST_MSR(r13) + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG + mtspr SPRN_SRR1, r10 + RFI_TO_KERNEL + b . + secondary_too_late: li r12, 0 stw r12, STACK_SLOT_TRAP(r1) @@ -1377,6 +1461,11 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1: #endif /* CONFIG_KVM_XICS */ + /* If we came in through the P9 short path, go back out to C now */ + lwz r0, STACK_SLOT_SHORT_PATH(r1) + cmpwi r0, 0 + bne guest_exit_short_path + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 30c2eb7..ad4a370 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -62,6 +62,69 @@ #define XIVE_Q_GAP 2 /* + * Push a vcpu's context to the XIVE on guest entry. + * This assumes we are in virtual mode (MMU on) + */ +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) +{ + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; + u64 pq; + + if (!tima) + return; + eieio(); + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); + vcpu->arch.xive_pushed = 1; + eieio(); + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurious wakeup on the next H_CEDE, which is not an + * issue. + */ + vcpu->arch.irq_pending = 0; + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + if (vcpu->arch.xive_esc_on) { + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + + XIVE_ESB_SET_PQ_01)); + mb(); + + /* + * We have a possible subtle race here: The escalation + * interrupt might have fired and be on its way to the + * host queue while we mask it, and if we unmask it + * early enough (re-cede right away), there is a + * theorical possibility that it fires again, thus + * landing in the target queue more than once which is + * a big no-no. + * + * Fortunately, solving this is rather easy. If the + * above load setting PQ to 01 returns a previous + * value where P is set, then we know the escalation + * interrupt is somewhere on its way to the host. In + * that case we simply don't clear the xive_esc_on + * flag below. It will be eventually cleared by the + * handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again + * before re-enabling the escalation interrupt, and if + * set, we abort the cede. + */ + if (!(pq & XIVE_ESB_VAL_P)) + /* Now P is 0, we can clear the flag */ + vcpu->arch.xive_esc_on = 0; + } +} +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); + +/* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page */
This creates an alternative guest entry/exit path which is used for radix guests on POWER9 systems when we have indep_threads_mode=Y. In these circumstances there is exactly one vcpu per vcore and there is no coordination required between vcpus or vcores; the vcpu can enter the guest without needing to synchronize with anything else. The new fast path is implemented almost entirely in C in book3s_hv.c and runs with the MMU on until the guest is entered. On guest exit we use the existing path until the point where we are committed to exiting the guest (as distinct from handling an interrupt in the low-level code and returning to the guest) and we have pulled the guest context from the XIVE. At that point we check a flag in the stack frame to see whether we came in via the old path and the new path; if we came in via the new path then we go back to C code to do the rest of the process of saving the guest context and restoring the host context. The C code is split into separate functions for handling the OS-accessible state and the hypervisor state, with the idea that the latter can be replaced by a hypercall when we implement nested virtualization. Signed-off-by: Paul Mackerras <paulus@ozlabs.org> --- arch/powerpc/include/asm/asm-prototypes.h | 2 + arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/kvm/book3s_hv.c | 423 +++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_hv_ras.c | 2 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 95 ++++++- arch/powerpc/kvm/book3s_xive.c | 63 +++++ 6 files changed, 583 insertions(+), 4 deletions(-)