[RFC,20/32] KVM: PPC: Book3S HV: Nested guest entry via hypercall

Message ID	1537524123-9578-21-git-send-email-paulus@ozlabs.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Paul Mackerras <paulus@ozlabs.org> To: kvm-ppc@vger.kernel.org, kvm@vger.kernel.org Cc: David Gibson <david@gibson.dropbear.id.au> Subject: [RFC PATCH 20/32] KVM: PPC: Book3S HV: Nested guest entry via hypercall Date: Fri, 21 Sep 2018 20:01:51 +1000 Message-Id: <1537524123-9578-21-git-send-email-paulus@ozlabs.org> In-Reply-To: <1537524123-9578-1-git-send-email-paulus@ozlabs.org> References: <1537524123-9578-1-git-send-email-paulus@ozlabs.org> Sender: kvm-owner@vger.kernel.org Precedence: bulk
Series	KVM: PPC: Book3S HV: Nested HV virtualization \| expand [RFC,00/32] KVM: PPC: Book3S HV: Nested HV virtualization [RFC,01/32] KVM: PPC: Book3S: Simplify external interrupt handling [RFC,02/32] KVM: PPC: Book3S HV: Remove left-over code in XICS-on-XIVE emulation [RFC,03/32] KVM: PPC: Book3S HV: Move interrupt delivery on guest entry to C code [RFC,04/32] KVM: PPC: Book3S HV: Extract PMU save/restore operations as C-callable functions [RFC,05/32] KVM: PPC: Book3S HV: Simplify real-mode interrupt handling [RFC,06/32] KVM: PPC: Book3S: Rework TM save/restore code and make it C-callable [RFC,07/32] KVM: PPC: Book3S HV: Call kvmppc_handle_exit_hv() with vcore unlocked [RFC,08/32] KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests [RFC,09/32] KVM: PPC: Book3S HV: Handle hypervisor instruction faults better [RFC,10/32] KVM: PPC: Book3S HV: Add a debugfs file to dump radix mappings [RFC,11/32] powerpc: Add LPCR_EVIRT define [RFC,12/32] powerpc: Provide dummy versions of plpar_hcall functions when !CONFIG_PPC_PSERIES [RFC,13/32] powerpc: Turn off CPU_FTR_P9_TM_HV_ASSIST in non-hypervisor mode [RFC,14/32] KVM: PPC: Use ccr field in pt_regs struct embedded in vcpu struct [RFC,15/32] KVM: PPC: Book3S HV: Clear partition table entry on vm teardown [RFC,16/32] KVM: PPC: Book3S HV: Make kvmppc_mmu_radix_xlate process/partition table agnostic [RFC,17/32] KVM: PPC: Book3S HV: Refactor radix page fault handler [RFC,18/32] KVM: PPC: Book3S HV: Use kvmppc_unmap_pte() in kvm_unmap_radix() [RFC,19/32] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization [RFC,20/32] KVM: PPC: Book3S HV: Nested guest entry via hypercall [RFC,21/32] KVM: PPC: Book3S HV: Use XICS hypercalls when running as a nested hypervisor [RFC,22/32] KVM: PPC: Book3S HV: Handle hypercalls correctly when nested [RFC,23/32] KVM: PPC: Book3S HV: Framework to handle HV Emulation Assist Interrupt [RFC,24/32] KVM: PPC: Book3S HV: Handle page fault for a nested guest [RFC,25/32] KVM: PPC: Book3S HV: Introduce rmap to track nested guest mappings [RFC,26/32] KVM: PPC: Book3S HV: Emulate Privileged TLBIE for guest hypervisors [RFC,27/32] KVM: PPC: Book3S HV: Invalidate TLB when nested vcpu moves physical cpu [RFC,28/32] KVM: PPC: Book3S HV: Don't access HFSCR, LPIDR or LPCR when running nested [RFC,29/32] KVM: PPC: Book3S HV: Add one-reg interface to virtual PTCR register [RFC,30/32] KVM: PPC: Book3S HV: Add a VM capability to enable nested virtualization [RFC,31/32] KVM: PPC: Book3S HV: Allow HV module to load without hypervisor mode [RFC,32/32] KVM: PPC: Book3S HV: Add nested shadow page tables to debugfs

On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote: > This adds a new hypercall, H_ENTER_NESTED, which is used by a nested > hypervisor to enter one of its nested guests. The hypercall supplies > register values in two structs. Those values are copied by the level 0 > (L0) hypervisor (the one which is running in hypervisor mode) into the > vcpu struct of the L1 guest, and then the guest is run until an > interrupt or error occurs which needs to be reported to L1 via the > hypercall return value. > > Currently this assumes that the L0 and L1 hypervisors are the same > endianness, and the structs passed as arguments are in native > endianness. That's nasty. It'd be good to at least detect this and bail. > > Nested hypervisors do not support indep_threads_mode=N, so this adds > code to print a warning message if the administrator has set > indep_threads_mode=N, and treat it as Y. > > Signed-off-by: Paul Mackerras <paulus@ozlabs.org> > --- > arch/powerpc/include/asm/hvcall.h | 36 +++++ > arch/powerpc/include/asm/kvm_book3s.h | 7 + > arch/powerpc/include/asm/kvm_host.h | 5 + > arch/powerpc/kernel/asm-offsets.c | 1 + > arch/powerpc/kvm/book3s_hv.c | 194 +++++++++++++++++++++++---- > arch/powerpc/kvm/book3s_hv_nested.c | 230 ++++++++++++++++++++++++++++++++ > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 + > 7 files changed, 452 insertions(+), 25 deletions(-) > > diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h > index 9afaa82..dfcf43d 100644 > --- a/arch/powerpc/include/asm/hvcall.h > +++ b/arch/powerpc/include/asm/hvcall.h > @@ -487,6 +487,42 @@ struct h_cpu_char_result { > u64 behaviour; > }; > > +/* Register state for entering a nested guest with H_ENTER_NESTED */ > +struct hv_guest_state { > + u64 version; /* version of this structure layout */ > + u32 lpid; > + u32 vcpu_token; > + /* These registers are hypervisor privileged (at least for writing) */ > + u64 lpcr; > + u64 pcr; > + u64 amor; > + u64 dpdes; > + u64 hfscr; > + s64 tb_offset; > + u64 dawr0; > + u64 dawrx0; > + u64 ciabr; > + u64 hdec_expiry; > + u64 purr; > + u64 spurr; > + u64 ic; > + u64 vtb; > + u64 hdar; > + u64 hdsisr; > + u64 heir; > + u64 asdr; > + /* These are OS privileged but need to be set late in guest entry */ > + u64 srr0; > + u64 srr1; > + u64 sprg[4]; > + u64 pidr; > + u64 cfar; > + u64 ppr; > +}; > + > +/* Latest version of hv_guest_state structure */ > +#define HV_GUEST_STATE_VERSION 1 > + > #endif /* __ASSEMBLY__ */ > #endif /* __KERNEL__ */ > #endif /* _ASM_POWERPC_HVCALL_H */ > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h > index 7719ca5..125bc5b 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -280,6 +280,13 @@ void kvmhv_vm_nested_init(struct kvm *kvm); > long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); > void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); > void kvmhv_release_all_nested(struct kvm *kvm); > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); > +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, > + u64 time_limit); > +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); > +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, > + struct hv_guest_state *hr); > +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); > > void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index c35d4f2..ceb9f20 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -95,6 +95,7 @@ struct dtl_entry; > > struct kvmppc_vcpu_book3s; > struct kvmppc_book3s_shadow_vcpu; > +struct kvm_nested_guest; > > struct kvm_vm_stat { > ulong remote_tlb_flush; > @@ -786,6 +787,10 @@ struct kvm_vcpu_arch { > u32 emul_inst; > > u32 online; > + > + /* For support of nested guests */ > + struct kvm_nested_guest *nested; > + u32 nested_vcpu_id; > #endif > > #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c > index 7c3738d..d0abcbb 100644 > --- a/arch/powerpc/kernel/asm-offsets.c > +++ b/arch/powerpc/kernel/asm-offsets.c > @@ -503,6 +503,7 @@ int main(void) > OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); > OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); > OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); > + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); > OFFSET(VCPU_CPU, kvm_vcpu, cpu); > OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); > #endif > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 82c9a1e..da380cb 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -942,6 +942,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > break; > case H_ENTER_NESTED: > ret = H_FUNCTION; > + if (!vcpu->kvm->arch.nested_enable) > + break; Wouldn't H_AUTHORITY make more sense than H_FUNCTION for the no-nested-allowed case? > + ret = kvmhv_enter_nested_guest(vcpu); > + if (ret == H_INTERRUPT) { > + kvmppc_set_gpr(vcpu, 3, 0); > + return RESUME_HOST; > + } > break; > > default: > @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, > return r; > } > > +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) Might be nice to rename to make it clear if this is the L0 or L1 handling for a nested exit. > +{ > + int r; > + int srcu_idx; > + > + vcpu->stat.sum_exits++; > + > + /* > + * This can happen if an interrupt occurs in the last stages > + * of guest entry or the first stages of guest exit (i.e. after > + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV > + * and before setting it to KVM_GUEST_MODE_HOST_HV). > + * That can happen due to a bug, or due to a machine check > + * occurring at just the wrong time. > + */ > + if (vcpu->arch.shregs.msr & MSR_HV) { > + printk(KERN_EMERG "KVM trap in HV mode while nested!\n"); > + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", > + vcpu->arch.trap, kvmppc_get_pc(vcpu), > + vcpu->arch.shregs.msr); > + kvmppc_dump_regs(vcpu); > + BUG(); > + } > + switch (vcpu->arch.trap) { > + /* We're good on these - the host merely wanted to get our attention */ > + case BOOK3S_INTERRUPT_HV_DECREMENTER: > + vcpu->stat.dec_exits++; > + r = RESUME_GUEST; > + break; > + case BOOK3S_INTERRUPT_EXTERNAL: > + vcpu->stat.ext_intr_exits++; > + r = RESUME_HOST; > + break; > + case BOOK3S_INTERRUPT_H_DOORBELL: > + case BOOK3S_INTERRUPT_H_VIRT: > + vcpu->stat.ext_intr_exits++; > + r = RESUME_GUEST; > + break; > + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ > + case BOOK3S_INTERRUPT_HMI: > + case BOOK3S_INTERRUPT_PERFMON: > + case BOOK3S_INTERRUPT_SYSTEM_RESET: > + r = RESUME_GUEST; > + break; > + case BOOK3S_INTERRUPT_MACHINE_CHECK: > + /* Pass the machine check to the L1 guest */ > + r = RESUME_HOST; > + /* Print the MCE event to host console. */ > + machine_check_print_event_info(&vcpu->arch.mce_evt, false); > + break; > + /* > + * We get these next two if the guest accesses a page which it thinks > + * it has mapped but which is not actually present, either because > + * it is for an emulated I/O device or because the corresonding > + * host page has been paged out. > + */ > + case BOOK3S_INTERRUPT_H_DATA_STORAGE: > + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvmhv_nested_page_fault(vcpu); > + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > + break; > + case BOOK3S_INTERRUPT_H_INST_STORAGE: > + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); > + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & > + DSISR_SRR1_MATCH_64S; > + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) > + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; > + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvmhv_nested_page_fault(vcpu); > + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > + break; > + > +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM > + case BOOK3S_INTERRUPT_HV_SOFTPATCH: > + /* > + * This occurs for various TM-related instructions that > + * we need to emulate on POWER9 DD2.2. We have already > + * handled the cases where the guest was in real-suspend > + * mode and was transitioning to transactional state. > + */ > + r = kvmhv_p9_tm_emulation(vcpu); > + break; > +#endif > + > + case BOOK3S_INTERRUPT_HV_RM_HARD: > + vcpu->arch.trap = 0; > + r = RESUME_GUEST; > + if (!xive_enabled()) > + kvmppc_xics_rm_complete(vcpu, 0); > + break; > + default: > + r = RESUME_HOST; > + break; > + } > + > + return r; > +} > + > static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, > struct kvm_sregs *sregs) > { > @@ -3095,7 +3200,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) > /* > * Load up hypervisor-mode registers on P9. > */ > -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit) > { > struct kvmppc_vcore *vc = vcpu->arch.vcore; > s64 hdec; > @@ -3108,7 +3213,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > unsigned long host_psscr = mfspr(SPRN_PSSCR); > unsigned long host_pidr = mfspr(SPRN_PID); > > - hdec = local_paca->kvm_hstate.dec_expires - mftb(); > + hdec = time_limit - mftb(); Looks like this change might better belong in the earlier patch creating kvmhv_load_hv_regs_and_go(). > if (hdec < 0) > return BOOK3S_INTERRUPT_HV_DECREMENTER; > mtspr(SPRN_HDEC, hdec); > @@ -3222,7 +3327,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > * Virtual-mode guest entry for POWER9 and later when the host and > * guest are both using the radix MMU. The LPIDR has already been set. > */ > -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit) > { > struct kvmppc_vcore *vc = vcpu->arch.vcore; > unsigned long host_dscr = mfspr(SPRN_DSCR); > @@ -3237,6 +3342,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > if (dec < 512) > return BOOK3S_INTERRUPT_HV_DECREMENTER; > local_paca->kvm_hstate.dec_expires = dec + tb; > + if (local_paca->kvm_hstate.dec_expires < time_limit) > + time_limit = local_paca->kvm_hstate.dec_expires; > > vcpu->arch.ceded = 0; > > @@ -3290,7 +3397,28 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > vcpu->arch.doorbell_request = 0; > } > > - trap = kvmhv_load_hv_regs_and_go(vcpu); > + if (!cpu_has_feature(CPU_FTR_HVMODE)) { > + /* call our hypervisor to load up HV regs and go */ > + struct hv_guest_state hvregs; > + > + kvmhv_save_hv_regs(vcpu, &hvregs); > + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; > + hvregs.version = HV_GUEST_STATE_VERSION; > + hvregs.lpid = vcpu->kvm->arch.lpid; > + hvregs.vcpu_token = vcpu->vcpu_id; > + hvregs.hdec_expiry = time_limit; > + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, > + &vcpu->arch.pending_exceptions)) > + hvregs.lpcr |= LPCR_MER; > + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), > + __pa(&vcpu->arch.regs)); > + kvmhv_restore_hv_return_state(vcpu, &hvregs); > + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; > + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); > + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); > + } else { > + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit); > + } > > vcpu->arch.slb_max = 0; > dec = mfspr(SPRN_DEC); > @@ -3530,6 +3658,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) > trace_kvmppc_vcore_wakeup(do_sleep, block_ns); > } > > +/* > + * This is assumed not to be able to fail for a radix guest in > + * kvmhv_run_single_vcpu(). > + */ Might be clearer to split that change out with a rationale as to why it's correct. > static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) > { > int r = 0; > @@ -3679,12 +3811,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > return vcpu->arch.ret; > } > > -static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > - struct kvm_vcpu *vcpu) > +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, > + struct kvm_vcpu *vcpu, u64 time_limit) > { > int trap, r, pcpu, pcpu0; > int srcu_idx; > struct kvmppc_vcore *vc; > + struct kvm_nested_guest *nested = vcpu->arch.nested; > + unsigned long lpid; > > trace_kvmppc_run_vcpu_enter(vcpu); > > @@ -3705,16 +3839,8 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > vc->runner = vcpu; > > /* See if the MMU is ready to go */ > - if (!vcpu->kvm->arch.mmu_ready) { > - r = kvmhv_setup_mmu(vcpu); > - if (r) { > - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; > - kvm_run->fail_entry. > - hardware_entry_failure_reason = 0; > - vcpu->arch.ret = r; > - goto out; > - } > - } > + if (!vcpu->kvm->arch.mmu_ready) > + kvmhv_setup_mmu(vcpu); > > if (need_resched()) > cond_resched(); > @@ -3736,7 +3862,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready) > goto out; > > - kvmppc_core_prepare_to_enter(vcpu); > + if (!nested) { > + kvmppc_core_prepare_to_enter(vcpu); > + } else if (vcpu->arch.pending_exceptions) { > + vcpu->arch.ret = RESUME_HOST; > + goto out; > + } > > kvmppc_clear_host_core(pcpu); > > @@ -3750,7 +3881,10 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > vc->vcore_state = VCORE_RUNNING; > trace_kvmppc_run_core(vc, 0); > > - mtspr(SPRN_LPID, vc->kvm->arch.lpid); > + lpid = vc->kvm->arch.lpid; > + if (nested) > + lpid = nested->shadow_lpid; > + mtspr(SPRN_LPID, lpid); > isync(); > > /* See comment above in kvmppc_run_core() about this */ > @@ -3759,7 +3893,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > pcpu0 &= ~0x3UL; > > if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) { > - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); > + radix__local_flush_tlb_lpid_guest(lpid); > /* Clear the bit after the TLB flush */ > cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush); > } > @@ -3771,7 +3905,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > > this_cpu_disable_ftrace(); > > - trap = kvmhv_p9_guest_entry(vcpu); > + trap = kvmhv_p9_guest_entry(vcpu, time_limit); > vcpu->arch.trap = trap; > > this_cpu_enable_ftrace(); > @@ -3796,8 +3930,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > > trace_kvm_guest_exit(vcpu); > r = RESUME_GUEST; > - if (trap) > - r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + if (trap) { > + if (!nested) > + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + else > + r = kvmppc_handle_nested_exit(vcpu); > + } > vcpu->arch.ret = r; > > if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && > @@ -3912,7 +4050,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > > do { > if (kvm->arch.threads_indep && kvm_is_radix(kvm)) > - r = kvmppc_run_single_vcpu(run, vcpu); > + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0); > else > r = kvmppc_run_vcpu(run, vcpu); > > @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) > * On POWER9, we only need to do this if the "indep_threads_mode" > * module parameter has been set to N. > */ > - if (cpu_has_feature(CPU_FTR_ARCH_300)) > - kvm->arch.threads_indep = indep_threads_mode; > + if (cpu_has_feature(CPU_FTR_ARCH_300)) { > + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { > + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); > + kvm->arch.threads_indep = true; Wouldn't it be cleaner to enforce this at the point indep_threads_mode is set, rather than altering the value here? > + } else { > + kvm->arch.threads_indep = indep_threads_mode; > + } > + } > if (!kvm->arch.threads_indep) > kvm_hv_vm_activated(); > > diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c > index 5fe3ea4..a7f3da9 100644 > --- a/arch/powerpc/kvm/book3s_hv_nested.c > +++ b/arch/powerpc/kvm/book3s_hv_nested.c > @@ -20,6 +20,231 @@ static struct patb_entry *pseries_partition_tb; > > static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); > > +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + hr->lpcr = vc->lpcr; > + hr->pcr = vc->pcr; > + hr->dpdes = vc->dpdes; > + hr->hfscr = vcpu->arch.hfscr; > + hr->tb_offset = vc->tb_offset; > + hr->dawr0 = vcpu->arch.dawr; > + hr->dawrx0 = vcpu->arch.dawrx; > + hr->ciabr = vcpu->arch.ciabr; > + hr->purr = vcpu->arch.purr; > + hr->spurr = vcpu->arch.spurr; > + hr->ic = vcpu->arch.ic; > + hr->vtb = vc->vtb; > + hr->srr0 = vcpu->arch.shregs.srr0; > + hr->srr1 = vcpu->arch.shregs.srr1; > + hr->sprg[0] = vcpu->arch.shregs.sprg0; > + hr->sprg[1] = vcpu->arch.shregs.sprg1; > + hr->sprg[2] = vcpu->arch.shregs.sprg2; > + hr->sprg[3] = vcpu->arch.shregs.sprg3; > + hr->pidr = vcpu->arch.pid; > + hr->cfar = vcpu->arch.cfar; > + hr->ppr = vcpu->arch.ppr; > +} > + > +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, > + struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + hr->dpdes = vc->dpdes; > + hr->hfscr = vcpu->arch.hfscr; > + hr->purr = vcpu->arch.purr; > + hr->spurr = vcpu->arch.spurr; > + hr->ic = vcpu->arch.ic; > + hr->vtb = vc->vtb; > + hr->srr0 = vcpu->arch.shregs.srr0; > + hr->srr1 = vcpu->arch.shregs.srr1; > + hr->sprg[0] = vcpu->arch.shregs.sprg0; > + hr->sprg[1] = vcpu->arch.shregs.sprg1; > + hr->sprg[2] = vcpu->arch.shregs.sprg2; > + hr->sprg[3] = vcpu->arch.shregs.sprg3; > + hr->pidr = vcpu->arch.pid; > + hr->cfar = vcpu->arch.cfar; > + hr->ppr = vcpu->arch.ppr; > + switch (trap) { > + case BOOK3S_INTERRUPT_H_DATA_STORAGE: > + hr->hdar = vcpu->arch.fault_dar; > + hr->hdsisr = vcpu->arch.fault_dsisr; > + hr->asdr = vcpu->arch.fault_gpa; > + break; > + case BOOK3S_INTERRUPT_H_INST_STORAGE: > + hr->asdr = vcpu->arch.fault_gpa; > + break; > + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: > + hr->heir = vcpu->arch.emul_inst; > + break; > + } > +} > + > +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + vc->pcr = hr->pcr; > + vc->dpdes = hr->dpdes; > + vcpu->arch.hfscr = hr->hfscr; > + vcpu->arch.dawr = hr->dawr0; > + vcpu->arch.dawrx = hr->dawrx0; > + vcpu->arch.ciabr = hr->ciabr; > + vcpu->arch.purr = hr->purr; > + vcpu->arch.spurr = hr->spurr; > + vcpu->arch.ic = hr->ic; > + vc->vtb = hr->vtb; > + vcpu->arch.shregs.srr0 = hr->srr0; > + vcpu->arch.shregs.srr1 = hr->srr1; > + vcpu->arch.shregs.sprg0 = hr->sprg[0]; > + vcpu->arch.shregs.sprg1 = hr->sprg[1]; > + vcpu->arch.shregs.sprg2 = hr->sprg[2]; > + vcpu->arch.shregs.sprg3 = hr->sprg[3]; > + vcpu->arch.pid = hr->pidr; > + vcpu->arch.cfar = hr->cfar; > + vcpu->arch.ppr = hr->ppr; > +} > + > +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, > + struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + vc->dpdes = hr->dpdes; > + vcpu->arch.hfscr = hr->hfscr; > + vcpu->arch.purr = hr->purr; > + vcpu->arch.spurr = hr->spurr; > + vcpu->arch.ic = hr->ic; > + vc->vtb = hr->vtb; > + vcpu->arch.fault_dar = hr->hdar; > + vcpu->arch.fault_dsisr = hr->hdsisr; > + vcpu->arch.fault_gpa = hr->asdr; > + vcpu->arch.emul_inst = hr->heir; > + vcpu->arch.shregs.srr0 = hr->srr0; > + vcpu->arch.shregs.srr1 = hr->srr1; > + vcpu->arch.shregs.sprg0 = hr->sprg[0]; > + vcpu->arch.shregs.sprg1 = hr->sprg[1]; > + vcpu->arch.shregs.sprg2 = hr->sprg[2]; > + vcpu->arch.shregs.sprg3 = hr->sprg[3]; > + vcpu->arch.pid = hr->pidr; > + vcpu->arch.cfar = hr->cfar; > + vcpu->arch.ppr = hr->ppr; > +} > + > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) > +{ > + long int err, r; > + struct kvm_nested_guest *l2; > + struct pt_regs l2_regs, saved_l1_regs; > + struct hv_guest_state l2_hv, saved_l1_hv; > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + u64 hv_ptr, regs_ptr; > + u64 hdec_exp; > + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; > + u64 mask; > + > + if (!kvm_is_radix(vcpu->kvm)) > + return H_FUNCTION; Would it be safer / cleaner to have this instead check that the L1 has completed an H_SET_PARTITION_TABLE? Which wouldn't be allowed for an HPT guest. > + > + /* copy parameters in */ > + hv_ptr = kvmppc_get_gpr(vcpu, 4); > + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, > + sizeof(struct hv_guest_state)); > + if (err) > + return H_PARAMETER; > + if (l2_hv.version != HV_GUEST_STATE_VERSION) > + return H_P2; > + > + regs_ptr = kvmppc_get_gpr(vcpu, 5); > + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, > + sizeof(struct pt_regs)); > + if (err) > + return H_PARAMETER; > + > + /* translate lpid */ > + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); > + if (!l2) > + return H_PARAMETER; > + if (!l2->l1_gr_to_hr) { > + mutex_lock(&l2->tlb_lock); > + kvmhv_update_ptbl_cache(l2); > + mutex_unlock(&l2->tlb_lock); > + } > + > + /* save l1 values of things */ > + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; > + saved_l1_regs = vcpu->arch.regs; > + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); > + > + /* convert TB values/offsets to host (L0) values */ > + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; > + vc->tb_offset += l2_hv.tb_offset; > + > + /* set L1 state to L2 state */ > + vcpu->arch.nested = l2; > + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; > + vcpu->arch.regs = l2_regs; > + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; > + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | > + LPCR_LPES | LPCR_MER; > + vc->lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); > + restore_hv_regs(vcpu, &l2_hv); > + > + vcpu->arch.ret = RESUME_GUEST; > + vcpu->arch.trap = 0; > + do { > + if (mftb() >= hdec_exp) { > + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; > + r = RESUME_HOST; > + break; > + } > + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp); > + } while (is_kvmppc_resume_guest(r)); > + > + /* save L2 state for return */ > + l2_regs = vcpu->arch.regs; > + l2_regs.msr = vcpu->arch.shregs.msr; > + delta_purr = vcpu->arch.purr - l2_hv.purr; > + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; > + delta_ic = vcpu->arch.ic - l2_hv.ic; > + delta_vtb = vc->vtb - l2_hv.vtb; > + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); > + > + /* restore L1 state */ > + vcpu->arch.nested = NULL; > + vcpu->arch.regs = saved_l1_regs; > + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; > + /* set L1 MSR TS field according to L2 transaction state */ > + if (l2_regs.msr & MSR_TS_MASK) > + vcpu->arch.shregs.msr |= MSR_TS_S; > + vc->lpcr = saved_l1_hv.lpcr; > + vc->tb_offset = saved_l1_hv.tb_offset; > + restore_hv_regs(vcpu, &saved_l1_hv); > + vcpu->arch.purr += delta_purr; > + vcpu->arch.spurr += delta_spurr; > + vcpu->arch.ic += delta_ic; > + vc->vtb += delta_vtb; > + > + kvmhv_put_nested(l2); > + > + /* copy l2_hv_state and regs back to guest */ > + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, > + sizeof(struct hv_guest_state)); > + if (err) > + return H_AUTHORITY; > + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, > + sizeof(struct pt_regs)); > + if (err) > + return H_AUTHORITY; > + > + if (r == -EINTR) > + return H_INTERRUPT; > + > + return vcpu->arch.trap; > +} > + > /* Only called when we're not in hypervisor mode */ > bool kvmhv_nested_init(void) > { > @@ -284,3 +509,8 @@ struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) > return NULL; > return kvm->arch.nested_guests[lpid]; > } > + > +long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) > +{ > + return RESUME_HOST; > +} > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 83efc13..04fcaa4 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -2199,6 +2199,10 @@ hcall_try_real_mode: > andi. r0,r11,MSR_PR > /* sc 1 from userspace - reflect to guest syscall */ > bne sc_1_fast_return > + /* sc 1 from nested guest - give it to L1 to handle */ > + ld r0, VCPU_NESTED(r9) > + cmpdi r0, 0 > + bne guest_exit_cont > clrrdi r3,r3,2 > cmpldi r3,hcall_real_table_end - hcall_real_table > bge guest_exit_cont

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 9afaa82..dfcf43d 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -487,6 +487,42 @@ struct h_cpu_char_result { u64 behaviour; }; +/* Register state for entering a nested guest with H_ENTER_NESTED */ +struct hv_guest_state { + u64 version; /* version of this structure layout */ + u32 lpid; + u32 vcpu_token; + /* These registers are hypervisor privileged (at least for writing) */ + u64 lpcr; + u64 pcr; + u64 amor; + u64 dpdes; + u64 hfscr; + s64 tb_offset; + u64 dawr0; + u64 dawrx0; + u64 ciabr; + u64 hdec_expiry; + u64 purr; + u64 spurr; + u64 ic; + u64 vtb; + u64 hdar; + u64 hdsisr; + u64 heir; + u64 asdr; + /* These are OS privileged but need to be set late in guest entry */ + u64 srr0; + u64 srr1; + u64 sprg[4]; + u64 pidr; + u64 cfar; + u64 ppr; +}; + +/* Latest version of hv_guest_state structure */ +#define HV_GUEST_STATE_VERSION 1 + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 7719ca5..125bc5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -280,6 +280,13 @@ void kvmhv_vm_nested_init(struct kvm *kvm); long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); void kvmhv_release_all_nested(struct kvm *kvm); +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, + u64 time_limit); +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr); +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c35d4f2..ceb9f20 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -95,6 +95,7 @@ struct dtl_entry; struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; +struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; @@ -786,6 +787,10 @@ struct kvm_vcpu_arch { u32 emul_inst; u32 online; + + /* For support of nested guests */ + struct kvm_nested_guest *nested; + u32 nested_vcpu_id; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7c3738d..d0abcbb 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -503,6 +503,7 @@ int main(void) OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 82c9a1e..da380cb 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -942,6 +942,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; case H_ENTER_NESTED: ret = H_FUNCTION; + if (!vcpu->kvm->arch.nested_enable) + break; + ret = kvmhv_enter_nested_guest(vcpu); + if (ret == H_INTERRUPT) { + kvmppc_set_gpr(vcpu, 3, 0); + return RESUME_HOST; + } break; default: @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) +{ + int r; + int srcu_idx; + + vcpu->stat.sum_exits++; + + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + printk(KERN_EMERG "KVM trap in HV mode while nested!\n"); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + BUG(); + } + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ + case BOOK3S_INTERRUPT_HMI: + case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Pass the machine check to the L1 guest */ + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); + break; + /* + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + break; +#endif + + case BOOK3S_INTERRUPT_HV_RM_HARD: + vcpu->arch.trap = 0; + r = RESUME_GUEST; + if (!xive_enabled()) + kvmppc_xics_rm_complete(vcpu, 0); + break; + default: + r = RESUME_HOST; + break; + } + + return r; +} + static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -3095,7 +3200,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Load up hypervisor-mode registers on P9. */ -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit) { struct kvmppc_vcore *vc = vcpu->arch.vcore; s64 hdec; @@ -3108,7 +3213,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) unsigned long host_psscr = mfspr(SPRN_PSSCR); unsigned long host_pidr = mfspr(SPRN_PID); - hdec = local_paca->kvm_hstate.dec_expires - mftb(); + hdec = time_limit - mftb(); if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; mtspr(SPRN_HDEC, hdec); @@ -3222,7 +3327,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) * Virtual-mode guest entry for POWER9 and later when the host and * guest are both using the radix MMU. The LPIDR has already been set. */ -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit) { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long host_dscr = mfspr(SPRN_DSCR); @@ -3237,6 +3342,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) if (dec < 512) return BOOK3S_INTERRUPT_HV_DECREMENTER; local_paca->kvm_hstate.dec_expires = dec + tb; + if (local_paca->kvm_hstate.dec_expires < time_limit) + time_limit = local_paca->kvm_hstate.dec_expires; vcpu->arch.ceded = 0; @@ -3290,7 +3397,28 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) vcpu->arch.doorbell_request = 0; } - trap = kvmhv_load_hv_regs_and_go(vcpu); + if (!cpu_has_feature(CPU_FTR_HVMODE)) { + /* call our hypervisor to load up HV regs and go */ + struct hv_guest_state hvregs; + + kvmhv_save_hv_regs(vcpu, &hvregs); + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + hvregs.version = HV_GUEST_STATE_VERSION; + hvregs.lpid = vcpu->kvm->arch.lpid; + hvregs.vcpu_token = vcpu->vcpu_id; + hvregs.hdec_expiry = time_limit; + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions)) + hvregs.lpcr |= LPCR_MER; + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), + __pa(&vcpu->arch.regs)); + kvmhv_restore_hv_return_state(vcpu, &hvregs); + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + } else { + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit); + } vcpu->arch.slb_max = 0; dec = mfspr(SPRN_DEC); @@ -3530,6 +3658,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +/* + * This is assumed not to be able to fail for a radix guest in + * kvmhv_run_single_vcpu(). + */ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) { int r = 0; @@ -3679,12 +3811,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } -static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, - struct kvm_vcpu *vcpu) +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, + struct kvm_vcpu *vcpu, u64 time_limit) { int trap, r, pcpu, pcpu0; int srcu_idx; struct kvmppc_vcore *vc; + struct kvm_nested_guest *nested = vcpu->arch.nested; + unsigned long lpid; trace_kvmppc_run_vcpu_enter(vcpu); @@ -3705,16 +3839,8 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, vc->runner = vcpu; /* See if the MMU is ready to go */ - if (!vcpu->kvm->arch.mmu_ready) { - r = kvmhv_setup_mmu(vcpu); - if (r) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry. - hardware_entry_failure_reason = 0; - vcpu->arch.ret = r; - goto out; - } - } + if (!vcpu->kvm->arch.mmu_ready) + kvmhv_setup_mmu(vcpu); if (need_resched()) cond_resched(); @@ -3736,7 +3862,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready) goto out; - kvmppc_core_prepare_to_enter(vcpu); + if (!nested) { + kvmppc_core_prepare_to_enter(vcpu); + } else if (vcpu->arch.pending_exceptions) { + vcpu->arch.ret = RESUME_HOST; + goto out; + } kvmppc_clear_host_core(pcpu); @@ -3750,7 +3881,10 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, vc->vcore_state = VCORE_RUNNING; trace_kvmppc_run_core(vc, 0); - mtspr(SPRN_LPID, vc->kvm->arch.lpid); + lpid = vc->kvm->arch.lpid; + if (nested) + lpid = nested->shadow_lpid; + mtspr(SPRN_LPID, lpid); isync(); /* See comment above in kvmppc_run_core() about this */ @@ -3759,7 +3893,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, pcpu0 &= ~0x3UL; if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) { - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); + radix__local_flush_tlb_lpid_guest(lpid); /* Clear the bit after the TLB flush */ cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush); } @@ -3771,7 +3905,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, this_cpu_disable_ftrace(); - trap = kvmhv_p9_guest_entry(vcpu); + trap = kvmhv_p9_guest_entry(vcpu, time_limit); vcpu->arch.trap = trap; this_cpu_enable_ftrace(); @@ -3796,8 +3930,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, trace_kvm_guest_exit(vcpu); r = RESUME_GUEST; - if (trap) - r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + if (trap) { + if (!nested) + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + else + r = kvmppc_handle_nested_exit(vcpu); + } vcpu->arch.ret = r; if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && @@ -3912,7 +4050,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) do { if (kvm->arch.threads_indep && kvm_is_radix(kvm)) - r = kvmppc_run_single_vcpu(run, vcpu); + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0); else r = kvmppc_run_vcpu(run, vcpu); @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * On POWER9, we only need to do this if the "indep_threads_mode" * module parameter has been set to N. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - kvm->arch.threads_indep = indep_threads_mode; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); + kvm->arch.threads_indep = true; + } else { + kvm->arch.threads_indep = indep_threads_mode; + } + } if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 5fe3ea4..a7f3da9 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -20,6 +20,231 @@ static struct patb_entry *pseries_partition_tb; static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->lpcr = vc->lpcr; + hr->pcr = vc->pcr; + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->tb_offset = vc->tb_offset; + hr->dawr0 = vcpu->arch.dawr; + hr->dawrx0 = vcpu->arch.dawrx; + hr->ciabr = vcpu->arch.ciabr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; +} + +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; + switch (trap) { + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + hr->hdar = vcpu->arch.fault_dar; + hr->hdsisr = vcpu->arch.fault_dsisr; + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + hr->heir = vcpu->arch.emul_inst; + break; + } +} + +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->pcr = hr->pcr; + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.dawr = hr->dawr0; + vcpu->arch.dawrx = hr->dawrx0; + vcpu->arch.ciabr = hr->ciabr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.fault_dar = hr->hdar; + vcpu->arch.fault_dsisr = hr->hdsisr; + vcpu->arch.fault_gpa = hr->asdr; + vcpu->arch.emul_inst = hr->heir; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) +{ + long int err, r; + struct kvm_nested_guest *l2; + struct pt_regs l2_regs, saved_l1_regs; + struct hv_guest_state l2_hv, saved_l1_hv; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 hv_ptr, regs_ptr; + u64 hdec_exp; + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; + u64 mask; + + if (!kvm_is_radix(vcpu->kvm)) + return H_FUNCTION; + + /* copy parameters in */ + hv_ptr = kvmppc_get_gpr(vcpu, 4); + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_PARAMETER; + if (l2_hv.version != HV_GUEST_STATE_VERSION) + return H_P2; + + regs_ptr = kvmppc_get_gpr(vcpu, 5); + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_PARAMETER; + + /* translate lpid */ + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); + if (!l2) + return H_PARAMETER; + if (!l2->l1_gr_to_hr) { + mutex_lock(&l2->tlb_lock); + kvmhv_update_ptbl_cache(l2); + mutex_unlock(&l2->tlb_lock); + } + + /* save l1 values of things */ + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + saved_l1_regs = vcpu->arch.regs; + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); + + /* convert TB values/offsets to host (L0) values */ + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; + vc->tb_offset += l2_hv.tb_offset; + + /* set L1 state to L2 state */ + vcpu->arch.nested = l2; + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + vcpu->arch.regs = l2_regs; + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + vc->lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); + restore_hv_regs(vcpu, &l2_hv); + + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + do { + if (mftb() >= hdec_exp) { + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + r = RESUME_HOST; + break; + } + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp); + } while (is_kvmppc_resume_guest(r)); + + /* save L2 state for return */ + l2_regs = vcpu->arch.regs; + l2_regs.msr = vcpu->arch.shregs.msr; + delta_purr = vcpu->arch.purr - l2_hv.purr; + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; + delta_ic = vcpu->arch.ic - l2_hv.ic; + delta_vtb = vc->vtb - l2_hv.vtb; + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + + /* restore L1 state */ + vcpu->arch.nested = NULL; + vcpu->arch.regs = saved_l1_regs; + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; + /* set L1 MSR TS field according to L2 transaction state */ + if (l2_regs.msr & MSR_TS_MASK) + vcpu->arch.shregs.msr |= MSR_TS_S; + vc->lpcr = saved_l1_hv.lpcr; + vc->tb_offset = saved_l1_hv.tb_offset; + restore_hv_regs(vcpu, &saved_l1_hv); + vcpu->arch.purr += delta_purr; + vcpu->arch.spurr += delta_spurr; + vcpu->arch.ic += delta_ic; + vc->vtb += delta_vtb; + + kvmhv_put_nested(l2); + + /* copy l2_hv_state and regs back to guest */ + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_AUTHORITY; + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_AUTHORITY; + + if (r == -EINTR) + return H_INTERRUPT; + + return vcpu->arch.trap; +} + /* Only called when we're not in hypervisor mode */ bool kvmhv_nested_init(void) { @@ -284,3 +509,8 @@ struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) return NULL; return kvm->arch.nested_guests[lpid]; } + +long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) +{ + return RESUME_HOST; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 83efc13..04fcaa4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2199,6 +2199,10 @@ hcall_try_real_mode: andi. r0,r11,MSR_PR /* sc 1 from userspace - reflect to guest syscall */ bne sc_1_fast_return + /* sc 1 from nested guest - give it to L1 to handle */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table bge guest_exit_cont

[RFC,20/32] KVM: PPC: Book3S HV: Nested guest entry via hypercall

Commit Message

Comments

Patch