diff mbox series

[v5,13/14] spapr: nested: Introduce H_GUEST_RUN_VCPU hcall.

Message ID 20240308111940.1617660-14-harshpb@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series Nested PAPR API (KVM on PowerVM) | expand

Commit Message

Harsh Prateek Bora March 8, 2024, 11:19 a.m. UTC
The H_GUEST_RUN_VCPU hcall is used to start execution of a Guest VCPU.
The Hypervisor will update the state of the Guest VCPU based on the
input buffer, restore the saved Guest VCPU state, and start its
execution.

The Guest VCPU can stop running for numerous reasons including HCALLs,
hypervisor exceptions, or an outstanding Host Partition Interrupt.
The reason that the Guest VCPU stopped running is communicated through
R4 and the output buffer will be filled in with any relevant state.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
---
 target/ppc/cpu.h      |   2 +
 hw/ppc/ppc.c          |  10 ++
 hw/ppc/spapr_nested.c | 334 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 316 insertions(+), 30 deletions(-)

Comments

Nicholas Piggin March 12, 2024, 11:42 a.m. UTC | #1
On Fri Mar 8, 2024 at 9:19 PM AEST, Harsh Prateek Bora wrote:
> The H_GUEST_RUN_VCPU hcall is used to start execution of a Guest VCPU.
> The Hypervisor will update the state of the Guest VCPU based on the
> input buffer, restore the saved Guest VCPU state, and start its
> execution.
>
> The Guest VCPU can stop running for numerous reasons including HCALLs,
> hypervisor exceptions, or an outstanding Host Partition Interrupt.
> The reason that the Guest VCPU stopped running is communicated through
> R4 and the output buffer will be filled in with any relevant state.
>
> Signed-off-by: Michael Neuling <mikey@neuling.org>
> Signed-off-by: Harsh Prateek Bora <harshpb@linux.ibm.com>

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

> ---
>  target/ppc/cpu.h      |   2 +
>  hw/ppc/ppc.c          |  10 ++
>  hw/ppc/spapr_nested.c | 334 ++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 316 insertions(+), 30 deletions(-)
>
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 4cffd46c79..95b7c86eb3 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1584,6 +1584,8 @@ uint64_t cpu_ppc_load_atbl(CPUPPCState *env);
>  uint32_t cpu_ppc_load_atbu(CPUPPCState *env);
>  void cpu_ppc_store_atbl(CPUPPCState *env, uint32_t value);
>  void cpu_ppc_store_atbu(CPUPPCState *env, uint32_t value);
> +void cpu_ppc_increase_tb_by_offset (CPUPPCState *env, int64_t offset);
> +void cpu_ppc_decrease_tb_by_offset (CPUPPCState *env, int64_t offset);
>  uint64_t cpu_ppc_load_vtb(CPUPPCState *env);
>  void cpu_ppc_store_vtb(CPUPPCState *env, uint64_t value);
>  bool ppc_decr_clear_on_delivery(CPUPPCState *env);
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index fadb8f5239..55860b9a83 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -633,6 +633,16 @@ void cpu_ppc_store_atbu (CPUPPCState *env, uint32_t value)
>                       ((uint64_t)value << 32) | tb);
>  }
>  
> +void cpu_ppc_increase_tb_by_offset (CPUPPCState *env, int64_t offset)
> +{
> +    env->tb_env->tb_offset += offset;
> +}
> +
> +void cpu_ppc_decrease_tb_by_offset (CPUPPCState *env, int64_t offset)
> +{
> +    env->tb_env->tb_offset -= offset;
> +}
> +
>  uint64_t cpu_ppc_load_vtb(CPUPPCState *env)
>  {
>      ppc_tb_t *tb_env = env->tb_env;
> diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
> index e0b234c786..597dba7fdc 100644
> --- a/hw/ppc/spapr_nested.c
> +++ b/hw/ppc/spapr_nested.c
> @@ -200,14 +200,28 @@ static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)
>          save->sier = env->spr[SPR_POWER_SIER];
>          save->vscr = ppc_get_vscr(env);
>          save->fpscr = env->fpscr;
> +    } else if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
> +        save->tb_offset = env->tb_env->tb_offset;
>      }
> +}
>  
> -    save->tb_offset = env->tb_env->tb_offset;
> +static void nested_post_load_state(CPUPPCState *env, CPUState *cs)
> +{
> +    /*
> +     * compute hflags and possible interrupts.
> +     */
> +    hreg_compute_hflags(env);
> +    ppc_maybe_interrupt(env);
> +    /*
> +     * Nested HV does not tag TLB entries between L1 and L2, so must
> +     * flush on transition.
> +     */
> +    tlb_flush(cs);
> +    env->reserve_addr = -1; /* Reset the reservation */
>  }
>  
>  static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
>  {
> -    CPUState *cs = CPU(cpu);
>      CPUPPCState *env = &cpu->env;
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>  
> @@ -285,22 +299,9 @@ static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
>          env->spr[SPR_POWER_SIER] = load->sier;
>          ppc_store_vscr(env, load->vscr);
>          ppc_store_fpscr(env, load->fpscr);
> +    } else if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
> +        env->tb_env->tb_offset = load->tb_offset;
>      }
> -
> -    env->tb_env->tb_offset = load->tb_offset;
> -
> -    /*
> -     * MSR updated, compute hflags and possible interrupts.
> -     */
> -    hreg_compute_hflags(env);
> -    ppc_maybe_interrupt(env);
> -
> -    /*
> -     * Nested HV does not tag TLB entries between L1 and L2, so must
> -     * flush on transition.
> -     */
> -    tlb_flush(cs);
> -    env->reserve_addr = -1; /* Reset the reservation */
>  }
>  
>  /*
> @@ -315,6 +316,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
>  {
>      PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>      CPUPPCState *env = &cpu->env;
> +    CPUState *cs = CPU(cpu);
>      SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
>      struct nested_ppc_state l2_state;
>      target_ulong hv_ptr = args[0];
> @@ -413,6 +415,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
>       * Switch to the nested guest environment and start the "hdec" timer.
>       */
>      nested_load_state(cpu, &l2_state);
> +    nested_post_load_state(env, cs);
>  
>      hdec = hv_state.hdec_expiry - now;
>      cpu_ppc_hdecr_init(env);
> @@ -444,6 +447,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
>  static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
>  {
>      CPUPPCState *env = &cpu->env;
> +    CPUState *cs = CPU(cpu);
>      SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
>      struct nested_ppc_state l2_state;
>      target_ulong hv_ptr = spapr_cpu->nested_host_state->gpr[4];
> @@ -465,6 +469,7 @@ static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
>       */
>      assert(env->spr[SPR_LPIDR] != 0);
>      nested_load_state(cpu, spapr_cpu->nested_host_state);
> +    nested_post_load_state(env, cs);
>      env->gpr[3] = env->excp_vectors[excp]; /* hcall return value */
>  
>      cpu_ppc_hdecr_exit(env);
> @@ -542,19 +547,6 @@ static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
>      address_space_unmap(CPU(cpu)->as, regs, len, len, true);
>  }
>  
> -void spapr_exit_nested(PowerPCCPU *cpu, int excp)
> -{
> -    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> -    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> -
> -    assert(spapr_cpu->in_nested);
> -    if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
> -        spapr_exit_nested_hv(cpu, excp);
> -    } else {
> -        g_assert_not_reached();
> -    }
> -}
> -
>  SpaprMachineStateNestedGuest *spapr_get_nested_guest(SpaprMachineState *spapr,
>                                                       target_ulong guestid)
>  {
> @@ -1539,6 +1531,286 @@ static target_ulong h_guest_get_state(PowerPCCPU *cpu,
>      return h_guest_getset_state(cpu, spapr, args, false);
>  }
>  
> +static void exit_nested_store_l2(PowerPCCPU *cpu, int excp,
> +                                 SpaprMachineStateNestedGuestVcpu *vcpu)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +    target_ulong now, hdar, hdsisr, asdr;
> +
> +    assert(sizeof(env->gpr) == sizeof(vcpu->state.gpr)); /* sanity check */
> +
> +    now = cpu_ppc_load_tbl(env); /* L2 timebase */
> +    now -= vcpu->tb_offset; /* L1 timebase */
> +    vcpu->state.dec_expiry_tb = now - cpu_ppc_load_decr(env);
> +    cpu_ppc_store_decr(env, spapr_cpu->nested_host_state->dec_expiry_tb - now);
> +    /* backup hdar, hdsisr, asdr if reqd later below */
> +    hdar   = vcpu->state.hdar;
> +    hdsisr = vcpu->state.hdsisr;
> +    asdr   = vcpu->state.asdr;
> +
> +    nested_save_state(&vcpu->state, cpu);
> +
> +    if (excp == POWERPC_EXCP_MCHECK ||
> +        excp == POWERPC_EXCP_RESET ||
> +        excp == POWERPC_EXCP_SYSCALL) {
> +        vcpu->state.nip = env->spr[SPR_SRR0];
> +        vcpu->state.msr = env->spr[SPR_SRR1] & env->msr_mask;
> +    } else {
> +        vcpu->state.nip = env->spr[SPR_HSRR0];
> +        vcpu->state.msr = env->spr[SPR_HSRR1] & env->msr_mask;
> +    }
> +
> +    /* hdar, hdsisr, asdr should be retained unless certain exceptions */
> +    if ((excp != POWERPC_EXCP_HDSI) && (excp != POWERPC_EXCP_HISI)) {
> +        vcpu->state.asdr = asdr;
> +    } else if (excp != POWERPC_EXCP_HDSI) {
> +        vcpu->state.hdar   = hdar;
> +        vcpu->state.hdsisr = hdsisr;
> +    }
> +}
> +
> +static int get_exit_ids(uint64_t srr0, uint16_t ids[16])
> +{
> +    int nr;
> +
> +    switch (srr0) {
> +    case 0xc00:
> +        nr = 10;
> +        ids[0] = GSB_VCPU_GPR3;
> +        ids[1] = GSB_VCPU_GPR4;
> +        ids[2] = GSB_VCPU_GPR5;
> +        ids[3] = GSB_VCPU_GPR6;
> +        ids[4] = GSB_VCPU_GPR7;
> +        ids[5] = GSB_VCPU_GPR8;
> +        ids[6] = GSB_VCPU_GPR9;
> +        ids[7] = GSB_VCPU_GPR10;
> +        ids[8] = GSB_VCPU_GPR11;
> +        ids[9] = GSB_VCPU_GPR12;
> +        break;
> +    case 0xe00:
> +        nr = 5;
> +        ids[0] = GSB_VCPU_SPR_HDAR;
> +        ids[1] = GSB_VCPU_SPR_HDSISR;
> +        ids[2] = GSB_VCPU_SPR_ASDR;
> +        ids[3] = GSB_VCPU_SPR_NIA;
> +        ids[4] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xe20:
> +        nr = 4;
> +        ids[0] = GSB_VCPU_SPR_HDAR;
> +        ids[1] = GSB_VCPU_SPR_ASDR;
> +        ids[2] = GSB_VCPU_SPR_NIA;
> +        ids[3] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xe40:
> +        nr = 3;
> +        ids[0] = GSB_VCPU_SPR_HEIR;
> +        ids[1] = GSB_VCPU_SPR_NIA;
> +        ids[2] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xf80:
> +        nr = 3;
> +        ids[0] = GSB_VCPU_SPR_HFSCR;
> +        ids[1] = GSB_VCPU_SPR_NIA;
> +        ids[2] = GSB_VCPU_SPR_MSR;
> +        break;
> +    default:
> +        nr = 0;
> +        break;
> +    }
> +
> +    return nr;
> +}
> +
> +static void exit_process_output_buffer(PowerPCCPU *cpu,
> +                                       SpaprMachineStateNestedGuest *guest,
> +                                       target_ulong vcpuid,
> +                                       target_ulong *r3)
> +{
> +    SpaprMachineStateNestedGuestVcpu *vcpu = &guest->vcpus[vcpuid];
> +    struct guest_state_request gsr;
> +    struct guest_state_buffer *gsb;
> +    struct guest_state_element *element;
> +    struct guest_state_element_type *type;
> +    int exit_id_count = 0;
> +    uint16_t exit_cause_ids[16];
> +    hwaddr len;
> +
> +    len = vcpu->runbufout.size;
> +    gsb = address_space_map(CPU(cpu)->as, vcpu->runbufout.addr, &len, true,
> +                            MEMTXATTRS_UNSPECIFIED);
> +    if (!gsb || len != vcpu->runbufout.size) {
> +        address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
> +        *r3 = H_P2;
> +        return;
> +    }
> +
> +    exit_id_count = get_exit_ids(*r3, exit_cause_ids);
> +
> +    /* Create a buffer of elements to send back */
> +    gsb->num_elements = cpu_to_be32(exit_id_count);
> +    element = gsb->elements;
> +    for (int i = 0; i < exit_id_count; i++) {
> +        type = guest_state_element_type_find(exit_cause_ids[i]);
> +        assert(type);
> +        element->id = cpu_to_be16(exit_cause_ids[i]);
> +        element->size = cpu_to_be16(type->size);
> +        element = guest_state_element_next(element, NULL, NULL);
> +    }
> +    gsr.gsb = gsb;
> +    gsr.len = VCPU_OUT_BUF_MIN_SZ;
> +    gsr.flags = 0; /* get + never guest wide */
> +    getset_state(guest, vcpuid, &gsr);
> +
> +    address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
> +    return;
> +}
> +
> +static
> +void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    CPUState *cs = CPU(cpu);
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +    target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value */
> +    target_ulong lpid = 0, vcpuid = 0;
> +    struct SpaprMachineStateNestedGuestVcpu *vcpu = NULL;
> +    struct SpaprMachineStateNestedGuest *guest = NULL;
> +
> +    lpid = spapr_cpu->nested_host_state->gpr[5];
> +    vcpuid = spapr_cpu->nested_host_state->gpr[6];
> +    guest = spapr_get_nested_guest(spapr, lpid);
> +    assert(guest);
> +    spapr_nested_vcpu_check(guest, vcpuid, false);
> +    vcpu = &guest->vcpus[vcpuid];
> +
> +    exit_nested_store_l2(cpu, excp, vcpu);
> +    /* do the output buffer for run_vcpu*/
> +    exit_process_output_buffer(cpu, guest, vcpuid, &r3_return);
> +
> +    assert(env->spr[SPR_LPIDR] != 0);
> +    nested_load_state(cpu, spapr_cpu->nested_host_state);
> +    cpu_ppc_decrease_tb_by_offset(env, vcpu->tb_offset);
> +    env->gpr[3] = H_SUCCESS;
> +    env->gpr[4] = r3_return;
> +    nested_post_load_state(env, cs);
> +    cpu_ppc_hdecr_exit(env);
> +
> +    spapr_cpu->in_nested = false;
> +    g_free(spapr_cpu->nested_host_state);
> +    spapr_cpu->nested_host_state = NULL;
> +}
> +
> +void spapr_exit_nested(PowerPCCPU *cpu, int excp)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +
> +    assert(spapr_cpu->in_nested);
> +    if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
> +        spapr_exit_nested_hv(cpu, excp);
> +    } else if (spapr_nested_api(spapr) == NESTED_API_PAPR) {
> +        spapr_exit_nested_papr(spapr, cpu, excp);
> +    } else {
> +        g_assert_not_reached();
> +    }
> +}
> +
> +static void nested_papr_load_l2(PowerPCCPU *cpu,
> +                                CPUPPCState *env,
> +                                SpaprMachineStateNestedGuestVcpu *vcpu,
> +                                target_ulong now)
> +{
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong lpcr, lpcr_mask, hdec;
> +    lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER;
> +
> +    assert(vcpu);
> +    assert(sizeof(env->gpr) == sizeof(vcpu->state.gpr));
> +    nested_load_state(cpu, &vcpu->state);
> +    lpcr = (env->spr[SPR_LPCR] & ~lpcr_mask) |
> +           (vcpu->state.lpcr & lpcr_mask);
> +    lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE;
> +    lpcr &= ~LPCR_LPES0;
> +    env->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask;
> +
> +    hdec = vcpu->hdecr_expiry_tb - now;
> +    cpu_ppc_store_decr(env, vcpu->state.dec_expiry_tb - now);
> +    cpu_ppc_hdecr_init(env);
> +    cpu_ppc_store_hdecr(env, hdec);
> +
> +    cpu_ppc_increase_tb_by_offset(env, vcpu->tb_offset);
> +}
> +
> +static void nested_papr_run_vcpu(PowerPCCPU *cpu,
> +                                 uint64_t lpid,
> +                                 SpaprMachineStateNestedGuestVcpu *vcpu)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    CPUPPCState *env = &cpu->env;
> +    CPUState *cs = CPU(cpu);
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +    target_ulong now = cpu_ppc_load_tbl(env);
> +
> +    assert(env->spr[SPR_LPIDR] == 0);
> +    assert(spapr->nested.api); /* ensure API version is initialized */
> +    spapr_cpu->nested_host_state = g_try_new(struct nested_ppc_state, 1);
> +    assert(spapr_cpu->nested_host_state);
> +    nested_save_state(spapr_cpu->nested_host_state, cpu);
> +    spapr_cpu->nested_host_state->dec_expiry_tb = now - cpu_ppc_load_decr(env);
> +    nested_papr_load_l2(cpu, env, vcpu, now);
> +    env->spr[SPR_LPIDR] = lpid; /* post load l2 */
> +
> +    spapr_cpu->in_nested = true;
> +    nested_post_load_state(env, cs);
> +}
> +
> +static target_ulong h_guest_run_vcpu(PowerPCCPU *cpu,
> +                                     SpaprMachineState *spapr,
> +                                     target_ulong opcode,
> +                                     target_ulong *args)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong flags = args[0];
> +    target_ulong lpid = args[1];
> +    target_ulong vcpuid = args[2];
> +    struct SpaprMachineStateNestedGuestVcpu *vcpu;
> +    struct guest_state_request gsr;
> +    SpaprMachineStateNestedGuest *guest;
> +    target_ulong rc;
> +
> +    if (flags) /* don't handle any flags for now */
> +        return H_PARAMETER;
> +
> +    guest = spapr_get_nested_guest(spapr, lpid);
> +    if (!guest) {
> +        return H_P2;
> +    }
> +    if (!spapr_nested_vcpu_check(guest, vcpuid, true)) {
> +        return H_P3;
> +    }
> +
> +    if (guest->parttbl[0] == 0) {
> +        /* At least need a partition scoped radix tree */
> +        return H_NOT_AVAILABLE;
> +    }
> +
> +    vcpu = &guest->vcpus[vcpuid];
> +
> +    /* Read run_vcpu input buffer to update state */
> +    gsr.buf = vcpu->runbufin.addr;
> +    gsr.len = vcpu->runbufin.size;
> +    gsr.flags = GUEST_STATE_REQUEST_SET; /* Thread wide + writing */
> +    rc = map_and_getset_state(cpu, guest, vcpuid, &gsr);
> +    if (rc == H_SUCCESS) {
> +        nested_papr_run_vcpu(cpu, lpid, vcpu);
> +    } else {
> +        env->gpr[3] = rc;
> +    }
> +    return env->gpr[3];
> +}
> +
>  void spapr_register_nested_hv(void)
>  {
>      spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
> @@ -1564,6 +1836,7 @@ void spapr_register_nested_papr(void)
>      spapr_register_hypercall(H_GUEST_CREATE_VCPU     , h_guest_create_vcpu);
>      spapr_register_hypercall(H_GUEST_SET_STATE       , h_guest_set_state);
>      spapr_register_hypercall(H_GUEST_GET_STATE       , h_guest_get_state);
> +    spapr_register_hypercall(H_GUEST_RUN_VCPU        , h_guest_run_vcpu);
>  }
>  
>  void spapr_unregister_nested_papr(void)
> @@ -1575,6 +1848,7 @@ void spapr_unregister_nested_papr(void)
>      spapr_unregister_hypercall(H_GUEST_CREATE_VCPU);
>      spapr_unregister_hypercall(H_GUEST_SET_STATE);
>      spapr_unregister_hypercall(H_GUEST_GET_STATE);
> +    spapr_unregister_hypercall(H_GUEST_RUN_VCPU);
>  }
>  
>  #else
diff mbox series

Patch

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 4cffd46c79..95b7c86eb3 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1584,6 +1584,8 @@  uint64_t cpu_ppc_load_atbl(CPUPPCState *env);
 uint32_t cpu_ppc_load_atbu(CPUPPCState *env);
 void cpu_ppc_store_atbl(CPUPPCState *env, uint32_t value);
 void cpu_ppc_store_atbu(CPUPPCState *env, uint32_t value);
+void cpu_ppc_increase_tb_by_offset (CPUPPCState *env, int64_t offset);
+void cpu_ppc_decrease_tb_by_offset (CPUPPCState *env, int64_t offset);
 uint64_t cpu_ppc_load_vtb(CPUPPCState *env);
 void cpu_ppc_store_vtb(CPUPPCState *env, uint64_t value);
 bool ppc_decr_clear_on_delivery(CPUPPCState *env);
diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index fadb8f5239..55860b9a83 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -633,6 +633,16 @@  void cpu_ppc_store_atbu (CPUPPCState *env, uint32_t value)
                      ((uint64_t)value << 32) | tb);
 }
 
+void cpu_ppc_increase_tb_by_offset (CPUPPCState *env, int64_t offset)
+{
+    env->tb_env->tb_offset += offset;
+}
+
+void cpu_ppc_decrease_tb_by_offset (CPUPPCState *env, int64_t offset)
+{
+    env->tb_env->tb_offset -= offset;
+}
+
 uint64_t cpu_ppc_load_vtb(CPUPPCState *env)
 {
     ppc_tb_t *tb_env = env->tb_env;
diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index e0b234c786..597dba7fdc 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -200,14 +200,28 @@  static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)
         save->sier = env->spr[SPR_POWER_SIER];
         save->vscr = ppc_get_vscr(env);
         save->fpscr = env->fpscr;
+    } else if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
+        save->tb_offset = env->tb_env->tb_offset;
     }
+}
 
-    save->tb_offset = env->tb_env->tb_offset;
+static void nested_post_load_state(CPUPPCState *env, CPUState *cs)
+{
+    /*
+     * compute hflags and possible interrupts.
+     */
+    hreg_compute_hflags(env);
+    ppc_maybe_interrupt(env);
+    /*
+     * Nested HV does not tag TLB entries between L1 and L2, so must
+     * flush on transition.
+     */
+    tlb_flush(cs);
+    env->reserve_addr = -1; /* Reset the reservation */
 }
 
 static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
 {
-    CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 
@@ -285,22 +299,9 @@  static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
         env->spr[SPR_POWER_SIER] = load->sier;
         ppc_store_vscr(env, load->vscr);
         ppc_store_fpscr(env, load->fpscr);
+    } else if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
+        env->tb_env->tb_offset = load->tb_offset;
     }
-
-    env->tb_env->tb_offset = load->tb_offset;
-
-    /*
-     * MSR updated, compute hflags and possible interrupts.
-     */
-    hreg_compute_hflags(env);
-    ppc_maybe_interrupt(env);
-
-    /*
-     * Nested HV does not tag TLB entries between L1 and L2, so must
-     * flush on transition.
-     */
-    tlb_flush(cs);
-    env->reserve_addr = -1; /* Reset the reservation */
 }
 
 /*
@@ -315,6 +316,7 @@  static target_ulong h_enter_nested(PowerPCCPU *cpu,
 {
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
     CPUPPCState *env = &cpu->env;
+    CPUState *cs = CPU(cpu);
     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
     struct nested_ppc_state l2_state;
     target_ulong hv_ptr = args[0];
@@ -413,6 +415,7 @@  static target_ulong h_enter_nested(PowerPCCPU *cpu,
      * Switch to the nested guest environment and start the "hdec" timer.
      */
     nested_load_state(cpu, &l2_state);
+    nested_post_load_state(env, cs);
 
     hdec = hv_state.hdec_expiry - now;
     cpu_ppc_hdecr_init(env);
@@ -444,6 +447,7 @@  static target_ulong h_enter_nested(PowerPCCPU *cpu,
 static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
 {
     CPUPPCState *env = &cpu->env;
+    CPUState *cs = CPU(cpu);
     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
     struct nested_ppc_state l2_state;
     target_ulong hv_ptr = spapr_cpu->nested_host_state->gpr[4];
@@ -465,6 +469,7 @@  static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
      */
     assert(env->spr[SPR_LPIDR] != 0);
     nested_load_state(cpu, spapr_cpu->nested_host_state);
+    nested_post_load_state(env, cs);
     env->gpr[3] = env->excp_vectors[excp]; /* hcall return value */
 
     cpu_ppc_hdecr_exit(env);
@@ -542,19 +547,6 @@  static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
     address_space_unmap(CPU(cpu)->as, regs, len, len, true);
 }
 
-void spapr_exit_nested(PowerPCCPU *cpu, int excp)
-{
-    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
-    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
-
-    assert(spapr_cpu->in_nested);
-    if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
-        spapr_exit_nested_hv(cpu, excp);
-    } else {
-        g_assert_not_reached();
-    }
-}
-
 SpaprMachineStateNestedGuest *spapr_get_nested_guest(SpaprMachineState *spapr,
                                                      target_ulong guestid)
 {
@@ -1539,6 +1531,286 @@  static target_ulong h_guest_get_state(PowerPCCPU *cpu,
     return h_guest_getset_state(cpu, spapr, args, false);
 }
 
+static void exit_nested_store_l2(PowerPCCPU *cpu, int excp,
+                                 SpaprMachineStateNestedGuestVcpu *vcpu)
+{
+    CPUPPCState *env = &cpu->env;
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+    target_ulong now, hdar, hdsisr, asdr;
+
+    assert(sizeof(env->gpr) == sizeof(vcpu->state.gpr)); /* sanity check */
+
+    now = cpu_ppc_load_tbl(env); /* L2 timebase */
+    now -= vcpu->tb_offset; /* L1 timebase */
+    vcpu->state.dec_expiry_tb = now - cpu_ppc_load_decr(env);
+    cpu_ppc_store_decr(env, spapr_cpu->nested_host_state->dec_expiry_tb - now);
+    /* backup hdar, hdsisr, asdr if reqd later below */
+    hdar   = vcpu->state.hdar;
+    hdsisr = vcpu->state.hdsisr;
+    asdr   = vcpu->state.asdr;
+
+    nested_save_state(&vcpu->state, cpu);
+
+    if (excp == POWERPC_EXCP_MCHECK ||
+        excp == POWERPC_EXCP_RESET ||
+        excp == POWERPC_EXCP_SYSCALL) {
+        vcpu->state.nip = env->spr[SPR_SRR0];
+        vcpu->state.msr = env->spr[SPR_SRR1] & env->msr_mask;
+    } else {
+        vcpu->state.nip = env->spr[SPR_HSRR0];
+        vcpu->state.msr = env->spr[SPR_HSRR1] & env->msr_mask;
+    }
+
+    /* hdar, hdsisr, asdr should be retained unless certain exceptions */
+    if ((excp != POWERPC_EXCP_HDSI) && (excp != POWERPC_EXCP_HISI)) {
+        vcpu->state.asdr = asdr;
+    } else if (excp != POWERPC_EXCP_HDSI) {
+        vcpu->state.hdar   = hdar;
+        vcpu->state.hdsisr = hdsisr;
+    }
+}
+
+static int get_exit_ids(uint64_t srr0, uint16_t ids[16])
+{
+    int nr;
+
+    switch (srr0) {
+    case 0xc00:
+        nr = 10;
+        ids[0] = GSB_VCPU_GPR3;
+        ids[1] = GSB_VCPU_GPR4;
+        ids[2] = GSB_VCPU_GPR5;
+        ids[3] = GSB_VCPU_GPR6;
+        ids[4] = GSB_VCPU_GPR7;
+        ids[5] = GSB_VCPU_GPR8;
+        ids[6] = GSB_VCPU_GPR9;
+        ids[7] = GSB_VCPU_GPR10;
+        ids[8] = GSB_VCPU_GPR11;
+        ids[9] = GSB_VCPU_GPR12;
+        break;
+    case 0xe00:
+        nr = 5;
+        ids[0] = GSB_VCPU_SPR_HDAR;
+        ids[1] = GSB_VCPU_SPR_HDSISR;
+        ids[2] = GSB_VCPU_SPR_ASDR;
+        ids[3] = GSB_VCPU_SPR_NIA;
+        ids[4] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xe20:
+        nr = 4;
+        ids[0] = GSB_VCPU_SPR_HDAR;
+        ids[1] = GSB_VCPU_SPR_ASDR;
+        ids[2] = GSB_VCPU_SPR_NIA;
+        ids[3] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xe40:
+        nr = 3;
+        ids[0] = GSB_VCPU_SPR_HEIR;
+        ids[1] = GSB_VCPU_SPR_NIA;
+        ids[2] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xf80:
+        nr = 3;
+        ids[0] = GSB_VCPU_SPR_HFSCR;
+        ids[1] = GSB_VCPU_SPR_NIA;
+        ids[2] = GSB_VCPU_SPR_MSR;
+        break;
+    default:
+        nr = 0;
+        break;
+    }
+
+    return nr;
+}
+
+static void exit_process_output_buffer(PowerPCCPU *cpu,
+                                       SpaprMachineStateNestedGuest *guest,
+                                       target_ulong vcpuid,
+                                       target_ulong *r3)
+{
+    SpaprMachineStateNestedGuestVcpu *vcpu = &guest->vcpus[vcpuid];
+    struct guest_state_request gsr;
+    struct guest_state_buffer *gsb;
+    struct guest_state_element *element;
+    struct guest_state_element_type *type;
+    int exit_id_count = 0;
+    uint16_t exit_cause_ids[16];
+    hwaddr len;
+
+    len = vcpu->runbufout.size;
+    gsb = address_space_map(CPU(cpu)->as, vcpu->runbufout.addr, &len, true,
+                            MEMTXATTRS_UNSPECIFIED);
+    if (!gsb || len != vcpu->runbufout.size) {
+        address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
+        *r3 = H_P2;
+        return;
+    }
+
+    exit_id_count = get_exit_ids(*r3, exit_cause_ids);
+
+    /* Create a buffer of elements to send back */
+    gsb->num_elements = cpu_to_be32(exit_id_count);
+    element = gsb->elements;
+    for (int i = 0; i < exit_id_count; i++) {
+        type = guest_state_element_type_find(exit_cause_ids[i]);
+        assert(type);
+        element->id = cpu_to_be16(exit_cause_ids[i]);
+        element->size = cpu_to_be16(type->size);
+        element = guest_state_element_next(element, NULL, NULL);
+    }
+    gsr.gsb = gsb;
+    gsr.len = VCPU_OUT_BUF_MIN_SZ;
+    gsr.flags = 0; /* get + never guest wide */
+    getset_state(guest, vcpuid, &gsr);
+
+    address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
+    return;
+}
+
+static
+void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp)
+{
+    CPUPPCState *env = &cpu->env;
+    CPUState *cs = CPU(cpu);
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+    target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value */
+    target_ulong lpid = 0, vcpuid = 0;
+    struct SpaprMachineStateNestedGuestVcpu *vcpu = NULL;
+    struct SpaprMachineStateNestedGuest *guest = NULL;
+
+    lpid = spapr_cpu->nested_host_state->gpr[5];
+    vcpuid = spapr_cpu->nested_host_state->gpr[6];
+    guest = spapr_get_nested_guest(spapr, lpid);
+    assert(guest);
+    spapr_nested_vcpu_check(guest, vcpuid, false);
+    vcpu = &guest->vcpus[vcpuid];
+
+    exit_nested_store_l2(cpu, excp, vcpu);
+    /* do the output buffer for run_vcpu*/
+    exit_process_output_buffer(cpu, guest, vcpuid, &r3_return);
+
+    assert(env->spr[SPR_LPIDR] != 0);
+    nested_load_state(cpu, spapr_cpu->nested_host_state);
+    cpu_ppc_decrease_tb_by_offset(env, vcpu->tb_offset);
+    env->gpr[3] = H_SUCCESS;
+    env->gpr[4] = r3_return;
+    nested_post_load_state(env, cs);
+    cpu_ppc_hdecr_exit(env);
+
+    spapr_cpu->in_nested = false;
+    g_free(spapr_cpu->nested_host_state);
+    spapr_cpu->nested_host_state = NULL;
+}
+
+void spapr_exit_nested(PowerPCCPU *cpu, int excp)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+
+    assert(spapr_cpu->in_nested);
+    if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
+        spapr_exit_nested_hv(cpu, excp);
+    } else if (spapr_nested_api(spapr) == NESTED_API_PAPR) {
+        spapr_exit_nested_papr(spapr, cpu, excp);
+    } else {
+        g_assert_not_reached();
+    }
+}
+
+static void nested_papr_load_l2(PowerPCCPU *cpu,
+                                CPUPPCState *env,
+                                SpaprMachineStateNestedGuestVcpu *vcpu,
+                                target_ulong now)
+{
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong lpcr, lpcr_mask, hdec;
+    lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER;
+
+    assert(vcpu);
+    assert(sizeof(env->gpr) == sizeof(vcpu->state.gpr));
+    nested_load_state(cpu, &vcpu->state);
+    lpcr = (env->spr[SPR_LPCR] & ~lpcr_mask) |
+           (vcpu->state.lpcr & lpcr_mask);
+    lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE;
+    lpcr &= ~LPCR_LPES0;
+    env->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask;
+
+    hdec = vcpu->hdecr_expiry_tb - now;
+    cpu_ppc_store_decr(env, vcpu->state.dec_expiry_tb - now);
+    cpu_ppc_hdecr_init(env);
+    cpu_ppc_store_hdecr(env, hdec);
+
+    cpu_ppc_increase_tb_by_offset(env, vcpu->tb_offset);
+}
+
+static void nested_papr_run_vcpu(PowerPCCPU *cpu,
+                                 uint64_t lpid,
+                                 SpaprMachineStateNestedGuestVcpu *vcpu)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUPPCState *env = &cpu->env;
+    CPUState *cs = CPU(cpu);
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+    target_ulong now = cpu_ppc_load_tbl(env);
+
+    assert(env->spr[SPR_LPIDR] == 0);
+    assert(spapr->nested.api); /* ensure API version is initialized */
+    spapr_cpu->nested_host_state = g_try_new(struct nested_ppc_state, 1);
+    assert(spapr_cpu->nested_host_state);
+    nested_save_state(spapr_cpu->nested_host_state, cpu);
+    spapr_cpu->nested_host_state->dec_expiry_tb = now - cpu_ppc_load_decr(env);
+    nested_papr_load_l2(cpu, env, vcpu, now);
+    env->spr[SPR_LPIDR] = lpid; /* post load l2 */
+
+    spapr_cpu->in_nested = true;
+    nested_post_load_state(env, cs);
+}
+
+static target_ulong h_guest_run_vcpu(PowerPCCPU *cpu,
+                                     SpaprMachineState *spapr,
+                                     target_ulong opcode,
+                                     target_ulong *args)
+{
+    CPUPPCState *env = &cpu->env;
+    target_ulong flags = args[0];
+    target_ulong lpid = args[1];
+    target_ulong vcpuid = args[2];
+    struct SpaprMachineStateNestedGuestVcpu *vcpu;
+    struct guest_state_request gsr;
+    SpaprMachineStateNestedGuest *guest;
+    target_ulong rc;
+
+    if (flags) /* don't handle any flags for now */
+        return H_PARAMETER;
+
+    guest = spapr_get_nested_guest(spapr, lpid);
+    if (!guest) {
+        return H_P2;
+    }
+    if (!spapr_nested_vcpu_check(guest, vcpuid, true)) {
+        return H_P3;
+    }
+
+    if (guest->parttbl[0] == 0) {
+        /* At least need a partition scoped radix tree */
+        return H_NOT_AVAILABLE;
+    }
+
+    vcpu = &guest->vcpus[vcpuid];
+
+    /* Read run_vcpu input buffer to update state */
+    gsr.buf = vcpu->runbufin.addr;
+    gsr.len = vcpu->runbufin.size;
+    gsr.flags = GUEST_STATE_REQUEST_SET; /* Thread wide + writing */
+    rc = map_and_getset_state(cpu, guest, vcpuid, &gsr);
+    if (rc == H_SUCCESS) {
+        nested_papr_run_vcpu(cpu, lpid, vcpu);
+    } else {
+        env->gpr[3] = rc;
+    }
+    return env->gpr[3];
+}
+
 void spapr_register_nested_hv(void)
 {
     spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
@@ -1564,6 +1836,7 @@  void spapr_register_nested_papr(void)
     spapr_register_hypercall(H_GUEST_CREATE_VCPU     , h_guest_create_vcpu);
     spapr_register_hypercall(H_GUEST_SET_STATE       , h_guest_set_state);
     spapr_register_hypercall(H_GUEST_GET_STATE       , h_guest_get_state);
+    spapr_register_hypercall(H_GUEST_RUN_VCPU        , h_guest_run_vcpu);
 }
 
 void spapr_unregister_nested_papr(void)
@@ -1575,6 +1848,7 @@  void spapr_unregister_nested_papr(void)
     spapr_unregister_hypercall(H_GUEST_CREATE_VCPU);
     spapr_unregister_hypercall(H_GUEST_SET_STATE);
     spapr_unregister_hypercall(H_GUEST_GET_STATE);
+    spapr_unregister_hypercall(H_GUEST_RUN_VCPU);
 }
 
 #else