diff mbox series

[v2,14/14] spapr: nested: Introduce H_GUEST_RUN_VCPU hcall.

Message ID 20231012104951.194876-15-harshpb@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series Nested PAPR API (KVM on PowerVM) | expand

Commit Message

Harsh Prateek Bora Oct. 12, 2023, 10:49 a.m. UTC
The H_GUEST_RUN_VCPU hcall is used to start execution of a Guest VCPU.
The Hypervisor will update the state of the Guest VCPU based on the
input buffer, restore the saved Guest VCPU state, and start its execution.

The Guest VCPU can stop running for numerous reasons including HCALLs,
hypervisor exceptions, or an outstanding Host Partition Interrupt.
The reason that the Guest VCPU stopped running is communicated through
R4 and the output buffer will be filled in with any relevant state.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Kautuk Consul <kconsul@linux.vnet.ibm.com>
Signed-off-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
---
 hw/ppc/spapr_nested.c           | 308 ++++++++++++++++++++++++++++++--
 include/hw/ppc/spapr.h          |   1 +
 include/hw/ppc/spapr_cpu_core.h |   7 +-
 3 files changed, 302 insertions(+), 14 deletions(-)

Comments

Nicholas Piggin Nov. 29, 2023, 4:58 a.m. UTC | #1
On Thu Oct 12, 2023 at 8:49 PM AEST, Harsh Prateek Bora wrote:
> The H_GUEST_RUN_VCPU hcall is used to start execution of a Guest VCPU.
> The Hypervisor will update the state of the Guest VCPU based on the
> input buffer, restore the saved Guest VCPU state, and start its execution.
>
> The Guest VCPU can stop running for numerous reasons including HCALLs,
> hypervisor exceptions, or an outstanding Host Partition Interrupt.
> The reason that the Guest VCPU stopped running is communicated through
> R4 and the output buffer will be filled in with any relevant state.
>
> Signed-off-by: Michael Neuling <mikey@neuling.org>
> Signed-off-by: Kautuk Consul <kconsul@linux.vnet.ibm.com>
> Signed-off-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
> ---
>  hw/ppc/spapr_nested.c           | 308 ++++++++++++++++++++++++++++++--
>  include/hw/ppc/spapr.h          |   1 +
>  include/hw/ppc/spapr_cpu_core.h |   7 +-
>  3 files changed, 302 insertions(+), 14 deletions(-)
>
> diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
> index e2d0cb5559..d3e7629f63 100644
> --- a/hw/ppc/spapr_nested.c
> +++ b/hw/ppc/spapr_nested.c
> @@ -141,6 +141,15 @@ static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)
>      save->tb_offset = env->tb_env->tb_offset;
>  }
>  
> +static void nested_post_state_update(CPUPPCState *env, CPUState *cs)
> +{
> +    hreg_compute_hflags(env);
> +    ppc_maybe_interrupt(env);
> +    tlb_flush(cs);
> +    env->reserve_addr = -1; /* Reset the reservation */
> +

Extra newline. And don't strip out comments please.

> +}
> +
>  static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
>  {
>      CPUState *cs = CPU(cpu);
> @@ -172,19 +181,7 @@ static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
>      env->spr[SPR_PPR] = load->ppr;
>  
>      env->tb_env->tb_offset = load->tb_offset;
> -
> -    /*
> -     * MSR updated, compute hflags and possible interrupts.
> -     */
> -    hreg_compute_hflags(env);
> -    ppc_maybe_interrupt(env);
> -
> -    /*
> -     * Nested HV does not tag TLB entries between L1 and L2, so must
> -     * flush on transition.
> -     */
> -    tlb_flush(cs);
> -    env->reserve_addr = -1; /* Reset the reservation */
> +    nested_post_state_update(env, cs);
>  }
>  
>  /*
> @@ -426,6 +423,9 @@ static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
>      address_space_unmap(CPU(cpu)->as, regs, len, len, true);
>  }
>  
> +static
> +void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp);

Would be nicer if the implementations could go above, then the APIs
below so you don't need this.

> +
>  void spapr_exit_nested(PowerPCCPU *cpu, int excp)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -434,6 +434,10 @@ void spapr_exit_nested(PowerPCCPU *cpu, int excp)
>      assert(spapr_cpu->in_nested);
>      if (spapr->nested.api == NESTED_API_KVM_HV) {
>          spapr_exit_nested_hv(cpu, excp);
> +    } else if (spapr->nested.api == NESTED_API_PAPR) {
> +        spapr_exit_nested_papr(spapr, cpu, excp);
> +    } else {
> +        g_assert_not_reached();

The assert leg should have been introduced in the previous patch.

>      }
>  }
>  
> @@ -1455,6 +1459,283 @@ static target_ulong h_guest_get_state(PowerPCCPU *cpu,
>      return h_guest_getset_state(cpu, spapr, args, false);
>  }
>  
> +static void restore_common_regs(CPUPPCState *dst, CPUPPCState *src)
> +{
> +    memcpy(dst->gpr, src->gpr, sizeof(dst->gpr));
> +    memcpy(dst->crf, src->crf, sizeof(dst->crf));
> +    memcpy(dst->vsr, src->vsr, sizeof(dst->vsr));
> +    dst->nip = src->nip;
> +    dst->msr = src->msr;
> +    dst->lr  = src->lr;
> +    dst->ctr = src->ctr;
> +    dst->cfar = src->cfar;
> +    cpu_write_xer(dst, src->xer);
> +    ppc_store_vscr(dst, ppc_get_vscr(src));
> +    ppc_store_fpscr(dst, src->fpscr);

Still don't like having a CPUPPCState to save into. It does give you a
bunch of variables you need, but you still need a lot of bespoke code
for FPSCR, XER, NIP, etc. so it's not really buying you anything.

If there was a general ppc_cpu_state_save/restore() function then sure,
but I don't think we have ti.

Now I think about it we could probably use the same struct and mostly
the same code for both APIs. They both do the same thing conceptually,
the difference is just that the PAPR API saves a few more registers.

> +    memcpy(dst->spr, src->spr, sizeof(dst->spr));

It's not really clear we can do this, since some SPRs have special
implementations. And AFAIKS the current code already has at least
one bug at a glance because of this, and that's DPDES.

Fortunately Linux doesn't really use it in !SMT and we don't support
SMT + nested at the moment. The problem just gets worse by doing a
wholesale memcpy of SPRs.

> +}

> +
> +static void exit_nested_restore_vcpu(PowerPCCPU *cpu, int excp,
> +                                     SpaprMachineStateNestedGuestVcpu *vcpu)

The "restore" here and  in nested_papr_restore_l2_state confuse me
I think. This isn't "restoring" the nested vcpu is it? It's saving
it. And nested_papr_restore_l2_state is *loading* the l2 state.

Restore is not a great word, it's a bit ambigous. But in general
this is all to be read from the point of view of the L1 (host), so
"restore" is more suitable for restoring back to host state, but
IMO it should be avoided if possible. "save" can refer to saving
current machine state to somewhere, "load" loading machine state
from somewhere.

> +{
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong now, hdar, hdsisr, asdr;
> +
> +    assert(sizeof(env->gpr) == sizeof(vcpu->env.gpr)); /* sanity check */
> +
> +    now = cpu_ppc_load_tbl(env); /* L2 timebase */
> +    now -= vcpu->tb_offset; /* L1 timebase */
> +    vcpu->dec_expiry_tb = now - cpu_ppc_load_decr(env);
> +    /* backup hdar, hdsisr, asdr if reqd later below */
> +    hdar   = vcpu->env.spr[SPR_HDAR];
> +    hdsisr = vcpu->env.spr[SPR_HDSISR];
> +    asdr   = vcpu->env.spr[SPR_ASDR];
> +
> +    restore_common_regs(&vcpu->env, env);
> +
> +    if (excp == POWERPC_EXCP_MCHECK ||
> +        excp == POWERPC_EXCP_RESET ||
> +        excp == POWERPC_EXCP_SYSCALL) {
> +        vcpu->env.nip = env->spr[SPR_SRR0];
> +        vcpu->env.msr = env->spr[SPR_SRR1] & env->msr_mask;
> +    } else {
> +        vcpu->env.nip = env->spr[SPR_HSRR0];
> +        vcpu->env.msr = env->spr[SPR_HSRR1] & env->msr_mask;
> +    }
> +
> +    /* hdar, hdsisr, asdr should be retained unless certain exceptions */
> +    if ((excp != POWERPC_EXCP_HDSI) && (excp != POWERPC_EXCP_HISI)) {
> +        vcpu->env.spr[SPR_ASDR] = asdr;
> +    } else if (excp != POWERPC_EXCP_HDSI) {
> +        vcpu->env.spr[SPR_HDAR]   = hdar;
> +        vcpu->env.spr[SPR_HDSISR] = hdsisr;
> +    }
> +}
> +
> +static int get_exit_ids(uint64_t srr0, uint16_t ids[16])
> +{
> +    int nr;
> +
> +    switch (srr0) {
> +    case 0xc00:
> +        nr = 10;
> +        ids[0] = GSB_VCPU_GPR3;
> +        ids[1] = GSB_VCPU_GPR4;
> +        ids[2] = GSB_VCPU_GPR5;
> +        ids[3] = GSB_VCPU_GPR6;
> +        ids[4] = GSB_VCPU_GPR7;
> +        ids[5] = GSB_VCPU_GPR8;
> +        ids[6] = GSB_VCPU_GPR9;
> +        ids[7] = GSB_VCPU_GPR10;
> +        ids[8] = GSB_VCPU_GPR11;
> +        ids[9] = GSB_VCPU_GPR12;
> +        break;
> +    case 0xe00:
> +        nr = 5;
> +        ids[0] = GSB_VCPU_SPR_HDAR;
> +        ids[1] = GSB_VCPU_SPR_HDSISR;
> +        ids[2] = GSB_VCPU_SPR_ASDR;
> +        ids[3] = GSB_VCPU_SPR_NIA;
> +        ids[4] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xe20:
> +        nr = 4;
> +        ids[0] = GSB_VCPU_SPR_HDAR;
> +        ids[1] = GSB_VCPU_SPR_ASDR;
> +        ids[2] = GSB_VCPU_SPR_NIA;
> +        ids[3] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xe40:
> +        nr = 3;
> +        ids[0] = GSB_VCPU_SPR_HEIR;
> +        ids[1] = GSB_VCPU_SPR_NIA;
> +        ids[2] = GSB_VCPU_SPR_MSR;
> +        break;
> +    case 0xf80:
> +        nr = 3;
> +        ids[0] = GSB_VCPU_SPR_HFSCR;
> +        ids[1] = GSB_VCPU_SPR_NIA;
> +        ids[2] = GSB_VCPU_SPR_MSR;
> +        break;
> +    default:
> +        nr = 0;
> +        break;
> +    }
> +
> +    return nr;
> +}
> +
> +static void exit_process_output_buffer(PowerPCCPU *cpu,
> +                                      SpaprMachineStateNestedGuest *guest,
> +                                      target_ulong vcpuid,
> +                                      target_ulong *r3)
> +{
> +    SpaprMachineStateNestedGuestVcpu *vcpu = &guest->vcpu[vcpuid];
> +    struct guest_state_request gsr;
> +    struct guest_state_buffer *gsb;
> +    struct guest_state_element *element;
> +    struct guest_state_element_type *type;
> +    int exit_id_count = 0;
> +    uint16_t exit_cause_ids[16];
> +    hwaddr len;
> +
> +    len = vcpu->runbufout.size;
> +    gsb = address_space_map(CPU(cpu)->as, vcpu->runbufout.addr, &len, true,
> +                            MEMTXATTRS_UNSPECIFIED);
> +    if (!gsb || len != vcpu->runbufout.size) {
> +        address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
> +        *r3 = H_P2;
> +        return;
> +    }
> +
> +    exit_id_count = get_exit_ids(*r3, exit_cause_ids);
> +
> +    /* Create a buffer of elements to send back */
> +    gsb->num_elements = cpu_to_be32(exit_id_count);
> +    element = gsb->elements;
> +    for (int i = 0; i < exit_id_count; i++) {
> +        type = guest_state_element_type_find(exit_cause_ids[i]);
> +        assert(type);
> +        element->id = cpu_to_be16(exit_cause_ids[i]);
> +        element->size = cpu_to_be16(type->size);
> +        element = guest_state_element_next(element, NULL, NULL);
> +    }
> +    gsr.gsb = gsb;
> +    gsr.len = VCPU_OUT_BUF_MIN_SZ;
> +    gsr.flags = 0; /* get + never guest wide */
> +    getset_state(guest, vcpuid, &gsr);
> +
> +    address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
> +    return;
> +}
> +
> +static
> +void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp)
> +{
> +    CPUState *cs = CPU(cpu);
> +    CPUPPCState *env = &cpu->env;
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +    target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value */
> +    target_ulong lpid = 0, vcpuid = 0;
> +    struct SpaprMachineStateNestedGuestVcpu *vcpu = NULL;
> +    struct SpaprMachineStateNestedGuest *guest = NULL;
> +
> +    lpid = spapr_cpu->nested_papr_host->gpr[5];
> +    vcpuid = spapr_cpu->nested_papr_host->gpr[6];
> +    guest = spapr_get_nested_guest(spapr, lpid);
> +    assert(guest);
> +    spapr_nested_vcpu_check(guest, vcpuid, false);
> +    vcpu = &guest->vcpu[vcpuid];
> +
> +    exit_nested_restore_vcpu(cpu, excp, vcpu);
> +    /* do the output buffer for run_vcpu*/
> +    exit_process_output_buffer(cpu, guest, vcpuid, &r3_return);
> +
> +    assert(env->spr[SPR_LPIDR] != 0);
> +    restore_common_regs(env, spapr_cpu->nested_papr_host);

I need to take a look a bit closer, but AFAIKS you aren't
loading L1 decr when exiting to host...

> +    env->tb_env->tb_offset -= vcpu->tb_offset;

This is nasty, and I can say that because I did it in the
original code. We should call a timebase helper to do this
adjustment for us.

> +    env->gpr[3] = H_SUCCESS;
> +    env->gpr[4] = r3_return;
> +    nested_post_state_update(env, cs);
> +    cpu_ppc_hdecr_exit(env);
> +
> +    spapr_cpu->in_nested = false;
> +    g_free(spapr_cpu->nested_papr_host);
> +    spapr_cpu->nested_papr_host = NULL;
> +}
> +
> +static void nested_papr_restore_l2_state(PowerPCCPU *cpu,
> +                                         CPUPPCState *env,
> +                                         SpaprMachineStateNestedGuestVcpu *vcpu,
> +                                         target_ulong now)
> +{
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong lpcr, lpcr_mask, hdec;
> +    lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER;
> +
> +    assert(vcpu);
> +    assert(sizeof(env->gpr) == sizeof(vcpu->env.gpr));
> +    restore_common_regs(env, &vcpu->env);
> +    lpcr = (env->spr[SPR_LPCR] & ~lpcr_mask) |
> +           (vcpu->env.spr[SPR_LPCR] & lpcr_mask);
> +    lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE;
> +    lpcr &= ~LPCR_LPES0;
> +    env->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask;
> +
> +    hdec = vcpu->hdecr_expiry_tb - now;
> +    cpu_ppc_store_decr(env, vcpu->dec_expiry_tb - now);
> +    cpu_ppc_hdecr_init(env);
> +    cpu_ppc_store_hdecr(env, hdec);
> +
> +    env->tb_env->tb_offset += vcpu->tb_offset;
> +}
> +
> +static void nested_papr_run_vcpu(PowerPCCPU *cpu,
> +                                 uint64_t lpid,
> +                                 SpaprMachineStateNestedGuestVcpu *vcpu)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    CPUState *cs = CPU(cpu);
> +    CPUPPCState *env = &cpu->env;
> +    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> +    target_ulong now = cpu_ppc_load_tbl(env);
> +
> +    assert(env->spr[SPR_LPIDR] == 0);
> +    assert(spapr->nested.api); /* ensure API version is initialized */
> +    spapr_cpu->nested_papr_host = g_try_new(CPUPPCState, 1);
> +    assert(spapr_cpu->nested_papr_host);
> +    memcpy(spapr_cpu->nested_papr_host, env, sizeof(CPUPPCState));
> +
> +    nested_papr_restore_l2_state(cpu, env, vcpu, now);
> +    env->spr[SPR_LPIDR] = lpid; /* post restore l2 state */
> +
> +    spapr_cpu->in_nested = true;
> +
> +    nested_post_state_update(env, cs);
> +}
> +
> +static target_ulong h_guest_run_vcpu(PowerPCCPU *cpu,
> +                                     SpaprMachineState *spapr,
> +                                     target_ulong opcode,
> +                                     target_ulong *args)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong flags = args[0];
> +    target_ulong lpid = args[1];
> +    target_ulong vcpuid = args[2];
> +    struct SpaprMachineStateNestedGuestVcpu *vcpu;
> +    struct guest_state_request gsr;
> +    SpaprMachineStateNestedGuest *guest;
> +
> +    if (flags) /* don't handle any flags for now */
> +        return H_PARAMETER;
> +
> +    guest = spapr_get_nested_guest(spapr, lpid);
> +    if (!guest) {
> +        return H_P2;
> +    }
> +    if (!spapr_nested_vcpu_check(guest, vcpuid, true)) {
> +        return H_P3;
> +    }
> +
> +    if (guest->parttbl[0] == 0) {
> +        /* At least need a partition scoped radix tree */
> +        return H_NOT_AVAILABLE;
> +    }
> +
> +    vcpu = &guest->vcpu[vcpuid];
> +
> +    /* Read run_vcpu input buffer to update state */
> +    gsr.buf = vcpu->runbufin.addr;
> +    gsr.len = vcpu->runbufin.size;
> +    gsr.flags = GUEST_STATE_REQUEST_SET; /* Thread wide + writing */
> +    if (!map_and_getset_state(cpu, guest, vcpuid, &gsr)) {
> +        nested_papr_run_vcpu(cpu, lpid, vcpu);
> +    }

If there is an error with map_and_getset, does it set gpr[3] to
an error code? IMO may be nicer if map_and_getset_state returns
the error itself, and this caller can set it in r3.

Thanks,
Nick

> +
> +    return env->gpr[3];
> +}
> +
>  void spapr_register_nested(void)
>  {
>      spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
> @@ -1473,6 +1754,7 @@ void spapr_register_nested_papr(void)
>      spapr_register_hypercall(H_GUEST_CREATE_VCPU     , h_guest_create_vcpu);
>      spapr_register_hypercall(H_GUEST_SET_STATE       , h_guest_set_state);
>      spapr_register_hypercall(H_GUEST_GET_STATE       , h_guest_get_state);
> +    spapr_register_hypercall(H_GUEST_RUN_VCPU        , h_guest_run_vcpu);
>  }
>  #else
>  void spapr_exit_nested(PowerPCCPU *cpu, int excp)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index b9a67895bb..e278ddc7cf 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -594,6 +594,7 @@ struct SpaprMachineState {
>  #define H_GUEST_CREATE_VCPU      0x474
>  #define H_GUEST_GET_STATE        0x478
>  #define H_GUEST_SET_STATE        0x47C
> +#define H_GUEST_RUN_VCPU         0x480
>  #define H_GUEST_DELETE           0x488
>  
>  #define MAX_HCALL_OPCODE         H_GUEST_DELETE
> diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h
> index 9c8c59f173..a9749a2df1 100644
> --- a/include/hw/ppc/spapr_cpu_core.h
> +++ b/include/hw/ppc/spapr_cpu_core.h
> @@ -53,7 +53,12 @@ typedef struct SpaprCpuState {
>  
>      /* Fields for nested-HV support */
>      bool in_nested; /* true while the L2 is executing */
> -    struct nested_ppc_state *nested_hv_host; /* holds the L1 state while L2 executes */
> +    union {
> +        /* holds the L1 state while L2 executes */
> +        struct nested_ppc_state *nested_hv_host;
> +        CPUPPCState             *nested_papr_host;
> +    };
> +
>  } SpaprCpuState;
>  
>  static inline SpaprCpuState *spapr_cpu_state(PowerPCCPU *cpu)
diff mbox series

Patch

diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index e2d0cb5559..d3e7629f63 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -141,6 +141,15 @@  static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)
     save->tb_offset = env->tb_env->tb_offset;
 }
 
+static void nested_post_state_update(CPUPPCState *env, CPUState *cs)
+{
+    hreg_compute_hflags(env);
+    ppc_maybe_interrupt(env);
+    tlb_flush(cs);
+    env->reserve_addr = -1; /* Reset the reservation */
+
+}
+
 static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
 {
     CPUState *cs = CPU(cpu);
@@ -172,19 +181,7 @@  static void nested_load_state(PowerPCCPU *cpu, struct nested_ppc_state *load)
     env->spr[SPR_PPR] = load->ppr;
 
     env->tb_env->tb_offset = load->tb_offset;
-
-    /*
-     * MSR updated, compute hflags and possible interrupts.
-     */
-    hreg_compute_hflags(env);
-    ppc_maybe_interrupt(env);
-
-    /*
-     * Nested HV does not tag TLB entries between L1 and L2, so must
-     * flush on transition.
-     */
-    tlb_flush(cs);
-    env->reserve_addr = -1; /* Reset the reservation */
+    nested_post_state_update(env, cs);
 }
 
 /*
@@ -426,6 +423,9 @@  static void spapr_exit_nested_hv(PowerPCCPU *cpu, int excp)
     address_space_unmap(CPU(cpu)->as, regs, len, len, true);
 }
 
+static
+void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp);
+
 void spapr_exit_nested(PowerPCCPU *cpu, int excp)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -434,6 +434,10 @@  void spapr_exit_nested(PowerPCCPU *cpu, int excp)
     assert(spapr_cpu->in_nested);
     if (spapr->nested.api == NESTED_API_KVM_HV) {
         spapr_exit_nested_hv(cpu, excp);
+    } else if (spapr->nested.api == NESTED_API_PAPR) {
+        spapr_exit_nested_papr(spapr, cpu, excp);
+    } else {
+        g_assert_not_reached();
     }
 }
 
@@ -1455,6 +1459,283 @@  static target_ulong h_guest_get_state(PowerPCCPU *cpu,
     return h_guest_getset_state(cpu, spapr, args, false);
 }
 
+static void restore_common_regs(CPUPPCState *dst, CPUPPCState *src)
+{
+    memcpy(dst->gpr, src->gpr, sizeof(dst->gpr));
+    memcpy(dst->crf, src->crf, sizeof(dst->crf));
+    memcpy(dst->vsr, src->vsr, sizeof(dst->vsr));
+    dst->nip = src->nip;
+    dst->msr = src->msr;
+    dst->lr  = src->lr;
+    dst->ctr = src->ctr;
+    dst->cfar = src->cfar;
+    cpu_write_xer(dst, src->xer);
+    ppc_store_vscr(dst, ppc_get_vscr(src));
+    ppc_store_fpscr(dst, src->fpscr);
+    memcpy(dst->spr, src->spr, sizeof(dst->spr));
+}
+
+static void exit_nested_restore_vcpu(PowerPCCPU *cpu, int excp,
+                                     SpaprMachineStateNestedGuestVcpu *vcpu)
+{
+    CPUPPCState *env = &cpu->env;
+    target_ulong now, hdar, hdsisr, asdr;
+
+    assert(sizeof(env->gpr) == sizeof(vcpu->env.gpr)); /* sanity check */
+
+    now = cpu_ppc_load_tbl(env); /* L2 timebase */
+    now -= vcpu->tb_offset; /* L1 timebase */
+    vcpu->dec_expiry_tb = now - cpu_ppc_load_decr(env);
+    /* backup hdar, hdsisr, asdr if reqd later below */
+    hdar   = vcpu->env.spr[SPR_HDAR];
+    hdsisr = vcpu->env.spr[SPR_HDSISR];
+    asdr   = vcpu->env.spr[SPR_ASDR];
+
+    restore_common_regs(&vcpu->env, env);
+
+    if (excp == POWERPC_EXCP_MCHECK ||
+        excp == POWERPC_EXCP_RESET ||
+        excp == POWERPC_EXCP_SYSCALL) {
+        vcpu->env.nip = env->spr[SPR_SRR0];
+        vcpu->env.msr = env->spr[SPR_SRR1] & env->msr_mask;
+    } else {
+        vcpu->env.nip = env->spr[SPR_HSRR0];
+        vcpu->env.msr = env->spr[SPR_HSRR1] & env->msr_mask;
+    }
+
+    /* hdar, hdsisr, asdr should be retained unless certain exceptions */
+    if ((excp != POWERPC_EXCP_HDSI) && (excp != POWERPC_EXCP_HISI)) {
+        vcpu->env.spr[SPR_ASDR] = asdr;
+    } else if (excp != POWERPC_EXCP_HDSI) {
+        vcpu->env.spr[SPR_HDAR]   = hdar;
+        vcpu->env.spr[SPR_HDSISR] = hdsisr;
+    }
+}
+
+static int get_exit_ids(uint64_t srr0, uint16_t ids[16])
+{
+    int nr;
+
+    switch (srr0) {
+    case 0xc00:
+        nr = 10;
+        ids[0] = GSB_VCPU_GPR3;
+        ids[1] = GSB_VCPU_GPR4;
+        ids[2] = GSB_VCPU_GPR5;
+        ids[3] = GSB_VCPU_GPR6;
+        ids[4] = GSB_VCPU_GPR7;
+        ids[5] = GSB_VCPU_GPR8;
+        ids[6] = GSB_VCPU_GPR9;
+        ids[7] = GSB_VCPU_GPR10;
+        ids[8] = GSB_VCPU_GPR11;
+        ids[9] = GSB_VCPU_GPR12;
+        break;
+    case 0xe00:
+        nr = 5;
+        ids[0] = GSB_VCPU_SPR_HDAR;
+        ids[1] = GSB_VCPU_SPR_HDSISR;
+        ids[2] = GSB_VCPU_SPR_ASDR;
+        ids[3] = GSB_VCPU_SPR_NIA;
+        ids[4] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xe20:
+        nr = 4;
+        ids[0] = GSB_VCPU_SPR_HDAR;
+        ids[1] = GSB_VCPU_SPR_ASDR;
+        ids[2] = GSB_VCPU_SPR_NIA;
+        ids[3] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xe40:
+        nr = 3;
+        ids[0] = GSB_VCPU_SPR_HEIR;
+        ids[1] = GSB_VCPU_SPR_NIA;
+        ids[2] = GSB_VCPU_SPR_MSR;
+        break;
+    case 0xf80:
+        nr = 3;
+        ids[0] = GSB_VCPU_SPR_HFSCR;
+        ids[1] = GSB_VCPU_SPR_NIA;
+        ids[2] = GSB_VCPU_SPR_MSR;
+        break;
+    default:
+        nr = 0;
+        break;
+    }
+
+    return nr;
+}
+
+static void exit_process_output_buffer(PowerPCCPU *cpu,
+                                      SpaprMachineStateNestedGuest *guest,
+                                      target_ulong vcpuid,
+                                      target_ulong *r3)
+{
+    SpaprMachineStateNestedGuestVcpu *vcpu = &guest->vcpu[vcpuid];
+    struct guest_state_request gsr;
+    struct guest_state_buffer *gsb;
+    struct guest_state_element *element;
+    struct guest_state_element_type *type;
+    int exit_id_count = 0;
+    uint16_t exit_cause_ids[16];
+    hwaddr len;
+
+    len = vcpu->runbufout.size;
+    gsb = address_space_map(CPU(cpu)->as, vcpu->runbufout.addr, &len, true,
+                            MEMTXATTRS_UNSPECIFIED);
+    if (!gsb || len != vcpu->runbufout.size) {
+        address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
+        *r3 = H_P2;
+        return;
+    }
+
+    exit_id_count = get_exit_ids(*r3, exit_cause_ids);
+
+    /* Create a buffer of elements to send back */
+    gsb->num_elements = cpu_to_be32(exit_id_count);
+    element = gsb->elements;
+    for (int i = 0; i < exit_id_count; i++) {
+        type = guest_state_element_type_find(exit_cause_ids[i]);
+        assert(type);
+        element->id = cpu_to_be16(exit_cause_ids[i]);
+        element->size = cpu_to_be16(type->size);
+        element = guest_state_element_next(element, NULL, NULL);
+    }
+    gsr.gsb = gsb;
+    gsr.len = VCPU_OUT_BUF_MIN_SZ;
+    gsr.flags = 0; /* get + never guest wide */
+    getset_state(guest, vcpuid, &gsr);
+
+    address_space_unmap(CPU(cpu)->as, gsb, len, true, len);
+    return;
+}
+
+static
+void spapr_exit_nested_papr(SpaprMachineState *spapr, PowerPCCPU *cpu, int excp)
+{
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+    target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value */
+    target_ulong lpid = 0, vcpuid = 0;
+    struct SpaprMachineStateNestedGuestVcpu *vcpu = NULL;
+    struct SpaprMachineStateNestedGuest *guest = NULL;
+
+    lpid = spapr_cpu->nested_papr_host->gpr[5];
+    vcpuid = spapr_cpu->nested_papr_host->gpr[6];
+    guest = spapr_get_nested_guest(spapr, lpid);
+    assert(guest);
+    spapr_nested_vcpu_check(guest, vcpuid, false);
+    vcpu = &guest->vcpu[vcpuid];
+
+    exit_nested_restore_vcpu(cpu, excp, vcpu);
+    /* do the output buffer for run_vcpu*/
+    exit_process_output_buffer(cpu, guest, vcpuid, &r3_return);
+
+    assert(env->spr[SPR_LPIDR] != 0);
+    restore_common_regs(env, spapr_cpu->nested_papr_host);
+    env->tb_env->tb_offset -= vcpu->tb_offset;
+    env->gpr[3] = H_SUCCESS;
+    env->gpr[4] = r3_return;
+    nested_post_state_update(env, cs);
+    cpu_ppc_hdecr_exit(env);
+
+    spapr_cpu->in_nested = false;
+    g_free(spapr_cpu->nested_papr_host);
+    spapr_cpu->nested_papr_host = NULL;
+}
+
+static void nested_papr_restore_l2_state(PowerPCCPU *cpu,
+                                         CPUPPCState *env,
+                                         SpaprMachineStateNestedGuestVcpu *vcpu,
+                                         target_ulong now)
+{
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong lpcr, lpcr_mask, hdec;
+    lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER;
+
+    assert(vcpu);
+    assert(sizeof(env->gpr) == sizeof(vcpu->env.gpr));
+    restore_common_regs(env, &vcpu->env);
+    lpcr = (env->spr[SPR_LPCR] & ~lpcr_mask) |
+           (vcpu->env.spr[SPR_LPCR] & lpcr_mask);
+    lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE;
+    lpcr &= ~LPCR_LPES0;
+    env->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask;
+
+    hdec = vcpu->hdecr_expiry_tb - now;
+    cpu_ppc_store_decr(env, vcpu->dec_expiry_tb - now);
+    cpu_ppc_hdecr_init(env);
+    cpu_ppc_store_hdecr(env, hdec);
+
+    env->tb_env->tb_offset += vcpu->tb_offset;
+}
+
+static void nested_papr_run_vcpu(PowerPCCPU *cpu,
+                                 uint64_t lpid,
+                                 SpaprMachineStateNestedGuestVcpu *vcpu)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
+    target_ulong now = cpu_ppc_load_tbl(env);
+
+    assert(env->spr[SPR_LPIDR] == 0);
+    assert(spapr->nested.api); /* ensure API version is initialized */
+    spapr_cpu->nested_papr_host = g_try_new(CPUPPCState, 1);
+    assert(spapr_cpu->nested_papr_host);
+    memcpy(spapr_cpu->nested_papr_host, env, sizeof(CPUPPCState));
+
+    nested_papr_restore_l2_state(cpu, env, vcpu, now);
+    env->spr[SPR_LPIDR] = lpid; /* post restore l2 state */
+
+    spapr_cpu->in_nested = true;
+
+    nested_post_state_update(env, cs);
+}
+
+static target_ulong h_guest_run_vcpu(PowerPCCPU *cpu,
+                                     SpaprMachineState *spapr,
+                                     target_ulong opcode,
+                                     target_ulong *args)
+{
+    CPUPPCState *env = &cpu->env;
+    target_ulong flags = args[0];
+    target_ulong lpid = args[1];
+    target_ulong vcpuid = args[2];
+    struct SpaprMachineStateNestedGuestVcpu *vcpu;
+    struct guest_state_request gsr;
+    SpaprMachineStateNestedGuest *guest;
+
+    if (flags) /* don't handle any flags for now */
+        return H_PARAMETER;
+
+    guest = spapr_get_nested_guest(spapr, lpid);
+    if (!guest) {
+        return H_P2;
+    }
+    if (!spapr_nested_vcpu_check(guest, vcpuid, true)) {
+        return H_P3;
+    }
+
+    if (guest->parttbl[0] == 0) {
+        /* At least need a partition scoped radix tree */
+        return H_NOT_AVAILABLE;
+    }
+
+    vcpu = &guest->vcpu[vcpuid];
+
+    /* Read run_vcpu input buffer to update state */
+    gsr.buf = vcpu->runbufin.addr;
+    gsr.len = vcpu->runbufin.size;
+    gsr.flags = GUEST_STATE_REQUEST_SET; /* Thread wide + writing */
+    if (!map_and_getset_state(cpu, guest, vcpuid, &gsr)) {
+        nested_papr_run_vcpu(cpu, lpid, vcpu);
+    }
+
+    return env->gpr[3];
+}
+
 void spapr_register_nested(void)
 {
     spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
@@ -1473,6 +1754,7 @@  void spapr_register_nested_papr(void)
     spapr_register_hypercall(H_GUEST_CREATE_VCPU     , h_guest_create_vcpu);
     spapr_register_hypercall(H_GUEST_SET_STATE       , h_guest_set_state);
     spapr_register_hypercall(H_GUEST_GET_STATE       , h_guest_get_state);
+    spapr_register_hypercall(H_GUEST_RUN_VCPU        , h_guest_run_vcpu);
 }
 #else
 void spapr_exit_nested(PowerPCCPU *cpu, int excp)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index b9a67895bb..e278ddc7cf 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -594,6 +594,7 @@  struct SpaprMachineState {
 #define H_GUEST_CREATE_VCPU      0x474
 #define H_GUEST_GET_STATE        0x478
 #define H_GUEST_SET_STATE        0x47C
+#define H_GUEST_RUN_VCPU         0x480
 #define H_GUEST_DELETE           0x488
 
 #define MAX_HCALL_OPCODE         H_GUEST_DELETE
diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h
index 9c8c59f173..a9749a2df1 100644
--- a/include/hw/ppc/spapr_cpu_core.h
+++ b/include/hw/ppc/spapr_cpu_core.h
@@ -53,7 +53,12 @@  typedef struct SpaprCpuState {
 
     /* Fields for nested-HV support */
     bool in_nested; /* true while the L2 is executing */
-    struct nested_ppc_state *nested_hv_host; /* holds the L1 state while L2 executes */
+    union {
+        /* holds the L1 state while L2 executes */
+        struct nested_ppc_state *nested_hv_host;
+        CPUPPCState             *nested_papr_host;
+    };
+
 } SpaprCpuState;
 
 static inline SpaprCpuState *spapr_cpu_state(PowerPCCPU *cpu)