diff mbox series

[v7,4/6] target/ppc: Build rtas error log

Message ID 155323644727.18748.14700690930436255677.stgit@aravinda (mailing list archive)
State New, archived
Headers show
Series target-ppc/spapr: Add FWNMI support in QEMU for PowerKVM guests | expand

Commit Message

Aravinda Prasad March 22, 2019, 6:34 a.m. UTC
This patch builds the rtas error log, copies it to the
rtas_addr and then invokes the guest registered machine
check handler.

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c         |    4 +
 hw/ppc/spapr_events.c  |  247 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h |    4 +
 3 files changed, 255 insertions(+)

Comments

David Gibson March 25, 2019, 6:30 a.m. UTC | #1
On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:
> This patch builds the rtas error log, copies it to the
> rtas_addr and then invokes the guest registered machine
> check handler.

This commit message needs more context.  When is this occurring, why
do we need this?

[I can answer those questions now, but whether I - or anyone else -
 will be able to looking back at this commit from years in the future
 is a different question]

> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |    4 +
>  hw/ppc/spapr_events.c  |  247 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    4 +
>  3 files changed, 255 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 744dcad..562d405 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2910,6 +2910,10 @@ static void spapr_machine_init(MachineState *machine)
>          error_report("Could not get size of LPAR rtas '%s'", filename);
>          exit(1);
>      }
> +
> +    /* Resize blob to accommodate error log. */
> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> +
>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>          error_report("Could not load LPAR rtas '%s'", filename);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index e7a24ad..d7cc0a4 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -212,6 +212,106 @@ struct hp_extended_log {
>      struct rtas_event_log_v6_hp hp;
>  } QEMU_PACKED;
>  
> +struct rtas_event_log_v6_mc {
> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
> +    struct rtas_event_log_v6_section_header hdr;
> +    uint32_t fru_id;
> +    uint32_t proc_id;
> +    uint8_t error_type;
> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
> +    uint8_t sub_err_type;
> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
> +    uint8_t reserved_1[6];
> +    uint64_t effective_address;
> +    uint64_t logical_address;
> +} QEMU_PACKED;
> +
> +struct mc_extended_log {
> +    struct rtas_event_log_v6 v6hdr;
> +    struct rtas_event_log_v6_mc mc;
> +} QEMU_PACKED;
> +
> +struct MC_ierror_table {
> +    unsigned long srr1_mask;
> +    unsigned long srr1_value;
> +    bool nip_valid; /* nip is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_ierror_table mc_ierror_table[] = {
> +{ 0x00000000081c0000, 0x0000000000040000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000080000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x00000000000c0000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000100000, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000140000, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000180000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, 0, 0, 0, 0, 0 } };
> +
> +struct MC_derror_table {
> +    unsigned long dsisr_value;
> +    bool dar_valid; /* dar is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_derror_table mc_derror_table[] = {
> +{ 0x00008000, false,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00004000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000800, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000400, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000080, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000100, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, false, 0, 0, 0, 0 } };
> +
> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
> +
>  typedef enum EventClass {
>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>      EVENT_CLASS_EPOW                = 1,
> @@ -620,6 +720,149 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>  }
>  
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
> +{
> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
> +    return RTAS_ERROR_LOG_MAX;
> +}
> +
> +static uint64_t spapr_get_rtas_addr(void)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    int rtas_node;
> +    const struct fdt_property *rtas_addr_prop;
> +    void *fdt = spapr->fdt_blob;
> +    uint32_t rtas_addr;
> +
> +    /* fetch rtas addr from fdt */
> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> +    g_assert(rtas_node >= 0);
> +
> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> +    g_assert(rtas_addr_prop);
> +
> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> +    return (uint64_t)rtas_addr;

It seems a bit roundabout to pull the rtas address out of the device
tree, since it was us that put it in there in the first place.

> +}
> +
> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
> +                                        struct mc_extended_log *ext_elog)
> +{
> +    int i;
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t summary;
> +    uint64_t dsisr = env->spr[SPR_DSISR];
> +
> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
> +    if (recovered) {
> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
> +    } else {
> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
> +    }
> +
> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
> +            if (mc_derror_table[i].dar_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
> +            }
> +
> +            summary |= mc_derror_table[i].initiator
> +                        | mc_derror_table[i].severity;
> +
> +            return summary;
> +        }
> +    } else {
> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
> +                    mc_ierror_table[i].srr1_value) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
> +            if (mc_ierror_table[i].nip_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
> +            }
> +
> +            summary |= mc_ierror_table[i].initiator
> +                        | mc_ierror_table[i].severity;
> +
> +            return summary;
> +        }
> +    }
> +
> +    summary |= RTAS_LOG_INITIATOR_CPU;
> +    return summary;
> +}
> +
> +static void spapr_mce_build_elog(PowerPCCPU *cpu, bool recovered)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    uint64_t rtas_addr;
> +    CPUPPCState *env = &cpu->env;
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong r3, msr = 0;
> +    struct rtas_error_log log;
> +    struct mc_extended_log *ext_elog;
> +    uint32_t summary;
> +
> +    ext_elog = g_malloc0(sizeof(*ext_elog));

g_new0() is preferred to g_malloc0().

> +
> +    /*
> +     * Properly set bits in MSR before we invoke the handler.
> +     * SRR0/1, DAR and DSISR are properly set by KVM
> +     */
> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> +        msr |= (1ULL << MSR_LE);
> +    }
> +
> +    if (env->msr && (1ULL << MSR_SF)) {
> +        msr |= (1ULL << MSR_SF);
> +    }
> +
> +    msr |= (1ULL << MSR_ME);
> +    env->msr = msr;

The name seems a bit misleading, since this is clearly doing more than
just building the error log, it's doing at least some of the work of
dispatching it as well.

> +
> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
> +
> +    log.summary = cpu_to_be32(summary);
> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
> +
> +    /* r3 should be in BE always */
> +    r3 = cpu_to_be64(env->gpr[3]);
> +
> +    spapr_init_v6hdr(&ext_elog->v6hdr);
> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
> +    ext_elog->mc.hdr.section_length =
> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
> +    ext_elog->mc.hdr.section_version = 1;
> +
> +    /* get rtas addr from fdt */
> +    rtas_addr = spapr_get_rtas_addr();
> +
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
> +                              &log, sizeof(log));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
> +                              sizeof(log), ext_elog,
> +                              sizeof(struct mc_extended_log));
> +
> +    /* Save gpr[3] in the guest endian mode */
> +    if ((*pcc->interrupts_big_endian)(cpu)) {
> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
> +    } else {
> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
> +    }
> +
> +    env->nip = spapr->guest_machine_check_addr;
> +}
> +
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -640,6 +883,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>          }
>      }
>      spapr->mc_status = cpu->vcpu_id;
> +
> +    spapr_mce_build_elog(cpu, recovered);
> +
> +    return;
>  }
>  
>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index b0d8c18..0c9dec1 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -662,6 +662,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>  
> +/* Offset from rtas-base where error log is placed */
> +#define RTAS_ERRLOG_OFFSET       0x25
> +
>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>  {
>      return addr & ~0xF000000000000000ULL;
> @@ -793,6 +796,7 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift,
>  void spapr_clear_pending_events(SpaprMachineState *spapr);
>  int spapr_max_server_number(SpaprMachineState *spapr);
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>  
>  /* DRC callbacks. */
>  void spapr_core_release(DeviceState *dev);
>
Aravinda Prasad March 25, 2019, 8:26 a.m. UTC | #2
On Monday 25 March 2019 12:00 PM, David Gibson wrote:
> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:
>> This patch builds the rtas error log, copies it to the
>> rtas_addr and then invokes the guest registered machine
>> check handler.
> 
> This commit message needs more context.  When is this occurring, why
> do we need this?
> 
> [I can answer those questions now, but whether I - or anyone else -
>  will be able to looking back at this commit from years in the future
>  is a different question]

will add more info.

> 
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c         |    4 +
>>  hw/ppc/spapr_events.c  |  247 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr.h |    4 +
>>  3 files changed, 255 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 744dcad..562d405 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2910,6 +2910,10 @@ static void spapr_machine_init(MachineState *machine)
>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>          exit(1);
>>      }
>> +
>> +    /* Resize blob to accommodate error log. */
>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
>> +
>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>>          error_report("Could not load LPAR rtas '%s'", filename);
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index e7a24ad..d7cc0a4 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -212,6 +212,106 @@ struct hp_extended_log {
>>      struct rtas_event_log_v6_hp hp;
>>  } QEMU_PACKED;
>>  
>> +struct rtas_event_log_v6_mc {
>> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
>> +    struct rtas_event_log_v6_section_header hdr;
>> +    uint32_t fru_id;
>> +    uint32_t proc_id;
>> +    uint8_t error_type;
>> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
>> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
>> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
>> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
>> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
>> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
>> +    uint8_t sub_err_type;
>> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
>> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
>> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
>> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
>> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
>> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
>> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
>> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
>> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
>> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
>> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
>> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
>> +    uint8_t reserved_1[6];
>> +    uint64_t effective_address;
>> +    uint64_t logical_address;
>> +} QEMU_PACKED;
>> +
>> +struct mc_extended_log {
>> +    struct rtas_event_log_v6 v6hdr;
>> +    struct rtas_event_log_v6_mc mc;
>> +} QEMU_PACKED;
>> +
>> +struct MC_ierror_table {
>> +    unsigned long srr1_mask;
>> +    unsigned long srr1_value;
>> +    bool nip_valid; /* nip is a valid indicator of faulting address */
>> +    uint8_t error_type;
>> +    uint8_t error_subtype;
>> +    unsigned int initiator;
>> +    unsigned int severity;
>> +};
>> +
>> +static const struct MC_ierror_table mc_ierror_table[] = {
>> +{ 0x00000000081c0000, 0x0000000000040000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000080000, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x00000000000c0000, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000100000, true,
>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000140000, true,
>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000180000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0, 0, 0, 0, 0, 0 } };
>> +
>> +struct MC_derror_table {
>> +    unsigned long dsisr_value;
>> +    bool dar_valid; /* dar is a valid indicator of faulting address */
>> +    uint8_t error_type;
>> +    uint8_t error_subtype;
>> +    unsigned int initiator;
>> +    unsigned int severity;
>> +};
>> +
>> +static const struct MC_derror_table mc_derror_table[] = {
>> +{ 0x00008000, false,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00004000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000800, true,
>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000400, true,
>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000080, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000100, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0, false, 0, 0, 0, 0 } };
>> +
>> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
>> +
>>  typedef enum EventClass {
>>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>>      EVENT_CLASS_EPOW                = 1,
>> @@ -620,6 +720,149 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>>  }
>>  
>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
>> +{
>> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
>> +    return RTAS_ERROR_LOG_MAX;
>> +}
>> +
>> +static uint64_t spapr_get_rtas_addr(void)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    int rtas_node;
>> +    const struct fdt_property *rtas_addr_prop;
>> +    void *fdt = spapr->fdt_blob;
>> +    uint32_t rtas_addr;
>> +
>> +    /* fetch rtas addr from fdt */
>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
>> +    g_assert(rtas_node >= 0);
>> +
>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
>> +    g_assert(rtas_addr_prop);
>> +
>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
>> +    return (uint64_t)rtas_addr;
> 
> It seems a bit roundabout to pull the rtas address out of the device
> tree, since it was us that put it in there in the first place.

Slof can change the rtas address. So we need to get the updated rtas
address.

> 
>> +}
>> +
>> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
>> +                                        struct mc_extended_log *ext_elog)
>> +{
>> +    int i;
>> +    CPUPPCState *env = &cpu->env;
>> +    uint32_t summary;
>> +    uint64_t dsisr = env->spr[SPR_DSISR];
>> +
>> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
>> +    if (recovered) {
>> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
>> +    } else {
>> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
>> +    }
>> +
>> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
>> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
>> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
>> +                continue;
>> +            }
>> +
>> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
>> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
>> +            if (mc_derror_table[i].dar_valid) {
>> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
>> +            }
>> +
>> +            summary |= mc_derror_table[i].initiator
>> +                        | mc_derror_table[i].severity;
>> +
>> +            return summary;
>> +        }
>> +    } else {
>> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
>> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
>> +                    mc_ierror_table[i].srr1_value) {
>> +                continue;
>> +            }
>> +
>> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
>> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
>> +            if (mc_ierror_table[i].nip_valid) {
>> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
>> +            }
>> +
>> +            summary |= mc_ierror_table[i].initiator
>> +                        | mc_ierror_table[i].severity;
>> +
>> +            return summary;
>> +        }
>> +    }
>> +
>> +    summary |= RTAS_LOG_INITIATOR_CPU;
>> +    return summary;
>> +}
>> +
>> +static void spapr_mce_build_elog(PowerPCCPU *cpu, bool recovered)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    uint64_t rtas_addr;
>> +    CPUPPCState *env = &cpu->env;
>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>> +    target_ulong r3, msr = 0;
>> +    struct rtas_error_log log;
>> +    struct mc_extended_log *ext_elog;
>> +    uint32_t summary;
>> +
>> +    ext_elog = g_malloc0(sizeof(*ext_elog));
> 
> g_new0() is preferred to g_malloc0().

ok..

> 
>> +
>> +    /*
>> +     * Properly set bits in MSR before we invoke the handler.
>> +     * SRR0/1, DAR and DSISR are properly set by KVM
>> +     */
>> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
>> +        msr |= (1ULL << MSR_LE);
>> +    }
>> +
>> +    if (env->msr && (1ULL << MSR_SF)) {
>> +        msr |= (1ULL << MSR_SF);
>> +    }
>> +
>> +    msr |= (1ULL << MSR_ME);
>> +    env->msr = msr;
> 
> The name seems a bit misleading, since this is clearly doing more than
> just building the error log, it's doing at least some of the work of
> dispatching it as well.

ok will rename.

> 
>> +
>> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
>> +
>> +    log.summary = cpu_to_be32(summary);
>> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
>> +
>> +    /* r3 should be in BE always */
>> +    r3 = cpu_to_be64(env->gpr[3]);
>> +
>> +    spapr_init_v6hdr(&ext_elog->v6hdr);
>> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
>> +    ext_elog->mc.hdr.section_length =
>> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
>> +    ext_elog->mc.hdr.section_version = 1;
>> +
>> +    /* get rtas addr from fdt */
>> +    rtas_addr = spapr_get_rtas_addr();
>> +
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
>> +                              &log, sizeof(log));
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
>> +                              sizeof(log), ext_elog,
>> +                              sizeof(struct mc_extended_log));
>> +
>> +    /* Save gpr[3] in the guest endian mode */
>> +    if ((*pcc->interrupts_big_endian)(cpu)) {
>> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
>> +    } else {
>> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
>> +    }
>> +
>> +    env->nip = spapr->guest_machine_check_addr;
>> +}
>> +
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>  {
>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> @@ -640,6 +883,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>          }
>>      }
>>      spapr->mc_status = cpu->vcpu_id;
>> +
>> +    spapr_mce_build_elog(cpu, recovered);
>> +
>> +    return;
>>  }
>>  
>>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index b0d8c18..0c9dec1 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -662,6 +662,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>>  
>> +/* Offset from rtas-base where error log is placed */
>> +#define RTAS_ERRLOG_OFFSET       0x25
>> +
>>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>>  {
>>      return addr & ~0xF000000000000000ULL;
>> @@ -793,6 +796,7 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift,
>>  void spapr_clear_pending_events(SpaprMachineState *spapr);
>>  int spapr_max_server_number(SpaprMachineState *spapr);
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>>  
>>  /* DRC callbacks. */
>>  void spapr_core_release(DeviceState *dev);
>>
>
David Gibson March 25, 2019, 11:32 p.m. UTC | #3
On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:
> 
> 
> On Monday 25 March 2019 12:00 PM, David Gibson wrote:
> > On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:
> >> This patch builds the rtas error log, copies it to the
> >> rtas_addr and then invokes the guest registered machine
> >> check handler.
> > 
> > This commit message needs more context.  When is this occurring, why
> > do we need this?
> > 
> > [I can answer those questions now, but whether I - or anyone else -
> >  will be able to looking back at this commit from years in the future
> >  is a different question]
> 
> will add more info.

Thanks.

[snip]
> >> +static uint64_t spapr_get_rtas_addr(void)
> >> +{
> >> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> +    int rtas_node;
> >> +    const struct fdt_property *rtas_addr_prop;
> >> +    void *fdt = spapr->fdt_blob;
> >> +    uint32_t rtas_addr;
> >> +
> >> +    /* fetch rtas addr from fdt */
> >> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> >> +    g_assert(rtas_node >= 0);
> >> +
> >> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> >> +    g_assert(rtas_addr_prop);
> >> +
> >> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> >> +    return (uint64_t)rtas_addr;
> > 
> > It seems a bit roundabout to pull the rtas address out of the device
> > tree, since it was us that put it in there in the first place.
> 
> Slof can change the rtas address. So we need to get the updated rtas
> address.

Ah, ok.
Greg Kurz March 26, 2019, 8:30 a.m. UTC | #4
On Tue, 26 Mar 2019 10:32:35 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:
> > 
> > 
> > On Monday 25 March 2019 12:00 PM, David Gibson wrote:  
> > > On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:  
> > >> This patch builds the rtas error log, copies it to the
> > >> rtas_addr and then invokes the guest registered machine
> > >> check handler.  
> > > 
> > > This commit message needs more context.  When is this occurring, why
> > > do we need this?
> > > 
> > > [I can answer those questions now, but whether I - or anyone else -
> > >  will be able to looking back at this commit from years in the future
> > >  is a different question]  
> > 
> > will add more info.  
> 
> Thanks.
> 
> [snip]
> > >> +static uint64_t spapr_get_rtas_addr(void)
> > >> +{
> > >> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > >> +    int rtas_node;
> > >> +    const struct fdt_property *rtas_addr_prop;
> > >> +    void *fdt = spapr->fdt_blob;
> > >> +    uint32_t rtas_addr;
> > >> +
> > >> +    /* fetch rtas addr from fdt */
> > >> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> > >> +    g_assert(rtas_node >= 0);
> > >> +
> > >> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> > >> +    g_assert(rtas_addr_prop);
> > >> +
> > >> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> > >> +    return (uint64_t)rtas_addr;  
> > > 
> > > It seems a bit roundabout to pull the rtas address out of the device
> > > tree, since it was us that put it in there in the first place.  
> > 
> > Slof can change the rtas address. So we need to get the updated rtas
> > address.  
> 
> Ah, ok.
> 

Yeah, and knowing that the DT is guest originated makes me a bit
nervous when I see the g_assert()... a misbehaving guest could
possibly abort QEMU. Either there should be some sanity checks
performed earlier or an non-fatal error path should be added in
this function IMHO.
Aravinda Prasad March 26, 2019, 9:15 a.m. UTC | #5
On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
> On Tue, 26 Mar 2019 10:32:35 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:
>>>
>>>
>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:  
>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:  
>>>>> This patch builds the rtas error log, copies it to the
>>>>> rtas_addr and then invokes the guest registered machine
>>>>> check handler.  
>>>>
>>>> This commit message needs more context.  When is this occurring, why
>>>> do we need this?
>>>>
>>>> [I can answer those questions now, but whether I - or anyone else -
>>>>  will be able to looking back at this commit from years in the future
>>>>  is a different question]  
>>>
>>> will add more info.  
>>
>> Thanks.
>>
>> [snip]
>>>>> +static uint64_t spapr_get_rtas_addr(void)
>>>>> +{
>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>>> +    int rtas_node;
>>>>> +    const struct fdt_property *rtas_addr_prop;
>>>>> +    void *fdt = spapr->fdt_blob;
>>>>> +    uint32_t rtas_addr;
>>>>> +
>>>>> +    /* fetch rtas addr from fdt */
>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
>>>>> +    g_assert(rtas_node >= 0);
>>>>> +
>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
>>>>> +    g_assert(rtas_addr_prop);
>>>>> +
>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
>>>>> +    return (uint64_t)rtas_addr;  
>>>>
>>>> It seems a bit roundabout to pull the rtas address out of the device
>>>> tree, since it was us that put it in there in the first place.  
>>>
>>> Slof can change the rtas address. So we need to get the updated rtas
>>> address.  
>>
>> Ah, ok.
>>
> 
> Yeah, and knowing that the DT is guest originated makes me a bit
> nervous when I see the g_assert()... a misbehaving guest could
> possibly abort QEMU. Either there should be some sanity checks
> performed earlier or an non-fatal error path should be added in
> this function IMHO.

Is it not the QEMU that builds the DT and provides it to the guest?

Also, spapr_get_rtas_addr() is called during physical memory corruption
which is a fatal error. So, if we cannot fetch rtas_addr (required to
build and pass the error info to the guest) then I think we should abort.

>
Greg Kurz March 26, 2019, 10:05 a.m. UTC | #6
On Tue, 26 Mar 2019 14:45:57 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
> > On Tue, 26 Mar 2019 10:32:35 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> >> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:  
> >>>
> >>>
> >>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:    
> >>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:    
> >>>>> This patch builds the rtas error log, copies it to the
> >>>>> rtas_addr and then invokes the guest registered machine
> >>>>> check handler.    
> >>>>
> >>>> This commit message needs more context.  When is this occurring, why
> >>>> do we need this?
> >>>>
> >>>> [I can answer those questions now, but whether I - or anyone else -
> >>>>  will be able to looking back at this commit from years in the future
> >>>>  is a different question]    
> >>>
> >>> will add more info.    
> >>
> >> Thanks.
> >>
> >> [snip]  
> >>>>> +static uint64_t spapr_get_rtas_addr(void)
> >>>>> +{
> >>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >>>>> +    int rtas_node;
> >>>>> +    const struct fdt_property *rtas_addr_prop;
> >>>>> +    void *fdt = spapr->fdt_blob;
> >>>>> +    uint32_t rtas_addr;
> >>>>> +
> >>>>> +    /* fetch rtas addr from fdt */
> >>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> >>>>> +    g_assert(rtas_node >= 0);
> >>>>> +
> >>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> >>>>> +    g_assert(rtas_addr_prop);
> >>>>> +
> >>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> >>>>> +    return (uint64_t)rtas_addr;    
> >>>>
> >>>> It seems a bit roundabout to pull the rtas address out of the device
> >>>> tree, since it was us that put it in there in the first place.    
> >>>
> >>> Slof can change the rtas address. So we need to get the updated rtas
> >>> address.    
> >>
> >> Ah, ok.
> >>  
> > 
> > Yeah, and knowing that the DT is guest originated makes me a bit
> > nervous when I see the g_assert()... a misbehaving guest could
> > possibly abort QEMU. Either there should be some sanity checks
> > performed earlier or an non-fatal error path should be added in
> > this function IMHO.  
> 
> Is it not the QEMU that builds the DT and provides it to the guest?
> 

Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
We only do some minimalist sanity checks in h_update_dt(). I don't think
we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
is missing for example.

> Also, spapr_get_rtas_addr() is called during physical memory corruption
> which is a fatal error.

Not that fatal since we care to report it to the guest.

> So, if we cannot fetch rtas_addr (required to
> build and pass the error info to the guest) then I think we should abort.
> 

Maybe we cannot do anything better at this point, but then we should
do some earlier checks and switch to the old machine check behaviour
if what we need is missing from the updated DT for example.

> >   
>
David Gibson March 27, 2019, 6:01 a.m. UTC | #7
On Tue, Mar 26, 2019 at 11:05:44AM +0100, Greg Kurz wrote:
> On Tue, 26 Mar 2019 14:45:57 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
> > On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
> > > On Tue, 26 Mar 2019 10:32:35 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > >> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:  
> > >>>
> > >>>
> > >>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:    
> > >>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:    
> > >>>>> This patch builds the rtas error log, copies it to the
> > >>>>> rtas_addr and then invokes the guest registered machine
> > >>>>> check handler.    
> > >>>>
> > >>>> This commit message needs more context.  When is this occurring, why
> > >>>> do we need this?
> > >>>>
> > >>>> [I can answer those questions now, but whether I - or anyone else -
> > >>>>  will be able to looking back at this commit from years in the future
> > >>>>  is a different question]    
> > >>>
> > >>> will add more info.    
> > >>
> > >> Thanks.
> > >>
> > >> [snip]  
> > >>>>> +static uint64_t spapr_get_rtas_addr(void)
> > >>>>> +{
> > >>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > >>>>> +    int rtas_node;
> > >>>>> +    const struct fdt_property *rtas_addr_prop;
> > >>>>> +    void *fdt = spapr->fdt_blob;
> > >>>>> +    uint32_t rtas_addr;
> > >>>>> +
> > >>>>> +    /* fetch rtas addr from fdt */
> > >>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> > >>>>> +    g_assert(rtas_node >= 0);
> > >>>>> +
> > >>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> > >>>>> +    g_assert(rtas_addr_prop);
> > >>>>> +
> > >>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> > >>>>> +    return (uint64_t)rtas_addr;    
> > >>>>
> > >>>> It seems a bit roundabout to pull the rtas address out of the device
> > >>>> tree, since it was us that put it in there in the first place.    
> > >>>
> > >>> Slof can change the rtas address. So we need to get the updated rtas
> > >>> address.    
> > >>
> > >> Ah, ok.
> > >>  
> > > 
> > > Yeah, and knowing that the DT is guest originated makes me a bit
> > > nervous when I see the g_assert()... a misbehaving guest could
> > > possibly abort QEMU. Either there should be some sanity checks
> > > performed earlier or an non-fatal error path should be added in
> > > this function IMHO.  
> > 
> > Is it not the QEMU that builds the DT and provides it to the guest?
> > 
> 
> Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
> We only do some minimalist sanity checks in h_update_dt(). I don't think
> we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
> is missing for example.

Yeah, that's a good point.

> > Also, spapr_get_rtas_addr() is called during physical memory corruption
> > which is a fatal error.
> 
> Not that fatal since we care to report it to the guest.
> 
> > So, if we cannot fetch rtas_addr (required to
> > build and pass the error info to the guest) then I think we should abort.
> > 
> 
> Maybe we cannot do anything better at this point, but then we should
> do some earlier checks and switch to the old machine check behaviour
> if what we need is missing from the updated DT for example.
> 
> > >   
> > 
>
Aravinda Prasad March 27, 2019, 6:48 a.m. UTC | #8
On Tuesday 26 March 2019 03:35 PM, Greg Kurz wrote:
> On Tue, 26 Mar 2019 14:45:57 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
>> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
>>> On Tue, 26 Mar 2019 10:32:35 +1100
>>> David Gibson <david@gibson.dropbear.id.au> wrote:
>>>   
>>>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:  
>>>>>
>>>>>
>>>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:    
>>>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:    
>>>>>>> This patch builds the rtas error log, copies it to the
>>>>>>> rtas_addr and then invokes the guest registered machine
>>>>>>> check handler.    
>>>>>>
>>>>>> This commit message needs more context.  When is this occurring, why
>>>>>> do we need this?
>>>>>>
>>>>>> [I can answer those questions now, but whether I - or anyone else -
>>>>>>  will be able to looking back at this commit from years in the future
>>>>>>  is a different question]    
>>>>>
>>>>> will add more info.    
>>>>
>>>> Thanks.
>>>>
>>>> [snip]  
>>>>>>> +static uint64_t spapr_get_rtas_addr(void)
>>>>>>> +{
>>>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>>>>> +    int rtas_node;
>>>>>>> +    const struct fdt_property *rtas_addr_prop;
>>>>>>> +    void *fdt = spapr->fdt_blob;
>>>>>>> +    uint32_t rtas_addr;
>>>>>>> +
>>>>>>> +    /* fetch rtas addr from fdt */
>>>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
>>>>>>> +    g_assert(rtas_node >= 0);
>>>>>>> +
>>>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
>>>>>>> +    g_assert(rtas_addr_prop);
>>>>>>> +
>>>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
>>>>>>> +    return (uint64_t)rtas_addr;    
>>>>>>
>>>>>> It seems a bit roundabout to pull the rtas address out of the device
>>>>>> tree, since it was us that put it in there in the first place.    
>>>>>
>>>>> Slof can change the rtas address. So we need to get the updated rtas
>>>>> address.    
>>>>
>>>> Ah, ok.
>>>>  
>>>
>>> Yeah, and knowing that the DT is guest originated makes me a bit
>>> nervous when I see the g_assert()... a misbehaving guest could
>>> possibly abort QEMU. Either there should be some sanity checks
>>> performed earlier or an non-fatal error path should be added in
>>> this function IMHO.  
>>
>> Is it not the QEMU that builds the DT and provides it to the guest?
>>
> 
> Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
> We only do some minimalist sanity checks in h_update_dt(). I don't think
> we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
> is missing for example.
> 
>> Also, spapr_get_rtas_addr() is called during physical memory corruption
>> which is a fatal error.
> 
> Not that fatal since we care to report it to the guest.

True, but if guest does not provide rtas_addr then I am not getting the
point on why terminating the QEMU instance (which actually terminates
the guest) is a problem. Am I missing something?

> 
>> So, if we cannot fetch rtas_addr (required to
>> build and pass the error info to the guest) then I think we should abort.
>>
> 
> Maybe we cannot do anything better at this point, but then we should
> do some earlier checks and switch to the old machine check behaviour
> if what we need is missing from the updated DT for example.

We can do some checks earlier, may be during fwnmi registration to see
if rtas entry is missing. Again, the guest can possibly update DT after
fwnmi registration.

But I think we cannot switch to old machine check behavior if we cannot
fetch the rtas_addr, because according to PAPR the guest would have
relinquished 0x200 vector to the firmware when fwnmi is registered. So
we cannot expect the guest to handle 0x200 interrupt.


> 
>>>   
>>
>
Greg Kurz March 27, 2019, 2:46 p.m. UTC | #9
On Wed, 27 Mar 2019 12:18:31 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> On Tuesday 26 March 2019 03:35 PM, Greg Kurz wrote:
> > On Tue, 26 Mar 2019 14:45:57 +0530
> > Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> >   
> >> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:  
> >>> On Tue, 26 Mar 2019 10:32:35 +1100
> >>> David Gibson <david@gibson.dropbear.id.au> wrote:
> >>>     
> >>>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:    
> >>>>>
> >>>>>
> >>>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:      
> >>>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:      
> >>>>>>> This patch builds the rtas error log, copies it to the
> >>>>>>> rtas_addr and then invokes the guest registered machine
> >>>>>>> check handler.      
> >>>>>>
> >>>>>> This commit message needs more context.  When is this occurring, why
> >>>>>> do we need this?
> >>>>>>
> >>>>>> [I can answer those questions now, but whether I - or anyone else -
> >>>>>>  will be able to looking back at this commit from years in the future
> >>>>>>  is a different question]      
> >>>>>
> >>>>> will add more info.      
> >>>>
> >>>> Thanks.
> >>>>
> >>>> [snip]    
> >>>>>>> +static uint64_t spapr_get_rtas_addr(void)
> >>>>>>> +{
> >>>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >>>>>>> +    int rtas_node;
> >>>>>>> +    const struct fdt_property *rtas_addr_prop;
> >>>>>>> +    void *fdt = spapr->fdt_blob;
> >>>>>>> +    uint32_t rtas_addr;
> >>>>>>> +
> >>>>>>> +    /* fetch rtas addr from fdt */
> >>>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> >>>>>>> +    g_assert(rtas_node >= 0);
> >>>>>>> +
> >>>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> >>>>>>> +    g_assert(rtas_addr_prop);
> >>>>>>> +
> >>>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> >>>>>>> +    return (uint64_t)rtas_addr;      
> >>>>>>
> >>>>>> It seems a bit roundabout to pull the rtas address out of the device
> >>>>>> tree, since it was us that put it in there in the first place.      
> >>>>>
> >>>>> Slof can change the rtas address. So we need to get the updated rtas
> >>>>> address.      
> >>>>
> >>>> Ah, ok.
> >>>>    
> >>>
> >>> Yeah, and knowing that the DT is guest originated makes me a bit
> >>> nervous when I see the g_assert()... a misbehaving guest could
> >>> possibly abort QEMU. Either there should be some sanity checks
> >>> performed earlier or an non-fatal error path should be added in
> >>> this function IMHO.    
> >>
> >> Is it not the QEMU that builds the DT and provides it to the guest?
> >>  
> > 
> > Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
> > We only do some minimalist sanity checks in h_update_dt(). I don't think
> > we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
> > is missing for example.
> >   
> >> Also, spapr_get_rtas_addr() is called during physical memory corruption
> >> which is a fatal error.  
> > 

BTW... IIUC this can also occur if the guest created duplicate SLB entries,
which isn't fatal from an hypervisor perspective.

> > Not that fatal since we care to report it to the guest.  
> 
> True, but if guest does not provide rtas_addr then I am not getting the
> point on why terminating the QEMU instance (which actually terminates
> the guest) is a problem. Am I missing something?
> 

This isn't terminating QEMU, this is aborting QEMU, which should only
ever happen in case of a QEMU bug or g_malloc() failure.

LoPAPR v1.1 7.3.14 Firmware Assisted Non-Maskable Interrupts Option (FWNMI)
doesn't describe this specific case of course. However, it gives a hint on
what to do in case of a fatal condition where it isn't possible to notify
the guest:

	"...the firmware has no choice but to declare the condition
	fatal, log the result and execute the partition’s reboot policy."

ie, either triggering a system_reset or exiting QEMU.

> >   
> >> So, if we cannot fetch rtas_addr (required to
> >> build and pass the error info to the guest) then I think we should abort.
> >>  
> > 
> > Maybe we cannot do anything better at this point, but then we should
> > do some earlier checks and switch to the old machine check behaviour
> > if what we need is missing from the updated DT for example.  
> 
> We can do some checks earlier, may be during fwnmi registration to see
> if rtas entry is missing. Again, the guest can possibly update DT after
> fwnmi registration.
> 

Hmm... you're right. Maybe also add these checks to h_update_dt() if the
guest registered fwnmi ? Or even forbid DT updates since we really expect
this to occur only once from SLOF between machine resets ? In which case,
it would be appropriate to keep the asserts in spapr_get_rtas_addr().

> But I think we cannot switch to old machine check behavior if we cannot
> fetch the rtas_addr, because according to PAPR the guest would have
> relinquished 0x200 vector to the firmware when fwnmi is registered. So
> we cannot expect the guest to handle 0x200 interrupt.
> 

I wonder how kexec fits in the picture... I mean what if the guest switches
from an FWNMI-capable kernel to an FWNMI-not-capable one ? There's no machine
reset in this case...

> 
> >   
> >>>     
> >>  
> >   
>
David Gibson March 28, 2019, 12:50 a.m. UTC | #10
On Wed, Mar 27, 2019 at 12:18:31PM +0530, Aravinda Prasad wrote:
> 
> 
> On Tuesday 26 March 2019 03:35 PM, Greg Kurz wrote:
> > On Tue, 26 Mar 2019 14:45:57 +0530
> > Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> > 
> >> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
> >>> On Tue, 26 Mar 2019 10:32:35 +1100
> >>> David Gibson <david@gibson.dropbear.id.au> wrote:
> >>>   
> >>>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:  
> >>>>>
> >>>>>
> >>>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:    
> >>>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:    
> >>>>>>> This patch builds the rtas error log, copies it to the
> >>>>>>> rtas_addr and then invokes the guest registered machine
> >>>>>>> check handler.    
> >>>>>>
> >>>>>> This commit message needs more context.  When is this occurring, why
> >>>>>> do we need this?
> >>>>>>
> >>>>>> [I can answer those questions now, but whether I - or anyone else -
> >>>>>>  will be able to looking back at this commit from years in the future
> >>>>>>  is a different question]    
> >>>>>
> >>>>> will add more info.    
> >>>>
> >>>> Thanks.
> >>>>
> >>>> [snip]  
> >>>>>>> +static uint64_t spapr_get_rtas_addr(void)
> >>>>>>> +{
> >>>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >>>>>>> +    int rtas_node;
> >>>>>>> +    const struct fdt_property *rtas_addr_prop;
> >>>>>>> +    void *fdt = spapr->fdt_blob;
> >>>>>>> +    uint32_t rtas_addr;
> >>>>>>> +
> >>>>>>> +    /* fetch rtas addr from fdt */
> >>>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
> >>>>>>> +    g_assert(rtas_node >= 0);
> >>>>>>> +
> >>>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
> >>>>>>> +    g_assert(rtas_addr_prop);
> >>>>>>> +
> >>>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
> >>>>>>> +    return (uint64_t)rtas_addr;    
> >>>>>>
> >>>>>> It seems a bit roundabout to pull the rtas address out of the device
> >>>>>> tree, since it was us that put it in there in the first place.    
> >>>>>
> >>>>> Slof can change the rtas address. So we need to get the updated rtas
> >>>>> address.    
> >>>>
> >>>> Ah, ok.
> >>>>  
> >>>
> >>> Yeah, and knowing that the DT is guest originated makes me a bit
> >>> nervous when I see the g_assert()... a misbehaving guest could
> >>> possibly abort QEMU. Either there should be some sanity checks
> >>> performed earlier or an non-fatal error path should be added in
> >>> this function IMHO.  
> >>
> >> Is it not the QEMU that builds the DT and provides it to the guest?
> >>
> > 
> > Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
> > We only do some minimalist sanity checks in h_update_dt(). I don't think
> > we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
> > is missing for example.
> > 
> >> Also, spapr_get_rtas_addr() is called during physical memory corruption
> >> which is a fatal error.
> > 
> > Not that fatal since we care to report it to the guest.
> 
> True, but if guest does not provide rtas_addr then I am not getting the
> point on why terminating the QEMU instance (which actually terminates
> the guest) is a problem. Am I missing something?
> 
> > 
> >> So, if we cannot fetch rtas_addr (required to
> >> build and pass the error info to the guest) then I think we should abort.
> >>
> > 
> > Maybe we cannot do anything better at this point, but then we should
> > do some earlier checks and switch to the old machine check behaviour
> > if what we need is missing from the updated DT for example.
> 
> We can do some checks earlier, may be during fwnmi registration to see
> if rtas entry is missing. Again, the guest can possibly update DT after
> fwnmi registration.
> 
> But I think we cannot switch to old machine check behavior if we cannot
> fetch the rtas_addr, because according to PAPR the guest would have
> relinquished 0x200 vector to the firmware when fwnmi is registered. So
> we cannot expect the guest to handle 0x200 interrupt.

I think that's fair.

I think we want to verify that we can get the rtas_addr and store it
at nmi-register time.  If we can't we can fail the RTAS call.  If the
guest changes the RTAS address after that point (which we don't
expect), then it has shot itself in the foot and that's fine.
Aravinda Prasad March 28, 2019, 6:29 a.m. UTC | #11
On Wednesday 27 March 2019 08:16 PM, Greg Kurz wrote:
> On Wed, 27 Mar 2019 12:18:31 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
>> On Tuesday 26 March 2019 03:35 PM, Greg Kurz wrote:
>>> On Tue, 26 Mar 2019 14:45:57 +0530
>>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
>>>   
>>>> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:  
>>>>> On Tue, 26 Mar 2019 10:32:35 +1100
>>>>> David Gibson <david@gibson.dropbear.id.au> wrote:
>>>>>     
>>>>>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:    
>>>>>>>
>>>>>>>
>>>>>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:      
>>>>>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:      
>>>>>>>>> This patch builds the rtas error log, copies it to the
>>>>>>>>> rtas_addr and then invokes the guest registered machine
>>>>>>>>> check handler.      
>>>>>>>>
>>>>>>>> This commit message needs more context.  When is this occurring, why
>>>>>>>> do we need this?
>>>>>>>>
>>>>>>>> [I can answer those questions now, but whether I - or anyone else -
>>>>>>>>  will be able to looking back at this commit from years in the future
>>>>>>>>  is a different question]      
>>>>>>>
>>>>>>> will add more info.      
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>> [snip]    
>>>>>>>>> +static uint64_t spapr_get_rtas_addr(void)
>>>>>>>>> +{
>>>>>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>>>>>>> +    int rtas_node;
>>>>>>>>> +    const struct fdt_property *rtas_addr_prop;
>>>>>>>>> +    void *fdt = spapr->fdt_blob;
>>>>>>>>> +    uint32_t rtas_addr;
>>>>>>>>> +
>>>>>>>>> +    /* fetch rtas addr from fdt */
>>>>>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
>>>>>>>>> +    g_assert(rtas_node >= 0);
>>>>>>>>> +
>>>>>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
>>>>>>>>> +    g_assert(rtas_addr_prop);
>>>>>>>>> +
>>>>>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
>>>>>>>>> +    return (uint64_t)rtas_addr;      
>>>>>>>>
>>>>>>>> It seems a bit roundabout to pull the rtas address out of the device
>>>>>>>> tree, since it was us that put it in there in the first place.      
>>>>>>>
>>>>>>> Slof can change the rtas address. So we need to get the updated rtas
>>>>>>> address.      
>>>>>>
>>>>>> Ah, ok.
>>>>>>    
>>>>>
>>>>> Yeah, and knowing that the DT is guest originated makes me a bit
>>>>> nervous when I see the g_assert()... a misbehaving guest could
>>>>> possibly abort QEMU. Either there should be some sanity checks
>>>>> performed earlier or an non-fatal error path should be added in
>>>>> this function IMHO.    
>>>>
>>>> Is it not the QEMU that builds the DT and provides it to the guest?
>>>>  
>>>
>>> Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
>>> We only do some minimalist sanity checks in h_update_dt(). I don't think
>>> we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
>>> is missing for example.
>>>   
>>>> Also, spapr_get_rtas_addr() is called during physical memory corruption
>>>> which is a fatal error.  
>>>
> 
> BTW... IIUC this can also occur if the guest created duplicate SLB entries,
> which isn't fatal from an hypervisor perspective.

yes.. right..

> 
>>> Not that fatal since we care to report it to the guest.  
>>
>> True, but if guest does not provide rtas_addr then I am not getting the
>> point on why terminating the QEMU instance (which actually terminates
>> the guest) is a problem. Am I missing something?
>>
> 
> This isn't terminating QEMU, this is aborting QEMU, which should only
> ever happen in case of a QEMU bug or g_malloc() failure.
> 
> LoPAPR v1.1 7.3.14 Firmware Assisted Non-Maskable Interrupts Option (FWNMI)
> doesn't describe this specific case of course. However, it gives a hint on
> what to do in case of a fatal condition where it isn't possible to notify
> the guest:
> 
> 	"...the firmware has no choice but to declare the condition
> 	fatal, log the result and execute the partition’s reboot policy."
> 
> ie, either triggering a system_reset or exiting QEMU.

Now I see the point. I will modify it to trigger system_reset instead of
aborting.

> 
>>>   
>>>> So, if we cannot fetch rtas_addr (required to
>>>> build and pass the error info to the guest) then I think we should abort.
>>>>  
>>>
>>> Maybe we cannot do anything better at this point, but then we should
>>> do some earlier checks and switch to the old machine check behaviour
>>> if what we need is missing from the updated DT for example.  
>>
>> We can do some checks earlier, may be during fwnmi registration to see
>> if rtas entry is missing. Again, the guest can possibly update DT after
>> fwnmi registration.
>>
> 
> Hmm... you're right. Maybe also add these checks to h_update_dt() if the
> guest registered fwnmi ? Or even forbid DT updates since we really expect
> this to occur only once from SLOF between machine resets ? In which case,
> it would be appropriate to keep the asserts in spapr_get_rtas_addr().

As you pointed out above, we can trigger system_reset if we fail to
fetch rtas_addr. I think this approach is simple and straight forward
and will not forbid guests from updating the DT.

> 
>> But I think we cannot switch to old machine check behavior if we cannot
>> fetch the rtas_addr, because according to PAPR the guest would have
>> relinquished 0x200 vector to the firmware when fwnmi is registered. So
>> we cannot expect the guest to handle 0x200 interrupt.
>>
> 
> I wonder how kexec fits in the picture... I mean what if the guest switches
> from an FWNMI-capable kernel to an FWNMI-not-capable one ? There's no machine
> reset in this case...

Ah.. not sure I need to check. So in this case QEMU will never know that
the guest rebooted into an FWNMI-not-capable kexec kernel.

> 
>>
>>>   
>>>>>     
>>>>  
>>>   
>>
>
Aravinda Prasad March 28, 2019, 6:33 a.m. UTC | #12
On Thursday 28 March 2019 06:20 AM, David Gibson wrote:
> On Wed, Mar 27, 2019 at 12:18:31PM +0530, Aravinda Prasad wrote:
>>
>>
>> On Tuesday 26 March 2019 03:35 PM, Greg Kurz wrote:
>>> On Tue, 26 Mar 2019 14:45:57 +0530
>>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
>>>
>>>> On Tuesday 26 March 2019 02:00 PM, Greg Kurz wrote:
>>>>> On Tue, 26 Mar 2019 10:32:35 +1100
>>>>> David Gibson <david@gibson.dropbear.id.au> wrote:
>>>>>   
>>>>>> On Mon, Mar 25, 2019 at 01:56:50PM +0530, Aravinda Prasad wrote:  
>>>>>>>
>>>>>>>
>>>>>>> On Monday 25 March 2019 12:00 PM, David Gibson wrote:    
>>>>>>>> On Fri, Mar 22, 2019 at 12:04:07PM +0530, Aravinda Prasad wrote:    
>>>>>>>>> This patch builds the rtas error log, copies it to the
>>>>>>>>> rtas_addr and then invokes the guest registered machine
>>>>>>>>> check handler.    
>>>>>>>>
>>>>>>>> This commit message needs more context.  When is this occurring, why
>>>>>>>> do we need this?
>>>>>>>>
>>>>>>>> [I can answer those questions now, but whether I - or anyone else -
>>>>>>>>  will be able to looking back at this commit from years in the future
>>>>>>>>  is a different question]    
>>>>>>>
>>>>>>> will add more info.    
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>> [snip]  
>>>>>>>>> +static uint64_t spapr_get_rtas_addr(void)
>>>>>>>>> +{
>>>>>>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>>>>>>> +    int rtas_node;
>>>>>>>>> +    const struct fdt_property *rtas_addr_prop;
>>>>>>>>> +    void *fdt = spapr->fdt_blob;
>>>>>>>>> +    uint32_t rtas_addr;
>>>>>>>>> +
>>>>>>>>> +    /* fetch rtas addr from fdt */
>>>>>>>>> +    rtas_node = fdt_path_offset(fdt, "/rtas");
>>>>>>>>> +    g_assert(rtas_node >= 0);
>>>>>>>>> +
>>>>>>>>> +    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
>>>>>>>>> +    g_assert(rtas_addr_prop);
>>>>>>>>> +
>>>>>>>>> +    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
>>>>>>>>> +    return (uint64_t)rtas_addr;    
>>>>>>>>
>>>>>>>> It seems a bit roundabout to pull the rtas address out of the device
>>>>>>>> tree, since it was us that put it in there in the first place.    
>>>>>>>
>>>>>>> Slof can change the rtas address. So we need to get the updated rtas
>>>>>>> address.    
>>>>>>
>>>>>> Ah, ok.
>>>>>>  
>>>>>
>>>>> Yeah, and knowing that the DT is guest originated makes me a bit
>>>>> nervous when I see the g_assert()... a misbehaving guest could
>>>>> possibly abort QEMU. Either there should be some sanity checks
>>>>> performed earlier or an non-fatal error path should be added in
>>>>> this function IMHO.  
>>>>
>>>> Is it not the QEMU that builds the DT and provides it to the guest?
>>>>
>>>
>>> Yes, but then the guest can push a new DT with the KVMPPC_H_UPDATE_DT hcall.
>>> We only do some minimalist sanity checks in h_update_dt(). I don't think
>>> we want to abort QEMU because the guest sent a DT where "linux,rtas-base"
>>> is missing for example.
>>>
>>>> Also, spapr_get_rtas_addr() is called during physical memory corruption
>>>> which is a fatal error.
>>>
>>> Not that fatal since we care to report it to the guest.
>>
>> True, but if guest does not provide rtas_addr then I am not getting the
>> point on why terminating the QEMU instance (which actually terminates
>> the guest) is a problem. Am I missing something?
>>
>>>
>>>> So, if we cannot fetch rtas_addr (required to
>>>> build and pass the error info to the guest) then I think we should abort.
>>>>
>>>
>>> Maybe we cannot do anything better at this point, but then we should
>>> do some earlier checks and switch to the old machine check behaviour
>>> if what we need is missing from the updated DT for example.
>>
>> We can do some checks earlier, may be during fwnmi registration to see
>> if rtas entry is missing. Again, the guest can possibly update DT after
>> fwnmi registration.
>>
>> But I think we cannot switch to old machine check behavior if we cannot
>> fetch the rtas_addr, because according to PAPR the guest would have
>> relinquished 0x200 vector to the firmware when fwnmi is registered. So
>> we cannot expect the guest to handle 0x200 interrupt.
> 
> I think that's fair.
> 
> I think we want to verify that we can get the rtas_addr and store it
> at nmi-register time.  If we can't we can fail the RTAS call.  If the
> guest changes the RTAS address after that point (which we don't
> expect), then it has shot itself in the foot and that's fine.

Yes we can do that during nmi-register (a fail-early approach). Further
we can also try to re-fetch rtas_addr during machine check (in case the
guest has updated the DT) and if we fail to re-fetch we can trigger a
system_reset.

>
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 744dcad..562d405 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2910,6 +2910,10 @@  static void spapr_machine_init(MachineState *machine)
         error_report("Could not get size of LPAR rtas '%s'", filename);
         exit(1);
     }
+
+    /* Resize blob to accommodate error log. */
+    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
+
     spapr->rtas_blob = g_malloc(spapr->rtas_size);
     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
         error_report("Could not load LPAR rtas '%s'", filename);
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index e7a24ad..d7cc0a4 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -212,6 +212,106 @@  struct hp_extended_log {
     struct rtas_event_log_v6_hp hp;
 } QEMU_PACKED;
 
+struct rtas_event_log_v6_mc {
+#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
+    struct rtas_event_log_v6_section_header hdr;
+    uint32_t fru_id;
+    uint32_t proc_id;
+    uint8_t error_type;
+#define RTAS_LOG_V6_MC_TYPE_UE                           0
+#define RTAS_LOG_V6_MC_TYPE_SLB                          1
+#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
+#define RTAS_LOG_V6_MC_TYPE_TLB                          4
+#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
+#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
+    uint8_t sub_err_type;
+#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
+#define RTAS_LOG_V6_MC_UE_IFETCH                         1
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
+#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
+#define RTAS_LOG_V6_MC_SLB_PARITY                        0
+#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
+#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
+#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
+#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
+#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
+#define RTAS_LOG_V6_MC_TLB_PARITY                        1
+#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
+#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
+    uint8_t reserved_1[6];
+    uint64_t effective_address;
+    uint64_t logical_address;
+} QEMU_PACKED;
+
+struct mc_extended_log {
+    struct rtas_event_log_v6 v6hdr;
+    struct rtas_event_log_v6_mc mc;
+} QEMU_PACKED;
+
+struct MC_ierror_table {
+    unsigned long srr1_mask;
+    unsigned long srr1_value;
+    bool nip_valid; /* nip is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_ierror_table mc_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct MC_derror_table {
+    unsigned long dsisr_value;
+    bool dar_valid; /* dar is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_derror_table mc_derror_table[] = {
+{ 0x00008000, false,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00004000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000800, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000400, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000080, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000100, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
 typedef enum EventClass {
     EVENT_CLASS_INTERNAL_ERRORS     = 0,
     EVENT_CLASS_EPOW                = 1,
@@ -620,6 +720,149 @@  void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
                             RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
 }
 
+ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
+{
+    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
+    return RTAS_ERROR_LOG_MAX;
+}
+
+static uint64_t spapr_get_rtas_addr(void)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    int rtas_node;
+    const struct fdt_property *rtas_addr_prop;
+    void *fdt = spapr->fdt_blob;
+    uint32_t rtas_addr;
+
+    /* fetch rtas addr from fdt */
+    rtas_node = fdt_path_offset(fdt, "/rtas");
+    g_assert(rtas_node >= 0);
+
+    rtas_addr_prop = fdt_get_property(fdt, rtas_node, "linux,rtas-base", NULL);
+    g_assert(rtas_addr_prop);
+
+    rtas_addr = fdt32_to_cpu(*(uint32_t *)rtas_addr_prop->data);
+    return (uint64_t)rtas_addr;
+}
+
+static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
+                                        struct mc_extended_log *ext_elog)
+{
+    int i;
+    CPUPPCState *env = &cpu->env;
+    uint32_t summary;
+    uint64_t dsisr = env->spr[SPR_DSISR];
+
+    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
+    if (recovered) {
+        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
+    } else {
+        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
+    }
+
+    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
+        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
+            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_derror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
+            if (mc_derror_table[i].dar_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
+            }
+
+            summary |= mc_derror_table[i].initiator
+                        | mc_derror_table[i].severity;
+
+            return summary;
+        }
+    } else {
+        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
+            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
+                    mc_ierror_table[i].srr1_value) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
+            if (mc_ierror_table[i].nip_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
+            }
+
+            summary |= mc_ierror_table[i].initiator
+                        | mc_ierror_table[i].severity;
+
+            return summary;
+        }
+    }
+
+    summary |= RTAS_LOG_INITIATOR_CPU;
+    return summary;
+}
+
+static void spapr_mce_build_elog(PowerPCCPU *cpu, bool recovered)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    uint64_t rtas_addr;
+    CPUPPCState *env = &cpu->env;
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong r3, msr = 0;
+    struct rtas_error_log log;
+    struct mc_extended_log *ext_elog;
+    uint32_t summary;
+
+    ext_elog = g_malloc0(sizeof(*ext_elog));
+
+    /*
+     * Properly set bits in MSR before we invoke the handler.
+     * SRR0/1, DAR and DSISR are properly set by KVM
+     */
+    if (!(*pcc->interrupts_big_endian)(cpu)) {
+        msr |= (1ULL << MSR_LE);
+    }
+
+    if (env->msr && (1ULL << MSR_SF)) {
+        msr |= (1ULL << MSR_SF);
+    }
+
+    msr |= (1ULL << MSR_ME);
+    env->msr = msr;
+
+    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
+
+    log.summary = cpu_to_be32(summary);
+    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
+
+    /* r3 should be in BE always */
+    r3 = cpu_to_be64(env->gpr[3]);
+
+    spapr_init_v6hdr(&ext_elog->v6hdr);
+    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
+    ext_elog->mc.hdr.section_length =
+                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
+    ext_elog->mc.hdr.section_version = 1;
+
+    /* get rtas addr from fdt */
+    rtas_addr = spapr_get_rtas_addr();
+
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
+                              &log, sizeof(log));
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
+                              sizeof(log), ext_elog,
+                              sizeof(struct mc_extended_log));
+
+    /* Save gpr[3] in the guest endian mode */
+    if ((*pcc->interrupts_big_endian)(cpu)) {
+        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
+    } else {
+        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
+    }
+
+    env->nip = spapr->guest_machine_check_addr;
+}
+
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -640,6 +883,10 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
         }
     }
     spapr->mc_status = cpu->vcpu_id;
+
+    spapr_mce_build_elog(cpu, recovered);
+
+    return;
 }
 
 static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index b0d8c18..0c9dec1 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -662,6 +662,9 @@  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
 #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
 #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
 
+/* Offset from rtas-base where error log is placed */
+#define RTAS_ERRLOG_OFFSET       0x25
+
 static inline uint64_t ppc64_phys_to_real(uint64_t addr)
 {
     return addr & ~0xF000000000000000ULL;
@@ -793,6 +796,7 @@  void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift,
 void spapr_clear_pending_events(SpaprMachineState *spapr);
 int spapr_max_server_number(SpaprMachineState *spapr);
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
+ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
 
 /* DRC callbacks. */
 void spapr_core_release(DeviceState *dev);