Message ID | 20250217071711.83735-5-adityag@linux.ibm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Implement Firmware Assisted Dump for PSeries | expand |
On Mon Feb 17, 2025 at 5:17 PM AEST, Aditya Gupta wrote: > Kernel expects CPU states/register states in the format mentioned in > "Register Save Area" in PAPR. > > The platform (in our case, QEMU) saves each CPU register in the form of > an array of "register entries", the start and end of this array is > signified by "CPUSTRT" and "CPUEND" register entries respectively. > > The CPUSTRT and CPUEND register entry also has 4-byte logical CPU ID, > thus storing the CPU ID corresponding to the array of register entries. > > Each register, and CPUSTRT, CPUEND has a predefined identifier. > Implement calculating identifier for a given register in > 'fadump_str_to_u64', which has been taken from the linux kernel > > Similarly GPRs also have predefined identifiers, and a corresponding > 64-bit resiter value (split into two 32-bit cells). Implement > calculation of GPR identifiers with 'fadump_gpr_id_to_u64' > > PAPR has restrictions on particular order of few registers, and is > free to be in any order for other registers. > Some registers mentioned in PAPR have not been exported as they are not > implemented in QEMU / don't make sense in QEMU. > > Implement saving of CPU state according to the PAPR document > > Signed-off-by: Aditya Gupta <adityag@linux.ibm.com> > --- > hw/ppc/spapr_rtas.c | 200 ++++++++++++++++++++++++++++++++++++++++- > include/hw/ppc/spapr.h | 83 +++++++++++++++++ > 2 files changed, 281 insertions(+), 2 deletions(-) > > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c > index 9b29cadab2c9..0aca4270aee8 100644 > --- a/hw/ppc/spapr_rtas.c > +++ b/hw/ppc/spapr_rtas.c > @@ -348,9 +348,12 @@ bool is_next_boot_fadump; > static bool fadump_preserve_mem(void) > { > struct rtas_fadump_mem_struct *fdm = &fadump_metadata.registered_fdm; > + struct rtas_fadump_section *cpu_state_region; > uint64_t next_section_addr; > int dump_num_sections, data_type; > uint64_t src_addr, src_len, dest_addr; > + uint64_t cpu_state_addr, cpu_state_len = 0; > + void *cpu_state_buffer; > void *copy_buffer; > > assert(fadump_metadata.fadump_registered); > @@ -413,9 +416,174 @@ static bool fadump_preserve_mem(void) > } > > switch (data_type) { > - case FADUMP_CPU_STATE_DATA: > - /* TODO: Add CPU state data */ > + case FADUMP_CPU_STATE_DATA: { I would split these out into their own functions if they grow more than a few lines. > + struct rtas_fadump_reg_save_area_header reg_save_hdr; > + struct rtas_fadump_reg_entry **reg_entries; > + struct rtas_fadump_reg_entry *curr_reg_entry; > + > + uint32_t fadump_reg_entries_size; > + __be32 num_cpus = 0; > + uint32_t num_regs_per_cpu = 0; > + CPUState *cpu; > + CPUPPCState *env; > + PowerPCCPU *ppc_cpu; > + > + CPU_FOREACH(cpu) { > + ++num_cpus; > + } > + > + reg_save_hdr.version = cpu_to_be32(1); > + reg_save_hdr.magic_number = > + cpu_to_be64(fadump_str_to_u64("REGSAVE")); > + > + /* Reg save area header is immediately followed by num cpus */ > + reg_save_hdr.num_cpu_offset = > + cpu_to_be32(sizeof(struct rtas_fadump_reg_save_area_header)); > + > + fadump_reg_entries_size = num_cpus * > + FADUMP_NUM_PER_CPU_REGS * > + sizeof(struct rtas_fadump_reg_entry); > + > + reg_entries = malloc(fadump_reg_entries_size); > + curr_reg_entry = (struct rtas_fadump_reg_entry *)reg_entries; > + > + /* This must loop num_cpus time */ > + CPU_FOREACH(cpu) { > + ppc_cpu = POWERPC_CPU(cpu); > + env = cpu_env(cpu); > + num_regs_per_cpu = 0; > + > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_str_to_u64("CPUSTRT")); > + curr_reg_entry->reg_value = ppc_cpu->vcpu_id; > + ++curr_reg_entry; > + > +#define REG_ENTRY(id, val) \ > + do { \ > + curr_reg_entry->reg_id = \ > + cpu_to_be64(fadump_str_to_u64(#id)); \ > + curr_reg_entry->reg_value = val; \ > + ++curr_reg_entry; \ > + ++num_regs_per_cpu; \ > + } while (0) > + > + REG_ENTRY(ACOP, env->spr[SPR_ACOP]); > + REG_ENTRY(AMR, env->spr[SPR_AMR]); > + REG_ENTRY(BESCR, env->spr[SPR_BESCR]); > + REG_ENTRY(CFAR, env->spr[SPR_CFAR]); > + REG_ENTRY(CIABR, env->spr[SPR_CIABR]); > + > + /* Save the condition register */ > + uint64_t cr = 0; > + cr |= (env->crf[0] & 0xf); > + cr |= (env->crf[1] & 0xf) << 1; > + cr |= (env->crf[2] & 0xf) << 2; > + cr |= (env->crf[3] & 0xf) << 3; > + cr |= (env->crf[4] & 0xf) << 4; > + cr |= (env->crf[5] & 0xf) << 5; > + cr |= (env->crf[6] & 0xf) << 6; > + cr |= (env->crf[7] & 0xf) << 7; Shift values wrong here I think... Use ppc_get_cr() > + REG_ENTRY(CR, cr); > + > + REG_ENTRY(CTR, env->spr[SPR_CTR]); > + REG_ENTRY(CTRL, env->spr[SPR_CTRL]); > + REG_ENTRY(DABR, env->spr[SPR_DABR]); > + REG_ENTRY(DABRX, env->spr[SPR_DABRX]); > + REG_ENTRY(DAR, env->spr[SPR_DAR]); > + REG_ENTRY(DAWR0, env->spr[SPR_DAWR0]); > + REG_ENTRY(DAWR1, env->spr[SPR_DAWR1]); > + REG_ENTRY(DAWRX0, env->spr[SPR_DAWRX0]); > + REG_ENTRY(DAWRX1, env->spr[SPR_DAWRX1]); > + REG_ENTRY(DPDES, env->spr[SPR_DPDES]); > + REG_ENTRY(DSCR, env->spr[SPR_DSCR]); > + REG_ENTRY(DSISR, env->spr[SPR_DSISR]); > + REG_ENTRY(EBBHR, env->spr[SPR_EBBHR]); > + REG_ENTRY(EBBRR, env->spr[SPR_EBBRR]); > + > + REG_ENTRY(FPSCR, env->fpscr); > + REG_ENTRY(FSCR, env->spr[SPR_FSCR]); > + > + /* Save the GPRs */ > + for (int gpr_id = 0; gpr_id < 32; ++gpr_id) { > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_gpr_id_to_u64(gpr_id)); > + curr_reg_entry->reg_value = env->gpr[i]; > + ++curr_reg_entry; > + ++num_regs_per_cpu; > + } > + > + REG_ENTRY(IAMR, env->spr[SPR_IAMR]); > + REG_ENTRY(IC, env->spr[SPR_IC]); > + REG_ENTRY(LR, env->spr[SPR_LR]); > + > + REG_ENTRY(MSR, env->msr); > + REG_ENTRY(NIA, env->nip); /* NIA */ > + REG_ENTRY(PIR, env->spr[SPR_PIR]); > + REG_ENTRY(PSPB, env->spr[SPR_PSPB]); > + REG_ENTRY(PVR, env->spr[SPR_PVR]); > + REG_ENTRY(RPR, env->spr[SPR_RPR]); > + REG_ENTRY(SPURR, env->spr[SPR_SPURR]); > + REG_ENTRY(SRR0, env->spr[SPR_SRR0]); > + REG_ENTRY(SRR1, env->spr[SPR_SRR1]); > + REG_ENTRY(TAR, env->spr[SPR_TAR]); > + REG_ENTRY(TEXASR, env->spr[SPR_TEXASR]); > + REG_ENTRY(TFHAR, env->spr[SPR_TFHAR]); > + REG_ENTRY(TFIAR, env->spr[SPR_TFIAR]); > + REG_ENTRY(TIR, env->spr[SPR_TIR]); > + REG_ENTRY(UAMOR, env->spr[SPR_UAMOR]); > + REG_ENTRY(VRSAVE, env->spr[SPR_VRSAVE]); > + REG_ENTRY(VSCR, env->vscr); > + REG_ENTRY(VTB, env->spr[SPR_VTB]); > + REG_ENTRY(WORT, env->spr[SPR_WORT]); > + REG_ENTRY(XER, env->spr[SPR_XER]); > + > + /* > + * Ignoring transaction checkpoint and few other registers > + * mentioned in PAPR as not supported in QEMU > + */ > +#undef REG_ENTRY > + > + /* End the registers for this CPU with "CPUEND" reg entry */ > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_str_to_u64("CPUEND")); > + > + /* Ensure the number of registers match (+2 for STRT & END) */ > + assert(FADUMP_NUM_PER_CPU_REGS == num_regs_per_cpu + 2); > + > + ++curr_reg_entry; > + } > + > + cpu_state_len = 0; > + cpu_state_len += sizeof(reg_save_hdr); /* reg save header */ > + cpu_state_len += sizeof(__be32); /* num_cpus */ > + cpu_state_len += fadump_reg_entries_size; /* reg entries */ > + > + cpu_state_region = &fdm->rgn[i]; > + cpu_state_addr = dest_addr; > + cpu_state_buffer = g_malloc(cpu_state_len); > + > + uint64_t offset = 0; > + memcpy(cpu_state_buffer + offset, > + ®_save_hdr, sizeof(reg_save_hdr)); > + offset += sizeof(reg_save_hdr); > + > + /* Write num_cpus */ > + num_cpus = cpu_to_be32(num_cpus); > + memcpy(cpu_state_buffer + offset, &num_cpus, sizeof(__be32)); > + offset += sizeof(__be32); > + > + /* Write the register entries */ > + memcpy(cpu_state_buffer + offset, > + reg_entries, fadump_reg_entries_size); > + offset += fadump_reg_entries_size; > + > + /* > + * We will write the cpu state data later, as otherwise it > + * might get overwritten by other fadump regions > + */ > + > break; > + } > case FADUMP_HPTE_REGION: > /* TODO: Add hpte state data */ > break; > @@ -455,6 +623,34 @@ static bool fadump_preserve_mem(void) > } > } > > + /* > + * Write the Register Save Area > + * > + * CPU State/Register Save Area should be written after dumping the > + * memory to prevent overwritting while saving other memory regions > + * > + * eg. If boot memory region is 1G, then both the first 1GB memory, and > + * the Register Save Area needs to be saved at 1GB. > + * And as the CPU_STATE_DATA region comes first than the > + * REAL_MODE_REGION region to be copied, the CPU_STATE_DATA will get > + * overwritten if saved before the 0GB - 1GB region is copied after > + * saving CPU state data > + */ > + cpu_physical_memory_write(cpu_state_addr, cpu_state_buffer, cpu_state_len); Check docs/devel/loads-stores.rst, address_space_* is preferred to check for failures. It also says devices should operate on their own address spaces and that doesn't really apply to spapr since the "virtual hypervisor" doesn't really fit the model of a device... Perhaps look at h_enter_nested which uses CPU(cpu)->as. > + g_free(cpu_state_buffer); > + > + /* > + * Set bytes_dumped in cpu state region, so kernel knows platform have > + * exported it > + */ > + cpu_state_region->bytes_dumped = cpu_to_be64(cpu_state_len); > + > + if (cpu_state_region->source_len != cpu_state_region->bytes_dumped) { > + qemu_log_mask(LOG_GUEST_ERROR, > + "CPU State region's length passed by kernel, doesn't match" > + " with CPU State region length exported by QEMU"); > + } > + > return true; > } > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index a80704187583..0e8002bad9e0 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -792,6 +792,9 @@ void push_sregs_to_kvm_pr(SpaprMachineState *spapr); > #define FADUMP_HPTE_REGION 0x0002 > #define FADUMP_REAL_MODE_REGION 0x0011 > > +/* Number of registers stored per cpu */ > +#define FADUMP_NUM_PER_CPU_REGS (32 /*GPR*/ + 45 /*others*/ + 2 /*STRT & END*/) > + > /* OS defined sections */ > #define FADUMP_PARAM_AREA 0x0100 > > @@ -845,6 +848,86 @@ struct rtas_fadump_mem_struct { > struct rtas_fadump_section rgn[FADUMP_MAX_SECTIONS]; > }; > > +/* > + * The firmware-assisted dump format. > + * > + * The register save area is an area in the partition's memory used to preserve > + * the register contents (CPU state data) for the active CPUs during a firmware > + * assisted dump. The dump format contains register save area header followed > + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" > + * and ends with "CPUEND". > + */ > + > +/* Register save area header. */ > +struct rtas_fadump_reg_save_area_header { > + __be64 magic_number; > + __be32 version; > + __be32 num_cpu_offset; > +}; > + > +/* Register entry. */ > +struct rtas_fadump_reg_entry { > + __be64 reg_id; > + __be64 reg_value; > +}; > + > +/* > + * Copy the ascii values for first 8 characters from a string into u64 > + * variable at their respective indexes. > + * e.g. > + * The string "FADMPINF" will be converted into 0x4641444d50494e46 > + */ > +static inline uint64_t fadump_str_to_u64(const char *str) > +{ > + uint64_t val = 0; > + int i; > + > + for (i = 0; i < sizeof(val); i++) { > + val = (*str) ? (val << 8) | *str++ : val << 8; > + } > + return val; > +} > + > +/** > + * Get the identifier id for register entries of GPRs > + * > + * It gives the same id as 'fadump_str_to_u64' when the complete string id > + * of the GPR is given, ie. > + * > + * fadump_str_to_u64("GPR05") == fadump_gpr_id_to_u64(5); > + * fadump_str_to_u64("GPR12") == fadump_gpr_id_to_u64(12); > + * > + * And so on. Hence this can be implemented by creating a dynamic > + * string for each GPR, such as "GPR00", "GPR01", ... "GPR31" > + * Instead of allocating a string, an observation from the math of > + * 'fadump_str_to_u64' or from PAPR tells us that there's a pattern > + * in the identifier IDs, such that the first 8 bytes are affected only by > + * whether it is GPR0*, GPR1*, GPR2*, GPR3*. 9th byte is always 0x3. And > + * the the 10th byte is the index of the GPR modulo 10. > + */ > +static inline uint64_t fadump_gpr_id_to_u64(uint32_t gpr_id) > +{ > + uint64_t val = 0; > + > + /* Valid range of GPR id is only GPR0 to GPR31 */ > + assert(gpr_id < 32); > + > + if (gpr_id <= 9) { > + val = fadump_str_to_u64("GPR0"); > + } else if (gpr_id <= 19) { > + val = fadump_str_to_u64("GPR1"); > + } else if (gpr_id <= 29) { > + val = fadump_str_to_u64("GPR2"); > + } else { > + val = fadump_str_to_u64("GPR3"); > + } > + > + val |= 0x30000000; > + val |= ((gpr_id % 10) << 12); > + > + return val; > +} These two functions could probably go out of line, I doubt they are performance critical and make them static if not used outside the file. Thanks, Nick
On 27/02/25 08:57, Nicholas Piggin wrote: > On Mon Feb 17, 2025 at 5:17 PM AEST, Aditya Gupta wrote: >> <...snip...> >> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c >> index 9b29cadab2c9..0aca4270aee8 100644 >> <...snip...> >> --- a/hw/ppc/spapr_rtas.c >> +++ b/hw/ppc/spapr_rtas.c >> @@ -413,9 +416,174 @@ static bool fadump_preserve_mem(void) >> } >> >> switch (data_type) { >> - case FADUMP_CPU_STATE_DATA: >> - /* TODO: Add CPU state data */ >> + case FADUMP_CPU_STATE_DATA: { > I would split these out into their own functions if they grow more than > a few lines. Makes sense. Will add this into a new helper function. > >> + struct rtas_fadump_reg_save_area_header reg_save_hdr; >> + struct rtas_fadump_reg_entry **reg_entries; >> + struct rtas_fadump_reg_entry *curr_reg_entry; >> + >> + uint32_t fadump_reg_entries_size; >> + __be32 num_cpus = 0; >> + uint32_t num_regs_per_cpu = 0; >> + CPUState *cpu; >> + CPUPPCState *env; >> + PowerPCCPU *ppc_cpu; >> + >> + CPU_FOREACH(cpu) { >> + ++num_cpus; >> + } >> + >> + reg_save_hdr.version = cpu_to_be32(1); >> + reg_save_hdr.magic_number = >> + cpu_to_be64(fadump_str_to_u64("REGSAVE")); >> + >> + /* Reg save area header is immediately followed by num cpus */ >> + reg_save_hdr.num_cpu_offset = >> + cpu_to_be32(sizeof(struct rtas_fadump_reg_save_area_header)); >> + >> + fadump_reg_entries_size = num_cpus * >> + FADUMP_NUM_PER_CPU_REGS * >> + sizeof(struct rtas_fadump_reg_entry); >> + >> + reg_entries = malloc(fadump_reg_entries_size); >> + curr_reg_entry = (struct rtas_fadump_reg_entry *)reg_entries; >> + >> + /* This must loop num_cpus time */ >> + CPU_FOREACH(cpu) { >> + ppc_cpu = POWERPC_CPU(cpu); >> + env = cpu_env(cpu); >> + num_regs_per_cpu = 0; >> + >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_str_to_u64("CPUSTRT")); >> + curr_reg_entry->reg_value = ppc_cpu->vcpu_id; >> + ++curr_reg_entry; >> + >> +#define REG_ENTRY(id, val) \ >> + do { \ >> + curr_reg_entry->reg_id = \ >> + cpu_to_be64(fadump_str_to_u64(#id)); \ >> + curr_reg_entry->reg_value = val; \ >> + ++curr_reg_entry; \ >> + ++num_regs_per_cpu; \ >> + } while (0) >> + >> + REG_ENTRY(ACOP, env->spr[SPR_ACOP]); >> + REG_ENTRY(AMR, env->spr[SPR_AMR]); >> + REG_ENTRY(BESCR, env->spr[SPR_BESCR]); >> + REG_ENTRY(CFAR, env->spr[SPR_CFAR]); >> + REG_ENTRY(CIABR, env->spr[SPR_CIABR]); >> + >> + /* Save the condition register */ >> + uint64_t cr = 0; >> + cr |= (env->crf[0] & 0xf); >> + cr |= (env->crf[1] & 0xf) << 1; >> + cr |= (env->crf[2] & 0xf) << 2; >> + cr |= (env->crf[3] & 0xf) << 3; >> + cr |= (env->crf[4] & 0xf) << 4; >> + cr |= (env->crf[5] & 0xf) << 5; >> + cr |= (env->crf[6] & 0xf) << 6; >> + cr |= (env->crf[7] & 0xf) << 7; > Shift values wrong here I think... Use ppc_get_cr() Okay, I had some issues getting this CR. Will use 'ppc_get_cr', thanks ! > >> + REG_ENTRY(CR, cr); >> + >> + REG_ENTRY(CTR, env->spr[SPR_CTR]); >> + REG_ENTRY(CTRL, env->spr[SPR_CTRL]); >> + REG_ENTRY(DABR, env->spr[SPR_DABR]); >> + REG_ENTRY(DABRX, env->spr[SPR_DABRX]); >> + REG_ENTRY(DAR, env->spr[SPR_DAR]); >> + REG_ENTRY(DAWR0, env->spr[SPR_DAWR0]); >> + REG_ENTRY(DAWR1, env->spr[SPR_DAWR1]); >> + REG_ENTRY(DAWRX0, env->spr[SPR_DAWRX0]); >> + REG_ENTRY(DAWRX1, env->spr[SPR_DAWRX1]); >> + REG_ENTRY(DPDES, env->spr[SPR_DPDES]); >> + REG_ENTRY(DSCR, env->spr[SPR_DSCR]); >> + REG_ENTRY(DSISR, env->spr[SPR_DSISR]); >> + REG_ENTRY(EBBHR, env->spr[SPR_EBBHR]); >> + REG_ENTRY(EBBRR, env->spr[SPR_EBBRR]); >> + >> + REG_ENTRY(FPSCR, env->fpscr); >> + REG_ENTRY(FSCR, env->spr[SPR_FSCR]); >> + >> + /* Save the GPRs */ >> + for (int gpr_id = 0; gpr_id < 32; ++gpr_id) { >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_gpr_id_to_u64(gpr_id)); >> + curr_reg_entry->reg_value = env->gpr[i]; >> + ++curr_reg_entry; >> + ++num_regs_per_cpu; >> + } >> + >> + REG_ENTRY(IAMR, env->spr[SPR_IAMR]); >> + REG_ENTRY(IC, env->spr[SPR_IC]); >> + REG_ENTRY(LR, env->spr[SPR_LR]); >> + >> + REG_ENTRY(MSR, env->msr); >> + REG_ENTRY(NIA, env->nip); /* NIA */ >> + REG_ENTRY(PIR, env->spr[SPR_PIR]); >> + REG_ENTRY(PSPB, env->spr[SPR_PSPB]); >> + REG_ENTRY(PVR, env->spr[SPR_PVR]); >> + REG_ENTRY(RPR, env->spr[SPR_RPR]); >> + REG_ENTRY(SPURR, env->spr[SPR_SPURR]); >> + REG_ENTRY(SRR0, env->spr[SPR_SRR0]); >> + REG_ENTRY(SRR1, env->spr[SPR_SRR1]); >> + REG_ENTRY(TAR, env->spr[SPR_TAR]); >> + REG_ENTRY(TEXASR, env->spr[SPR_TEXASR]); >> + REG_ENTRY(TFHAR, env->spr[SPR_TFHAR]); >> + REG_ENTRY(TFIAR, env->spr[SPR_TFIAR]); >> + REG_ENTRY(TIR, env->spr[SPR_TIR]); >> + REG_ENTRY(UAMOR, env->spr[SPR_UAMOR]); >> + REG_ENTRY(VRSAVE, env->spr[SPR_VRSAVE]); >> + REG_ENTRY(VSCR, env->vscr); >> + REG_ENTRY(VTB, env->spr[SPR_VTB]); >> + REG_ENTRY(WORT, env->spr[SPR_WORT]); >> + REG_ENTRY(XER, env->spr[SPR_XER]); >> + >> + /* >> + * Ignoring transaction checkpoint and few other registers >> + * mentioned in PAPR as not supported in QEMU >> + */ >> +#undef REG_ENTRY >> + >> + /* End the registers for this CPU with "CPUEND" reg entry */ >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_str_to_u64("CPUEND")); >> + >> + /* Ensure the number of registers match (+2 for STRT & END) */ >> + assert(FADUMP_NUM_PER_CPU_REGS == num_regs_per_cpu + 2); >> + >> + ++curr_reg_entry; >> + } >> + >> + cpu_state_len = 0; >> + cpu_state_len += sizeof(reg_save_hdr); /* reg save header */ >> + cpu_state_len += sizeof(__be32); /* num_cpus */ >> + cpu_state_len += fadump_reg_entries_size; /* reg entries */ >> + >> + cpu_state_region = &fdm->rgn[i]; >> + cpu_state_addr = dest_addr; >> + cpu_state_buffer = g_malloc(cpu_state_len); >> + >> + uint64_t offset = 0; >> + memcpy(cpu_state_buffer + offset, >> + ®_save_hdr, sizeof(reg_save_hdr)); >> + offset += sizeof(reg_save_hdr); >> + >> + /* Write num_cpus */ >> + num_cpus = cpu_to_be32(num_cpus); >> + memcpy(cpu_state_buffer + offset, &num_cpus, sizeof(__be32)); >> + offset += sizeof(__be32); >> + >> + /* Write the register entries */ >> + memcpy(cpu_state_buffer + offset, >> + reg_entries, fadump_reg_entries_size); >> + offset += fadump_reg_entries_size; >> + >> + /* >> + * We will write the cpu state data later, as otherwise it >> + * might get overwritten by other fadump regions >> + */ >> + >> break; >> + } >> case FADUMP_HPTE_REGION: >> /* TODO: Add hpte state data */ >> break; >> @@ -455,6 +623,34 @@ static bool fadump_preserve_mem(void) >> } >> } >> >> + /* >> + * Write the Register Save Area >> + * >> + * CPU State/Register Save Area should be written after dumping the >> + * memory to prevent overwritting while saving other memory regions >> + * >> + * eg. If boot memory region is 1G, then both the first 1GB memory, and >> + * the Register Save Area needs to be saved at 1GB. >> + * And as the CPU_STATE_DATA region comes first than the >> + * REAL_MODE_REGION region to be copied, the CPU_STATE_DATA will get >> + * overwritten if saved before the 0GB - 1GB region is copied after >> + * saving CPU state data >> + */ >> + cpu_physical_memory_write(cpu_state_addr, cpu_state_buffer, cpu_state_len); > Check docs/devel/loads-stores.rst, address_space_* is preferred to check > for failures. It also says devices should operate on their own address > spaces and that doesn't really apply to spapr since the "virtual > hypervisor" doesn't really fit the model of a device... > > Perhaps look at h_enter_nested which uses CPU(cpu)->as. Got it. Will try to use address_space_read/write in v2. >> + g_free(cpu_state_buffer); >> + >> + /* >> + * Set bytes_dumped in cpu state region, so kernel knows platform have >> + * exported it >> + */ >> + cpu_state_region->bytes_dumped = cpu_to_be64(cpu_state_len); >> + >> + if (cpu_state_region->source_len != cpu_state_region->bytes_dumped) { >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "CPU State region's length passed by kernel, doesn't match" >> + " with CPU State region length exported by QEMU"); >> + } >> + >> return true; >> } >> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h >> index a80704187583..0e8002bad9e0 100644 >> --- a/include/hw/ppc/spapr.h >> +++ b/include/hw/ppc/spapr.h >> @@ -792,6 +792,9 @@ void push_sregs_to_kvm_pr(SpaprMachineState *spapr); >> #define FADUMP_HPTE_REGION 0x0002 >> #define FADUMP_REAL_MODE_REGION 0x0011 >> >> +/* Number of registers stored per cpu */ >> +#define FADUMP_NUM_PER_CPU_REGS (32 /*GPR*/ + 45 /*others*/ + 2 /*STRT & END*/) >> + >> /* OS defined sections */ >> #define FADUMP_PARAM_AREA 0x0100 >> >> @@ -845,6 +848,86 @@ struct rtas_fadump_mem_struct { >> struct rtas_fadump_section rgn[FADUMP_MAX_SECTIONS]; >> }; >> >> +/* >> + * The firmware-assisted dump format. >> + * >> + * The register save area is an area in the partition's memory used to preserve >> + * the register contents (CPU state data) for the active CPUs during a firmware >> + * assisted dump. The dump format contains register save area header followed >> + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" >> + * and ends with "CPUEND". >> + */ >> + >> +/* Register save area header. */ >> +struct rtas_fadump_reg_save_area_header { >> + __be64 magic_number; >> + __be32 version; >> + __be32 num_cpu_offset; >> +}; >> + >> +/* Register entry. */ >> +struct rtas_fadump_reg_entry { >> + __be64 reg_id; >> + __be64 reg_value; >> +}; >> + >> +/* >> + * Copy the ascii values for first 8 characters from a string into u64 >> + * variable at their respective indexes. >> + * e.g. >> + * The string "FADMPINF" will be converted into 0x4641444d50494e46 >> + */ >> +static inline uint64_t fadump_str_to_u64(const char *str) >> +{ >> + uint64_t val = 0; >> + int i; >> + >> + for (i = 0; i < sizeof(val); i++) { >> + val = (*str) ? (val << 8) | *str++ : val << 8; >> + } >> + return val; >> +} >> + >> +/** >> + * Get the identifier id for register entries of GPRs >> + * >> + * It gives the same id as 'fadump_str_to_u64' when the complete string id >> + * of the GPR is given, ie. >> + * >> + * fadump_str_to_u64("GPR05") == fadump_gpr_id_to_u64(5); >> + * fadump_str_to_u64("GPR12") == fadump_gpr_id_to_u64(12); >> + * >> + * And so on. Hence this can be implemented by creating a dynamic >> + * string for each GPR, such as "GPR00", "GPR01", ... "GPR31" >> + * Instead of allocating a string, an observation from the math of >> + * 'fadump_str_to_u64' or from PAPR tells us that there's a pattern >> + * in the identifier IDs, such that the first 8 bytes are affected only by >> + * whether it is GPR0*, GPR1*, GPR2*, GPR3*. 9th byte is always 0x3. And >> + * the the 10th byte is the index of the GPR modulo 10. >> + */ >> +static inline uint64_t fadump_gpr_id_to_u64(uint32_t gpr_id) >> +{ >> + uint64_t val = 0; >> + >> + /* Valid range of GPR id is only GPR0 to GPR31 */ >> + assert(gpr_id < 32); >> + >> + if (gpr_id <= 9) { >> + val = fadump_str_to_u64("GPR0"); >> + } else if (gpr_id <= 19) { >> + val = fadump_str_to_u64("GPR1"); >> + } else if (gpr_id <= 29) { >> + val = fadump_str_to_u64("GPR2"); >> + } else { >> + val = fadump_str_to_u64("GPR3"); >> + } >> + >> + val |= 0x30000000; >> + val |= ((gpr_id % 10) << 12); >> + >> + return val; >> +} > These two functions could probably go out of line, I doubt they > are performance critical and make them static if not used outside > the file. True, have marked them static (but in a header file), can see I can move it into some .c file if not needed to be shared. Thanks for your reviews Nick. - Aditya G > > Thanks, > Nick
On 2/17/25 12:47, Aditya Gupta wrote: > Kernel expects CPU states/register states in the format mentioned in > "Register Save Area" in PAPR. > > The platform (in our case, QEMU) saves each CPU register in the form of > an array of "register entries", the start and end of this array is > signified by "CPUSTRT" and "CPUEND" register entries respectively. > > The CPUSTRT and CPUEND register entry also has 4-byte logical CPU ID, > thus storing the CPU ID corresponding to the array of register entries. > > Each register, and CPUSTRT, CPUEND has a predefined identifier. > Implement calculating identifier for a given register in > 'fadump_str_to_u64', which has been taken from the linux kernel > > Similarly GPRs also have predefined identifiers, and a corresponding > 64-bit resiter value (split into two 32-bit cells). Implement > calculation of GPR identifiers with 'fadump_gpr_id_to_u64' > > PAPR has restrictions on particular order of few registers, and is > free to be in any order for other registers. > Some registers mentioned in PAPR have not been exported as they are not > implemented in QEMU / don't make sense in QEMU. > > Implement saving of CPU state according to the PAPR document > > Signed-off-by: Aditya Gupta <adityag@linux.ibm.com> > --- > hw/ppc/spapr_rtas.c | 200 ++++++++++++++++++++++++++++++++++++++++- > include/hw/ppc/spapr.h | 83 +++++++++++++++++ > 2 files changed, 281 insertions(+), 2 deletions(-) > > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c > index 9b29cadab2c9..0aca4270aee8 100644 > --- a/hw/ppc/spapr_rtas.c > +++ b/hw/ppc/spapr_rtas.c > @@ -348,9 +348,12 @@ bool is_next_boot_fadump; > static bool fadump_preserve_mem(void) > { > struct rtas_fadump_mem_struct *fdm = &fadump_metadata.registered_fdm; > + struct rtas_fadump_section *cpu_state_region; > uint64_t next_section_addr; > int dump_num_sections, data_type; > uint64_t src_addr, src_len, dest_addr; > + uint64_t cpu_state_addr, cpu_state_len = 0; > + void *cpu_state_buffer; > void *copy_buffer; > > assert(fadump_metadata.fadump_registered); > @@ -413,9 +416,174 @@ static bool fadump_preserve_mem(void) > } > > switch (data_type) { > - case FADUMP_CPU_STATE_DATA: > - /* TODO: Add CPU state data */ > + case FADUMP_CPU_STATE_DATA: { > + struct rtas_fadump_reg_save_area_header reg_save_hdr; > + struct rtas_fadump_reg_entry **reg_entries; > + struct rtas_fadump_reg_entry *curr_reg_entry; > + > + uint32_t fadump_reg_entries_size; > + __be32 num_cpus = 0; > + uint32_t num_regs_per_cpu = 0; > + CPUState *cpu; > + CPUPPCState *env; > + PowerPCCPU *ppc_cpu; > + > + CPU_FOREACH(cpu) { > + ++num_cpus; > + } > + > + reg_save_hdr.version = cpu_to_be32(1); PAPR spec mentions version value as 0. Do we need to update ? > + reg_save_hdr.magic_number = > + cpu_to_be64(fadump_str_to_u64("REGSAVE")); > + > + /* Reg save area header is immediately followed by num cpus */ > + reg_save_hdr.num_cpu_offset = > + cpu_to_be32(sizeof(struct rtas_fadump_reg_save_area_header)); > + Above inits could go into a helper fadump_init_reg_save_header(®_save_hdr); BTW, the PAPR spec also mentions about padding followed by num_cpus_offset, see another comment later below. > + fadump_reg_entries_size = num_cpus * > + FADUMP_NUM_PER_CPU_REGS * > + sizeof(struct rtas_fadump_reg_entry); > + > + reg_entries = malloc(fadump_reg_entries_size); This was declared as double pointer, but being used as a pointer. > + curr_reg_entry = (struct rtas_fadump_reg_entry *)reg_entries; > + > + /* This must loop num_cpus time */ > + CPU_FOREACH(cpu) { > + ppc_cpu = POWERPC_CPU(cpu); > + env = cpu_env(cpu); > + num_regs_per_cpu = 0; > + > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_str_to_u64("CPUSTRT")); > + curr_reg_entry->reg_value = ppc_cpu->vcpu_id; > + ++curr_reg_entry; > + > +#define REG_ENTRY(id, val) \ > + do { \ > + curr_reg_entry->reg_id = \ > + cpu_to_be64(fadump_str_to_u64(#id)); \ > + curr_reg_entry->reg_value = val; \ > + ++curr_reg_entry; \ > + ++num_regs_per_cpu; \ > + } while (0) > + > + REG_ENTRY(ACOP, env->spr[SPR_ACOP]); > + REG_ENTRY(AMR, env->spr[SPR_AMR]); > + REG_ENTRY(BESCR, env->spr[SPR_BESCR]); > + REG_ENTRY(CFAR, env->spr[SPR_CFAR]); > + REG_ENTRY(CIABR, env->spr[SPR_CIABR]); > + > + /* Save the condition register */ > + uint64_t cr = 0; > + cr |= (env->crf[0] & 0xf); > + cr |= (env->crf[1] & 0xf) << 1; > + cr |= (env->crf[2] & 0xf) << 2; > + cr |= (env->crf[3] & 0xf) << 3; > + cr |= (env->crf[4] & 0xf) << 4; > + cr |= (env->crf[5] & 0xf) << 5; > + cr |= (env->crf[6] & 0xf) << 6; > + cr |= (env->crf[7] & 0xf) << 7; > + REG_ENTRY(CR, cr); ppc_get_cr ? > + > + REG_ENTRY(CTR, env->spr[SPR_CTR]); > + REG_ENTRY(CTRL, env->spr[SPR_CTRL]); > + REG_ENTRY(DABR, env->spr[SPR_DABR]); > + REG_ENTRY(DABRX, env->spr[SPR_DABRX]); > + REG_ENTRY(DAR, env->spr[SPR_DAR]); > + REG_ENTRY(DAWR0, env->spr[SPR_DAWR0]); > + REG_ENTRY(DAWR1, env->spr[SPR_DAWR1]); > + REG_ENTRY(DAWRX0, env->spr[SPR_DAWRX0]); > + REG_ENTRY(DAWRX1, env->spr[SPR_DAWRX1]); > + REG_ENTRY(DPDES, env->spr[SPR_DPDES]); > + REG_ENTRY(DSCR, env->spr[SPR_DSCR]); > + REG_ENTRY(DSISR, env->spr[SPR_DSISR]); > + REG_ENTRY(EBBHR, env->spr[SPR_EBBHR]); > + REG_ENTRY(EBBRR, env->spr[SPR_EBBRR]); > + > + REG_ENTRY(FPSCR, env->fpscr); > + REG_ENTRY(FSCR, env->spr[SPR_FSCR]); > + > + /* Save the GPRs */ > + for (int gpr_id = 0; gpr_id < 32; ++gpr_id) { > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_gpr_id_to_u64(gpr_id)); > + curr_reg_entry->reg_value = env->gpr[i]; > + ++curr_reg_entry; > + ++num_regs_per_cpu; > + } > + > + REG_ENTRY(IAMR, env->spr[SPR_IAMR]); > + REG_ENTRY(IC, env->spr[SPR_IC]); > + REG_ENTRY(LR, env->spr[SPR_LR]); > + > + REG_ENTRY(MSR, env->msr); > + REG_ENTRY(NIA, env->nip); /* NIA */ > + REG_ENTRY(PIR, env->spr[SPR_PIR]); > + REG_ENTRY(PSPB, env->spr[SPR_PSPB]); > + REG_ENTRY(PVR, env->spr[SPR_PVR]); > + REG_ENTRY(RPR, env->spr[SPR_RPR]); > + REG_ENTRY(SPURR, env->spr[SPR_SPURR]); > + REG_ENTRY(SRR0, env->spr[SPR_SRR0]); > + REG_ENTRY(SRR1, env->spr[SPR_SRR1]); > + REG_ENTRY(TAR, env->spr[SPR_TAR]); > + REG_ENTRY(TEXASR, env->spr[SPR_TEXASR]); > + REG_ENTRY(TFHAR, env->spr[SPR_TFHAR]); > + REG_ENTRY(TFIAR, env->spr[SPR_TFIAR]); > + REG_ENTRY(TIR, env->spr[SPR_TIR]); > + REG_ENTRY(UAMOR, env->spr[SPR_UAMOR]); > + REG_ENTRY(VRSAVE, env->spr[SPR_VRSAVE]); > + REG_ENTRY(VSCR, env->vscr); > + REG_ENTRY(VTB, env->spr[SPR_VTB]); > + REG_ENTRY(WORT, env->spr[SPR_WORT]); > + REG_ENTRY(XER, env->spr[SPR_XER]); > + > + /* > + * Ignoring transaction checkpoint and few other registers > + * mentioned in PAPR as not supported in QEMU > + */ > +#undef REG_ENTRY > + > + /* End the registers for this CPU with "CPUEND" reg entry */ > + curr_reg_entry->reg_id = > + cpu_to_be64(fadump_str_to_u64("CPUEND")); > + > + /* Ensure the number of registers match (+2 for STRT & END) */ > + assert(FADUMP_NUM_PER_CPU_REGS == num_regs_per_cpu + 2); > + > + ++curr_reg_entry; > + } > + > + cpu_state_len = 0; > + cpu_state_len += sizeof(reg_save_hdr); /* reg save header */ > + cpu_state_len += sizeof(__be32); /* num_cpus */ > + cpu_state_len += fadump_reg_entries_size; /* reg entries */ > + above 4 inits could be a single line init. > + cpu_state_region = &fdm->rgn[i]; > + cpu_state_addr = dest_addr; > + cpu_state_buffer = g_malloc(cpu_state_len); > + > + uint64_t offset = 0; > + memcpy(cpu_state_buffer + offset, > + ®_save_hdr, sizeof(reg_save_hdr)); > + offset += sizeof(reg_save_hdr); > + > + /* Write num_cpus */ > + num_cpus = cpu_to_be32(num_cpus); > + memcpy(cpu_state_buffer + offset, &num_cpus, sizeof(__be32)); > + offset += sizeof(__be32); As per PAPR spec, NumCpusOffset is followed by a padding of 0xC bytes initialized to 0. Any reasons for skipping that here ? > + > + /* Write the register entries */ > + memcpy(cpu_state_buffer + offset, > + reg_entries, fadump_reg_entries_size); > + offset += fadump_reg_entries_size; > + > + /* > + * We will write the cpu state data later, as otherwise it > + * might get overwritten by other fadump regions > + */ > + Better to have a separate routine fadump_preserve_cpu_state_data() when the switch case logic grows this large, applies wherever applicable. > break; > + } > case FADUMP_HPTE_REGION: > /* TODO: Add hpte state data */ > break; > @@ -455,6 +623,34 @@ static bool fadump_preserve_mem(void) > } > } > > + /* > + * Write the Register Save Area > + * > + * CPU State/Register Save Area should be written after dumping the > + * memory to prevent overwritting while saving other memory regions > + * > + * eg. If boot memory region is 1G, then both the first 1GB memory, and > + * the Register Save Area needs to be saved at 1GB. > + * And as the CPU_STATE_DATA region comes first than the > + * REAL_MODE_REGION region to be copied, the CPU_STATE_DATA will get > + * overwritten if saved before the 0GB - 1GB region is copied after > + * saving CPU state data > + */ > + cpu_physical_memory_write(cpu_state_addr, cpu_state_buffer, cpu_state_len); > + g_free(cpu_state_buffer); > + > + /* > + * Set bytes_dumped in cpu state region, so kernel knows platform have > + * exported it > + */ > + cpu_state_region->bytes_dumped = cpu_to_be64(cpu_state_len); > + > + if (cpu_state_region->source_len != cpu_state_region->bytes_dumped) { > + qemu_log_mask(LOG_GUEST_ERROR, > + "CPU State region's length passed by kernel, doesn't match" > + " with CPU State region length exported by QEMU"); return error ? Thanks Harsh > + } > + > return true; > } > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index a80704187583..0e8002bad9e0 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -792,6 +792,9 @@ void push_sregs_to_kvm_pr(SpaprMachineState *spapr); > #define FADUMP_HPTE_REGION 0x0002 > #define FADUMP_REAL_MODE_REGION 0x0011 > > +/* Number of registers stored per cpu */ > +#define FADUMP_NUM_PER_CPU_REGS (32 /*GPR*/ + 45 /*others*/ + 2 /*STRT & END*/) > + > /* OS defined sections */ > #define FADUMP_PARAM_AREA 0x0100 > > @@ -845,6 +848,86 @@ struct rtas_fadump_mem_struct { > struct rtas_fadump_section rgn[FADUMP_MAX_SECTIONS]; > }; > > +/* > + * The firmware-assisted dump format. > + * > + * The register save area is an area in the partition's memory used to preserve > + * the register contents (CPU state data) for the active CPUs during a firmware > + * assisted dump. The dump format contains register save area header followed > + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" > + * and ends with "CPUEND". > + */ > + > +/* Register save area header. */ > +struct rtas_fadump_reg_save_area_header { > + __be64 magic_number; > + __be32 version; > + __be32 num_cpu_offset; > +}; > + > +/* Register entry. */ > +struct rtas_fadump_reg_entry { > + __be64 reg_id; > + __be64 reg_value; > +}; > + > +/* > + * Copy the ascii values for first 8 characters from a string into u64 > + * variable at their respective indexes. > + * e.g. > + * The string "FADMPINF" will be converted into 0x4641444d50494e46 > + */ > +static inline uint64_t fadump_str_to_u64(const char *str) > +{ > + uint64_t val = 0; > + int i; > + > + for (i = 0; i < sizeof(val); i++) { > + val = (*str) ? (val << 8) | *str++ : val << 8; > + } > + return val; > +} > + > +/** > + * Get the identifier id for register entries of GPRs > + * > + * It gives the same id as 'fadump_str_to_u64' when the complete string id > + * of the GPR is given, ie. > + * > + * fadump_str_to_u64("GPR05") == fadump_gpr_id_to_u64(5); > + * fadump_str_to_u64("GPR12") == fadump_gpr_id_to_u64(12); > + * > + * And so on. Hence this can be implemented by creating a dynamic > + * string for each GPR, such as "GPR00", "GPR01", ... "GPR31" > + * Instead of allocating a string, an observation from the math of > + * 'fadump_str_to_u64' or from PAPR tells us that there's a pattern > + * in the identifier IDs, such that the first 8 bytes are affected only by > + * whether it is GPR0*, GPR1*, GPR2*, GPR3*. 9th byte is always 0x3. And > + * the the 10th byte is the index of the GPR modulo 10. > + */ > +static inline uint64_t fadump_gpr_id_to_u64(uint32_t gpr_id) > +{ > + uint64_t val = 0; > + > + /* Valid range of GPR id is only GPR0 to GPR31 */ > + assert(gpr_id < 32); > + > + if (gpr_id <= 9) { > + val = fadump_str_to_u64("GPR0"); > + } else if (gpr_id <= 19) { > + val = fadump_str_to_u64("GPR1"); > + } else if (gpr_id <= 29) { > + val = fadump_str_to_u64("GPR2"); > + } else { > + val = fadump_str_to_u64("GPR3"); > + } > + > + val |= 0x30000000; > + val |= ((gpr_id % 10) << 12); > + > + return val; > +} > + > struct fadump_metadata { > bool fadump_registered; > bool fadump_dump_active;
On 05/03/25 12:53, Harsh Prateek Bora wrote: > > > On 2/17/25 12:47, Aditya Gupta wrote: >> <...snip...> >> >> + case FADUMP_CPU_STATE_DATA: { >> + struct rtas_fadump_reg_save_area_header reg_save_hdr; >> + struct rtas_fadump_reg_entry **reg_entries; >> + struct rtas_fadump_reg_entry *curr_reg_entry; >> + >> + uint32_t fadump_reg_entries_size; >> + __be32 num_cpus = 0; >> + uint32_t num_regs_per_cpu = 0; >> + CPUState *cpu; >> + CPUPPCState *env; >> + PowerPCCPU *ppc_cpu; >> + >> + CPU_FOREACH(cpu) { >> + ++num_cpus; >> + } >> + >> + reg_save_hdr.version = cpu_to_be32(1); > > PAPR spec mentions version value as 0. Do we need to update ? Yes, will fix, thanks Harsh. > >> + reg_save_hdr.magic_number = >> + cpu_to_be64(fadump_str_to_u64("REGSAVE")); >> + >> + /* Reg save area header is immediately followed by num >> cpus */ >> + reg_save_hdr.num_cpu_offset = >> + cpu_to_be32(sizeof(struct >> rtas_fadump_reg_save_area_header)); >> + > > Above inits could go into a helper > fadump_init_reg_save_header(®_save_hdr); > BTW, the PAPR spec also mentions about padding followed by > num_cpus_offset, see another comment later below. > > >> + fadump_reg_entries_size = num_cpus * >> + FADUMP_NUM_PER_CPU_REGS * >> + sizeof(struct >> rtas_fadump_reg_entry); >> + >> + reg_entries = malloc(fadump_reg_entries_size); > > This was declared as double pointer, but being used as a pointer. Agreed, not needed to keep it as double pointer. My initial plan for this variable was different, that's why i was using double pointer earlier to point to a list of CPU registers, and each CPU registers itself an array. Not needed in current implementation. Will fix it. > >> + curr_reg_entry = (struct rtas_fadump_reg_entry >> *)reg_entries; >> + >> + /* This must loop num_cpus time */ >> + CPU_FOREACH(cpu) { >> + ppc_cpu = POWERPC_CPU(cpu); >> + env = cpu_env(cpu); >> + num_regs_per_cpu = 0; >> + >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_str_to_u64("CPUSTRT")); >> + curr_reg_entry->reg_value = ppc_cpu->vcpu_id; >> + ++curr_reg_entry; >> + >> +#define REG_ENTRY(id, val) \ >> + do { \ >> + curr_reg_entry->reg_id = \ >> + cpu_to_be64(fadump_str_to_u64(#id)); \ >> + curr_reg_entry->reg_value = val; \ >> + ++curr_reg_entry; \ >> + ++num_regs_per_cpu; \ >> + } while (0) >> + >> + REG_ENTRY(ACOP, env->spr[SPR_ACOP]); >> + REG_ENTRY(AMR, env->spr[SPR_AMR]); >> + REG_ENTRY(BESCR, env->spr[SPR_BESCR]); >> + REG_ENTRY(CFAR, env->spr[SPR_CFAR]); >> + REG_ENTRY(CIABR, env->spr[SPR_CIABR]); >> + >> + /* Save the condition register */ >> + uint64_t cr = 0; >> + cr |= (env->crf[0] & 0xf); >> + cr |= (env->crf[1] & 0xf) << 1; >> + cr |= (env->crf[2] & 0xf) << 2; >> + cr |= (env->crf[3] & 0xf) << 3; >> + cr |= (env->crf[4] & 0xf) << 4; >> + cr |= (env->crf[5] & 0xf) << 5; >> + cr |= (env->crf[6] & 0xf) << 6; >> + cr |= (env->crf[7] & 0xf) << 7; >> + REG_ENTRY(CR, cr); > > ppc_get_cr ? Thanks, will use it. > >> + >> + REG_ENTRY(CTR, env->spr[SPR_CTR]); >> + REG_ENTRY(CTRL, env->spr[SPR_CTRL]); >> + REG_ENTRY(DABR, env->spr[SPR_DABR]); >> + REG_ENTRY(DABRX, env->spr[SPR_DABRX]); >> + REG_ENTRY(DAR, env->spr[SPR_DAR]); >> + REG_ENTRY(DAWR0, env->spr[SPR_DAWR0]); >> + REG_ENTRY(DAWR1, env->spr[SPR_DAWR1]); >> + REG_ENTRY(DAWRX0, env->spr[SPR_DAWRX0]); >> + REG_ENTRY(DAWRX1, env->spr[SPR_DAWRX1]); >> + REG_ENTRY(DPDES, env->spr[SPR_DPDES]); >> + REG_ENTRY(DSCR, env->spr[SPR_DSCR]); >> + REG_ENTRY(DSISR, env->spr[SPR_DSISR]); >> + REG_ENTRY(EBBHR, env->spr[SPR_EBBHR]); >> + REG_ENTRY(EBBRR, env->spr[SPR_EBBRR]); >> + >> + REG_ENTRY(FPSCR, env->fpscr); >> + REG_ENTRY(FSCR, env->spr[SPR_FSCR]); >> + >> + /* Save the GPRs */ >> + for (int gpr_id = 0; gpr_id < 32; ++gpr_id) { >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_gpr_id_to_u64(gpr_id)); >> + curr_reg_entry->reg_value = env->gpr[i]; >> + ++curr_reg_entry; >> + ++num_regs_per_cpu; >> + } >> + >> + REG_ENTRY(IAMR, env->spr[SPR_IAMR]); >> + REG_ENTRY(IC, env->spr[SPR_IC]); >> + REG_ENTRY(LR, env->spr[SPR_LR]); >> + >> + REG_ENTRY(MSR, env->msr); >> + REG_ENTRY(NIA, env->nip); /* NIA */ >> + REG_ENTRY(PIR, env->spr[SPR_PIR]); >> + REG_ENTRY(PSPB, env->spr[SPR_PSPB]); >> + REG_ENTRY(PVR, env->spr[SPR_PVR]); >> + REG_ENTRY(RPR, env->spr[SPR_RPR]); >> + REG_ENTRY(SPURR, env->spr[SPR_SPURR]); >> + REG_ENTRY(SRR0, env->spr[SPR_SRR0]); >> + REG_ENTRY(SRR1, env->spr[SPR_SRR1]); >> + REG_ENTRY(TAR, env->spr[SPR_TAR]); >> + REG_ENTRY(TEXASR, env->spr[SPR_TEXASR]); >> + REG_ENTRY(TFHAR, env->spr[SPR_TFHAR]); >> + REG_ENTRY(TFIAR, env->spr[SPR_TFIAR]); >> + REG_ENTRY(TIR, env->spr[SPR_TIR]); >> + REG_ENTRY(UAMOR, env->spr[SPR_UAMOR]); >> + REG_ENTRY(VRSAVE, env->spr[SPR_VRSAVE]); >> + REG_ENTRY(VSCR, env->vscr); >> + REG_ENTRY(VTB, env->spr[SPR_VTB]); >> + REG_ENTRY(WORT, env->spr[SPR_WORT]); >> + REG_ENTRY(XER, env->spr[SPR_XER]); >> + >> + /* >> + * Ignoring transaction checkpoint and few other >> registers >> + * mentioned in PAPR as not supported in QEMU >> + */ >> +#undef REG_ENTRY >> + >> + /* End the registers for this CPU with "CPUEND" reg >> entry */ >> + curr_reg_entry->reg_id = >> + cpu_to_be64(fadump_str_to_u64("CPUEND")); >> + >> + /* Ensure the number of registers match (+2 for STRT >> & END) */ >> + assert(FADUMP_NUM_PER_CPU_REGS == num_regs_per_cpu + >> 2); >> + >> + ++curr_reg_entry; >> + } >> + >> + cpu_state_len = 0; >> + cpu_state_len += sizeof(reg_save_hdr); /* reg save >> header */ >> + cpu_state_len += sizeof(__be32); /* num_cpus */ >> + cpu_state_len += fadump_reg_entries_size; /* reg >> entries */ >> + > > above 4 inits could be a single line init. Yes it could be, but i kept it this way as it looks more readable to me with the comments, what do you think ? > >> + cpu_state_region = &fdm->rgn[i]; >> + cpu_state_addr = dest_addr; >> + cpu_state_buffer = g_malloc(cpu_state_len); >> + >> + uint64_t offset = 0; >> + memcpy(cpu_state_buffer + offset, >> + ®_save_hdr, sizeof(reg_save_hdr)); >> + offset += sizeof(reg_save_hdr); >> + >> + /* Write num_cpus */ >> + num_cpus = cpu_to_be32(num_cpus); >> + memcpy(cpu_state_buffer + offset, &num_cpus, >> sizeof(__be32)); >> + offset += sizeof(__be32); > > As per PAPR spec, NumCpusOffset is followed by a padding of 0xC bytes > initialized to 0. Any reasons for skipping that here ? Missed that padding, didn't notice as kernel also was picking up the correct num cpus in my testing, will fix this, and see how the kernel got the correct value then. > >> + >> + /* Write the register entries */ >> + memcpy(cpu_state_buffer + offset, >> + reg_entries, fadump_reg_entries_size); >> + offset += fadump_reg_entries_size; >> + >> + /* >> + * We will write the cpu state data later, as otherwise it >> + * might get overwritten by other fadump regions >> + */ >> + > > Better to have a separate routine fadump_preserve_cpu_state_data() > when the switch case logic grows this large, applies wherever applicable. Yes, multiple switch cases have huge logic in my patches, will move them to helpers. > >> break; >> + } >> case FADUMP_HPTE_REGION: >> /* TODO: Add hpte state data */ >> break; >> @@ -455,6 +623,34 @@ static bool fadump_preserve_mem(void) >> } >> } >> + /* >> + * Write the Register Save Area >> + * >> + * CPU State/Register Save Area should be written after dumping the >> + * memory to prevent overwritting while saving other memory regions >> + * >> + * eg. If boot memory region is 1G, then both the first 1GB >> memory, and >> + * the Register Save Area needs to be saved at 1GB. >> + * And as the CPU_STATE_DATA region comes first than the >> + * REAL_MODE_REGION region to be copied, the CPU_STATE_DATA will >> get >> + * overwritten if saved before the 0GB - 1GB region is copied after >> + * saving CPU state data >> + */ >> + cpu_physical_memory_write(cpu_state_addr, cpu_state_buffer, >> cpu_state_len); >> + g_free(cpu_state_buffer); >> + >> + /* >> + * Set bytes_dumped in cpu state region, so kernel knows >> platform have >> + * exported it >> + */ >> + cpu_state_region->bytes_dumped = cpu_to_be64(cpu_state_len); >> + >> + if (cpu_state_region->source_len != >> cpu_state_region->bytes_dumped) { >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "CPU State region's length passed by kernel, doesn't >> match" >> + " with CPU State region length exported by QEMU"); > > return error ? Yes, will return PARAM_ERROR here ? Thanks Harsh for the detailed reviews. - Aditya Gupta
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index 9b29cadab2c9..0aca4270aee8 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -348,9 +348,12 @@ bool is_next_boot_fadump; static bool fadump_preserve_mem(void) { struct rtas_fadump_mem_struct *fdm = &fadump_metadata.registered_fdm; + struct rtas_fadump_section *cpu_state_region; uint64_t next_section_addr; int dump_num_sections, data_type; uint64_t src_addr, src_len, dest_addr; + uint64_t cpu_state_addr, cpu_state_len = 0; + void *cpu_state_buffer; void *copy_buffer; assert(fadump_metadata.fadump_registered); @@ -413,9 +416,174 @@ static bool fadump_preserve_mem(void) } switch (data_type) { - case FADUMP_CPU_STATE_DATA: - /* TODO: Add CPU state data */ + case FADUMP_CPU_STATE_DATA: { + struct rtas_fadump_reg_save_area_header reg_save_hdr; + struct rtas_fadump_reg_entry **reg_entries; + struct rtas_fadump_reg_entry *curr_reg_entry; + + uint32_t fadump_reg_entries_size; + __be32 num_cpus = 0; + uint32_t num_regs_per_cpu = 0; + CPUState *cpu; + CPUPPCState *env; + PowerPCCPU *ppc_cpu; + + CPU_FOREACH(cpu) { + ++num_cpus; + } + + reg_save_hdr.version = cpu_to_be32(1); + reg_save_hdr.magic_number = + cpu_to_be64(fadump_str_to_u64("REGSAVE")); + + /* Reg save area header is immediately followed by num cpus */ + reg_save_hdr.num_cpu_offset = + cpu_to_be32(sizeof(struct rtas_fadump_reg_save_area_header)); + + fadump_reg_entries_size = num_cpus * + FADUMP_NUM_PER_CPU_REGS * + sizeof(struct rtas_fadump_reg_entry); + + reg_entries = malloc(fadump_reg_entries_size); + curr_reg_entry = (struct rtas_fadump_reg_entry *)reg_entries; + + /* This must loop num_cpus time */ + CPU_FOREACH(cpu) { + ppc_cpu = POWERPC_CPU(cpu); + env = cpu_env(cpu); + num_regs_per_cpu = 0; + + curr_reg_entry->reg_id = + cpu_to_be64(fadump_str_to_u64("CPUSTRT")); + curr_reg_entry->reg_value = ppc_cpu->vcpu_id; + ++curr_reg_entry; + +#define REG_ENTRY(id, val) \ + do { \ + curr_reg_entry->reg_id = \ + cpu_to_be64(fadump_str_to_u64(#id)); \ + curr_reg_entry->reg_value = val; \ + ++curr_reg_entry; \ + ++num_regs_per_cpu; \ + } while (0) + + REG_ENTRY(ACOP, env->spr[SPR_ACOP]); + REG_ENTRY(AMR, env->spr[SPR_AMR]); + REG_ENTRY(BESCR, env->spr[SPR_BESCR]); + REG_ENTRY(CFAR, env->spr[SPR_CFAR]); + REG_ENTRY(CIABR, env->spr[SPR_CIABR]); + + /* Save the condition register */ + uint64_t cr = 0; + cr |= (env->crf[0] & 0xf); + cr |= (env->crf[1] & 0xf) << 1; + cr |= (env->crf[2] & 0xf) << 2; + cr |= (env->crf[3] & 0xf) << 3; + cr |= (env->crf[4] & 0xf) << 4; + cr |= (env->crf[5] & 0xf) << 5; + cr |= (env->crf[6] & 0xf) << 6; + cr |= (env->crf[7] & 0xf) << 7; + REG_ENTRY(CR, cr); + + REG_ENTRY(CTR, env->spr[SPR_CTR]); + REG_ENTRY(CTRL, env->spr[SPR_CTRL]); + REG_ENTRY(DABR, env->spr[SPR_DABR]); + REG_ENTRY(DABRX, env->spr[SPR_DABRX]); + REG_ENTRY(DAR, env->spr[SPR_DAR]); + REG_ENTRY(DAWR0, env->spr[SPR_DAWR0]); + REG_ENTRY(DAWR1, env->spr[SPR_DAWR1]); + REG_ENTRY(DAWRX0, env->spr[SPR_DAWRX0]); + REG_ENTRY(DAWRX1, env->spr[SPR_DAWRX1]); + REG_ENTRY(DPDES, env->spr[SPR_DPDES]); + REG_ENTRY(DSCR, env->spr[SPR_DSCR]); + REG_ENTRY(DSISR, env->spr[SPR_DSISR]); + REG_ENTRY(EBBHR, env->spr[SPR_EBBHR]); + REG_ENTRY(EBBRR, env->spr[SPR_EBBRR]); + + REG_ENTRY(FPSCR, env->fpscr); + REG_ENTRY(FSCR, env->spr[SPR_FSCR]); + + /* Save the GPRs */ + for (int gpr_id = 0; gpr_id < 32; ++gpr_id) { + curr_reg_entry->reg_id = + cpu_to_be64(fadump_gpr_id_to_u64(gpr_id)); + curr_reg_entry->reg_value = env->gpr[i]; + ++curr_reg_entry; + ++num_regs_per_cpu; + } + + REG_ENTRY(IAMR, env->spr[SPR_IAMR]); + REG_ENTRY(IC, env->spr[SPR_IC]); + REG_ENTRY(LR, env->spr[SPR_LR]); + + REG_ENTRY(MSR, env->msr); + REG_ENTRY(NIA, env->nip); /* NIA */ + REG_ENTRY(PIR, env->spr[SPR_PIR]); + REG_ENTRY(PSPB, env->spr[SPR_PSPB]); + REG_ENTRY(PVR, env->spr[SPR_PVR]); + REG_ENTRY(RPR, env->spr[SPR_RPR]); + REG_ENTRY(SPURR, env->spr[SPR_SPURR]); + REG_ENTRY(SRR0, env->spr[SPR_SRR0]); + REG_ENTRY(SRR1, env->spr[SPR_SRR1]); + REG_ENTRY(TAR, env->spr[SPR_TAR]); + REG_ENTRY(TEXASR, env->spr[SPR_TEXASR]); + REG_ENTRY(TFHAR, env->spr[SPR_TFHAR]); + REG_ENTRY(TFIAR, env->spr[SPR_TFIAR]); + REG_ENTRY(TIR, env->spr[SPR_TIR]); + REG_ENTRY(UAMOR, env->spr[SPR_UAMOR]); + REG_ENTRY(VRSAVE, env->spr[SPR_VRSAVE]); + REG_ENTRY(VSCR, env->vscr); + REG_ENTRY(VTB, env->spr[SPR_VTB]); + REG_ENTRY(WORT, env->spr[SPR_WORT]); + REG_ENTRY(XER, env->spr[SPR_XER]); + + /* + * Ignoring transaction checkpoint and few other registers + * mentioned in PAPR as not supported in QEMU + */ +#undef REG_ENTRY + + /* End the registers for this CPU with "CPUEND" reg entry */ + curr_reg_entry->reg_id = + cpu_to_be64(fadump_str_to_u64("CPUEND")); + + /* Ensure the number of registers match (+2 for STRT & END) */ + assert(FADUMP_NUM_PER_CPU_REGS == num_regs_per_cpu + 2); + + ++curr_reg_entry; + } + + cpu_state_len = 0; + cpu_state_len += sizeof(reg_save_hdr); /* reg save header */ + cpu_state_len += sizeof(__be32); /* num_cpus */ + cpu_state_len += fadump_reg_entries_size; /* reg entries */ + + cpu_state_region = &fdm->rgn[i]; + cpu_state_addr = dest_addr; + cpu_state_buffer = g_malloc(cpu_state_len); + + uint64_t offset = 0; + memcpy(cpu_state_buffer + offset, + ®_save_hdr, sizeof(reg_save_hdr)); + offset += sizeof(reg_save_hdr); + + /* Write num_cpus */ + num_cpus = cpu_to_be32(num_cpus); + memcpy(cpu_state_buffer + offset, &num_cpus, sizeof(__be32)); + offset += sizeof(__be32); + + /* Write the register entries */ + memcpy(cpu_state_buffer + offset, + reg_entries, fadump_reg_entries_size); + offset += fadump_reg_entries_size; + + /* + * We will write the cpu state data later, as otherwise it + * might get overwritten by other fadump regions + */ + break; + } case FADUMP_HPTE_REGION: /* TODO: Add hpte state data */ break; @@ -455,6 +623,34 @@ static bool fadump_preserve_mem(void) } } + /* + * Write the Register Save Area + * + * CPU State/Register Save Area should be written after dumping the + * memory to prevent overwritting while saving other memory regions + * + * eg. If boot memory region is 1G, then both the first 1GB memory, and + * the Register Save Area needs to be saved at 1GB. + * And as the CPU_STATE_DATA region comes first than the + * REAL_MODE_REGION region to be copied, the CPU_STATE_DATA will get + * overwritten if saved before the 0GB - 1GB region is copied after + * saving CPU state data + */ + cpu_physical_memory_write(cpu_state_addr, cpu_state_buffer, cpu_state_len); + g_free(cpu_state_buffer); + + /* + * Set bytes_dumped in cpu state region, so kernel knows platform have + * exported it + */ + cpu_state_region->bytes_dumped = cpu_to_be64(cpu_state_len); + + if (cpu_state_region->source_len != cpu_state_region->bytes_dumped) { + qemu_log_mask(LOG_GUEST_ERROR, + "CPU State region's length passed by kernel, doesn't match" + " with CPU State region length exported by QEMU"); + } + return true; } diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index a80704187583..0e8002bad9e0 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -792,6 +792,9 @@ void push_sregs_to_kvm_pr(SpaprMachineState *spapr); #define FADUMP_HPTE_REGION 0x0002 #define FADUMP_REAL_MODE_REGION 0x0011 +/* Number of registers stored per cpu */ +#define FADUMP_NUM_PER_CPU_REGS (32 /*GPR*/ + 45 /*others*/ + 2 /*STRT & END*/) + /* OS defined sections */ #define FADUMP_PARAM_AREA 0x0100 @@ -845,6 +848,86 @@ struct rtas_fadump_mem_struct { struct rtas_fadump_section rgn[FADUMP_MAX_SECTIONS]; }; +/* + * The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" + * and ends with "CPUEND". + */ + +/* Register save area header. */ +struct rtas_fadump_reg_save_area_header { + __be64 magic_number; + __be32 version; + __be32 num_cpu_offset; +}; + +/* Register entry. */ +struct rtas_fadump_reg_entry { + __be64 reg_id; + __be64 reg_value; +}; + +/* + * Copy the ascii values for first 8 characters from a string into u64 + * variable at their respective indexes. + * e.g. + * The string "FADMPINF" will be converted into 0x4641444d50494e46 + */ +static inline uint64_t fadump_str_to_u64(const char *str) +{ + uint64_t val = 0; + int i; + + for (i = 0; i < sizeof(val); i++) { + val = (*str) ? (val << 8) | *str++ : val << 8; + } + return val; +} + +/** + * Get the identifier id for register entries of GPRs + * + * It gives the same id as 'fadump_str_to_u64' when the complete string id + * of the GPR is given, ie. + * + * fadump_str_to_u64("GPR05") == fadump_gpr_id_to_u64(5); + * fadump_str_to_u64("GPR12") == fadump_gpr_id_to_u64(12); + * + * And so on. Hence this can be implemented by creating a dynamic + * string for each GPR, such as "GPR00", "GPR01", ... "GPR31" + * Instead of allocating a string, an observation from the math of + * 'fadump_str_to_u64' or from PAPR tells us that there's a pattern + * in the identifier IDs, such that the first 8 bytes are affected only by + * whether it is GPR0*, GPR1*, GPR2*, GPR3*. 9th byte is always 0x3. And + * the the 10th byte is the index of the GPR modulo 10. + */ +static inline uint64_t fadump_gpr_id_to_u64(uint32_t gpr_id) +{ + uint64_t val = 0; + + /* Valid range of GPR id is only GPR0 to GPR31 */ + assert(gpr_id < 32); + + if (gpr_id <= 9) { + val = fadump_str_to_u64("GPR0"); + } else if (gpr_id <= 19) { + val = fadump_str_to_u64("GPR1"); + } else if (gpr_id <= 29) { + val = fadump_str_to_u64("GPR2"); + } else { + val = fadump_str_to_u64("GPR3"); + } + + val |= 0x30000000; + val |= ((gpr_id % 10) << 12); + + return val; +} + struct fadump_metadata { bool fadump_registered; bool fadump_dump_active;
Kernel expects CPU states/register states in the format mentioned in "Register Save Area" in PAPR. The platform (in our case, QEMU) saves each CPU register in the form of an array of "register entries", the start and end of this array is signified by "CPUSTRT" and "CPUEND" register entries respectively. The CPUSTRT and CPUEND register entry also has 4-byte logical CPU ID, thus storing the CPU ID corresponding to the array of register entries. Each register, and CPUSTRT, CPUEND has a predefined identifier. Implement calculating identifier for a given register in 'fadump_str_to_u64', which has been taken from the linux kernel Similarly GPRs also have predefined identifiers, and a corresponding 64-bit resiter value (split into two 32-bit cells). Implement calculation of GPR identifiers with 'fadump_gpr_id_to_u64' PAPR has restrictions on particular order of few registers, and is free to be in any order for other registers. Some registers mentioned in PAPR have not been exported as they are not implemented in QEMU / don't make sense in QEMU. Implement saving of CPU state according to the PAPR document Signed-off-by: Aditya Gupta <adityag@linux.ibm.com> --- hw/ppc/spapr_rtas.c | 200 ++++++++++++++++++++++++++++++++++++++++- include/hw/ppc/spapr.h | 83 +++++++++++++++++ 2 files changed, 281 insertions(+), 2 deletions(-)