Message ID | 20250217071934.86131-6-adityag@linux.ibm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Implement MPIPL for PowerNV | expand |
On 2/17/25 12:49, Aditya Gupta wrote: > When MPIPL is used, OPAL/Linux registers memory regions to be preserved > on a Memory-Preserving boot ('crashkernel boot'). > > The regions are added to two tables: MDST and MDDT (source and > destination tables) > > The MDST contains the start address of the region, and size of region > > The MDDT contains the destination address where the region should be > copied (and size of region which will be same as in MDST entry) > > Then after a crash, when hostboot (pnv_sbe.c in case of QEMU) > preserves the memory region, it adds the details of preserved regions to > MDRT (results table) > > Copy memory regions mentioned in MDST to addresses mentioned in MDDT. > And accordingly update the copied region details in MDRT table. > > Note: If we did not preserve the regions, and MDRT is empty then OPAL > simply logs "OPAL dump is not available", while kernel will assume that > firmware would have preserved the regions, and export /proc/vmcore, but > the vmcore won't have most basic kernel structures hence crash will be > unable to analyse the vmcore > > Signed-off-by: Aditya Gupta <adityag@linux.ibm.com> > --- > hw/ppc/pnv_sbe.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 57 insertions(+) > > diff --git a/hw/ppc/pnv_sbe.c b/hw/ppc/pnv_sbe.c > index 361a3854307d..ee905df4e0a6 100644 > --- a/hw/ppc/pnv_sbe.c > +++ b/hw/ppc/pnv_sbe.c > @@ -227,6 +227,60 @@ static uint64_t pnv_sbe_power9_xscom_ctrl_read(void *opaque, hwaddr addr, > return val; > } > > +static void pnv_mpipl_preserve_mem(void) > +{ > + /* Get access to metadata */ > + struct mpipl_metadata *metadata = malloc(DUMP_METADATA_AREA_SIZE); > + struct mdst_table *mdst = malloc(MDST_TABLE_SIZE); > + struct mddt_table *mddt = malloc(MDDT_TABLE_SIZE); > + struct mdrt_table *mdrt = malloc(MDRT_TABLE_SIZE); Where are these getting free()ed? Mem leak ? > + __be64 source_addr, dest_addr, bytes_to_copy; > + uint8_t *copy_buffer; > + > + cpu_physical_memory_read(DUMP_METADATA_AREA_BASE, metadata, DUMP_METADATA_AREA_SIZE); > + cpu_physical_memory_read(MDST_TABLE_BASE, mdst, MDST_TABLE_SIZE); > + cpu_physical_memory_read(MDDT_TABLE_BASE, mddt, MDDT_TABLE_SIZE); > + > + /* HRMOR_BIT copied from skiboot */ > + #define HRMOR_BIT (1ul << 63) Could be moved to pnv_sbe.h file. > + > + for (int i = 0;; ++i) { > + /* NOTE: Assuming uninitialised will be all zeroes */ > + if ((mdst[i].addr == 0) && (mdst[i].size == 0)) { > + break; > + } What if there is no uninitialized entry till the end of array? Out-of-bound access since we do not have a loop exit condition? > + > + if (mdst[i].size != mddt[i].size) { > + qemu_log_mask(LOG_TRACE, > + "Warning: Invalid entry, size mismatch in MDST & MDDT\n"); > + continue; > + } > + > + if (mdst[i].data_region != mddt[i].data_region) { > + qemu_log_mask(LOG_TRACE, > + "Warning: Invalid entry, region mismatch in MDST & MDDT\n"); > + continue; > + } > + > + mdrt[i].src_addr = mdst[i].addr; > + mdrt[i].dest_addr = mddt[i].addr; > + mdrt[i].size = mdst[i].size; > + mdrt[i].data_region = mdst[i].data_region; > + > + source_addr = cpu_to_be64(mdst[i].addr) & ~HRMOR_BIT; > + dest_addr = cpu_to_be64(mddt[i].addr) & ~HRMOR_BIT; > + bytes_to_copy = cpu_to_be32(mddt[i].size); > + > + /* XXX: Am i assuming we are in big endian mode ? */ If the patches are assuming to work only with BE, it should gracefully handle the LE case. Thanks Harsh > + copy_buffer = malloc(bytes_to_copy); > + cpu_physical_memory_read(source_addr, copy_buffer, bytes_to_copy); > + cpu_physical_memory_write(dest_addr, copy_buffer, bytes_to_copy); > + free(copy_buffer); > + } > + > + cpu_physical_memory_write(MDRT_TABLE_BASE, mdrt, MDRT_TABLE_SIZE); > +} > + > static void pnv_sbe_power9_xscom_ctrl_write(void *opaque, hwaddr addr, > uint64_t val, unsigned size) > { > @@ -250,6 +304,9 @@ static void pnv_sbe_power9_xscom_ctrl_write(void *opaque, hwaddr addr, > */ > pause_all_vcpus(); > > + /* Preserve the memory locations registered for MPIPL */ > + pnv_mpipl_preserve_mem(); > + > /* > * TODO: Pass `mpipl` node in device tree to signify next > * boot is an MPIPL boot
diff --git a/hw/ppc/pnv_sbe.c b/hw/ppc/pnv_sbe.c index 361a3854307d..ee905df4e0a6 100644 --- a/hw/ppc/pnv_sbe.c +++ b/hw/ppc/pnv_sbe.c @@ -227,6 +227,60 @@ static uint64_t pnv_sbe_power9_xscom_ctrl_read(void *opaque, hwaddr addr, return val; } +static void pnv_mpipl_preserve_mem(void) +{ + /* Get access to metadata */ + struct mpipl_metadata *metadata = malloc(DUMP_METADATA_AREA_SIZE); + struct mdst_table *mdst = malloc(MDST_TABLE_SIZE); + struct mddt_table *mddt = malloc(MDDT_TABLE_SIZE); + struct mdrt_table *mdrt = malloc(MDRT_TABLE_SIZE); + __be64 source_addr, dest_addr, bytes_to_copy; + uint8_t *copy_buffer; + + cpu_physical_memory_read(DUMP_METADATA_AREA_BASE, metadata, DUMP_METADATA_AREA_SIZE); + cpu_physical_memory_read(MDST_TABLE_BASE, mdst, MDST_TABLE_SIZE); + cpu_physical_memory_read(MDDT_TABLE_BASE, mddt, MDDT_TABLE_SIZE); + + /* HRMOR_BIT copied from skiboot */ + #define HRMOR_BIT (1ul << 63) + + for (int i = 0;; ++i) { + /* NOTE: Assuming uninitialised will be all zeroes */ + if ((mdst[i].addr == 0) && (mdst[i].size == 0)) { + break; + } + + if (mdst[i].size != mddt[i].size) { + qemu_log_mask(LOG_TRACE, + "Warning: Invalid entry, size mismatch in MDST & MDDT\n"); + continue; + } + + if (mdst[i].data_region != mddt[i].data_region) { + qemu_log_mask(LOG_TRACE, + "Warning: Invalid entry, region mismatch in MDST & MDDT\n"); + continue; + } + + mdrt[i].src_addr = mdst[i].addr; + mdrt[i].dest_addr = mddt[i].addr; + mdrt[i].size = mdst[i].size; + mdrt[i].data_region = mdst[i].data_region; + + source_addr = cpu_to_be64(mdst[i].addr) & ~HRMOR_BIT; + dest_addr = cpu_to_be64(mddt[i].addr) & ~HRMOR_BIT; + bytes_to_copy = cpu_to_be32(mddt[i].size); + + /* XXX: Am i assuming we are in big endian mode ? */ + copy_buffer = malloc(bytes_to_copy); + cpu_physical_memory_read(source_addr, copy_buffer, bytes_to_copy); + cpu_physical_memory_write(dest_addr, copy_buffer, bytes_to_copy); + free(copy_buffer); + } + + cpu_physical_memory_write(MDRT_TABLE_BASE, mdrt, MDRT_TABLE_SIZE); +} + static void pnv_sbe_power9_xscom_ctrl_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -250,6 +304,9 @@ static void pnv_sbe_power9_xscom_ctrl_write(void *opaque, hwaddr addr, */ pause_all_vcpus(); + /* Preserve the memory locations registered for MPIPL */ + pnv_mpipl_preserve_mem(); + /* * TODO: Pass `mpipl` node in device tree to signify next * boot is an MPIPL boot
When MPIPL is used, OPAL/Linux registers memory regions to be preserved on a Memory-Preserving boot ('crashkernel boot'). The regions are added to two tables: MDST and MDDT (source and destination tables) The MDST contains the start address of the region, and size of region The MDDT contains the destination address where the region should be copied (and size of region which will be same as in MDST entry) Then after a crash, when hostboot (pnv_sbe.c in case of QEMU) preserves the memory region, it adds the details of preserved regions to MDRT (results table) Copy memory regions mentioned in MDST to addresses mentioned in MDDT. And accordingly update the copied region details in MDRT table. Note: If we did not preserve the regions, and MDRT is empty then OPAL simply logs "OPAL dump is not available", while kernel will assume that firmware would have preserved the regions, and export /proc/vmcore, but the vmcore won't have most basic kernel structures hence crash will be unable to analyse the vmcore Signed-off-by: Aditya Gupta <adityag@linux.ibm.com> --- hw/ppc/pnv_sbe.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+)