diff mbox series

[v5,18/18] s390x: pv: Add dump support

Message ID 20220811121111.9878-19-frankja@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series dump: Add arch section and s390x PV dump | expand

Commit Message

Janosch Frank Aug. 11, 2022, 12:11 p.m. UTC
Sometimes dumping a guest from the outside is the only way to get the
data that is needed. This can be the case if a dumping mechanism like
KDUMP hasn't been configured or data needs to be fetched at a specific
point. Dumping a protected guest from the outside without help from
fw/hw doesn't yield sufficient data to be useful. Hence we now
introduce PV dump support.

The PV dump support works by integrating the firmware into the dump
process. New Ultravisor calls are used to initiate the dump process,
dump cpu data, dump memory state and lastly complete the dump process.
The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
its subcommands. The guest's data is fully encrypted and can only be
decrypted by the entity that owns the customer communication key for
the dumped guest. Also dumping needs to be allowed via a flag in the
SE header.

On the QEMU side of things we store the PV dump data in the newly
introduced architecture ELF sections (storage state and completion
data) and the cpu notes (for cpu dump data).

Users can use the zgetdump tool to convert the encrypted QEMU dump to an
unencrypted one.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 dump/dump.c              |  12 +-
 include/sysemu/dump.h    |   5 +
 target/s390x/arch_dump.c | 242 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 227 insertions(+), 32 deletions(-)

Comments

Janosch Frank Aug. 11, 2022, 1:03 p.m. UTC | #1
On 8/11/22 14:11, Janosch Frank wrote:
> Sometimes dumping a guest from the outside is the only way to get the
> data that is needed. This can be the case if a dumping mechanism like
> KDUMP hasn't been configured or data needs to be fetched at a specific
> point. Dumping a protected guest from the outside without help from
> fw/hw doesn't yield sufficient data to be useful. Hence we now
> introduce PV dump support.
> 
> The PV dump support works by integrating the firmware into the dump
> process. New Ultravisor calls are used to initiate the dump process,
> dump cpu data, dump memory state and lastly complete the dump process.
> The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
> its subcommands. The guest's data is fully encrypted and can only be
> decrypted by the entity that owns the customer communication key for
> the dumped guest. Also dumping needs to be allowed via a flag in the
> SE header.
> 
> On the QEMU side of things we store the PV dump data in the newly
> introduced architecture ELF sections (storage state and completion
> data) and the cpu notes (for cpu dump data).
> 
> Users can use the zgetdump tool to convert the encrypted QEMU dump to an
> unencrypted one.
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>


Seems like I forgot to amend this commit with the naming changes before 
sending:

diff --git i/target/s390x/arch_dump.c w/target/s390x/arch_dump.c
index 5e8e03d536..233f23c071 100644
--- i/target/s390x/arch_dump.c
+++ w/target/s390x/arch_dump.c
@@ -286,14 +286,14 @@ int 
s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
  }

  /* PV dump section size functions */
-static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
+static uint64_t get_stor_state_size_from_len(uint64_t len)
  {
      return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();
  }

  static uint64_t get_size_stor_state(DumpState *s)
  {
-    return get_dump_stor_state_size_from_len(s->total_size);
+    return get_stor_state_size_from_len(s->total_size);
  }

  static uint64_t get_size_complete(DumpState *s)
@@ -316,7 +316,8 @@ static int get_data_complete(DumpState *s, uint8_t 
*buff)
      return rc;
  }

-static int dump_mem(DumpState *s, uint64_t gaddr, uint8_t *buff, 
uint64_t buff_len)
+static int get_stor_state_block(DumpState *s, uint64_t gaddr, uint8_t 
*buff,
+                                uint64_t buff_len)
  {
      /* We need the gaddr + len and something to write to */
      if (!pv_dump_initialized) {
@@ -325,7 +326,7 @@ static int dump_mem(DumpState *s, uint64_t gaddr, 
uint8_t *buff, uint64_t buff_l
      return kvm_s390_dump_mem(gaddr, buff_len, buff);
  }

-static int get_data_mem(DumpState *s, uint8_t *buff)
+static int get_store_state(DumpState *s, uint8_t *buff)
  {
      int64_t memblock_size, memblock_start;
      GuestPhysBlock *block;
@@ -341,9 +342,9 @@ static int get_data_mem(DumpState *s, uint8_t *buff)
          memblock_size = dump_filtered_memblock_size(block, 
s->filter_area_begin,
 
s->filter_area_length);

-        off = get_dump_stor_state_size_from_len(block->target_start);
-        dump_mem(s, block->target_start, buff + off,
-                 get_dump_stor_state_size_from_len(memblock_size));
+        off = get_stor_state_size_from_len(block->target_start);
+        get_stor_state_block(s, block->target_start, buff + off,
+                             get_stor_state_size_from_len(memblock_size));
      }

      return 0;
@@ -354,7 +355,7 @@ struct sections {
      int (*sections_contents_func)(DumpState *s, uint8_t *buff);
      char sctn_str[12];
  } sections[] = {
-    { get_size_stor_state, get_data_mem, "pv_mem_meta"},
+    { get_size_stor_state, get_store_state, "pv_mem_meta"},
      { get_size_complete, get_data_complete, "pv_compl"},
      {NULL , NULL, ""}
  };
Steffen Eiden Aug. 23, 2022, 3:26 p.m. UTC | #2
On 8/11/22 14:11, Janosch Frank wrote:
> Sometimes dumping a guest from the outside is the only way to get the
> data that is needed. This can be the case if a dumping mechanism like
> KDUMP hasn't been configured or data needs to be fetched at a specific
> point. Dumping a protected guest from the outside without help from
> fw/hw doesn't yield sufficient data to be useful. Hence we now
> introduce PV dump support.
> 
> The PV dump support works by integrating the firmware into the dump
> process. New Ultravisor calls are used to initiate the dump process,
> dump cpu data, dump memory state and lastly complete the dump process.
> The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
> its subcommands. The guest's data is fully encrypted and can only be
> decrypted by the entity that owns the customer communication key for
> the dumped guest. Also dumping needs to be allowed via a flag in the
> SE header.
> 
> On the QEMU side of things we store the PV dump data in the newly
> introduced architecture ELF sections (storage state and completion
> data) and the cpu notes (for cpu dump data).
> 
> Users can use the zgetdump tool to convert the encrypted QEMU dump to an
> unencrypted one.
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> ---
>   dump/dump.c              |  12 +-
>   include/sysemu/dump.h    |   5 +
>   target/s390x/arch_dump.c | 242 ++++++++++++++++++++++++++++++++++-----
>   3 files changed, 227 insertions(+), 32 deletions(-)
> 
> diff --git a/dump/dump.c b/dump/dump.c
> index 65b18fc602..7cf5eb7c8b 100644
> --- a/dump/dump.c
> +++ b/dump/dump.c
> @@ -720,9 +720,9 @@ static void dump_begin(DumpState *s, Error **errp)
>       write_elf_notes(s, errp);
>   }
>   
> -static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
> -                                           int64_t filter_area_start,
> -                                           int64_t filter_area_length)
> +int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
> +                                    int64_t filter_area_start,
> +                                    int64_t filter_area_length)
>   {
>       int64_t size, left, right;
>   
> @@ -740,9 +740,9 @@ static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
>       return size;
>   }
>   
> -static int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
> -                                            int64_t filter_area_start,
> -                                            int64_t filter_area_length)
> +int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
> +                                     int64_t filter_area_start,
> +                                     int64_t filter_area_length)
>   {
>       if (filter_area_length) {
>           /* return -1 if the block is not within filter area */
> diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
> index 358b038d47..245c26fbca 100644
> --- a/include/sysemu/dump.h
> +++ b/include/sysemu/dump.h
> @@ -216,4 +216,9 @@ typedef struct DumpState {
>   uint16_t cpu_to_dump16(DumpState *s, uint16_t val);
>   uint32_t cpu_to_dump32(DumpState *s, uint32_t val);
>   uint64_t cpu_to_dump64(DumpState *s, uint64_t val);
> +
> +int64_t dump_filtered_memblock_size(GuestPhysBlock *block, int64_t filter_area_start,
> +                                    int64_t filter_area_length);
> +int64_t dump_filtered_memblock_start(GuestPhysBlock *block, int64_t filter_area_start,
> +                                     int64_t filter_area_length);
>   #endif
> diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
> index f60a14920d..5e8e03d536 100644
> --- a/target/s390x/arch_dump.c
> +++ b/target/s390x/arch_dump.c
> @@ -16,7 +16,8 @@
>   #include "s390x-internal.h"
>   #include "elf.h"
>   #include "sysemu/dump.h"
> -
> +#include "hw/s390x/pv.h"
> +#include "kvm/kvm_s390x.h"
>   
>   struct S390xUserRegsStruct {
>       uint64_t psw[2];
> @@ -76,9 +77,16 @@ typedef struct noteStruct {
>           uint64_t todcmp;
>           uint32_t todpreg;
>           uint64_t ctrs[16];
> +        uint8_t dynamic[1];  /*
> +                              * Would be a flexible array member, if
> +                              * that was legal inside a union. Real
> +                              * size comes from PV info interface.
> +                              */
>       } contents;
>   } QEMU_PACKED Note;
>   
> +static bool pv_dump_initialized;
> +
>   static void s390x_write_elf64_prstatus(Note *note, S390CPU *cpu, int id)
>   {
>       int i;
> @@ -177,28 +185,39 @@ static void s390x_write_elf64_prefix(Note *note, S390CPU *cpu, int id)
>       note->contents.prefix = cpu_to_be32((uint32_t)(cpu->env.psa));
>   }
>   
> +static void s390x_write_elf64_pv(Note *note, S390CPU *cpu, int id)
> +{
> +    note->hdr.n_type = cpu_to_be32(NT_S390_PV_CPU_DATA);
> +    if (!pv_dump_initialized) {
> +        return;
> +    }
> +    kvm_s390_dump_cpu(cpu, &note->contents.dynamic);
> +}
>   
>   typedef struct NoteFuncDescStruct {
>       int contents_size;
> +    uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents */
>       void (*note_contents_func)(Note *note, S390CPU *cpu, int id);
> +    bool pvonly;
>   } NoteFuncDesc;
>   
>   static const NoteFuncDesc note_core[] = {
> -    {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus},
> -    {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset},
> -    { 0, NULL}
> +    {sizeof_field(Note, contents.prstatus), NULL, s390x_write_elf64_prstatus, false},
> +    {sizeof_field(Note, contents.fpregset), NULL, s390x_write_elf64_fpregset, false}, > +    { 0, NULL, NULL}
You are missing a false here.
>   };
>   
>   static const NoteFuncDesc note_linux[] = {
> -    {sizeof_field(Note, contents.prefix),   s390x_write_elf64_prefix},
> -    {sizeof_field(Note, contents.ctrs),     s390x_write_elf64_ctrs},
> -    {sizeof_field(Note, contents.timer),    s390x_write_elf64_timer},
> -    {sizeof_field(Note, contents.todcmp),   s390x_write_elf64_todcmp},
> -    {sizeof_field(Note, contents.todpreg),  s390x_write_elf64_todpreg},
> -    {sizeof_field(Note, contents.vregslo),  s390x_write_elf64_vregslo},
> -    {sizeof_field(Note, contents.vregshi),  s390x_write_elf64_vregshi},
> -    {sizeof_field(Note, contents.gscb),     s390x_write_elf64_gscb},
> -    { 0, NULL}
> +    {sizeof_field(Note, contents.prefix),   NULL, s390x_write_elf64_prefix,  false},
> +    {sizeof_field(Note, contents.ctrs),     NULL, s390x_write_elf64_ctrs,    false},
> +    {sizeof_field(Note, contents.timer),    NULL, s390x_write_elf64_timer,   false},
> +    {sizeof_field(Note, contents.todcmp),   NULL, s390x_write_elf64_todcmp,  false},
> +    {sizeof_field(Note, contents.todpreg),  NULL, s390x_write_elf64_todpreg, false},
> +    {sizeof_field(Note, contents.vregslo),  NULL, s390x_write_elf64_vregslo, false},
> +    {sizeof_field(Note, contents.vregshi),  NULL, s390x_write_elf64_vregshi, false},
> +    {sizeof_field(Note, contents.gscb),     NULL, s390x_write_elf64_gscb,    false},
> +    {0, kvm_s390_pv_dmp_get_size_cpu,       s390x_write_elf64_pv, true} > +    { 0, NULL, NULL}
and here.
>   };
>   
>   static int s390x_write_elf64_notes(const char *note_name,
> @@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char *note_name,
>                                          DumpState *s,
>                                          const NoteFuncDesc *funcs)
>   {
> -    Note note;
> +    Note note, *notep;
>       const NoteFuncDesc *nf;
> -    int note_size;
> +    int note_size, content_size;
>       int ret = -1;
>   
>       assert(strlen(note_name) < sizeof(note.name));
>   
>       for (nf = funcs; nf->note_contents_func; nf++) {
> -        memset(&note, 0, sizeof(note));
> -        note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> -        note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
> -        g_strlcpy(note.name, note_name, sizeof(note.name));
> -        (*nf->note_contents_func)(&note, cpu, id);
> +        notep = &note;
> +        if (nf->pvonly && !s390_is_pv()) {
> +            continue;
> +        }
>   
> -        note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;
> -        ret = f(&note, note_size, s);
> +        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
> +        note_size = sizeof(note) - sizeof(notep->contents) + content_size;
> +
> +        /* Notes with dynamic sizes need to allocate a note */
> +        if (nf->note_size_func) {
> +            notep = g_malloc0(note_size);
> +        }
> +
> +        memset(notep, 0, sizeof(note));
> +
> +        /* Setup note header data */
> +        notep->hdr.n_descsz = cpu_to_be32(content_size);
> +        notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> +        g_strlcpy(notep->name, note_name, sizeof(notep->name));
> +
> +        /* Get contents and write them out */
> +        (*nf->note_contents_func)(notep, cpu, id);
> +        ret = f(notep, note_size, s);
> +
> +        if (nf->note_size_func) {
> +            g_free(notep);
> +        }
>   
>           if (ret < 0) {
>               return -1;
> @@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
>       return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
>   }
>   
> +/* PV dump section size functions */
> +static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
> +{
> +    return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();
> +}
> +
> +static uint64_t get_size_stor_state(DumpState *s)
> +{
> +    return get_dump_stor_state_size_from_len(s->total_size);
> +}
> +
> +static uint64_t get_size_complete(DumpState *s)
> +{
> +    return kvm_s390_pv_dmp_get_size_complete();
> +}
> +
> +/* PV dump section data functions*/
> +static int get_data_complete(DumpState *s, uint8_t *buff)
> +{
> +    int rc;
> +
> +    if (!pv_dump_initialized) {
> +        return 0;
> +    }
> +    rc = kvm_s390_dump_complete(buff);
> +    if (!rc) {
> +            pv_dump_initialized = false;
> +    }
> +    return rc;
> +}
> +
Similar to my comments in the last patch:
dump_mem_state
> +static int dump_mem(DumpState *s, uint64_t gaddr, uint8_t *buff, uint64_t buff_len)
> +{
> +    /* We need the gaddr + len and something to write to */
> +    if (!pv_dump_initialized) {
> +        return 0;
> +    }
> +    return kvm_s390_dump_mem(gaddr, buff_len, buff);
> +}
> +
get_data_mem_state
> +static int get_data_mem(DumpState *s, uint8_t *buff)
> +{
> +    int64_t memblock_size, memblock_start;
> +    GuestPhysBlock *block;
> +    uint64_t off;
> +
> +    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
> +        memblock_start = dump_filtered_memblock_start(block, s->filter_area_begin,
> +                                                      s->filter_area_length);
> +        if (memblock_start == -1) {
> +            continue;
> +        }
> +
> +        memblock_size = dump_filtered_memblock_size(block, s->filter_area_begin,
> +                                                    s->filter_area_length);
> +
> +        off = get_dump_stor_state_size_from_len(block->target_start);
> +        dump_mem(s, block->target_start, buff + off,
> +                 get_dump_stor_state_size_from_len(memblock_size));
> +    }
> +
> +    return 0;
> +}
> +
> +struct sections {
> +    uint64_t (*sections_size_func)(DumpState *s);
> +    int (*sections_contents_func)(DumpState *s, uint8_t *buff);
> +    char sctn_str[12];
> +} sections[] = {
> +    { get_size_stor_state, get_data_mem, "pv_mem_meta"},
> +    { get_size_complete, get_data_complete, "pv_compl"},
> +    {NULL , NULL, ""}
> +};
> +
> +static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff)
> +{
> +    Elf64_Shdr *shdr = (void *)buff;
> +    struct sections *sctn = sections;
> +    uint64_t off = s->section_offset;
> +
> +    if (!s390_is_pv()) {
> +        return 0;
> +    }
> +
> +    for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) {
> +        memset(shdr, 0, sizeof(*shdr));
> +        shdr->sh_type = SHT_PROGBITS;
> +        shdr->sh_offset = off;
> +        shdr->sh_size = sctn->sections_size_func(s);
> +        shdr->sh_name = s->string_table_buf->len;
> +        g_array_append_vals(s->string_table_buf, sctn->sctn_str, sizeof(sctn->sctn_str));
> +    }
> +
> +    return (uintptr_t)shdr - (uintptr_t)buff;
> +}
> +
> +
> +/* Add arch specific number of sections and their respective sizes */
> +static void arch_sections_add(DumpState *s)
> +{
> +    struct sections *sctn = sections;
> +
> +    /*
> +     * We only do a PV dump if we are running a PV guest, KVM supports
> +     * the dump API and we got valid dump length information.
> +     */
> +    if (!s390_is_pv() || !kvm_s390_get_protected_dump() ||
> +        !kvm_s390_pv_info_basic_valid()) {
> +        return;
> +    }
> +
> +    /*
> +     * Start the UV dump process by doing the initialize dump call via
> +     * KVM as the proxy.
> +     */
> +    if (!kvm_s390_dump_init()) {
> +            pv_dump_initialized = true;
> +    }
> +
> +    for (; sctn->sections_size_func; sctn++) {
> +        s->shdr_num += 1;
> +        s->elf_section_data_size += sctn->sections_size_func(s);
> +    }
> +
> +    /* We use the string table to identify the sections */
> +    s->string_table_usage = true;
> +}
> +
> +/*
> + * After the PV dump has been initialized, the CPU data has been
> + * fetched and memory has been dumped, we need to grab the tweak data
> + * and the completion data.
> + */
> +static void arch_sections_write(DumpState *s, uint8_t *buff)
> +{
> +    struct sections *sctn = sections;
> +
> +    /* shdr_num should only have been set > 1 if we are protected */
> +    assert(s390_is_pv());
> +
> +    for (; sctn->sections_size_func; sctn++) {
> +        sctn->sections_contents_func(s, buff);
> +        buff += sctn->sections_size_func(s);
> +    }
> +}
> +
>   int cpu_get_dump_info(ArchDumpInfo *info,
>                         const struct GuestPhysBlockList *guest_phys_blocks)
>   {
>       info->d_machine = EM_S390;
>       info->d_endian = ELFDATA2MSB;
>       info->d_class = ELFCLASS64;
> +    info->arch_sections_add_fn = *arch_sections_add;
> +    info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
> +    info->arch_sections_write_fn = *arch_sections_write;
>   
>       return 0;
>   }
> @@ -261,7 +448,7 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
>   {
>       int name_size = 8; /* "LINUX" or "CORE" + pad */
>       size_t elf_note_size = 0;
> -    int note_head_size;
> +    int note_head_size, content_size;
>       const NoteFuncDesc *nf;
>   
>       assert(class == ELFCLASS64);
> @@ -270,12 +457,15 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
>       note_head_size = sizeof(Elf64_Nhdr);
>   
>       for (nf = note_core; nf->note_contents_func; nf++) {
> -        elf_note_size = elf_note_size + note_head_size + name_size +
> -                        nf->contents_size;
> +        elf_note_size = elf_note_size + note_head_size + name_size + nf->contents_size;
>       }
>       for (nf = note_linux; nf->note_contents_func; nf++) {
> +        if (nf->pvonly && !s390_is_pv()) {
> +            continue;
> +        }
> +        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
>           elf_note_size = elf_note_size + note_head_size + name_size +
> -                        nf->contents_size;
> +                        content_size;
>       }
>   
>       return (elf_note_size) * nr_cpus;

LGTM, and I can confirm this produces a valid pv-dump that can be
decrypted.

Please have a look on my comments.

With the nits fixed:
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Thomas Huth Aug. 29, 2022, 11:57 a.m. UTC | #3
On 11/08/2022 14.11, Janosch Frank wrote:
> Sometimes dumping a guest from the outside is the only way to get the
> data that is needed. This can be the case if a dumping mechanism like
> KDUMP hasn't been configured or data needs to be fetched at a specific
> point. Dumping a protected guest from the outside without help from
> fw/hw doesn't yield sufficient data to be useful. Hence we now
> introduce PV dump support.
> 
> The PV dump support works by integrating the firmware into the dump
> process. New Ultravisor calls are used to initiate the dump process,
> dump cpu data, dump memory state and lastly complete the dump process.
> The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
> its subcommands. The guest's data is fully encrypted and can only be
> decrypted by the entity that owns the customer communication key for
> the dumped guest. Also dumping needs to be allowed via a flag in the
> SE header.
> 
> On the QEMU side of things we store the PV dump data in the newly
> introduced architecture ELF sections (storage state and completion
> data) and the cpu notes (for cpu dump data).
> 
> Users can use the zgetdump tool to convert the encrypted QEMU dump to an
> unencrypted one.
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> ---
[...]
> @@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char *note_name,
>                                          DumpState *s,
>                                          const NoteFuncDesc *funcs)
>   {
> -    Note note;
> +    Note note, *notep;
>       const NoteFuncDesc *nf;
> -    int note_size;
> +    int note_size, content_size;
>       int ret = -1;
>   
>       assert(strlen(note_name) < sizeof(note.name));
>   
>       for (nf = funcs; nf->note_contents_func; nf++) {
> -        memset(&note, 0, sizeof(note));
> -        note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> -        note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
> -        g_strlcpy(note.name, note_name, sizeof(note.name));
> -        (*nf->note_contents_func)(&note, cpu, id);
> +        notep = &note;
> +        if (nf->pvonly && !s390_is_pv()) {
> +            continue;
> +        }
>   
> -        note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;
> -        ret = f(&note, note_size, s);
> +        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
> +        note_size = sizeof(note) - sizeof(notep->contents) + content_size;
> +
> +        /* Notes with dynamic sizes need to allocate a note */
> +        if (nf->note_size_func) {
> +            notep = g_malloc0(note_size);

Either use g_malloc() here (without the trailing "0") ...

> +        }
> +
> +        memset(notep, 0, sizeof(note));

... or put the memset() in an "else" block.

> +        /* Setup note header data */
> +        notep->hdr.n_descsz = cpu_to_be32(content_size);
> +        notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> +        g_strlcpy(notep->name, note_name, sizeof(notep->name));
> +
> +        /* Get contents and write them out */
> +        (*nf->note_contents_func)(notep, cpu, id);
> +        ret = f(notep, note_size, s);
> +
> +        if (nf->note_size_func) {
> +            g_free(notep);
> +        }
>   
>           if (ret < 0) {
>               return -1;
> @@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
>       return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
>   }
>   
> +/* PV dump section size functions */
> +static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
> +{
> +    return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();

Use "MiB" instead of "1 << 20" ?

> +}
> +
> +static uint64_t get_size_stor_state(DumpState *s)
> +{
> +    return get_dump_stor_state_size_from_len(s->total_size);
> +}

  Thomas
Janis Schoetterl-Glausch Sept. 1, 2022, 9:31 a.m. UTC | #4
On Thu, 2022-08-11 at 12:11 +0000, Janosch Frank wrote:
> Sometimes dumping a guest from the outside is the only way to get the
> data that is needed. This can be the case if a dumping mechanism like
> KDUMP hasn't been configured or data needs to be fetched at a specific
> point. Dumping a protected guest from the outside without help from
> fw/hw doesn't yield sufficient data to be useful. Hence we now
> introduce PV dump support.
> 
> The PV dump support works by integrating the firmware into the dump
> process. New Ultravisor calls are used to initiate the dump process,
> dump cpu data, dump memory state and lastly complete the dump process.
> The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
> its subcommands. The guest's data is fully encrypted and can only be
> decrypted by the entity that owns the customer communication key for
> the dumped guest. Also dumping needs to be allowed via a flag in the
> SE header.
> 
> On the QEMU side of things we store the PV dump data in the newly
> introduced architecture ELF sections (storage state and completion
> data) and the cpu notes (for cpu dump data).
> 
> Users can use the zgetdump tool to convert the encrypted QEMU dump to an
> unencrypted one.

Does PV dump work when memory is being filtered? Are there any
constraints on the filter parameters, alignment or so?
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> ---
>  dump/dump.c              |  12 +-
>  include/sysemu/dump.h    |   5 +
>  target/s390x/arch_dump.c | 242 ++++++++++++++++++++++++++++++++++-----
>  3 files changed, 227 insertions(+), 32 deletions(-)

[...]
>  
>  typedef struct NoteFuncDescStruct {
>      int contents_size;
> +    uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents */
>      void (*note_contents_func)(Note *note, S390CPU *cpu, int id);
> +    bool pvonly;
>  } NoteFuncDesc;
>  
>  static const NoteFuncDesc note_core[] = {
> -    {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus},
> -    {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset},
> -    { 0, NULL}
> +    {sizeof_field(Note, contents.prstatus), NULL, s390x_write_elf64_prstatus, false},
> +    {sizeof_field(Note, contents.fpregset), NULL, s390x_write_elf64_fpregset, false},
> +    { 0, NULL, NULL}
>  };
>  
>  static const NoteFuncDesc note_linux[] = {
> -    {sizeof_field(Note, contents.prefix),   s390x_write_elf64_prefix},
> -    {sizeof_field(Note, contents.ctrs),     s390x_write_elf64_ctrs},
> -    {sizeof_field(Note, contents.timer),    s390x_write_elf64_timer},
> -    {sizeof_field(Note, contents.todcmp),   s390x_write_elf64_todcmp},
> -    {sizeof_field(Note, contents.todpreg),  s390x_write_elf64_todpreg},
> -    {sizeof_field(Note, contents.vregslo),  s390x_write_elf64_vregslo},
> -    {sizeof_field(Note, contents.vregshi),  s390x_write_elf64_vregshi},
> -    {sizeof_field(Note, contents.gscb),     s390x_write_elf64_gscb},
> -    { 0, NULL}
> +    {sizeof_field(Note, contents.prefix),   NULL, s390x_write_elf64_prefix,  false},
> +    {sizeof_field(Note, contents.ctrs),     NULL, s390x_write_elf64_ctrs,    false},
> +    {sizeof_field(Note, contents.timer),    NULL, s390x_write_elf64_timer,   false},
> +    {sizeof_field(Note, contents.todcmp),   NULL, s390x_write_elf64_todcmp,  false},
> +    {sizeof_field(Note, contents.todpreg),  NULL, s390x_write_elf64_todpreg, false},
> +    {sizeof_field(Note, contents.vregslo),  NULL, s390x_write_elf64_vregslo, false},
> +    {sizeof_field(Note, contents.vregshi),  NULL, s390x_write_elf64_vregshi, false},
> +    {sizeof_field(Note, contents.gscb),     NULL, s390x_write_elf64_gscb,    false},
> +    {0, kvm_s390_pv_dmp_get_size_cpu,       s390x_write_elf64_pv, true},
> +    { 0, NULL, NULL}
>  };
>  
>  static int s390x_write_elf64_notes(const char *note_name,
> @@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char *note_name,
>                                         DumpState *s,
>                                         const NoteFuncDesc *funcs)
>  {
> -    Note note;
> +    Note note, *notep;
>      const NoteFuncDesc *nf;
> -    int note_size;
> +    int note_size, content_size;

Could make those size_t. I guess it's not necessary, but we're kind of
a dumb pipe for data from the ultravisor so there's something to be
said for not making assumptions.

>      int ret = -1;
>  
>      assert(strlen(note_name) < sizeof(note.name));
>  
>      for (nf = funcs; nf->note_contents_func; nf++) {
> -        memset(&note, 0, sizeof(note));
> -        note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> -        note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
> -        g_strlcpy(note.name, note_name, sizeof(note.name));
> -        (*nf->note_contents_func)(&note, cpu, id);
> +        notep = &note;
> +        if (nf->pvonly && !s390_is_pv()) {
> +            continue;
> +        }
>  
> -        note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;
> -        ret = f(&note, note_size, s);
> +        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();

Your comment above says
/* NULL for non-dynamic sized contents */
So it makes sense to condition on that, i.e.
+        content_size = nf->contents_size_func ? nf->note_size_func() : nf->contents_size;
And it makes it consistent with the ifs below

> +        note_size = sizeof(note) - sizeof(notep->contents) + content_size;
> +
> +        /* Notes with dynamic sizes need to allocate a note */
> +        if (nf->note_size_func) {
> +            notep = g_malloc0(note_size);
> +        }
> +
> +        memset(notep, 0, sizeof(note));
> +
> +        /* Setup note header data */
> +        notep->hdr.n_descsz = cpu_to_be32(content_size);
> +        notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
> +        g_strlcpy(notep->name, note_name, sizeof(notep->name));
> +
> +        /* Get contents and write them out */
> +        (*nf->note_contents_func)(notep, cpu, id);
> +        ret = f(notep, note_size, s);
> +
> +        if (nf->note_size_func) {
> +            g_free(notep);
> +        }
>  
>          if (ret < 0) {
>              return -1;
> @@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
>      return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
>  }
>  
> +/* PV dump section size functions */
> +static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
> +{
> +    return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();
> +}
> +
> +static uint64_t get_size_stor_state(DumpState *s)
> +{
> +    return get_dump_stor_state_size_from_len(s->total_size);
> +}
> +
> +static uint64_t get_size_complete(DumpState *s)
> +{
> +    return kvm_s390_pv_dmp_get_size_complete();
> +}
> +
> +/* PV dump section data functions*/
> +static int get_data_complete(DumpState *s, uint8_t *buff)
> +{
> +    int rc;
> +
> +    if (!pv_dump_initialized) {
> +        return 0;
> +    }
> +    rc = kvm_s390_dump_complete(buff);
> +    if (!rc) {
> +            pv_dump_initialized = false;
> +    }
> +    return rc;
> +}
> +
> +static int dump_mem(DumpState *s, uint64_t gaddr, uint8_t *buff, uint64_t buff_len)

The DumpState arg is being ignored.

> +{
> +    /* We need the gaddr + len and something to write to */
> +    if (!pv_dump_initialized) {
> +        return 0;
> +    }
> +    return kvm_s390_dump_mem(gaddr, buff_len, buff);
> +}
> +
> +static int get_data_mem(DumpState *s, uint8_t *buff)
> +{
> +    int64_t memblock_size, memblock_start;
> +    GuestPhysBlock *block;
> +    uint64_t off;
> +
> +    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
> +        memblock_start = dump_filtered_memblock_start(block, s->filter_area_begin,
> +                                                      s->filter_area_length);
> +        if (memblock_start == -1) {
> +            continue;
> +        }
> +
> +        memblock_size = dump_filtered_memblock_size(block, s->filter_area_begin,
> +                                                    s->filter_area_length);
> +
> +        off = get_dump_stor_state_size_from_len(block->target_start);
> +        dump_mem(s, block->target_start, buff + off,
> +                 get_dump_stor_state_size_from_len(memblock_size));

This ignores the return value, if this is intentional a comment
explaining it would be nice.

> +    }
> +
> +    return 0;
> +}
> +
> +struct sections {
> +    uint64_t (*sections_size_func)(DumpState *s);
> +    int (*sections_contents_func)(DumpState *s, uint8_t *buff);
> +    char sctn_str[12];
> +} sections[] = {
> +    { get_size_stor_state, get_data_mem, "pv_mem_meta"},
> +    { get_size_complete, get_data_complete, "pv_compl"},
> +    {NULL , NULL, ""}
> +};

Could be static right?

> +
> +static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff)
> +{
> +    Elf64_Shdr *shdr = (void *)buff;
> +    struct sections *sctn = sections;
> +    uint64_t off = s->section_offset;
> +
> +    if (!s390_is_pv()) {
> +        return 0;
> +    }
> +
> +    for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) {
> +        memset(shdr, 0, sizeof(*shdr));

This isn't necessary since the header mem is zero allocated, but you
might not want to make guarantees about that. Maybe consider doing a
normal malloc and memsetting just the special 0 index section header in
dump.c.

> +        shdr->sh_type = SHT_PROGBITS;
> +        shdr->sh_offset = off;
> +        shdr->sh_size = sctn->sections_size_func(s);
> +        shdr->sh_name = s->string_table_buf->len;
> +        g_array_append_vals(s->string_table_buf, sctn->sctn_str, sizeof(sctn->sctn_str));
> +    }
> +
> +    return (uintptr_t)shdr - (uintptr_t)buff;
> +}
> +
> +
> +/* Add arch specific number of sections and their respective sizes */
> +static void arch_sections_add(DumpState *s)
> +{
> +    struct sections *sctn = sections;
> +
> +    /*
> +     * We only do a PV dump if we are running a PV guest, KVM supports
> +     * the dump API and we got valid dump length information.
> +     */
> +    if (!s390_is_pv() || !kvm_s390_get_protected_dump() ||
> +        !kvm_s390_pv_info_basic_valid()) {
> +        return;
> +    }
> +
> +    /*
> +     * Start the UV dump process by doing the initialize dump call via
> +     * KVM as the proxy.
> +     */
> +    if (!kvm_s390_dump_init()) {
> +            pv_dump_initialized = true;
> +    }
> +
> +    for (; sctn->sections_size_func; sctn++) {
> +        s->shdr_num += 1;
> +        s->elf_section_data_size += sctn->sections_size_func(s);
> +    }
> +
> +    /* We use the string table to identify the sections */
> +    s->string_table_usage = true;

In dump.c this value is being set if shdr_num > 0, so this seems
redundant.

> +}
> +
> +/*
> + * After the PV dump has been initialized, the CPU data has been
> + * fetched and memory has been dumped, we need to grab the tweak data
> + * and the completion data.
> + */
> +static void arch_sections_write(DumpState *s, uint8_t *buff)
> +{
> +    struct sections *sctn = sections;
> +
> +    /* shdr_num should only have been set > 1 if we are protected */
> +    assert(s390_is_pv());
> +
> +    for (; sctn->sections_size_func; sctn++) {
> +        sctn->sections_contents_func(s, buff);

As above with regards to ignoring return values.

> +        buff += sctn->sections_size_func(s);
> +    }
> +}
> +
>  int cpu_get_dump_info(ArchDumpInfo *info,
>                        const struct GuestPhysBlockList *guest_phys_blocks)
>  {
>      info->d_machine = EM_S390;
>      info->d_endian = ELFDATA2MSB;
>      info->d_class = ELFCLASS64;
> +    info->arch_sections_add_fn = *arch_sections_add;
> +    info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
> +    info->arch_sections_write_fn = *arch_sections_write;
>  
>      return 0;
>  }
> @@ -261,7 +448,7 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
>  {
>      int name_size = 8; /* "LINUX" or "CORE" + pad */
>      size_t elf_note_size = 0;
> -    int note_head_size;
> +    int note_head_size, content_size;
>      const NoteFuncDesc *nf;
>  
>      assert(class == ELFCLASS64);
> @@ -270,12 +457,15 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
>      note_head_size = sizeof(Elf64_Nhdr);
>  
>      for (nf = note_core; nf->note_contents_func; nf++) {
> -        elf_note_size = elf_note_size + note_head_size + name_size +
> -                        nf->contents_size;
> +        elf_note_size = elf_note_size + note_head_size + name_size + nf->contents_size;
>      }
>      for (nf = note_linux; nf->note_contents_func; nf++) {
> +        if (nf->pvonly && !s390_is_pv()) {
> +            continue;
> +        }
> +        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
>          elf_note_size = elf_note_size + note_head_size + name_size +
> -                        nf->contents_size;
> +                        content_size;
>      }
>  
>      return (elf_note_size) * nr_cpus;
diff mbox series

Patch

diff --git a/dump/dump.c b/dump/dump.c
index 65b18fc602..7cf5eb7c8b 100644
--- a/dump/dump.c
+++ b/dump/dump.c
@@ -720,9 +720,9 @@  static void dump_begin(DumpState *s, Error **errp)
     write_elf_notes(s, errp);
 }
 
-static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
-                                           int64_t filter_area_start,
-                                           int64_t filter_area_length)
+int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
+                                    int64_t filter_area_start,
+                                    int64_t filter_area_length)
 {
     int64_t size, left, right;
 
@@ -740,9 +740,9 @@  static int64_t dump_filtered_memblock_size(GuestPhysBlock *block,
     return size;
 }
 
-static int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
-                                            int64_t filter_area_start,
-                                            int64_t filter_area_length)
+int64_t dump_filtered_memblock_start(GuestPhysBlock *block,
+                                     int64_t filter_area_start,
+                                     int64_t filter_area_length)
 {
     if (filter_area_length) {
         /* return -1 if the block is not within filter area */
diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
index 358b038d47..245c26fbca 100644
--- a/include/sysemu/dump.h
+++ b/include/sysemu/dump.h
@@ -216,4 +216,9 @@  typedef struct DumpState {
 uint16_t cpu_to_dump16(DumpState *s, uint16_t val);
 uint32_t cpu_to_dump32(DumpState *s, uint32_t val);
 uint64_t cpu_to_dump64(DumpState *s, uint64_t val);
+
+int64_t dump_filtered_memblock_size(GuestPhysBlock *block, int64_t filter_area_start,
+                                    int64_t filter_area_length);
+int64_t dump_filtered_memblock_start(GuestPhysBlock *block, int64_t filter_area_start,
+                                     int64_t filter_area_length);
 #endif
diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
index f60a14920d..5e8e03d536 100644
--- a/target/s390x/arch_dump.c
+++ b/target/s390x/arch_dump.c
@@ -16,7 +16,8 @@ 
 #include "s390x-internal.h"
 #include "elf.h"
 #include "sysemu/dump.h"
-
+#include "hw/s390x/pv.h"
+#include "kvm/kvm_s390x.h"
 
 struct S390xUserRegsStruct {
     uint64_t psw[2];
@@ -76,9 +77,16 @@  typedef struct noteStruct {
         uint64_t todcmp;
         uint32_t todpreg;
         uint64_t ctrs[16];
+        uint8_t dynamic[1];  /*
+                              * Would be a flexible array member, if
+                              * that was legal inside a union. Real
+                              * size comes from PV info interface.
+                              */
     } contents;
 } QEMU_PACKED Note;
 
+static bool pv_dump_initialized;
+
 static void s390x_write_elf64_prstatus(Note *note, S390CPU *cpu, int id)
 {
     int i;
@@ -177,28 +185,39 @@  static void s390x_write_elf64_prefix(Note *note, S390CPU *cpu, int id)
     note->contents.prefix = cpu_to_be32((uint32_t)(cpu->env.psa));
 }
 
+static void s390x_write_elf64_pv(Note *note, S390CPU *cpu, int id)
+{
+    note->hdr.n_type = cpu_to_be32(NT_S390_PV_CPU_DATA);
+    if (!pv_dump_initialized) {
+        return;
+    }
+    kvm_s390_dump_cpu(cpu, &note->contents.dynamic);
+}
 
 typedef struct NoteFuncDescStruct {
     int contents_size;
+    uint64_t (*note_size_func)(void); /* NULL for non-dynamic sized contents */
     void (*note_contents_func)(Note *note, S390CPU *cpu, int id);
+    bool pvonly;
 } NoteFuncDesc;
 
 static const NoteFuncDesc note_core[] = {
-    {sizeof_field(Note, contents.prstatus), s390x_write_elf64_prstatus},
-    {sizeof_field(Note, contents.fpregset), s390x_write_elf64_fpregset},
-    { 0, NULL}
+    {sizeof_field(Note, contents.prstatus), NULL, s390x_write_elf64_prstatus, false},
+    {sizeof_field(Note, contents.fpregset), NULL, s390x_write_elf64_fpregset, false},
+    { 0, NULL, NULL}
 };
 
 static const NoteFuncDesc note_linux[] = {
-    {sizeof_field(Note, contents.prefix),   s390x_write_elf64_prefix},
-    {sizeof_field(Note, contents.ctrs),     s390x_write_elf64_ctrs},
-    {sizeof_field(Note, contents.timer),    s390x_write_elf64_timer},
-    {sizeof_field(Note, contents.todcmp),   s390x_write_elf64_todcmp},
-    {sizeof_field(Note, contents.todpreg),  s390x_write_elf64_todpreg},
-    {sizeof_field(Note, contents.vregslo),  s390x_write_elf64_vregslo},
-    {sizeof_field(Note, contents.vregshi),  s390x_write_elf64_vregshi},
-    {sizeof_field(Note, contents.gscb),     s390x_write_elf64_gscb},
-    { 0, NULL}
+    {sizeof_field(Note, contents.prefix),   NULL, s390x_write_elf64_prefix,  false},
+    {sizeof_field(Note, contents.ctrs),     NULL, s390x_write_elf64_ctrs,    false},
+    {sizeof_field(Note, contents.timer),    NULL, s390x_write_elf64_timer,   false},
+    {sizeof_field(Note, contents.todcmp),   NULL, s390x_write_elf64_todcmp,  false},
+    {sizeof_field(Note, contents.todpreg),  NULL, s390x_write_elf64_todpreg, false},
+    {sizeof_field(Note, contents.vregslo),  NULL, s390x_write_elf64_vregslo, false},
+    {sizeof_field(Note, contents.vregshi),  NULL, s390x_write_elf64_vregshi, false},
+    {sizeof_field(Note, contents.gscb),     NULL, s390x_write_elf64_gscb,    false},
+    {0, kvm_s390_pv_dmp_get_size_cpu,       s390x_write_elf64_pv, true},
+    { 0, NULL, NULL}
 };
 
 static int s390x_write_elf64_notes(const char *note_name,
@@ -207,22 +226,41 @@  static int s390x_write_elf64_notes(const char *note_name,
                                        DumpState *s,
                                        const NoteFuncDesc *funcs)
 {
-    Note note;
+    Note note, *notep;
     const NoteFuncDesc *nf;
-    int note_size;
+    int note_size, content_size;
     int ret = -1;
 
     assert(strlen(note_name) < sizeof(note.name));
 
     for (nf = funcs; nf->note_contents_func; nf++) {
-        memset(&note, 0, sizeof(note));
-        note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
-        note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
-        g_strlcpy(note.name, note_name, sizeof(note.name));
-        (*nf->note_contents_func)(&note, cpu, id);
+        notep = &note;
+        if (nf->pvonly && !s390_is_pv()) {
+            continue;
+        }
 
-        note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;
-        ret = f(&note, note_size, s);
+        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
+        note_size = sizeof(note) - sizeof(notep->contents) + content_size;
+
+        /* Notes with dynamic sizes need to allocate a note */
+        if (nf->note_size_func) {
+            notep = g_malloc0(note_size);
+        }
+
+        memset(notep, 0, sizeof(note));
+
+        /* Setup note header data */
+        notep->hdr.n_descsz = cpu_to_be32(content_size);
+        notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
+        g_strlcpy(notep->name, note_name, sizeof(notep->name));
+
+        /* Get contents and write them out */
+        (*nf->note_contents_func)(notep, cpu, id);
+        ret = f(notep, note_size, s);
+
+        if (nf->note_size_func) {
+            g_free(notep);
+        }
 
         if (ret < 0) {
             return -1;
@@ -247,12 +285,161 @@  int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
     return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
 }
 
+/* PV dump section size functions */
+static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
+{
+    return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();
+}
+
+static uint64_t get_size_stor_state(DumpState *s)
+{
+    return get_dump_stor_state_size_from_len(s->total_size);
+}
+
+static uint64_t get_size_complete(DumpState *s)
+{
+    return kvm_s390_pv_dmp_get_size_complete();
+}
+
+/* PV dump section data functions*/
+static int get_data_complete(DumpState *s, uint8_t *buff)
+{
+    int rc;
+
+    if (!pv_dump_initialized) {
+        return 0;
+    }
+    rc = kvm_s390_dump_complete(buff);
+    if (!rc) {
+            pv_dump_initialized = false;
+    }
+    return rc;
+}
+
+static int dump_mem(DumpState *s, uint64_t gaddr, uint8_t *buff, uint64_t buff_len)
+{
+    /* We need the gaddr + len and something to write to */
+    if (!pv_dump_initialized) {
+        return 0;
+    }
+    return kvm_s390_dump_mem(gaddr, buff_len, buff);
+}
+
+static int get_data_mem(DumpState *s, uint8_t *buff)
+{
+    int64_t memblock_size, memblock_start;
+    GuestPhysBlock *block;
+    uint64_t off;
+
+    QTAILQ_FOREACH(block, &s->guest_phys_blocks.head, next) {
+        memblock_start = dump_filtered_memblock_start(block, s->filter_area_begin,
+                                                      s->filter_area_length);
+        if (memblock_start == -1) {
+            continue;
+        }
+
+        memblock_size = dump_filtered_memblock_size(block, s->filter_area_begin,
+                                                    s->filter_area_length);
+
+        off = get_dump_stor_state_size_from_len(block->target_start);
+        dump_mem(s, block->target_start, buff + off,
+                 get_dump_stor_state_size_from_len(memblock_size));
+    }
+
+    return 0;
+}
+
+struct sections {
+    uint64_t (*sections_size_func)(DumpState *s);
+    int (*sections_contents_func)(DumpState *s, uint8_t *buff);
+    char sctn_str[12];
+} sections[] = {
+    { get_size_stor_state, get_data_mem, "pv_mem_meta"},
+    { get_size_complete, get_data_complete, "pv_compl"},
+    {NULL , NULL, ""}
+};
+
+static uint64_t arch_sections_write_hdr(DumpState *s, uint8_t *buff)
+{
+    Elf64_Shdr *shdr = (void *)buff;
+    struct sections *sctn = sections;
+    uint64_t off = s->section_offset;
+
+    if (!s390_is_pv()) {
+        return 0;
+    }
+
+    for (; sctn->sections_size_func; off += shdr->sh_size, sctn++, shdr++) {
+        memset(shdr, 0, sizeof(*shdr));
+        shdr->sh_type = SHT_PROGBITS;
+        shdr->sh_offset = off;
+        shdr->sh_size = sctn->sections_size_func(s);
+        shdr->sh_name = s->string_table_buf->len;
+        g_array_append_vals(s->string_table_buf, sctn->sctn_str, sizeof(sctn->sctn_str));
+    }
+
+    return (uintptr_t)shdr - (uintptr_t)buff;
+}
+
+
+/* Add arch specific number of sections and their respective sizes */
+static void arch_sections_add(DumpState *s)
+{
+    struct sections *sctn = sections;
+
+    /*
+     * We only do a PV dump if we are running a PV guest, KVM supports
+     * the dump API and we got valid dump length information.
+     */
+    if (!s390_is_pv() || !kvm_s390_get_protected_dump() ||
+        !kvm_s390_pv_info_basic_valid()) {
+        return;
+    }
+
+    /*
+     * Start the UV dump process by doing the initialize dump call via
+     * KVM as the proxy.
+     */
+    if (!kvm_s390_dump_init()) {
+            pv_dump_initialized = true;
+    }
+
+    for (; sctn->sections_size_func; sctn++) {
+        s->shdr_num += 1;
+        s->elf_section_data_size += sctn->sections_size_func(s);
+    }
+
+    /* We use the string table to identify the sections */
+    s->string_table_usage = true;
+}
+
+/*
+ * After the PV dump has been initialized, the CPU data has been
+ * fetched and memory has been dumped, we need to grab the tweak data
+ * and the completion data.
+ */
+static void arch_sections_write(DumpState *s, uint8_t *buff)
+{
+    struct sections *sctn = sections;
+
+    /* shdr_num should only have been set > 1 if we are protected */
+    assert(s390_is_pv());
+
+    for (; sctn->sections_size_func; sctn++) {
+        sctn->sections_contents_func(s, buff);
+        buff += sctn->sections_size_func(s);
+    }
+}
+
 int cpu_get_dump_info(ArchDumpInfo *info,
                       const struct GuestPhysBlockList *guest_phys_blocks)
 {
     info->d_machine = EM_S390;
     info->d_endian = ELFDATA2MSB;
     info->d_class = ELFCLASS64;
+    info->arch_sections_add_fn = *arch_sections_add;
+    info->arch_sections_write_hdr_fn = *arch_sections_write_hdr;
+    info->arch_sections_write_fn = *arch_sections_write;
 
     return 0;
 }
@@ -261,7 +448,7 @@  ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
 {
     int name_size = 8; /* "LINUX" or "CORE" + pad */
     size_t elf_note_size = 0;
-    int note_head_size;
+    int note_head_size, content_size;
     const NoteFuncDesc *nf;
 
     assert(class == ELFCLASS64);
@@ -270,12 +457,15 @@  ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
     note_head_size = sizeof(Elf64_Nhdr);
 
     for (nf = note_core; nf->note_contents_func; nf++) {
-        elf_note_size = elf_note_size + note_head_size + name_size +
-                        nf->contents_size;
+        elf_note_size = elf_note_size + note_head_size + name_size + nf->contents_size;
     }
     for (nf = note_linux; nf->note_contents_func; nf++) {
+        if (nf->pvonly && !s390_is_pv()) {
+            continue;
+        }
+        content_size = nf->contents_size ? nf->contents_size : nf->note_size_func();
         elf_note_size = elf_note_size + note_head_size + name_size +
-                        nf->contents_size;
+                        content_size;
     }
 
     return (elf_note_size) * nr_cpus;