diff mbox series

[v3] block/nvme: introduce PMR support from NVMe 1.4 spec

Message ID 20200318200303.11322-1-andrzej.jakowski@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [v3] block/nvme: introduce PMR support from NVMe 1.4 spec | expand

Commit Message

Andrzej Jakowski March 18, 2020, 8:03 p.m. UTC
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmrdev option that should point to HostMemoryBackend.
pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
device. Guest OS can perform mmio read and writes to the PMR region that will stay
persistent across system reboot.

Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
---
v2:
 - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
   backend file into qemu [1] (Stefan)

v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [2] (Stefan)

 - added check if pmr size is power of two in size [3] (David)

 - addressed cross compilation build problems reported by CI environment

[1]: https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakowski@linux.intel.com/
[2]: https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[3]: https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakowski@linux.intel.com/
---
Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.
---
 hw/block/nvme.c       | 117 +++++++++++++++++++++++++++-
 hw/block/nvme.h       |   2 +
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 294 insertions(+), 2 deletions(-)

Comments

Klaus Jensen March 19, 2020, 9:48 a.m. UTC | #1
On Mar 18 13:03, Andrzej Jakowski wrote:
> This patch introduces support for PMR that has been defined as part of NVMe 1.4
> spec. User can now specify a pmrdev option that should point to HostMemoryBackend.
> pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
> device. Guest OS can perform mmio read and writes to the PMR region that will stay
> persistent across system reboot.
> 
> Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
> ---
> v2:
>  - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
>    backend file into qemu [1] (Stefan)
> 
> v1:
>  - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
>    improved performance in virtualized environment [2] (Stefan)
> 
>  - added check if pmr size is power of two in size [3] (David)
> 
>  - addressed cross compilation build problems reported by CI environment
> 
> [1]: https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakowski@linux.intel.com/
> [2]: https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
> [3]: https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakowski@linux.intel.com/
> ---
> Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
> specification. This patch implements initial support for it in NVMe driver.
> ---
>  hw/block/nvme.c       | 117 +++++++++++++++++++++++++++-
>  hw/block/nvme.h       |   2 +
>  hw/block/trace-events |   5 ++
>  include/block/nvme.h  | 172 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 294 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d28335cbf3..70fd09d293 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,10 +19,18 @@
>   *      -drive file=<file>,if=none,id=<drive_id>
>   *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
>   *              cmb_size_mb=<cmb_size_mb[optional]>, \
> + *              [pmrdev=<mem_backend_file_id>,] \
>   *              num_queues=<N[optional]>
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> + *
> + * Either cmb or pmr - due to limitation in available BAR indexes.
> + * pmr_file file needs to be power of two in size.
> + * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
> + * For example:
> + * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
> + *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
>   */
>  
>  #include "qemu/osdep.h"
> @@ -35,7 +43,9 @@
>  #include "sysemu/sysemu.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> +#include "sysemu/hostmem.h"
>  #include "sysemu/block-backend.h"
> +#include "exec/ramblock.h"
>  
>  #include "qemu/log.h"
>  #include "qemu/module.h"
> @@ -1141,6 +1151,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
>          NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
>                         "invalid write to read only CMBSZ, ignored");
>          return;
> +    case 0xE00: /* PMRCAP */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
> +                       "invalid write to PMRCAP register, ignored");
> +        return;
> +    case 0xE04: /* TODO PMRCTL */
> +        break;
> +    case 0xE08: /* PMRSTS */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
> +                       "invalid write to PMRSTS register, ignored");
> +        return;
> +    case 0xE0C: /* PMREBS */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
> +                       "invalid write to PMREBS register, ignored");
> +        return;
> +    case 0xE10: /* PMRSWTP */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
> +                       "invalid write to PMRSWTP register, ignored");
> +        return;
> +    case 0xE14: /* TODO PMRMSC */
> +         break;
>      default:
>          NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
>                         "invalid MMIO write,"
> @@ -1169,6 +1199,23 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
>      }
>  
>      if (addr < sizeof(n->bar)) {
> +        /*
> +         * When PMRWBM bit 1 is set then read from
> +         * from PMRSTS should ensure prior writes
> +         * made it to persistent media
> +         */
> +        if (addr == 0xE08 &&
> +            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {

Don't think that shift is needed.

> +            int status;
> +
> +            status = qemu_msync((void *)n->pmrdev->mr.ram_block->host,
> +                                n->pmrdev->size,
> +                                n->pmrdev->mr.ram_block->fd);
> +            if (!status) {
> +                NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
> +                               "error while persisting data");
> +            }
> +        }
>          memcpy(&val, ptr + addr, size);
>      } else {
>          NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
> @@ -1332,6 +1379,23 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>          error_setg(errp, "serial property not set");
>          return;
>      }
> +
> +    if (!n->cmb_size_mb && n->pmrdev) {
> +        if (host_memory_backend_is_mapped(n->pmrdev)) {
> +            char *path = object_get_canonical_path_component(OBJECT(n->pmrdev));
> +            error_setg(errp, "can't use already busy memdev: %s", path);
> +            g_free(path);
> +            return;
> +        }
> +
> +        if (!is_power_of_2(n->pmrdev->size)) {
> +            error_setg(errp, "pmr backend size needs to be power of 2 in size");
> +            return;
> +        }
> +
> +        host_memory_backend_set_mapped(n->pmrdev, true);
> +    }
> +
>      blkconf_blocksizes(&n->conf);
>      if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
>                                         false, errp)) {
> @@ -1393,7 +1457,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>      n->bar.intmc = n->bar.intms = 0;
>  
>      if (n->cmb_size_mb) {
> -
>          NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
>          NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
>  
> @@ -1415,6 +1478,51 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>              PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
>              PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
>  
> +    } else if (n->pmrdev) {
> +        /* Controller Capabilities register */
> +        NVME_CAP_SET_PMRS(n->bar.cap, 1);
> +
> +        /* PMR Capabities register */
> +        n->bar.pmrcap = 0;
> +        NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
> +        NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
> +        NVME_PMRCAP_SET_BIR(n->bar.pmrcap, 2);
> +        NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
> +        /* Turn on bit 1 support */
> +        NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
> +        NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
> +        NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
> +
> +        /* PMR Control register */
> +        n->bar.pmrctl = 0;
> +        NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
> +
> +        /* PMR Status register */
> +        n->bar.pmrsts = 0;
> +        NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
> +        NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
> +        NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
> +        NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
> +
> +        /* PMR Elasticity Buffer Size register */
> +        n->bar.pmrebs = 0;
> +        NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
> +        NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
> +        NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
> +
> +        /* PMR Sustained Write Throughput register */
> +        n->bar.pmrswtp = 0;
> +        NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
> +        NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
> +
> +        /* PMR Memory Space Control register */
> +        n->bar.pmrmsc = 0;
> +        NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
> +        NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
> +
> +        pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
> +            PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
> +            PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
>      }
>  
>      for (i = 0; i < n->num_namespaces; i++) {
> @@ -1441,15 +1549,20 @@ static void nvme_exit(PCIDevice *pci_dev)
>      g_free(n->namespaces);
>      g_free(n->cq);
>      g_free(n->sq);
> -
>      if (n->cmb_size_mb) {
>          g_free(n->cmbuf);
>      }
> +
> +    if (n->pmrdev) {
> +        host_memory_backend_set_mapped(n->pmrdev, false);
> +    }
>      msix_uninit_exclusive_bar(pci_dev);
>  }
>  
>  static Property nvme_props[] = {
>      DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
> +    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
> +                     HostMemoryBackend *),
>      DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
>      DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
>      DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 557194ee19..6520a9f0be 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -83,6 +83,8 @@ typedef struct NvmeCtrl {
>      uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
>  
>      char            *serial;
> +    HostMemoryBackend *pmrdev;
> +
>      NvmeNamespace   *namespaces;
>      NvmeSQueue      **sq;
>      NvmeCQueue      **cq;
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index c03e80c2c9..233e094438 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -110,10 +110,15 @@ nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CA
>  nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
>  nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
>  nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
> +nvme_ub_mmiowr_pmrcap_readonly(void) "invalid write to read only PMRCAP, ignored"
> +nvme_ub_mmiowr_pmrsts_readonly(void) "invalid write to read only PMRSTS, ignored"
> +nvme_ub_mmiowr_pmrebs_readonly(void) "invalid write to read only PMREBS, ignored"
> +nvme_ub_mmiowr_pmrswtp_readonly(void) "invalid write to read only PMRSWTP, ignored"
>  nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
>  nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
>  nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
>  nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
> +nvme_ub_mmiord_pmrread_barrier(void) "failed to persists data"
>  nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
>  nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
>  nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8fb941c653..374262d4b7 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -15,6 +15,13 @@ typedef struct NvmeBar {
>      uint64_t    acq;
>      uint32_t    cmbloc;
>      uint32_t    cmbsz;
> +    uint8_t     padding[3520]; /* not used by QEMU */
> +    uint32_t    pmrcap;
> +    uint32_t    pmrctl;
> +    uint32_t    pmrsts;
> +    uint32_t    pmrebs;
> +    uint32_t    pmrswtp;
> +    uint32_t    pmrmsc;
>  } NvmeBar;
>  
>  enum NvmeCapShift {
> @@ -27,6 +34,7 @@ enum NvmeCapShift {
>      CAP_CSS_SHIFT      = 37,
>      CAP_MPSMIN_SHIFT   = 48,
>      CAP_MPSMAX_SHIFT   = 52,
> +    CAP_PMR_SHIFT      = 56,
>  };
>  
>  enum NvmeCapMask {
> @@ -39,6 +47,7 @@ enum NvmeCapMask {
>      CAP_CSS_MASK       = 0xff,
>      CAP_MPSMIN_MASK    = 0xf,
>      CAP_MPSMAX_MASK    = 0xf,
> +    CAP_PMR_MASK       = 0x1,
>  };
>  
>  #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
> @@ -69,6 +78,8 @@ enum NvmeCapMask {
>                                                             << CAP_MPSMIN_SHIFT)
>  #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
>                                                              << CAP_MPSMAX_SHIFT)
> +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
> +                                                            << CAP_PMR_SHIFT)
>  
>  enum NvmeCcShift {
>      CC_EN_SHIFT     = 0,
> @@ -205,6 +216,167 @@ enum NvmeCmbszMask {
>  #define NVME_CMBSZ_GETSIZE(cmbsz) \
>      (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
>  
> +enum NvmePmrcapShift {
> +    PMRCAP_RDS_SHIFT   = 3,
> +    PMRCAP_WDS_SHIFT   = 4,
> +    PMRCAP_BIR_SHIFT   = 5,
> +    PMRCAP_PMRTU_SHIFT   = 8,
> +    PMRCAP_PMRWBM_SHIFT   = 10,
> +    PMRCAP_PMRTO_SHIFT   = 16,
> +    PMRCAP_CMSS_SHIFT   = 24,
> +};

Can these be aligned?

> +
> +enum NvmePmrcapMask {
> +    PMRCAP_RDS_MASK   = 0x1,
> +    PMRCAP_WDS_MASK   = 0x1,
> +    PMRCAP_BIR_MASK   = 0x7,
> +    PMRCAP_PMRTU_MASK   = 0x3,
> +    PMRCAP_PMRWBM_MASK   = 0xf,
> +    PMRCAP_PMRTO_MASK   = 0xff,
> +    PMRCAP_CMSS_MASK   = 0x1,
> +};

Can these be aligned?

> +
> +#define NVME_PMRCAP_RDS(pmrcap)    \
> +    ((pmrcap >> PMRCAP_RDS_SHIFT)   & PMRCAP_RDS_MASK)
> +#define NVME_PMRCAP_WDS(pmrcap)    \
> +    ((pmrcap >> PMRCAP_WDS_SHIFT)   & PMRCAP_WDS_MASK)
> +#define NVME_PMRCAP_BIR(pmrcap)    \
> +    ((pmrcap >> PMRCAP_BIR_SHIFT)   & PMRCAP_BIR_MASK)
> +#define NVME_PMRCAP_PMRTU(pmrcap)    \
> +    ((pmrcap >> PMRCAP_PMRTU_SHIFT)   & PMRCAP_PMRTU_MASK)
> +#define NVME_PMRCAP_PMRWBM(pmrcap)    \
> +    ((pmrcap >> PMRCAP_PMRWBM_SHIFT)   & PMRCAP_PMRWBM_MASK)
> +#define NVME_PMRCAP_PMRTO(pmrcap)    \
> +    ((pmrcap >> PMRCAP_PMRTO_SHIFT)   & PMRCAP_PMRTO_MASK)
> +#define NVME_PMRCAP_CMSS(pmrcap)    \
> +    ((pmrcap >> PMRCAP_CMSS_SHIFT)   & PMRCAP_CMSS_MASK)
> +
> +#define NVME_PMRCAP_SET_RDS(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
> +#define NVME_PMRCAP_SET_WDS(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
> +#define NVME_PMRCAP_SET_BIR(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
> +#define NVME_PMRCAP_SET_PMRTU(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
> +#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
> +#define NVME_PMRCAP_SET_PMRTO(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
> +#define NVME_PMRCAP_SET_CMSS(pmrcap, val)   \
> +    (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
> +
> +enum NvmePmrctlShift {
> +    PMRCTL_EN_SHIFT   = 0,
> +};
> +
> +enum NvmePmrctlMask {
> +    PMRCTL_EN_MASK   = 0x1,
> +};
> +
> +#define NVME_PMRCTL_EN(pmrctl)  ((pmrctl >> PMRCTL_EN_SHIFT)   & PMRCTL_EN_MASK)
> +
> +#define NVME_PMRCTL_SET_EN(pmrctl, val)   \
> +    (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
> +
> +enum NvmePmrstsShift {
> +    PMRSTS_ERR_SHIFT   = 0,
> +    PMRSTS_NRDY_SHIFT   = 8,
> +    PMRSTS_HSTS_SHIFT   = 9,
> +    PMRSTS_CBAI_SHIFT   = 12,
> +};
> +
> +enum NvmePmrstsMask {
> +    PMRSTS_ERR_MASK   = 0xff,
> +    PMRSTS_NRDY_MASK   = 0x1,
> +    PMRSTS_HSTS_MASK   = 0x7,
> +    PMRSTS_CBAI_MASK   = 0x1,
> +};
> +
> +#define NVME_PMRSTS_ERR(pmrsts)     \
> +    ((pmrsts >> PMRSTS_ERR_SHIFT)   & PMRSTS_ERR_MASK)
> +#define NVME_PMRSTS_NRDY(pmrsts)    \
> +    ((pmrsts >> PMRSTS_NRDY_SHIFT)   & PMRSTS_NRDY_MASK)
> +#define NVME_PMRSTS_HSTS(pmrsts)    \
> +    ((pmrsts >> PMRSTS_HSTS_SHIFT)   & PMRSTS_HSTS_MASK)
> +#define NVME_PMRSTS_CBAI(pmrsts)    \
> +    ((pmrsts >> PMRSTS_CBAI_SHIFT)   & PMRSTS_CBAI_MASK)
> +
> +#define NVME_PMRSTS_SET_ERR(pmrsts, val)   \
> +    (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
> +#define NVME_PMRSTS_SET_NRDY(pmrsts, val)   \
> +    (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
> +#define NVME_PMRSTS_SET_HSTS(pmrsts, val)   \
> +    (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
> +#define NVME_PMRSTS_SET_CBAI(pmrsts, val)   \
> +    (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
> +
> +enum NvmePmrebsShift {
> +    PMREBS_PMRSZU_SHIFT   = 0,
> +    PMREBS_RBB_SHIFT   = 4,
> +    PMREBS_PMRWBZ_SHIFT   = 8,
> +};
> +
> +enum NvmePmrebsMask {
> +    PMREBS_PMRSZU_MASK   = 0xf,
> +    PMREBS_RBB_MASK   = 0x1,
> +    PMREBS_PMRWBZ_MASK   = 0xffffff,
> +};

Alignment.

> +
> +#define NVME_PMREBS_PMRSZU(pmrebs)  \
> +    ((pmrebs >> PMREBS_PMRSZU_SHIFT)   & PMREBS_PMRSZU_MASK)
> +#define NVME_PMREBS_RBB(pmrebs)     \
> +    ((pmrebs >> PMREBS_RBB_SHIFT)   & PMREBS_RBB_MASK)
> +#define NVME_PMREBS_PMRWBZ(pmrebs)  \
> +    ((pmrebs >> PMREBS_PMRWBZ_SHIFT)   & PMREBS_PMRWBZ_MASK)
> +
> +#define NVME_PMREBS_SET_PMRSZU(pmrebs, val)   \
> +    (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
> +#define NVME_PMREBS_SET_RBB(pmrebs, val)   \
> +    (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
> +#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val)   \
> +    (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
> +
> +enum NvmePmrswtpShift {
> +    PMRSWTP_PMRSWTU_SHIFT   = 0,
> +    PMRSWTP_PMRSWTV_SHIFT   = 8,
> +};
> +
> +enum NvmePmrswtpMask {
> +    PMRSWTP_PMRSWTU_MASK   = 0xf,
> +    PMRSWTP_PMRSWTV_MASK   = 0xffffff,
> +};
> +
> +#define NVME_PMRSWTP_PMRSWTU(pmrswtp)   \
> +    ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT)   & PMRSWTP_PMRSWTU_MASK)
> +#define NVME_PMRSWTP_PMRSWTV(pmrswtp)   \
> +    ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT)   & PMRSWTP_PMRSWTV_MASK)
> +
> +#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val)   \
> +    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
> +#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val)   \
> +    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
> +
> +enum NvmePmrmscShift {
> +    PMRMSC_CMSE_SHIFT   = 1,
> +    PMRMSC_CBA_SHIFT   = 12,
> +};
> +
> +enum NvmePmrmscMask {
> +    PMRMSC_CMSE_MASK   = 0x1,
> +    PMRMSC_CBA_MASK   = 0xfffffffffffff,
> +};

Alignment.

> +
> +#define NVME_PMRMSC_CMSE(pmrmsc)    \
> +    ((pmrmsc >> PMRMSC_CMSE_SHIFT)   & PMRMSC_CMSE_MASK)
> +#define NVME_PMRMSC_CBA(pmrmsc)     \
> +    ((pmrmsc >> PMRMSC_CBA_SHIFT)   & PMRMSC_CBA_MASK)
> +
> +#define NVME_PMRMSC_SET_CMSE(pmrmsc, val)   \
> +    (pmrmsc |= (uint64_t)(val & PMRMSC_CMSE_MASK) << PMRMSC_CMSE_SHIFT)
> +#define NVME_PMRMSC_SET_CBA(pmrmsc, val)   \
> +    (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT)
> +
>  typedef struct NvmeCmd {
>      uint8_t     opcode;
>      uint8_t     fuse;
> -- 
> 2.21.1
> 
> 

Tested and LGTM, just a couple of style fixes that I'd like to see.

I would like a small comment at the top of nvme.c that noted that any
configured CMB would take preference or have the device complain if both
are configured.

Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Stefan Hajnoczi March 20, 2020, 3:45 p.m. UTC | #2
On Wed, Mar 18, 2020 at 01:03:03PM -0700, Andrzej Jakowski wrote:
> This patch introduces support for PMR that has been defined as part of NVMe 1.4
> spec. User can now specify a pmrdev option that should point to HostMemoryBackend.
> pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
> device. Guest OS can perform mmio read and writes to the PMR region that will stay
> persistent across system reboot.
> 
> Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
> ---
> v2:
>  - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
>    backend file into qemu [1] (Stefan)
> 
> v1:
>  - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
>    improved performance in virtualized environment [2] (Stefan)
> 
>  - added check if pmr size is power of two in size [3] (David)
> 
>  - addressed cross compilation build problems reported by CI environment
> 
> [1]: https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakowski@linux.intel.com/
> [2]: https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
> [3]: https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakowski@linux.intel.com/
> ---
> Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
> specification. This patch implements initial support for it in NVMe driver.
> ---
>  hw/block/nvme.c       | 117 +++++++++++++++++++++++++++-
>  hw/block/nvme.h       |   2 +
>  hw/block/trace-events |   5 ++
>  include/block/nvme.h  | 172 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 294 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d28335cbf3..70fd09d293 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,10 +19,18 @@
>   *      -drive file=<file>,if=none,id=<drive_id>
>   *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
>   *              cmb_size_mb=<cmb_size_mb[optional]>, \
> + *              [pmrdev=<mem_backend_file_id>,] \
>   *              num_queues=<N[optional]>
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> + *
> + * Either cmb or pmr - due to limitation in available BAR indexes.

This isn't 100% clear.  Does this mean "cmb_size_mb= and pmrdev= are
mutually exclusive due to limited availability of BARs"?  Please
rephrase.

> + * pmr_file file needs to be power of two in size.
> + * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
> + * For example:
> + * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
> + *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
>   */
>  
>  #include "qemu/osdep.h"
> @@ -35,7 +43,9 @@
>  #include "sysemu/sysemu.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> +#include "sysemu/hostmem.h"
>  #include "sysemu/block-backend.h"
> +#include "exec/ramblock.h"
>  
>  #include "qemu/log.h"
>  #include "qemu/module.h"
> @@ -1141,6 +1151,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
>          NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
>                         "invalid write to read only CMBSZ, ignored");
>          return;
> +    case 0xE00: /* PMRCAP */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
> +                       "invalid write to PMRCAP register, ignored");
> +        return;
> +    case 0xE04: /* TODO PMRCTL */
> +        break;
> +    case 0xE08: /* PMRSTS */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
> +                       "invalid write to PMRSTS register, ignored");
> +        return;
> +    case 0xE0C: /* PMREBS */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
> +                       "invalid write to PMREBS register, ignored");
> +        return;
> +    case 0xE10: /* PMRSWTP */
> +        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
> +                       "invalid write to PMRSWTP register, ignored");
> +        return;
> +    case 0xE14: /* TODO PMRMSC */
> +         break;
>      default:
>          NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
>                         "invalid MMIO write,"
> @@ -1169,6 +1199,23 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
>      }
>  
>      if (addr < sizeof(n->bar)) {
> +        /*
> +         * When PMRWBM bit 1 is set then read from
> +         * from PMRSTS should ensure prior writes
> +         * made it to persistent media
> +         */
> +        if (addr == 0xE08 &&
> +            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
> +            int status;
> +
> +            status = qemu_msync((void *)n->pmrdev->mr.ram_block->host,
> +                                n->pmrdev->size,
> +                                n->pmrdev->mr.ram_block->fd);

Please use qemu_ram_writeback() so that pmem_persist() and qemu_msync()
are used as appropriate.

Stefan
Andrzej Jakowski March 20, 2020, 5:48 p.m. UTC | #3
On 3/20/20 8:45 AM, Stefan Hajnoczi wrote:
> Please use qemu_ram_writeback() so that pmem_persist() and qemu_msync()
> are used as appropriate.

Thx!
qemu_ram_writeback() doesn't return any status. How can I know that actual msync succeds?

Also qemu_ram_writeback() requires me to include #include "exec/ram_addr.h". 
After including it when I compile code I'm getting following error:

In file included from hw/block/nvme.c:49:
/root/sources/pmr/qemu/include/exec/ram_addr.h:23:10: fatal error: cpu.h: No such file or directory
   23 | #include "cpu.h"
      |          ^~~~~~~
compilation terminated.
make: *** [/root/sources/pmr/qemu/rules.mak:69: hw/block/nvme.o] Error 1

Why this is happening and what should be changed.
Stefan Hajnoczi March 20, 2020, 7:24 p.m. UTC | #4
On Fri, Mar 20, 2020 at 5:48 PM Andrzej Jakowski
<andrzej.jakowski@linux.intel.com> wrote:
>
> On 3/20/20 8:45 AM, Stefan Hajnoczi wrote:
> > Please use qemu_ram_writeback() so that pmem_persist() and qemu_msync()
> > are used as appropriate.
>
> Thx!
> qemu_ram_writeback() doesn't return any status. How can I know that actual msync succeds?

If the warn_report() message that is already printed by
qemu_ram_writeback() is insufficient in terms of error reporting, I
suggest propagating the return value from qemu_ram_writeback() and
qemu_ram_block_writeback().

> Also qemu_ram_writeback() requires me to include #include "exec/ram_addr.h".
> After including it when I compile code I'm getting following error:
>
> In file included from hw/block/nvme.c:49:
> /root/sources/pmr/qemu/include/exec/ram_addr.h:23:10: fatal error: cpu.h: No such file or directory
>    23 | #include "cpu.h"
>       |          ^~~~~~~
> compilation terminated.
> make: *** [/root/sources/pmr/qemu/rules.mak:69: hw/block/nvme.o] Error 1
>
> Why this is happening and what should be changed.

Generally object files are built as part of common-obj-y in
Makefile.objs.  These object files are built only once across all QEMU
targets (e.g. qemu-system-x86_64, qemu-system-arm, ...).

Some code embeds target-specific information and is therefore not
suitable for common-obj-y.  These object files are built as part of
obj-y in Makefile.objs.

You can fix this compilation issue by changing hw/block/Makefile.objs
to like this:

diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index 4b4a2b338d..12d5d5dac6 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -7,11 +7,11 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
 common-obj-$(CONFIG_XEN) += xen-block.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
-common-obj-$(CONFIG_NVME_PCI) += nvme.o
 common-obj-$(CONFIG_SWIM) += swim.o

 common-obj-$(CONFIG_SH4) += tc58128.o

+obj-$(CONFIG_NVME_PCI) += nvme.o
 obj-$(CONFIG_VIRTIO_BLK) += virtio-blk.o
 obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o

Stefan
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..70fd09d293 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,18 @@ 
  *      -drive file=<file>,if=none,id=<drive_id>
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
  *              cmb_size_mb=<cmb_size_mb[optional]>, \
+ *              [pmrdev=<mem_backend_file_id>,] \
  *              num_queues=<N[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in available BAR indexes.
+ * pmr_file file needs to be power of two in size.
+ * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
+ * For example:
+ * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
+ *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
  */
 
 #include "qemu/osdep.h"
@@ -35,7 +43,9 @@ 
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
+#include "exec/ramblock.h"
 
 #include "qemu/log.h"
 #include "qemu/module.h"
@@ -1141,6 +1151,26 @@  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
                        "invalid write to read only CMBSZ, ignored");
         return;
+    case 0xE00: /* PMRCAP */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+                       "invalid write to PMRCAP register, ignored");
+        return;
+    case 0xE04: /* TODO PMRCTL */
+        break;
+    case 0xE08: /* PMRSTS */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+                       "invalid write to PMRSTS register, ignored");
+        return;
+    case 0xE0C: /* PMREBS */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+                       "invalid write to PMREBS register, ignored");
+        return;
+    case 0xE10: /* PMRSWTP */
+        NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+                       "invalid write to PMRSWTP register, ignored");
+        return;
+    case 0xE14: /* TODO PMRMSC */
+         break;
     default:
         NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
                        "invalid MMIO write,"
@@ -1169,6 +1199,23 @@  static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     }
 
     if (addr < sizeof(n->bar)) {
+        /*
+         * When PMRWBM bit 1 is set then read from
+         * from PMRSTS should ensure prior writes
+         * made it to persistent media
+         */
+        if (addr == 0xE08 &&
+            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
+            int status;
+
+            status = qemu_msync((void *)n->pmrdev->mr.ram_block->host,
+                                n->pmrdev->size,
+                                n->pmrdev->mr.ram_block->fd);
+            if (!status) {
+                NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
+                               "error while persisting data");
+            }
+        }
         memcpy(&val, ptr + addr, size);
     } else {
         NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -1332,6 +1379,23 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
         error_setg(errp, "serial property not set");
         return;
     }
+
+    if (!n->cmb_size_mb && n->pmrdev) {
+        if (host_memory_backend_is_mapped(n->pmrdev)) {
+            char *path = object_get_canonical_path_component(OBJECT(n->pmrdev));
+            error_setg(errp, "can't use already busy memdev: %s", path);
+            g_free(path);
+            return;
+        }
+
+        if (!is_power_of_2(n->pmrdev->size)) {
+            error_setg(errp, "pmr backend size needs to be power of 2 in size");
+            return;
+        }
+
+        host_memory_backend_set_mapped(n->pmrdev, true);
+    }
+
     blkconf_blocksizes(&n->conf);
     if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
                                        false, errp)) {
@@ -1393,7 +1457,6 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     n->bar.intmc = n->bar.intms = 0;
 
     if (n->cmb_size_mb) {
-
         NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
         NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
 
@@ -1415,6 +1478,51 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
             PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
             PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
 
+    } else if (n->pmrdev) {
+        /* Controller Capabilities register */
+        NVME_CAP_SET_PMRS(n->bar.cap, 1);
+
+        /* PMR Capabities register */
+        n->bar.pmrcap = 0;
+        NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_BIR(n->bar.pmrcap, 2);
+        NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
+        /* Turn on bit 1 support */
+        NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
+        NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
+        NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
+
+        /* PMR Control register */
+        n->bar.pmrctl = 0;
+        NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
+
+        /* PMR Status register */
+        n->bar.pmrsts = 0;
+        NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
+        NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
+
+        /* PMR Elasticity Buffer Size register */
+        n->bar.pmrebs = 0;
+        NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
+        NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
+        NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
+
+        /* PMR Sustained Write Throughput register */
+        n->bar.pmrswtp = 0;
+        NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
+        NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
+
+        /* PMR Memory Space Control register */
+        n->bar.pmrmsc = 0;
+        NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
+        NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
+
+        pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
+            PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
+            PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
     }
 
     for (i = 0; i < n->num_namespaces; i++) {
@@ -1441,15 +1549,20 @@  static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->namespaces);
     g_free(n->cq);
     g_free(n->sq);
-
     if (n->cmb_size_mb) {
         g_free(n->cmbuf);
     }
+
+    if (n->pmrdev) {
+        host_memory_backend_set_mapped(n->pmrdev, false);
+    }
     msix_uninit_exclusive_bar(pci_dev);
 }
 
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
+    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
+                     HostMemoryBackend *),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 557194ee19..6520a9f0be 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -83,6 +83,8 @@  typedef struct NvmeCtrl {
     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
 
     char            *serial;
+    HostMemoryBackend *pmrdev;
+
     NvmeNamespace   *namespaces;
     NvmeSQueue      **sq;
     NvmeCQueue      **cq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index c03e80c2c9..233e094438 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -110,10 +110,15 @@  nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CA
 nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
 nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
 nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
+nvme_ub_mmiowr_pmrcap_readonly(void) "invalid write to read only PMRCAP, ignored"
+nvme_ub_mmiowr_pmrsts_readonly(void) "invalid write to read only PMRSTS, ignored"
+nvme_ub_mmiowr_pmrebs_readonly(void) "invalid write to read only PMREBS, ignored"
+nvme_ub_mmiowr_pmrswtp_readonly(void) "invalid write to read only PMRSWTP, ignored"
 nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
 nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
 nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
 nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
+nvme_ub_mmiord_pmrread_barrier(void) "failed to persists data"
 nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
 nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
 nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8fb941c653..374262d4b7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -15,6 +15,13 @@  typedef struct NvmeBar {
     uint64_t    acq;
     uint32_t    cmbloc;
     uint32_t    cmbsz;
+    uint8_t     padding[3520]; /* not used by QEMU */
+    uint32_t    pmrcap;
+    uint32_t    pmrctl;
+    uint32_t    pmrsts;
+    uint32_t    pmrebs;
+    uint32_t    pmrswtp;
+    uint32_t    pmrmsc;
 } NvmeBar;
 
 enum NvmeCapShift {
@@ -27,6 +34,7 @@  enum NvmeCapShift {
     CAP_CSS_SHIFT      = 37,
     CAP_MPSMIN_SHIFT   = 48,
     CAP_MPSMAX_SHIFT   = 52,
+    CAP_PMR_SHIFT      = 56,
 };
 
 enum NvmeCapMask {
@@ -39,6 +47,7 @@  enum NvmeCapMask {
     CAP_CSS_MASK       = 0xff,
     CAP_MPSMIN_MASK    = 0xf,
     CAP_MPSMAX_MASK    = 0xf,
+    CAP_PMR_MASK       = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -69,6 +78,8 @@  enum NvmeCapMask {
                                                            << CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
                                                             << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+                                                            << CAP_PMR_SHIFT)
 
 enum NvmeCcShift {
     CC_EN_SHIFT     = 0,
@@ -205,6 +216,167 @@  enum NvmeCmbszMask {
 #define NVME_CMBSZ_GETSIZE(cmbsz) \
     (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
 
+enum NvmePmrcapShift {
+    PMRCAP_RDS_SHIFT   = 3,
+    PMRCAP_WDS_SHIFT   = 4,
+    PMRCAP_BIR_SHIFT   = 5,
+    PMRCAP_PMRTU_SHIFT   = 8,
+    PMRCAP_PMRWBM_SHIFT   = 10,
+    PMRCAP_PMRTO_SHIFT   = 16,
+    PMRCAP_CMSS_SHIFT   = 24,
+};
+
+enum NvmePmrcapMask {
+    PMRCAP_RDS_MASK   = 0x1,
+    PMRCAP_WDS_MASK   = 0x1,
+    PMRCAP_BIR_MASK   = 0x7,
+    PMRCAP_PMRTU_MASK   = 0x3,
+    PMRCAP_PMRWBM_MASK   = 0xf,
+    PMRCAP_PMRTO_MASK   = 0xff,
+    PMRCAP_CMSS_MASK   = 0x1,
+};
+
+#define NVME_PMRCAP_RDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_RDS_SHIFT)   & PMRCAP_RDS_MASK)
+#define NVME_PMRCAP_WDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_WDS_SHIFT)   & PMRCAP_WDS_MASK)
+#define NVME_PMRCAP_BIR(pmrcap)    \
+    ((pmrcap >> PMRCAP_BIR_SHIFT)   & PMRCAP_BIR_MASK)
+#define NVME_PMRCAP_PMRTU(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTU_SHIFT)   & PMRCAP_PMRTU_MASK)
+#define NVME_PMRCAP_PMRWBM(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRWBM_SHIFT)   & PMRCAP_PMRWBM_MASK)
+#define NVME_PMRCAP_PMRTO(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTO_SHIFT)   & PMRCAP_PMRTO_MASK)
+#define NVME_PMRCAP_CMSS(pmrcap)    \
+    ((pmrcap >> PMRCAP_CMSS_SHIFT)   & PMRCAP_CMSS_MASK)
+
+#define NVME_PMRCAP_SET_RDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
+#define NVME_PMRCAP_SET_WDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
+#define NVME_PMRCAP_SET_BIR(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
+#define NVME_PMRCAP_SET_PMRTU(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
+#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
+#define NVME_PMRCAP_SET_PMRTO(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
+#define NVME_PMRCAP_SET_CMSS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
+
+enum NvmePmrctlShift {
+    PMRCTL_EN_SHIFT   = 0,
+};
+
+enum NvmePmrctlMask {
+    PMRCTL_EN_MASK   = 0x1,
+};
+
+#define NVME_PMRCTL_EN(pmrctl)  ((pmrctl >> PMRCTL_EN_SHIFT)   & PMRCTL_EN_MASK)
+
+#define NVME_PMRCTL_SET_EN(pmrctl, val)   \
+    (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
+
+enum NvmePmrstsShift {
+    PMRSTS_ERR_SHIFT   = 0,
+    PMRSTS_NRDY_SHIFT   = 8,
+    PMRSTS_HSTS_SHIFT   = 9,
+    PMRSTS_CBAI_SHIFT   = 12,
+};
+
+enum NvmePmrstsMask {
+    PMRSTS_ERR_MASK   = 0xff,
+    PMRSTS_NRDY_MASK   = 0x1,
+    PMRSTS_HSTS_MASK   = 0x7,
+    PMRSTS_CBAI_MASK   = 0x1,
+};
+
+#define NVME_PMRSTS_ERR(pmrsts)     \
+    ((pmrsts >> PMRSTS_ERR_SHIFT)   & PMRSTS_ERR_MASK)
+#define NVME_PMRSTS_NRDY(pmrsts)    \
+    ((pmrsts >> PMRSTS_NRDY_SHIFT)   & PMRSTS_NRDY_MASK)
+#define NVME_PMRSTS_HSTS(pmrsts)    \
+    ((pmrsts >> PMRSTS_HSTS_SHIFT)   & PMRSTS_HSTS_MASK)
+#define NVME_PMRSTS_CBAI(pmrsts)    \
+    ((pmrsts >> PMRSTS_CBAI_SHIFT)   & PMRSTS_CBAI_MASK)
+
+#define NVME_PMRSTS_SET_ERR(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
+#define NVME_PMRSTS_SET_NRDY(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
+#define NVME_PMRSTS_SET_HSTS(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
+#define NVME_PMRSTS_SET_CBAI(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
+
+enum NvmePmrebsShift {
+    PMREBS_PMRSZU_SHIFT   = 0,
+    PMREBS_RBB_SHIFT   = 4,
+    PMREBS_PMRWBZ_SHIFT   = 8,
+};
+
+enum NvmePmrebsMask {
+    PMREBS_PMRSZU_MASK   = 0xf,
+    PMREBS_RBB_MASK   = 0x1,
+    PMREBS_PMRWBZ_MASK   = 0xffffff,
+};
+
+#define NVME_PMREBS_PMRSZU(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRSZU_SHIFT)   & PMREBS_PMRSZU_MASK)
+#define NVME_PMREBS_RBB(pmrebs)     \
+    ((pmrebs >> PMREBS_RBB_SHIFT)   & PMREBS_RBB_MASK)
+#define NVME_PMREBS_PMRWBZ(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRWBZ_SHIFT)   & PMREBS_PMRWBZ_MASK)
+
+#define NVME_PMREBS_SET_PMRSZU(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
+#define NVME_PMREBS_SET_RBB(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
+#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
+
+enum NvmePmrswtpShift {
+    PMRSWTP_PMRSWTU_SHIFT   = 0,
+    PMRSWTP_PMRSWTV_SHIFT   = 8,
+};
+
+enum NvmePmrswtpMask {
+    PMRSWTP_PMRSWTU_MASK   = 0xf,
+    PMRSWTP_PMRSWTV_MASK   = 0xffffff,
+};
+
+#define NVME_PMRSWTP_PMRSWTU(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT)   & PMRSWTP_PMRSWTU_MASK)
+#define NVME_PMRSWTP_PMRSWTV(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT)   & PMRSWTP_PMRSWTV_MASK)
+
+#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
+#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
+
+enum NvmePmrmscShift {
+    PMRMSC_CMSE_SHIFT   = 1,
+    PMRMSC_CBA_SHIFT   = 12,
+};
+
+enum NvmePmrmscMask {
+    PMRMSC_CMSE_MASK   = 0x1,
+    PMRMSC_CBA_MASK   = 0xfffffffffffff,
+};
+
+#define NVME_PMRMSC_CMSE(pmrmsc)    \
+    ((pmrmsc >> PMRMSC_CMSE_SHIFT)   & PMRMSC_CMSE_MASK)
+#define NVME_PMRMSC_CBA(pmrmsc)     \
+    ((pmrmsc >> PMRMSC_CBA_SHIFT)   & PMRMSC_CBA_MASK)
+
+#define NVME_PMRMSC_SET_CMSE(pmrmsc, val)   \
+    (pmrmsc |= (uint64_t)(val & PMRMSC_CMSE_MASK) << PMRMSC_CMSE_SHIFT)
+#define NVME_PMRMSC_SET_CBA(pmrmsc, val)   \
+    (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT)
+
 typedef struct NvmeCmd {
     uint8_t     opcode;
     uint8_t     fuse;