diff mbox series

[v6,36/42] nvme: add support for scatter gather lists

Message ID 20200316142928.153431-37-its@irrelevant.dk (mailing list archive)
State New, archived
Headers show
Series nvme: support NVMe v1.3d, SGLs and multiple namespaces | expand

Commit Message

Klaus Jensen March 16, 2020, 2:29 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

For now, support the Data Block, Segment and Last Segment descriptor
types.

See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").

Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
Acked-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.c       | 310 +++++++++++++++++++++++++++++++++++-------
 hw/block/trace-events |   4 +
 2 files changed, 262 insertions(+), 52 deletions(-)

Comments

Maxim Levitsky March 25, 2020, 10:58 a.m. UTC | #1
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> For now, support the Data Block, Segment and Last Segment descriptor
> types.
> 
> See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> 
> Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> Acked-by: Keith Busch <kbusch@kernel.org>
> ---
>  hw/block/nvme.c       | 310 +++++++++++++++++++++++++++++++++++-------
>  hw/block/trace-events |   4 +
>  2 files changed, 262 insertions(+), 52 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 49d323566393..b89b96990f52 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>  
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> +    hwaddr hi = addr + size;
> +    if (hi < addr) {
> +        return 1;
> +    }
> +
> +    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {

I would suggest to split this into a separate patch as well, since this contains not just one but 2 bugfixes
for this function and they are not related to sg lists.
Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
to something like 'nvme: fix and refactor nvme_addr_read'


>          memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>          return 0;
>      }
> @@ -328,13 +333,242 @@ unmap:
>      return status;
>  }
>  
> -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> -                             uint64_t prp1, uint64_t prp2, DMADirection dir,
> +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> +                                  QEMUIOVector *iov,
> +                                  NvmeSglDescriptor *segment, uint64_t nsgld,
> +                                  size_t *len, NvmeRequest *req)
> +{
> +    dma_addr_t addr, trans_len;
> +    uint32_t blk_len;
> +    uint16_t status;
> +
> +    for (int i = 0; i < nsgld; i++) {
> +        uint8_t type = NVME_SGL_TYPE(segment[i].type);
> +
> +        if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> +            switch (type) {
> +            case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> +            case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> +                return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> +            default:
To be honest I don't like that 'default'
I would explicitly state which segment types remain 
(I think segment list and last segment list, and various reserved types)
In fact for the reserved types you probably also want to return NVME_SGL_DESCR_TYPE_INVALID)

Also this function as well really begs to have a description prior to it,
something like 'map a sg list section, assuming that it only contains SGL data descriptions,
caller has to ensure this'.


> +                return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> +            }
> +        }
> +
> +        if (*len == 0) {
> +            uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
Nitpick: I would add a small comment here as well describiing
what this does (We reach this point if sg list covers more that that
was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
capability indicates that we support just throwing the extra data away)

> +            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> +                break;
> +            }
> +
> +            trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +        }
> +
> +        addr = le64_to_cpu(segment[i].addr);
> +        blk_len = le32_to_cpu(segment[i].len);
> +
> +        if (!blk_len) {
> +            continue;
> +        }
> +
> +        if (UINT64_MAX - addr < blk_len) {
> +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +        }
Good!
> +
> +        trans_len = MIN(*len, blk_len);
> +
> +        status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> +        if (status) {
> +            return status;
> +        }
> +
> +        *len -= trans_len;
> +    }
> +
> +    return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> +                             NvmeSglDescriptor sgl, size_t len,
>                               NvmeRequest *req)
> +{
> +    /*
> +     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
> +     * dynamically allocating a potentially large SGL. The spec allows the SGL
> +     * to be larger than the command transfer size, so it is not bounded by
> +     * MDTS.
> +     */
Now this is a very good comment!

However I don't fully understand the note about the SGL. I assume that you mean
that the data that SGL covers still should be less that MDTS, but the actual SGL chain,
if assembled really in inefficient way (like 1 byte per each data descriptor) might be larger.


> +    const int SEG_CHUNK_SIZE = 256;
> +
> +    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
> +    uint64_t nsgld;
> +    uint32_t seg_len;
> +    uint16_t status;
> +    bool sgl_in_cmb = false;
> +    hwaddr addr;
> +    int ret;
> +
> +    sgld = &sgl;
> +    addr = le64_to_cpu(sgl.addr);
> +
> +    trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
> +                           len);
> +
> +    /*
> +     * If the entire transfer can be described with a single data block it can
> +     * be mapped directly.
> +     */
> +    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> +        status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
> +        if (status) {
> +            goto unmap;
> +        }
> +
> +        goto out;
> +    }
> +
> +    /*
> +     * If the segment is located in the CMB, the submission queue of the
> +     * request must also reside there.
> +     */
> +    if (nvme_addr_is_cmb(n, addr)) {
> +        if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
> +            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +        }
> +
> +        sgl_in_cmb = true;
> +    }
> +
> +    for (;;) {
> +        seg_len = le32_to_cpu(sgld->len);
> +
> +        if (!seg_len || seg_len & 0xf) {
> +            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
> +        }
It might be worth noting here that we are dealing with sgl (last) segment descriptor
and its length indeed must be non zero and multiple of 16.
Otherwise I confused this for a moment with the alignment requirements on the data itsel.

> +
> +        if (UINT64_MAX - addr < seg_len) {
> +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +        }
> +
> +        nsgld = seg_len / sizeof(NvmeSglDescriptor);
> +
> +        while (nsgld > SEG_CHUNK_SIZE) {
> +            if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
> +                trace_nvme_dev_err_addr_read(addr);
> +                status = NVME_DATA_TRANSFER_ERROR;
> +                goto unmap;
> +            }
> +
> +            status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
> +                                       &len, req);
> +            if (status) {
> +                goto unmap;
> +            }
> +
> +            nsgld -= SEG_CHUNK_SIZE;
> +            addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
> +        }
> +
> +        ret = nvme_addr_read(n, addr, segment, nsgld *
> +                             sizeof(NvmeSglDescriptor));
> +        if (ret) {
> +            trace_nvme_dev_err_addr_read(addr);
> +            status = NVME_DATA_TRANSFER_ERROR;
> +            goto unmap;
> +        }
> +
> +        last_sgld = &segment[nsgld - 1];
> +
> +        /* if the segment ends with a Data Block, then we are done */
> +        if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> +            status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
> +            if (status) {
> +                goto unmap;
> +            }
> +
> +            break;
> +        }
> +
> +        /* a Last Segment must end with a Data Block descriptor */
> +        if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
> +            status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
> +            goto unmap;
> +        }
> +
> +        sgld = last_sgld;
> +        addr = le64_to_cpu(sgld->addr);
> +
> +        /*
> +         * Do not map the last descriptor; it will be a Segment or Last Segment
> +         * descriptor instead and handled by the next iteration.
> +         */
> +        status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
> +        if (status) {
> +            goto unmap;
> +        }
> +
> +        /*
> +         * If the next segment is in the CMB, make sure that the sgl was
> +         * already located there.
> +         */
> +        if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
> +            status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +            goto unmap;
> +        }
> +    }
> +
> +out:
> +    /* if there is any residual left in len, the SGL was too short */
> +    if (len) {
> +        status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +        goto unmap;
> +    }
> +
> +    return NVME_SUCCESS;
> +
> +unmap:
> +    if (iov->iov) {
> +        qemu_iovec_destroy(iov);
> +    }
> +
> +    if (qsg->sg) {
> +        qemu_sglist_destroy(qsg);
> +    }
> +
> +    return status;
> +}
Looks OK, I might have missed something.

> +
> +static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> +                         size_t len, NvmeRequest *req)
> +{
> +    uint64_t prp1, prp2;
> +
> +    switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
> +    case PSDT_PRP:
> +        prp1 = le64_to_cpu(req->cmd.dptr.prp1);
> +        prp2 = le64_to_cpu(req->cmd.dptr.prp2);
> +
> +        return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> +    case PSDT_SGL_MPTR_CONTIGUOUS:
> +    case PSDT_SGL_MPTR_SGL:
> +        /* SGLs shall not be used for Admin commands in NVMe over PCIe */
> +        if (!req->sq->sqid) {
> +            return NVME_INVALID_FIELD | NVME_DNR;
> +        }
> +
> +        return nvme_map_sgl(n, qsg, iov, req->cmd.dptr.sgl, len, req);
> +    default:
> +        return NVME_INVALID_FIELD;
> +    }
> +}
Looks OK
> +
> +static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> +                         DMADirection dir, NvmeRequest *req)
>  {
>      uint16_t status = NVME_SUCCESS;
>  
> -    status = nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
> +    status = nvme_map(n, &req->qsg, &req->iov, len, req);
>      if (status) {
>          return status;
>      }
> @@ -370,15 +604,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
>      return status;
>  }
>  
> -static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
> -                         QEMUIOVector *iov, size_t len, NvmeRequest *req)
> -{
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> -
> -    return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> -}
> -
>  static void nvme_aio_destroy(NvmeAIO *aio)
>  {
>      g_free(aio);
> @@ -846,7 +1071,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          goto invalid;
>      }
>  
> -    status = nvme_map(n, cmd, &req->qsg, &req->iov, len, req);
> +    status = nvme_map(n, &req->qsg, &req->iov, len, req);
>      if (status) {
>          goto invalid;
>      }
> @@ -1013,8 +1238,6 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
>                                  uint32_t buf_len, uint64_t off,
>                                  NvmeRequest *req)
>  {
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>      uint32_t nsid = le32_to_cpu(cmd->nsid);
>  
>      uint32_t trans_len;
> @@ -1064,16 +1287,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
>          nvme_clear_events(n, NVME_AER_TYPE_SMART);
>      }
>  
> -    return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
> -                        DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, (uint8_t *) &smart + off, trans_len,
> +                    DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
>                                   uint64_t off, NvmeRequest *req)
>  {
>      uint32_t trans_len;
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>      NvmeFwSlotInfoLog fw_log;
>  
>      if (off > sizeof(fw_log)) {
> @@ -1084,8 +1305,8 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
>  
>      trans_len = MIN(sizeof(fw_log) - off, buf_len);
>  
> -    return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
> -                        DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len,
> +                    DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> @@ -1093,8 +1314,6 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
>                                  NvmeRequest *req)
>  {
>      uint32_t trans_len;
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>      uint8_t errlog[64];
>  
>      if (!rae) {
> @@ -1109,8 +1328,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
>  
>      trans_len = MIN(sizeof(errlog) - off, buf_len);
>  
> -    return nvme_dma_prp(n, errlog, trans_len, prp1, prp2,
> -                        DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, errlog, trans_len, DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> @@ -1255,13 +1473,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c,
>                                     NvmeRequest *req)
>  {
> -    uint64_t prp1 = le64_to_cpu(c->prp1);
> -    uint64_t prp2 = le64_to_cpu(c->prp2);
> -
>      trace_nvme_dev_identify_ctrl();
>  
> -    return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
> -                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
> +                    DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
> @@ -1269,8 +1484,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
>  {
>      NvmeNamespace *ns;
>      uint32_t nsid = le32_to_cpu(c->nsid);
> -    uint64_t prp1 = le64_to_cpu(c->prp1);
> -    uint64_t prp2 = le64_to_cpu(c->prp2);
>  
>      trace_nvme_dev_identify_ns(nsid);
>  
> @@ -1281,8 +1494,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
>  
>      ns = &n->namespaces[nsid - 1];
>  
> -    return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
> -                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
> +                    DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
> @@ -1290,8 +1503,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
>  {
>      static const int data_len = NVME_IDENTIFY_DATA_SIZE;
>      uint32_t min_nsid = le32_to_cpu(c->nsid);
> -    uint64_t prp1 = le64_to_cpu(c->prp1);
> -    uint64_t prp2 = le64_to_cpu(c->prp2);
>      uint32_t *list;
>      uint16_t ret;
>      int i, j = 0;
> @@ -1308,8 +1519,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
>              break;
>          }
>      }
> -    ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2,
> -                       DMA_DIRECTION_FROM_DEVICE, req);
> +    ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
> +                   req);
>      g_free(list);
>      return ret;
>  }
> @@ -1318,8 +1529,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c,
>                                              NvmeRequest *req)
>  {
>      uint32_t nsid = le32_to_cpu(c->nsid);
> -    uint64_t prp1 = le64_to_cpu(c->prp1);
> -    uint64_t prp2 = le64_to_cpu(c->prp2);
>  
>      void *list;
>      uint16_t ret;
> @@ -1345,8 +1554,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c,
>      ns_descr->nidl = NVME_NIDT_UUID_LEN;
>      stl_be_p(ns_descr + sizeof(*ns_descr), nsid);
>  
> -    ret = nvme_dma_prp(n, (uint8_t *) list, NVME_IDENTIFY_DATA_SIZE, prp1,
> -                       prp2, DMA_DIRECTION_FROM_DEVICE, req);
> +    ret = nvme_dma(n, (uint8_t *)list, NVME_IDENTIFY_DATA_SIZE,
> +                   DMA_DIRECTION_FROM_DEVICE, req);
>      g_free(list);
>      return ret;
>  }
> @@ -1425,13 +1634,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
>  static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd,
>                                             NvmeRequest *req)
>  {
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> -
>      uint64_t timestamp = nvme_get_timestamp(n);
>  
> -    return nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
> -                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
> +    return nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
> +                    DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> @@ -1514,11 +1720,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd,
>  {
>      uint16_t ret;
>      uint64_t timestamp;
> -    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> -    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>  
> -    ret = nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
> -                       prp2, DMA_DIRECTION_TO_DEVICE, req);
> +    ret = nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
> +                   DMA_DIRECTION_TO_DEVICE, req);
>      if (ret != NVME_SUCCESS) {
>          return ret;
>      }
> @@ -2306,6 +2510,8 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>      id->nn = cpu_to_le32(n->num_namespaces);
>      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
>  
> +    id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORTED_NO_ALIGNMENT);
> +
>      pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
>      pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
>  
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index d51c09a4e454..70702cc67d5a 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -34,6 +34,7 @@ nvme_dev_irq_pin(void) "pulsing IRQ pin"
>  nvme_dev_irq_masked(void) "IRQ is masked"
>  nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
>  nvme_dev_map_prp(uint16_t cid, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64"
> num_prps %d"
> +nvme_dev_map_sgl(uint16_t cid, uint8_t typ, uint32_t nlb, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" nlb %"PRIu32" len %"PRIu64""
>  nvme_dev_req_register_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count
> %"PRIu64" opc \"%s\" req %p"
>  nvme_dev_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
>  nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
> @@ -89,6 +90,9 @@ nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
>  nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p
> status 0x%"PRIx16""
>  nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
>  nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
> +nvme_dev_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
> +nvme_dev_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
> +nvme_dev_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16""
>  nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
>  nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
>  nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""


Best regards,
	Maxim Levitsky
Klaus Jensen March 31, 2020, 5:48 a.m. UTC | #2
On Mar 25 12:58, Maxim Levitsky wrote:
> On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> > 
> > For now, support the Data Block, Segment and Last Segment descriptor
> > types.
> > 
> > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> > 
> > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > Acked-by: Keith Busch <kbusch@kernel.org>
> > ---
> >  hw/block/nvme.c       | 310 +++++++++++++++++++++++++++++++++++-------
> >  hw/block/trace-events |   4 +
> >  2 files changed, 262 insertions(+), 52 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 49d323566393..b89b96990f52 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> >  
> >  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> >  {
> > -    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> > +    hwaddr hi = addr + size;
> > +    if (hi < addr) {
> > +        return 1;
> > +    }
> > +
> > +    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
> 
> I would suggest to split this into a separate patch as well, since this contains not just one but 2 bugfixes
> for this function and they are not related to sg lists.
> Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
> to something like 'nvme: fix and refactor nvme_addr_read'
> 

I've split it into a patch.

> 
> >          memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> >          return 0;
> >      }
> > @@ -328,13 +333,242 @@ unmap:
> >      return status;
> >  }
> >  
> > -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> > -                             uint64_t prp1, uint64_t prp2, DMADirection dir,
> > +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> > +                                  QEMUIOVector *iov,
> > +                                  NvmeSglDescriptor *segment, uint64_t nsgld,
> > +                                  size_t *len, NvmeRequest *req)
> > +{
> > +    dma_addr_t addr, trans_len;
> > +    uint32_t blk_len;
> > +    uint16_t status;
> > +
> > +    for (int i = 0; i < nsgld; i++) {
> > +        uint8_t type = NVME_SGL_TYPE(segment[i].type);
> > +
> > +        if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > +            switch (type) {
> > +            case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> > +            case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> > +                return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> > +            default:
> To be honest I don't like that 'default'
> I would explicitly state which segment types remain 
> (I think segment list and last segment list, and various reserved types)
> In fact for the reserved types you probably also want to return NVME_SGL_DESCR_TYPE_INVALID)
> 

I "negated" the logic which I think is more readable. I still really
want to keep the default, for instance, nvme v1.4 adds a new type that
we do not support (the Transport SGL Data Block descriptor).

> Also this function as well really begs to have a description prior to it,
> something like 'map a sg list section, assuming that it only contains SGL data descriptions,
> caller has to ensure this'.
> 

Done.

> 
> > +                return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> > +            }
> > +        }
> > +
> > +        if (*len == 0) {
> > +            uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
> Nitpick: I would add a small comment here as well describiing
> what this does (We reach this point if sg list covers more that that
> was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
> capability indicates that we support just throwing the extra data away)
> 

Adding a comment. It's the other way around. The size as indicated by
NLB (or whatever depending on the command) is the "authoritative" souce
of information for the size of the payload. We will never accept an SGL
that is too short such that we lose or throw away data, but we might
accept ignoring parts of the SGL.

> > +            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> > +                break;
> > +            }
> > +
> > +            trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> > +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > +        }
> > +
> > +        addr = le64_to_cpu(segment[i].addr);
> > +        blk_len = le32_to_cpu(segment[i].len);
> > +
> > +        if (!blk_len) {
> > +            continue;
> > +        }
> > +
> > +        if (UINT64_MAX - addr < blk_len) {
> > +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > +        }
> Good!
> > +
> > +        trans_len = MIN(*len, blk_len);
> > +
> > +        status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> > +        if (status) {
> > +            return status;
> > +        }
> > +
> > +        *len -= trans_len;
> > +    }
> > +
> > +    return NVME_SUCCESS;
> > +}
> > +
> > +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> > +                             NvmeSglDescriptor sgl, size_t len,
> >                               NvmeRequest *req)
> > +{
> > +    /*
> > +     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
> > +     * dynamically allocating a potentially large SGL. The spec allows the SGL
> > +     * to be larger than the command transfer size, so it is not bounded by
> > +     * MDTS.
> > +     */
> Now this is a very good comment!
> 
> However I don't fully understand the note about the SGL. I assume that you mean
> that the data that SGL covers still should be less that MDTS, but the actual SGL chain,
> if assembled really in inefficient way (like 1 byte per each data descriptor) might be larger.
> 

Exactly. I'll rephrase.

> 
> > +    const int SEG_CHUNK_SIZE = 256;
> > +
> > +    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
> > +    uint64_t nsgld;
> > +    uint32_t seg_len;
> > +    uint16_t status;
> > +    bool sgl_in_cmb = false;
> > +    hwaddr addr;
> > +    int ret;
> > +
> > +    sgld = &sgl;
> > +    addr = le64_to_cpu(sgl.addr);
> > +
> > +    trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
> > +                           len);
> > +
> > +    /*
> > +     * If the entire transfer can be described with a single data block it can
> > +     * be mapped directly.
> > +     */
> > +    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > +        status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
> > +        if (status) {
> > +            goto unmap;
> > +        }
> > +
> > +        goto out;
> > +    }
> > +
> > +    /*
> > +     * If the segment is located in the CMB, the submission queue of the
> > +     * request must also reside there.
> > +     */
> > +    if (nvme_addr_is_cmb(n, addr)) {
> > +        if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
> > +            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> > +        }
> > +
> > +        sgl_in_cmb = true;
> > +    }
> > +
> > +    for (;;) {
> > +        seg_len = le32_to_cpu(sgld->len);
> > +
> > +        if (!seg_len || seg_len & 0xf) {
> > +            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
> > +        }
> It might be worth noting here that we are dealing with sgl (last) segment descriptor
> and its length indeed must be non zero and multiple of 16.
> Otherwise I confused this for a moment with the alignment requirements on the data itsel.
> 

Done.

> 
> Best regards,
> 	Maxim Levitsky
> 
> 
> 
> 
>
Maxim Levitsky March 31, 2020, 8:51 a.m. UTC | #3
On Tue, 2020-03-31 at 07:48 +0200, Klaus Birkelund Jensen wrote:
> On Mar 25 12:58, Maxim Levitsky wrote:
> > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > > From: Klaus Jensen <k.jensen@samsung.com>
> > > 
> > > For now, support the Data Block, Segment and Last Segment descriptor
> > > types.
> > > 
> > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> > > 
> > > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > > Acked-by: Keith Busch <kbusch@kernel.org>
> > > ---
> > >  hw/block/nvme.c       | 310 +++++++++++++++++++++++++++++++++++-------
> > >  hw/block/trace-events |   4 +
> > >  2 files changed, 262 insertions(+), 52 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 49d323566393..b89b96990f52 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> > >  
> > >  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> > >  {
> > > -    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> > > +    hwaddr hi = addr + size;
> > > +    if (hi < addr) {
> > > +        return 1;
> > > +    }
> > > +
> > > +    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
> > 
> > I would suggest to split this into a separate patch as well, since this contains not just one but 2 bugfixes
> > for this function and they are not related to sg lists.
> > Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
> > to something like 'nvme: fix and refactor nvme_addr_read'
> > 
> 
> I've split it into a patch.
> 
> > 
> > >          memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> > >          return 0;
> > >      }
> > > @@ -328,13 +333,242 @@ unmap:
> > >      return status;
> > >  }
> > >  
> > > -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> > > -                             uint64_t prp1, uint64_t prp2, DMADirection dir,
> > > +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> > > +                                  QEMUIOVector *iov,
> > > +                                  NvmeSglDescriptor *segment, uint64_t nsgld,
> > > +                                  size_t *len, NvmeRequest *req)
> > > +{
> > > +    dma_addr_t addr, trans_len;
> > > +    uint32_t blk_len;
> > > +    uint16_t status;
> > > +
> > > +    for (int i = 0; i < nsgld; i++) {
> > > +        uint8_t type = NVME_SGL_TYPE(segment[i].type);
> > > +
> > > +        if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > > +            switch (type) {
> > > +            case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> > > +            case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> > > +                return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> > > +            default:
> > 
> > To be honest I don't like that 'default'
> > I would explicitly state which segment types remain 
> > (I think segment list and last segment list, and various reserved types)
> > In fact for the reserved types you probably also want to return NVME_SGL_DESCR_TYPE_INVALID)
> > 
> 
> I "negated" the logic which I think is more readable. I still really
> want to keep the default, for instance, nvme v1.4 adds a new type that
> we do not support (the Transport SGL Data Block descriptor).
OK, I'll take a look a that in the next version of the patches.

> 
> > Also this function as well really begs to have a description prior to it,
> > something like 'map a sg list section, assuming that it only contains SGL data descriptions,
> > caller has to ensure this'.
> > 
> 
> Done.
Thanks a lot!
> 
> > 
> > > +                return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> > > +            }
> > > +        }
> > > +
> > > +        if (*len == 0) {
> > > +            uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
> > 
> > Nitpick: I would add a small comment here as well describiing
> > what this does (We reach this point if sg list covers more that that
> > was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
> > capability indicates that we support just throwing the extra data away)
> > 
> 
> Adding a comment. It's the other way around. The size as indicated by
> NLB (or whatever depending on the command) is the "authoritative" souce
> of information for the size of the payload. We will never accept an SGL
> that is too short such that we lose or throw away data, but we might
> accept ignoring parts of the SGL.
Yes, that is what I meant. Thanks!

> 
> > > +            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> > > +                break;
> > > +            }
> > > +
> > > +            trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> > > +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > > +        }
> > > +
> > > +        addr = le64_to_cpu(segment[i].addr);
> > > +        blk_len = le32_to_cpu(segment[i].len);
> > > +
> > > +        if (!blk_len) {
> > > +            continue;
> > > +        }
> > > +
> > > +        if (UINT64_MAX - addr < blk_len) {
> > > +            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > > +        }
> > 
> > Good!
> > > +
> > > +        trans_len = MIN(*len, blk_len);
> > > +
> > > +        status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> > > +        if (status) {
> > > +            return status;
> > > +        }
> > > +
> > > +        *len -= trans_len;
> > > +    }
> > > +
> > > +    return NVME_SUCCESS;
> > > +}
> > > +
> > > +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> > > +                             NvmeSglDescriptor sgl, size_t len,
> > >                               NvmeRequest *req)
> > > +{
> > > +    /*
> > > +     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
> > > +     * dynamically allocating a potentially large SGL. The spec allows the SGL
> > > +     * to be larger than the command transfer size, so it is not bounded by
> > > +     * MDTS.
> > > +     */
> > 
> > Now this is a very good comment!
> > 
> > However I don't fully understand the note about the SGL. I assume that you mean
> > that the data that SGL covers still should be less that MDTS, but the actual SGL chain,
> > if assembled really in inefficient way (like 1 byte per each data descriptor) might be larger.
> > 
> 
> Exactly. I'll rephrase.
Thanks!
> 
> > 
> > > +    const int SEG_CHUNK_SIZE = 256;
> > > +
> > > +    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
> > > +    uint64_t nsgld;
> > > +    uint32_t seg_len;
> > > +    uint16_t status;
> > > +    bool sgl_in_cmb = false;
> > > +    hwaddr addr;
> > > +    int ret;
> > > +
> > > +    sgld = &sgl;
> > > +    addr = le64_to_cpu(sgl.addr);
> > > +
> > > +    trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
> > > +                           len);
> > > +
> > > +    /*
> > > +     * If the entire transfer can be described with a single data block it can
> > > +     * be mapped directly.
> > > +     */
> > > +    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > > +        status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
> > > +        if (status) {
> > > +            goto unmap;
> > > +        }
> > > +
> > > +        goto out;
> > > +    }
> > > +
> > > +    /*
> > > +     * If the segment is located in the CMB, the submission queue of the
> > > +     * request must also reside there.
> > > +     */
> > > +    if (nvme_addr_is_cmb(n, addr)) {
> > > +        if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
> > > +            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> > > +        }
> > > +
> > > +        sgl_in_cmb = true;
> > > +    }
> > > +
> > > +    for (;;) {
> > > +        seg_len = le32_to_cpu(sgld->len);
> > > +
> > > +        if (!seg_len || seg_len & 0xf) {
> > > +            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
> > > +        }
> > 
> > It might be worth noting here that we are dealing with sgl (last) segment descriptor
> > and its length indeed must be non zero and multiple of 16.
> > Otherwise I confused this for a moment with the alignment requirements on the data itsel.
> > 
> 
> Done.
Thanks as well!
> 
> > 
> > Best regards,
> > 	Maxim Levitsky
> > 
> > 
> > 
> > 
> > 
> 

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 49d323566393..b89b96990f52 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -76,7 +76,12 @@  static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
+    hwaddr hi = addr + size;
+    if (hi < addr) {
+        return 1;
+    }
+
+    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
         return 0;
     }
@@ -328,13 +333,242 @@  unmap:
     return status;
 }
 
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-                             uint64_t prp1, uint64_t prp2, DMADirection dir,
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+                                  QEMUIOVector *iov,
+                                  NvmeSglDescriptor *segment, uint64_t nsgld,
+                                  size_t *len, NvmeRequest *req)
+{
+    dma_addr_t addr, trans_len;
+    uint32_t blk_len;
+    uint16_t status;
+
+    for (int i = 0; i < nsgld; i++) {
+        uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+        if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+            switch (type) {
+            case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
+            case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
+                return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+            default:
+                return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+            }
+        }
+
+        if (*len == 0) {
+            uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+                break;
+            }
+
+            trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
+            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+        }
+
+        addr = le64_to_cpu(segment[i].addr);
+        blk_len = le32_to_cpu(segment[i].len);
+
+        if (!blk_len) {
+            continue;
+        }
+
+        if (UINT64_MAX - addr < blk_len) {
+            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+        }
+
+        trans_len = MIN(*len, blk_len);
+
+        status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+        if (status) {
+            return status;
+        }
+
+        *len -= trans_len;
+    }
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+                             NvmeSglDescriptor sgl, size_t len,
                              NvmeRequest *req)
+{
+    /*
+     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+     * dynamically allocating a potentially large SGL. The spec allows the SGL
+     * to be larger than the command transfer size, so it is not bounded by
+     * MDTS.
+     */
+    const int SEG_CHUNK_SIZE = 256;
+
+    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+    uint64_t nsgld;
+    uint32_t seg_len;
+    uint16_t status;
+    bool sgl_in_cmb = false;
+    hwaddr addr;
+    int ret;
+
+    sgld = &sgl;
+    addr = le64_to_cpu(sgl.addr);
+
+    trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
+                           len);
+
+    /*
+     * If the entire transfer can be described with a single data block it can
+     * be mapped directly.
+     */
+    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+        status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
+        if (status) {
+            goto unmap;
+        }
+
+        goto out;
+    }
+
+    /*
+     * If the segment is located in the CMB, the submission queue of the
+     * request must also reside there.
+     */
+    if (nvme_addr_is_cmb(n, addr)) {
+        if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+        }
+
+        sgl_in_cmb = true;
+    }
+
+    for (;;) {
+        seg_len = le32_to_cpu(sgld->len);
+
+        if (!seg_len || seg_len & 0xf) {
+            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+        }
+
+        if (UINT64_MAX - addr < seg_len) {
+            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+        }
+
+        nsgld = seg_len / sizeof(NvmeSglDescriptor);
+
+        while (nsgld > SEG_CHUNK_SIZE) {
+            if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
+                trace_nvme_dev_err_addr_read(addr);
+                status = NVME_DATA_TRANSFER_ERROR;
+                goto unmap;
+            }
+
+            status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
+                                       &len, req);
+            if (status) {
+                goto unmap;
+            }
+
+            nsgld -= SEG_CHUNK_SIZE;
+            addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
+        }
+
+        ret = nvme_addr_read(n, addr, segment, nsgld *
+                             sizeof(NvmeSglDescriptor));
+        if (ret) {
+            trace_nvme_dev_err_addr_read(addr);
+            status = NVME_DATA_TRANSFER_ERROR;
+            goto unmap;
+        }
+
+        last_sgld = &segment[nsgld - 1];
+
+        /* if the segment ends with a Data Block, then we are done */
+        if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+            status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
+            if (status) {
+                goto unmap;
+            }
+
+            break;
+        }
+
+        /* a Last Segment must end with a Data Block descriptor */
+        if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
+            status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+            goto unmap;
+        }
+
+        sgld = last_sgld;
+        addr = le64_to_cpu(sgld->addr);
+
+        /*
+         * Do not map the last descriptor; it will be a Segment or Last Segment
+         * descriptor instead and handled by the next iteration.
+         */
+        status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
+        if (status) {
+            goto unmap;
+        }
+
+        /*
+         * If the next segment is in the CMB, make sure that the sgl was
+         * already located there.
+         */
+        if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
+            status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
+            goto unmap;
+        }
+    }
+
+out:
+    /* if there is any residual left in len, the SGL was too short */
+    if (len) {
+        status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+        goto unmap;
+    }
+
+    return NVME_SUCCESS;
+
+unmap:
+    if (iov->iov) {
+        qemu_iovec_destroy(iov);
+    }
+
+    if (qsg->sg) {
+        qemu_sglist_destroy(qsg);
+    }
+
+    return status;
+}
+
+static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+                         size_t len, NvmeRequest *req)
+{
+    uint64_t prp1, prp2;
+
+    switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
+    case PSDT_PRP:
+        prp1 = le64_to_cpu(req->cmd.dptr.prp1);
+        prp2 = le64_to_cpu(req->cmd.dptr.prp2);
+
+        return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
+    case PSDT_SGL_MPTR_CONTIGUOUS:
+    case PSDT_SGL_MPTR_SGL:
+        /* SGLs shall not be used for Admin commands in NVMe over PCIe */
+        if (!req->sq->sqid) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        return nvme_map_sgl(n, qsg, iov, req->cmd.dptr.sgl, len, req);
+    default:
+        return NVME_INVALID_FIELD;
+    }
+}
+
+static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+                         DMADirection dir, NvmeRequest *req)
 {
     uint16_t status = NVME_SUCCESS;
 
-    status = nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
+    status = nvme_map(n, &req->qsg, &req->iov, len, req);
     if (status) {
         return status;
     }
@@ -370,15 +604,6 @@  static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return status;
 }
 
-static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
-                         QEMUIOVector *iov, size_t len, NvmeRequest *req)
-{
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
-
-    return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
-}
-
 static void nvme_aio_destroy(NvmeAIO *aio)
 {
     g_free(aio);
@@ -846,7 +1071,7 @@  static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         goto invalid;
     }
 
-    status = nvme_map(n, cmd, &req->qsg, &req->iov, len, req);
+    status = nvme_map(n, &req->qsg, &req->iov, len, req);
     if (status) {
         goto invalid;
     }
@@ -1013,8 +1238,6 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
                                 uint32_t buf_len, uint64_t off,
                                 NvmeRequest *req)
 {
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
     uint32_t nsid = le32_to_cpu(cmd->nsid);
 
     uint32_t trans_len;
@@ -1064,16 +1287,14 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
         nvme_clear_events(n, NVME_AER_TYPE_SMART);
     }
 
-    return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
-                        DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, (uint8_t *) &smart + off, trans_len,
+                    DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
                                  uint64_t off, NvmeRequest *req)
 {
     uint32_t trans_len;
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
     NvmeFwSlotInfoLog fw_log;
 
     if (off > sizeof(fw_log)) {
@@ -1084,8 +1305,8 @@  static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
 
     trans_len = MIN(sizeof(fw_log) - off, buf_len);
 
-    return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
-                        DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len,
+                    DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
@@ -1093,8 +1314,6 @@  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
                                 NvmeRequest *req)
 {
     uint32_t trans_len;
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
     uint8_t errlog[64];
 
     if (!rae) {
@@ -1109,8 +1328,7 @@  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
 
     trans_len = MIN(sizeof(errlog) - off, buf_len);
 
-    return nvme_dma_prp(n, errlog, trans_len, prp1, prp2,
-                        DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, errlog, trans_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
@@ -1255,13 +1473,10 @@  static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c,
                                    NvmeRequest *req)
 {
-    uint64_t prp1 = le64_to_cpu(c->prp1);
-    uint64_t prp2 = le64_to_cpu(c->prp2);
-
     trace_nvme_dev_identify_ctrl();
 
-    return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
-                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
+                    DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
@@ -1269,8 +1484,6 @@  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
 {
     NvmeNamespace *ns;
     uint32_t nsid = le32_to_cpu(c->nsid);
-    uint64_t prp1 = le64_to_cpu(c->prp1);
-    uint64_t prp2 = le64_to_cpu(c->prp2);
 
     trace_nvme_dev_identify_ns(nsid);
 
@@ -1281,8 +1494,8 @@  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
 
     ns = &n->namespaces[nsid - 1];
 
-    return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
-                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
+                    DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
@@ -1290,8 +1503,6 @@  static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
 {
     static const int data_len = NVME_IDENTIFY_DATA_SIZE;
     uint32_t min_nsid = le32_to_cpu(c->nsid);
-    uint64_t prp1 = le64_to_cpu(c->prp1);
-    uint64_t prp2 = le64_to_cpu(c->prp2);
     uint32_t *list;
     uint16_t ret;
     int i, j = 0;
@@ -1308,8 +1519,8 @@  static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
             break;
         }
     }
-    ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2,
-                       DMA_DIRECTION_FROM_DEVICE, req);
+    ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
+                   req);
     g_free(list);
     return ret;
 }
@@ -1318,8 +1529,6 @@  static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c,
                                             NvmeRequest *req)
 {
     uint32_t nsid = le32_to_cpu(c->nsid);
-    uint64_t prp1 = le64_to_cpu(c->prp1);
-    uint64_t prp2 = le64_to_cpu(c->prp2);
 
     void *list;
     uint16_t ret;
@@ -1345,8 +1554,8 @@  static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c,
     ns_descr->nidl = NVME_NIDT_UUID_LEN;
     stl_be_p(ns_descr + sizeof(*ns_descr), nsid);
 
-    ret = nvme_dma_prp(n, (uint8_t *) list, NVME_IDENTIFY_DATA_SIZE, prp1,
-                       prp2, DMA_DIRECTION_FROM_DEVICE, req);
+    ret = nvme_dma(n, (uint8_t *)list, NVME_IDENTIFY_DATA_SIZE,
+                   DMA_DIRECTION_FROM_DEVICE, req);
     g_free(list);
     return ret;
 }
@@ -1425,13 +1634,10 @@  static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd,
                                            NvmeRequest *req)
 {
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
-
     uint64_t timestamp = nvme_get_timestamp(n);
 
-    return nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
-                        prp2, DMA_DIRECTION_FROM_DEVICE, req);
+    return nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
+                    DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
@@ -1514,11 +1720,9 @@  static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd,
 {
     uint16_t ret;
     uint64_t timestamp;
-    uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
-    uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
 
-    ret = nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
-                       prp2, DMA_DIRECTION_TO_DEVICE, req);
+    ret = nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
+                   DMA_DIRECTION_TO_DEVICE, req);
     if (ret != NVME_SUCCESS) {
         return ret;
     }
@@ -2306,6 +2510,8 @@  static void nvme_init_ctrl(NvmeCtrl *n)
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
 
+    id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORTED_NO_ALIGNMENT);
+
     pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
     pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index d51c09a4e454..70702cc67d5a 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -34,6 +34,7 @@  nvme_dev_irq_pin(void) "pulsing IRQ pin"
 nvme_dev_irq_masked(void) "IRQ is masked"
 nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 nvme_dev_map_prp(uint16_t cid, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
+nvme_dev_map_sgl(uint16_t cid, uint8_t typ, uint32_t nlb, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" nlb %"PRIu32" len %"PRIu64""
 nvme_dev_req_register_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count %"PRIu64" opc \"%s\" req %p"
 nvme_dev_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
 nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
@@ -89,6 +90,9 @@  nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
 nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
 nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
 nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
+nvme_dev_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
+nvme_dev_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
+nvme_dev_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16""
 nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""