diff mbox series

[08/17] hw/block/nvme: refactor aio submission

Message ID 20200904141956.576630-9-its@irrelevant.dk (mailing list archive)
State New, archived
Headers show
Series hw/block/nvme: multiple namespaces support | expand

Commit Message

Klaus Jensen Sept. 4, 2020, 2:19 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

This pulls block layer aio submission to a common function an introduces
the NvmeAIO structure that encapsulates this.

This adds more code with no immediate benefit, but is in preparation for
supporting multiple aios per request.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 191 ++++++++++++++++++++++++++++++++----------
 hw/block/nvme.h       |  51 +++++++++++
 hw/block/trace-events |   3 +
 3 files changed, 203 insertions(+), 42 deletions(-)

Comments

Keith Busch Sept. 4, 2020, 7:47 p.m. UTC | #1
On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index bfac3385cb64..3e32f39c7c1d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
>  };
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);

You don't need the forward declaration here. Just move the
implementation above where it's used. It looks safe: nvme_aio_cb()
doesn't have any circular dependencies.
Klaus Jensen Sept. 4, 2020, 8:38 p.m. UTC | #2
On Sep  4 12:47, Keith Busch wrote:
> On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index bfac3385cb64..3e32f39c7c1d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> >  };
> >  
> >  static void nvme_process_sq(void *opaque);
> > +static void nvme_aio_cb(void *opaque, int ret);
> 
> You don't need the forward declaration here. Just move the
> implementation above where it's used. It looks safe: nvme_aio_cb()
> doesn't have any circular dependencies.

You are right, of course. But it is getting a circular dependency in a
later patch. I left it there to reduce code movement later.
Keith Busch Sept. 4, 2020, 9:15 p.m. UTC | #3
On Fri, Sep 04, 2020 at 10:38:39PM +0200, Klaus Jensen wrote:
> On Sep  4 12:47, Keith Busch wrote:
> > On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index bfac3385cb64..3e32f39c7c1d 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> > >  };
> > >  
> > >  static void nvme_process_sq(void *opaque);
> > > +static void nvme_aio_cb(void *opaque, int ret);
> > 
> > You don't need the forward declaration here. Just move the
> > implementation above where it's used. It looks safe: nvme_aio_cb()
> > doesn't have any circular dependencies.
> 
> You are right, of course. But it is getting a circular dependency in a
> later patch. I left it there to reduce code movement later.

Is that coming in a future patch? Not finding it in this series.

About the whole patch in general, are multiple aio's per-request coming
in later patch as well? I didn't see any use for it here, and the
overhead of dynamic allocation and zeroing a new struct in the IO path
is a bit concerning for performance. I'd like to see your intended use
for this.
Klaus Jensen Sept. 4, 2020, 9:38 p.m. UTC | #4
On Sep  4 14:15, Keith Busch wrote:
> On Fri, Sep 04, 2020 at 10:38:39PM +0200, Klaus Jensen wrote:
> > On Sep  4 12:47, Keith Busch wrote:
> > > On Fri, Sep 04, 2020 at 04:19:47PM +0200, Klaus Jensen wrote:
> > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > > index bfac3385cb64..3e32f39c7c1d 100644
> > > > --- a/hw/block/nvme.c
> > > > +++ b/hw/block/nvme.c
> > > > @@ -110,6 +110,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> > > >  };
> > > >  
> > > >  static void nvme_process_sq(void *opaque);
> > > > +static void nvme_aio_cb(void *opaque, int ret);
> > > 
> > > You don't need the forward declaration here. Just move the
> > > implementation above where it's used. It looks safe: nvme_aio_cb()
> > > doesn't have any circular dependencies.
> > 
> > You are right, of course. But it is getting a circular dependency in a
> > later patch. I left it there to reduce code movement later.
> 
> Is that coming in a future patch? Not finding it in this series.
> 
> About the whole patch in general, are multiple aio's per-request coming
> in later patch as well? I didn't see any use for it here, and the
> overhead of dynamic allocation and zeroing a new struct in the IO path
> is a bit concerning for performance. I'd like to see your intended use
> for this.

Intended use-case was parallel aios. There are a lot of use cases for
this, DSM, metadata, block allocation tracking and zns zoneinfo.

But I'll rip it out of the series and repost so we can focus on multiple
namespaces.
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index bfac3385cb64..3e32f39c7c1d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -110,6 +110,7 @@  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 };
 
 static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
 
 static uint16_t nvme_cid(NvmeRequest *req)
 {
@@ -611,39 +612,151 @@  static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
     return NVME_SUCCESS;
 }
 
-static void nvme_rw_cb(void *opaque, int ret)
+static NvmeAIO *nvme_aio_new(NvmeAIOOp opc, BlockBackend *blk, int64_t offset)
 {
-    NvmeRequest *req = opaque;
-    NvmeSQueue *sq = req->sq;
-    NvmeCtrl *n = sq->ctrl;
-    NvmeCQueue *cq = n->cq[sq->cqid];
+    NvmeAIO *aio = g_new0(NvmeAIO, 1);
 
-    trace_pci_nvme_rw_cb(nvme_cid(req));
+    aio->opc = opc;
+    aio->blk = blk;
+    aio->offset = offset;
 
-    if (!ret) {
-        block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
-        req->status = NVME_SUCCESS;
-    } else {
-        block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
-        req->status = NVME_INTERNAL_DEV_ERROR;
+    return aio;
+}
+
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+    g_free(aio);
+}
+
+static uint16_t nvme_do_aio(NvmeAIO *aio)
+{
+    NvmeRequest *req = aio->req;
+
+    BlockBackend *blk = aio->blk;
+    BlockAcctCookie *acct = &req->acct;
+    BlockAcctStats *stats = blk_get_stats(blk);
+
+    bool is_write;
+
+    switch (aio->opc) {
+    case NVME_AIO_OPC_FLUSH:
+        block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+        req->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+        break;
+
+    case NVME_AIO_OPC_WRITE_ZEROES:
+        block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+        req->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+                                           BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
+                                           aio);
+        break;
+
+    case NVME_AIO_OPC_READ:
+    case NVME_AIO_OPC_WRITE:
+        is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+        block_acct_start(stats, acct, aio->len,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        if (aio->flags & NVME_AIO_DMA) {
+            QEMUSGList *qsg = (QEMUSGList *)aio->payload;
+
+            if (is_write) {
+                req->aiocb = dma_blk_write(blk, qsg, aio->offset,
+                                           BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+            } else {
+                req->aiocb = dma_blk_read(blk, qsg, aio->offset,
+                                          BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+            }
+        } else {
+            QEMUIOVector *iov = (QEMUIOVector *)aio->payload;
+
+            if (is_write) {
+                req->aiocb = blk_aio_pwritev(blk, aio->offset, iov, 0,
+                                             nvme_aio_cb, aio);
+            } else {
+                req->aiocb = blk_aio_preadv(blk, aio->offset, iov, 0,
+                                            nvme_aio_cb, aio);
+            }
+        }
+
+        break;
     }
 
-    nvme_enqueue_req_completion(cq, req);
+    return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_aio_add(NvmeRequest *req, NvmeAIO *aio)
+{
+    aio->req = req;
+
+    trace_pci_nvme_aio_add(nvme_cid(req), aio, blk_name(aio->blk),
+                           aio->offset, aio->len, nvme_aio_opc_str(aio),
+                           req);
+
+    return nvme_do_aio(aio);
+}
+
+static void nvme_aio_cb(void *opaque, int ret)
+{
+    NvmeAIO *aio = opaque;
+    NvmeRequest *req = aio->req;
+
+    BlockBackend *blk = aio->blk;
+    BlockAcctCookie *acct = &req->acct;
+    BlockAcctStats *stats = blk_get_stats(blk);
+
+    Error *local_err = NULL;
+
+    trace_pci_nvme_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset,
+                          aio->len, nvme_aio_opc_str(aio), req);
+
+    if (!ret) {
+        block_acct_done(stats, acct);
+        req->status = NVME_SUCCESS;
+    } else {
+        uint16_t status;
+
+        block_acct_failed(stats, acct);
+
+        switch (aio->opc) {
+        case NVME_AIO_OPC_READ:
+            status = NVME_UNRECOVERED_READ;
+            break;
+        case NVME_AIO_OPC_FLUSH:
+        case NVME_AIO_OPC_WRITE:
+        case NVME_AIO_OPC_WRITE_ZEROES:
+            status = NVME_WRITE_FAULT;
+            break;
+        default:
+            status = NVME_INTERNAL_DEV_ERROR;
+            break;
+        }
+
+        trace_pci_nvme_err_aio(nvme_cid(req), aio, blk_name(blk),
+                               aio->offset, nvme_aio_opc_str(aio), req,
+                               status);
+
+        error_setg_errno(&local_err, -ret, "aio failed");
+        error_report_err(local_err);
+
+        req->status = status;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+    nvme_aio_destroy(aio);
 }
 
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
-    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
-                     BLOCK_ACCT_FLUSH);
-    req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
-
-    return NVME_NO_COMPLETE;
+    return nvme_aio_add(req, nvme_aio_new(NVME_AIO_OPC_FLUSH, n->conf.blk, 0));
 }
 
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
+    NvmeAIO *aio;
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t offset = nvme_l2b(ns, slba);
@@ -658,24 +771,23 @@  static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
         return status;
     }
 
-    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
-                     BLOCK_ACCT_WRITE);
-    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
-                                       BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-    return NVME_NO_COMPLETE;
+    aio = nvme_aio_new(NVME_AIO_OPC_WRITE_ZEROES, n->conf.blk, offset);
+    aio->len = count;
+
+    return nvme_aio_add(req, aio);
 }
 
 static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
+    NvmeAIO *aio;
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t slba = le64_to_cpu(rw->slba);
 
     uint64_t data_size = nvme_l2b(ns, nlb);
     uint64_t data_offset = nvme_l2b(ns, slba);
-    int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
-    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+    bool is_write = nvme_req_is_write(req);
     uint16_t status;
 
     trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb,
@@ -698,28 +810,23 @@  static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    if (req->qsg.nsg > 0) {
-        block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
-                         acct);
-        req->aiocb = is_write ?
-            dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                          nvme_rw_cb, req) :
-            dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
-                         nvme_rw_cb, req);
+    aio = nvme_aio_new(is_write ? NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ,
+                       n->conf.blk, data_offset);
+
+    if (req->qsg.sg) {
+        aio->payload = &req->qsg;
+        aio->len = req->qsg.size;
+        aio->flags |= NVME_AIO_DMA;
     } else {
-        block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
-                         acct);
-        req->aiocb = is_write ?
-            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                            req) :
-            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                           req);
+        aio->payload = &req->iov;
+        aio->len = req->iov.size;
     }
 
-    return NVME_NO_COMPLETE;
+    return nvme_aio_add(req, aio);
 
 invalid:
-    block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+    block_acct_invalid(blk_get_stats(n->conf.blk),
+                       is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
     return status;
 }
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index ce9e931420d7..7a11b0b37317 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -32,6 +32,17 @@  typedef struct NvmeRequest {
     QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
+static inline bool nvme_req_is_write(NvmeRequest *req)
+{
+    switch (req->cmd.opcode) {
+    case NVME_CMD_WRITE:
+    case NVME_CMD_WRITE_ZEROES:
+        return true;
+    default:
+        return false;
+    }
+}
+
 static inline const char *nvme_adm_opc_str(uint8_t opc)
 {
     switch (opc) {
@@ -60,6 +71,38 @@  static inline const char *nvme_io_opc_str(uint8_t opc)
     }
 }
 
+typedef enum NvmeAIOOp {
+    NVME_AIO_OPC_FLUSH        = 0x1,
+    NVME_AIO_OPC_READ         = 0x2,
+    NVME_AIO_OPC_WRITE        = 0x3,
+    NVME_AIO_OPC_WRITE_ZEROES = 0x4,
+} NvmeAIOOp;
+
+typedef enum NvmeAIOFlags {
+    NVME_AIO_DMA = 1 << 0,
+} NvmeAIOFlags;
+
+typedef struct NvmeAIO {
+    NvmeAIOOp       opc;
+    NvmeRequest     *req;
+    BlockBackend    *blk;
+    int64_t         offset;
+    size_t          len;
+    int             flags;
+    void            *payload;
+} NvmeAIO;
+
+static inline const char *nvme_aio_opc_str(NvmeAIO *aio)
+{
+    switch (aio->opc) {
+    case NVME_AIO_OPC_FLUSH:        return "NVME_AIO_OPC_FLUSH";
+    case NVME_AIO_OPC_READ:         return "NVME_AIO_OPC_READ";
+    case NVME_AIO_OPC_WRITE:        return "NVME_AIO_OPC_WRITE";
+    case NVME_AIO_OPC_WRITE_ZEROES: return "NVME_AIO_OPC_WRITE_ZEROES";
+    default:                        return "NVME_AIO_OPC_UNKNOWN";
+    }
+}
+
 typedef struct NvmeSQueue {
     struct NvmeCtrl *ctrl;
     uint16_t    sqid;
@@ -171,4 +214,12 @@  static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
     return n->ns_size >> nvme_ns_lbads(ns);
 }
 
+static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
+{
+    NvmeSQueue *sq = req->sq;
+    NvmeCtrl *n = sq->ctrl;
+
+    return n->cq[sq->cqid];
+}
+
 #endif /* HW_NVME_H */
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 0823d0fb47c5..fb3bf1be5e07 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -39,6 +39,8 @@  pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2,
 pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname \"%s\""
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname \"%s\""
 pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" \"%s\" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_aio_add(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t len, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" len %"PRIu64" opc \"%s\" req %p"
+pci_nvme_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t len, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" len %"PRIu64" opc \"%s\" req %p"
 pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
@@ -88,6 +90,7 @@  pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
 pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
+pci_nvme_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
 pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
 pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""