Message ID | 20191219130921.309264-20-k.jensen@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | nvme: support NVMe v1.3d, SGLs and multiple namespaces | expand |
Hi Klaus, On Thu, 19 Dec 2019 at 13:09, Klaus Jensen <k.jensen@samsung.com> wrote: > > Handling DMA errors gracefully is required for the device to pass the > block/011 test ("disable PCI device while doing I/O") in the blktests > suite. > > With this patch the device passes the test by retrying "critical" > transfers (posting of completion entries and processing of submission > queue entries). > > If DMA errors occur at any other point in the execution of the command > (say, while mapping the PRPs), the command is aborted with a Data > Transfer Error status code. > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com> > --- > hw/block/nvme.c | 37 +++++++++++++++++++++++++++++-------- > hw/block/trace-events | 2 ++ > include/block/nvme.h | 2 +- > 3 files changed, 32 insertions(+), 9 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index 56659bbe263a..f6591285b504 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -71,14 +71,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) > return addr >= low && addr < hi; > } > > -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) > +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) > { > if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { > memcpy(buf, (void *) &n->cmbuf[addr - n->ctrl_mem.addr], size); > - return; > + return 0; > } > > - pci_dma_read(&n->parent_obj, addr, buf, size); > + return pci_dma_read(&n->parent_obj, addr, buf, size); > } > > static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) > @@ -216,7 +216,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, > > nents = (len + n->page_size - 1) >> n->page_bits; > prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); > - nvme_addr_read(n, prp2, (void *) prp_list, prp_trans); > + if (nvme_addr_read(n, prp2, (void *) prp_list, prp_trans)) { > + trace_nvme_dev_err_addr_read(prp2); > + status = NVME_DATA_TRANSFER_ERROR; > + goto unmap; > + } > while (len != 0) { > uint64_t prp_ent = le64_to_cpu(prp_list[i]); > > @@ -235,7 +239,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, > i = 0; > nents = (len + n->page_size - 1) >> n->page_bits; > prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); > - nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans); > + if (nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans)) { > + trace_nvme_dev_err_addr_read(prp_ent); > + status = NVME_DATA_TRANSFER_ERROR; > + goto unmap; > + } > prp_ent = le64_to_cpu(prp_list[i]); > } > > @@ -456,6 +464,7 @@ static void nvme_post_cqes(void *opaque) > NvmeCQueue *cq = opaque; > NvmeCtrl *n = cq->ctrl; > NvmeRequest *req, *next; > + int ret; > > QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { > NvmeSQueue *sq; > @@ -471,9 +480,16 @@ static void nvme_post_cqes(void *opaque) > req->cqe.sq_id = cpu_to_le16(sq->sqid); > req->cqe.sq_head = cpu_to_le16(sq->head); > addr = cq->dma_addr + cq->tail * n->cqe_size; > - nvme_inc_cq_tail(cq); > - pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, > + ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, > sizeof(req->cqe)); > + if (ret) { > + trace_nvme_dev_err_addr_write(addr); > + QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > + 100 * SCALE_MS); > + break; > + } > + nvme_inc_cq_tail(cq); > QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); > } > if (cq->tail != cq->head) { > @@ -1595,7 +1611,12 @@ static void nvme_process_sq(void *opaque) > > while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { > addr = sq->dma_addr + sq->head * n->sqe_size; > - nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd)); > + if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { > + trace_nvme_dev_err_addr_read(addr); > + timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > + 100 * SCALE_MS); > + break; > + } Is there a chance we will end up repeatedly triggering the read error here as this will come back to the same memory location each time (the sq->head is not moving here) ? BR Beata > nvme_inc_sq_head(sq); > > req = QTAILQ_FIRST(&sq->req_list); > diff --git a/hw/block/trace-events b/hw/block/trace-events > index 90a57fb6099a..09bfb3782dd0 100644 > --- a/hw/block/trace-events > +++ b/hw/block/trace-events > @@ -83,6 +83,8 @@ nvme_dev_mmio_shutdown_cleared(void) "shutdown bit cleared" > nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64"" > nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16"" > nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16"" > +nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" > +nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" > nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size" > nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" > nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" > diff --git a/include/block/nvme.h b/include/block/nvme.h > index c1de92179596..a873776d98b8 100644 > --- a/include/block/nvme.h > +++ b/include/block/nvme.h > @@ -418,7 +418,7 @@ enum NvmeStatusCodes { > NVME_INVALID_OPCODE = 0x0001, > NVME_INVALID_FIELD = 0x0002, > NVME_CID_CONFLICT = 0x0003, > - NVME_DATA_TRAS_ERROR = 0x0004, > + NVME_DATA_TRANSFER_ERROR = 0x0004, > NVME_POWER_LOSS_ABORT = 0x0005, > NVME_INTERNAL_DEV_ERROR = 0x0006, > NVME_CMD_ABORT_REQ = 0x0007, > -- > 2.24.1 >
On Jan 9 11:35, Beata Michalska wrote: > Hi Klaus, > Hi Beata, Your reviews are, as always, much appreciated! Thanks! > On Thu, 19 Dec 2019 at 13:09, Klaus Jensen <k.jensen@samsung.com> wrote: > > @@ -1595,7 +1611,12 @@ static void nvme_process_sq(void *opaque) > > > > while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { > > addr = sq->dma_addr + sq->head * n->sqe_size; > > - nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd)); > > + if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { > > + trace_nvme_dev_err_addr_read(addr); > > + timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > > + 100 * SCALE_MS); > > + break; > > + } > > Is there a chance we will end up repeatedly triggering the read error here > as this will come back to the same memory location each time (the sq->head > is not moving here) ? > Absolutely, and that was the point. Not being able to read the submission queue is pretty bad, so the device just keeps retrying every 100 ms. This is the same for when writing to the completion queue fails. But... It would probably be prudent to track how long it has been since a successful DMA transfer was done and timeout, shutting down the device. Say maybe after 60 seconds. I'll try to add something like that. Thanks, Klaus
diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 56659bbe263a..f6591285b504 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -71,14 +71,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) return addr >= low && addr < hi; } -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { memcpy(buf, (void *) &n->cmbuf[addr - n->ctrl_mem.addr], size); - return; + return 0; } - pci_dma_read(&n->parent_obj, addr, buf, size); + return pci_dma_read(&n->parent_obj, addr, buf, size); } static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) @@ -216,7 +216,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); - nvme_addr_read(n, prp2, (void *) prp_list, prp_trans); + if (nvme_addr_read(n, prp2, (void *) prp_list, prp_trans)) { + trace_nvme_dev_err_addr_read(prp2); + status = NVME_DATA_TRANSFER_ERROR; + goto unmap; + } while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); @@ -235,7 +239,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); - nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans); + if (nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans)) { + trace_nvme_dev_err_addr_read(prp_ent); + status = NVME_DATA_TRANSFER_ERROR; + goto unmap; + } prp_ent = le64_to_cpu(prp_list[i]); } @@ -456,6 +464,7 @@ static void nvme_post_cqes(void *opaque) NvmeCQueue *cq = opaque; NvmeCtrl *n = cq->ctrl; NvmeRequest *req, *next; + int ret; QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { NvmeSQueue *sq; @@ -471,9 +480,16 @@ static void nvme_post_cqes(void *opaque) req->cqe.sq_id = cpu_to_le16(sq->sqid); req->cqe.sq_head = cpu_to_le16(sq->head); addr = cq->dma_addr + cq->tail * n->cqe_size; - nvme_inc_cq_tail(cq); - pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, + ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, sizeof(req->cqe)); + if (ret) { + trace_nvme_dev_err_addr_write(addr); + QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + + 100 * SCALE_MS); + break; + } + nvme_inc_cq_tail(cq); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -1595,7 +1611,12 @@ static void nvme_process_sq(void *opaque) while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { addr = sq->dma_addr + sq->head * n->sqe_size; - nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd)); + if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { + trace_nvme_dev_err_addr_read(addr); + timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + + 100 * SCALE_MS); + break; + } nvme_inc_sq_head(sq); req = QTAILQ_FIRST(&sq->req_list); diff --git a/hw/block/trace-events b/hw/block/trace-events index 90a57fb6099a..09bfb3782dd0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -83,6 +83,8 @@ nvme_dev_mmio_shutdown_cleared(void) "shutdown bit cleared" nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64"" nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16"" nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16"" +nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" +nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size" nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index c1de92179596..a873776d98b8 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -418,7 +418,7 @@ enum NvmeStatusCodes { NVME_INVALID_OPCODE = 0x0001, NVME_INVALID_FIELD = 0x0002, NVME_CID_CONFLICT = 0x0003, - NVME_DATA_TRAS_ERROR = 0x0004, + NVME_DATA_TRANSFER_ERROR = 0x0004, NVME_POWER_LOSS_ABORT = 0x0005, NVME_INTERNAL_DEV_ERROR = 0x0006, NVME_CMD_ABORT_REQ = 0x0007,
Handling DMA errors gracefully is required for the device to pass the block/011 test ("disable PCI device while doing I/O") in the blktests suite. With this patch the device passes the test by retrying "critical" transfers (posting of completion entries and processing of submission queue entries). If DMA errors occur at any other point in the execution of the command (say, while mapping the PRPs), the command is aborted with a Data Transfer Error status code. Signed-off-by: Klaus Jensen <k.jensen@samsung.com> --- hw/block/nvme.c | 37 +++++++++++++++++++++++++++++-------- hw/block/trace-events | 2 ++ include/block/nvme.h | 2 +- 3 files changed, 32 insertions(+), 9 deletions(-)