diff mbox

[4/4] scsi-block: always use SG_IO

Message ID 1462966873-30473-5-git-send-email-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paolo Bonzini May 11, 2016, 11:41 a.m. UTC
Using pread/pwrite or io_submit has the advantage of eliminating the
bounce buffer, but drops the SCSI status.  This keeps the guest from
seeing unit attention codes, as well as statuses such as RESERVATION
CONFLICT.  Because we know scsi-block operates on an SBC device we can
still use the DMA helpers with SG_IO; just remember to patch the CDBs
if the transfer is split into multiple segments.

This means that scsi-block will always use the thread-pool unfortunately,
instead of respecting aio=native.

Reported-by: Vadim Rozenfeld <vrozenfe@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
        This version of the patch has an overflow if, for example, a
        READ(12) command straddles the LBA 2^32 boundary and is split after
        the LBA 2^32 boundary.  In this case, the second part needs to use
        a READ(16) command.  I'll fix this before posting the final version.

 hw/scsi/scsi-disk.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 114 insertions(+), 15 deletions(-)
diff mbox

Patch

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 3ecfc46..9374a1a 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2599,6 +2599,97 @@  static void scsi_block_realize(SCSIDevice *dev, Error **errp)
     scsi_generic_read_device_identification(&s->qdev);
 }
 
+typedef struct SCSIBlockReq {
+    SCSIDiskReq req;
+    sg_io_hdr_t io_header;
+    uint8_t cdb[SCSI_CMD_BUF_SIZE];
+} SCSIBlockReq;
+
+static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
+                                      int64_t sector_num, QEMUIOVector *iov,
+                                      int nb_sectors, int direction,
+                                      BlockCompletionFunc *cb, void *opaque)
+{
+    sg_io_hdr_t *io_header = &req->io_header;
+    SCSIDiskReq *r = &req->req;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    int nb_logical_blocks;
+    uint32_t word0;
+    uint64_t lba;
+    BlockAIOCB *aiocb;
+
+    io_header->interface_id = 'S';
+
+    /* The data transfer comes from the QEMUIOVector.  */
+    io_header->dxfer_direction = direction;
+    io_header->dxfer_len = nb_sectors * BDRV_SECTOR_SIZE;
+    io_header->dxferp = (void *)iov->iov;
+    io_header->iovec_count = iov->niov;
+    assert(io_header->iovec_count == iov->niov); /* no overflow! */
+
+    /* The CDB needs to have the LBA and length patched in, in case
+     * DMA helpers split the transfer in multiple segments.
+     */
+    io_header->cmdp = req->cdb;
+    io_header->cmd_len = r->req.cmd.len;
+    lba = sector_num / (s->qdev.blocksize / BDRV_SECTOR_SIZE);
+    nb_logical_blocks = io_header->dxfer_len / s->qdev.blocksize;
+    switch(req->cdb[0] >> 5) {
+    case 0:
+        /* 6-byte CDB */
+        word0 = deposit32(ldl_be_p(&req->cdb[0]), 0, 21, lba);
+        stl_be_p(&req->cdb[0], lba | word0);
+        req->cdb[4] = nb_logical_blocks;
+    case 1:
+    case 2:
+        /* 10-byte CDB */
+        stl_be_p(&req->cdb[2], lba);
+        stw_be_p(&req->cdb[7], nb_logical_blocks);
+        break;
+    case 4:
+        /* 16-byte CDB */
+        stq_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[10], nb_logical_blocks);
+        break;
+    case 5:
+        /* 12-byte CDB */
+        stl_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[6], nb_logical_blocks);
+        break;
+    }
+
+    /* The rest is as in scsi-generic.c.  */
+    io_header->mx_sb_len = sizeof(r->req.sense);
+    io_header->sbp = r->req.sense;
+    io_header->timeout = UINT_MAX;
+    io_header->usr_ptr = r;
+    io_header->flags |= SG_FLAG_DIRECT_IO;
+
+    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
+    assert(aiocb != NULL);
+    return aiocb;
+}
+
+static BlockAIOCB *scsi_block_dma_readv(int64_t sector_num,
+                                        QEMUIOVector *iov, int nb_sectors,
+                                        BlockCompletionFunc *cb, void *cb_opaque,
+                                        void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, sector_num, iov, nb_sectors,
+                              SG_DXFER_FROM_DEV, cb, cb_opaque);
+}
+
+static BlockAIOCB *scsi_block_dma_writev(int64_t sector_num,
+                                         QEMUIOVector *iov, int nb_sectors,
+                                         BlockCompletionFunc *cb, void *cb_opaque,
+                                         void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, sector_num, iov, nb_sectors,
+                              SG_DXFER_TO_DEV, cb, cb_opaque);
+}
+
 static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 {
     switch (buf[0]) {
@@ -2616,21 +2707,8 @@  static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
     case WRITE_VERIFY_10:
     case WRITE_VERIFY_12:
     case WRITE_VERIFY_16:
-        /* If we are not using O_DIRECT, we might read stale data from the
-         * host cache if writes were made using other commands than these
-         * ones (such as WRITE SAME or EXTENDED COPY, etc.).  So, without
-         * O_DIRECT everything must go through SG_IO.
-         */
-        if (!(blk_get_flags(s->qdev.conf.blk) & BDRV_O_NOCACHE)) {
-            break;
-        }
-
-        /* MMC writing cannot be done via pread/pwrite, because it sometimes
+        /* MMC writing cannot be done via DMA helpers, because it sometimes
          * involves writing beyond the maximum LBA or to negative LBA (lead-in).
-         * And once you do these writes, reading from the block device is
-         * unreliable, too.  It is even possible that reads deliver random data
-         * from the host page cache (this is probably a Linux bug).
-         *
          * We might use scsi_disk_dma_reqops as long as no writing commands are
          * seen, but performance usually isn't paramount on optical media.  So,
          * just make scsi-block operate the same as scsi-generic for them.
@@ -2648,6 +2726,24 @@  static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 }
 
 
+static int32_t scsi_block_dma_command(SCSIRequest *req, uint8_t *buf)
+{
+    SCSIBlockReq *r = (SCSIBlockReq *)req;
+    memcpy(r->cdb, req->cmd.buf, sizeof(r->cdb));
+    return scsi_disk_dma_command(req, buf);
+}
+
+static const SCSIReqOps scsi_block_dma_reqops = {
+    .size         = sizeof(SCSIBlockReq),
+    .free_req     = scsi_free_request,
+    .send_command = scsi_block_dma_command,
+    .read_data    = scsi_read_data,
+    .write_data   = scsi_write_data,
+    .get_buf      = scsi_get_buf,
+    .load_request = scsi_disk_load_request,
+    .save_request = scsi_disk_save_request,
+};
+
 static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
                                            uint32_t lun, uint8_t *buf,
                                            void *hba_private)
@@ -2658,7 +2754,7 @@  static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
         return scsi_req_alloc(&scsi_generic_req_ops, &s->qdev, tag, lun,
                               hba_private);
     } else {
-        return scsi_req_alloc(&scsi_disk_dma_reqops, &s->qdev, tag, lun,
+        return scsi_req_alloc(&scsi_block_dma_reqops, &s->qdev, tag, lun,
                               hba_private);
     }
 }
@@ -2817,10 +2913,13 @@  static void scsi_block_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     SCSIDeviceClass *sc = SCSI_DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
 
     sc->realize      = scsi_block_realize;
     sc->alloc_req    = scsi_block_new_request;
     sc->parse_cdb    = scsi_block_parse_cdb;
+    sdc->dma_readv   = scsi_block_dma_readv;
+    sdc->dma_writev  = scsi_block_dma_writev;
     dc->desc = "SCSI block device passthrough";
     dc->props = scsi_block_properties;
     dc->vmsd  = &vmstate_scsi_disk_state;