diff mbox

[RFC,v2] virtio-blk: simple multithreaded MQ implementation for bdrv_raw

Message ID 1466782326-24704-1-git-send-email-roman.penyaev@profitbricks.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Pen June 24, 2016, 3:32 p.m. UTC
v2:
  * Rebased onto latest v4 Stefan's series:
    [PATCH v4 0/7] virtio-blk: multiqueue support

Hello, all.

This is RFC because mostly this patch is a quick attempt to get true
multithreaded multiqueue support for a block device with native AIO.
The goal is to squeeze everything possible on lockless IO path from
MQ block on a guest to MQ block on a host.

To avoid any locks in qemu backend and not to introduce thread safety
into qemu block-layer I open same backend device several times, one
device per one MQ.  e.g. the following is the stack for a virtio-blk
with num-queues=2:

            VirtIOBlock
               /   \
     VirtQueue#0   VirtQueue#1
      IOThread#0    IOThread#1
         BH#0          BH#1
      Backend#0     Backend#1
               \   /
             /dev/null0

To group all objects related to one vq new structure is introduced:

    typedef struct VirtQueueCtx {
        BlockBackend *blk;
        struct VirtIOBlock *s;
        VirtQueue *vq;
        void *rq;
        QEMUBH *bh;
        IOThread *iothread;
    } VirtQueueCtx;

And VirtIOBlock includes an array of these contexts:

     typedef struct VirtIOBlock {
         VirtIODevice parent_obj;
    +    VirtQueueCtx mq[VIRTIO_QUEUE_MAX];
     ...

This patch is based on Stefan's series: "virtio-blk: multiqueue support",
with minor difference: I reverted "virtio-blk: multiqueue batch notify",
which does not make a lot sense when each VQ is handled by it's own
iothread.

The qemu configuration stays the same, i.e. put num-queues=N and N
iothreads will be started on demand and N drives will be opened:

    qemu -device virtio-blk-pci,num-queues=8

My configuration is the following:

host:
    Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz,
    8 CPUs,
    /dev/nullb0 as backend with the following parameters:
      $ cat /sys/module/null_blk/parameters/submit_queues
      8
      $ cat /sys/module/null_blk/parameters/irqmode
      1

guest:
    8 VCPUs

qemu:
    -drive if=none,id=d0,file=/dev/nullb0,format=raw,snapshot=off,cache=none,aio=native \
    -device virtio-blk-pci,num-queues=$N,drive=d0,disable-modern=off,disable-legacy=on

    where $N varies during the tests.

fio:
    [global]
    description=Emulation of Storage Server Access Pattern
    bssplit=512/20:1k/16:2k/9:4k/12:8k/19:16k/10:32k/8:64k/4
    fadvise_hint=0
    rw=randrw:2
    direct=1

    ioengine=libaio
    iodepth=64
    iodepth_batch_submit=64
    iodepth_batch_complete=64
    numjobs=8
    gtod_reduce=1
    group_reporting=1

    time_based=1
    runtime=30

    [job]
    filename=/dev/vda

Results:
    num-queues   RD bw      WR bw
    ----------   -----      -----

    * with 1 iothread *

    1 thr 1 mq   1242MB/s   1238MB/s
    1 thr 2 mq   1632MB/s   1627MB/s
    1 thr 4 mq   1714MB/s   1710MB/s
    1 thr 8 mq   1728MB/s   1723MB/s

    * with N iothreads *

    2 thr 2 mq   1962MB/s   1958MB/s
    4 thr 4 mq   2385MB/s   2382MB/s
    8 thr 8 mq   1227MB/s   1223MB/s

Obviously, 8 iothreads + 8 vcpu threads is too much for my machine
with 8 CPUs, but 4 iothreads show quite good result.

This patch will work only for raw block device or for non-expandable
raw image files, because it is quite clear what will happen if any
expandable image is opened and accessed from many threads :)  Also
any alignment attempts on write or zeroing blocks or what ever lead
to corruptions.

So these options 'snapshot=off,cache=none,aio=native' are specified in
a hope that it is enough to stop qemu from doing smart things and will
force it simply to forward IO request from guest to host immediately.
Fio verification test and couple of filesystem tests show that nothing
terrible has happend.

I also did an attempt to assign several AIO contexts to one BlockBackend
device in order to access one BlockBackend from many iothreads, but I
failed to finish this part in reasonable time.  Of course, my qemu block
layer experience is rather shallow and I do not see obvious ways how
to make this IO path thread-safe without changing everything all over
the place.

--
Roman

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: qemu-devel@nongnu.org
---
 hw/block/dataplane/virtio-blk.c | 112 ++++++++----------
 hw/block/dataplane/virtio-blk.h |   2 +-
 hw/block/virtio-blk.c           | 256 ++++++++++++++++++++++++++++------------
 include/hw/virtio/virtio-blk.h  |  38 +++++-
 4 files changed, 264 insertions(+), 144 deletions(-)
diff mbox

Patch

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index cebeed4..ec2967d 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -32,49 +32,23 @@  struct VirtIOBlockDataPlane {
 
     VirtIOBlkConf *conf;
     VirtIODevice *vdev;
-    QEMUBH *bh;                     /* bh for guest notification */
-    unsigned long *batch_notify_vqs;
-
-    /* Note that these EventNotifiers are assigned by value.  This is
-     * fine as long as you do not call event_notifier_cleanup on them
-     * (because you don't own the file descriptor or handle; you just
-     * use it).
-     */
-    IOThread *iothread;
-    AioContext *ctx;
 };
 
 /* Raise an interrupt to signal guest, if necessary */
-void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq)
+void virtio_blk_data_plane_notify(VirtQueueCtx *vq_ctx)
 {
-    set_bit(virtio_get_queue_index(vq), s->batch_notify_vqs);
-    qemu_bh_schedule(s->bh);
+    qemu_bh_schedule(vq_ctx->bh);
 }
 
 static void notify_guest_bh(void *opaque)
 {
-    VirtIOBlockDataPlane *s = opaque;
-    unsigned nvqs = s->conf->num_queues;
-    unsigned long bitmap[BITS_TO_LONGS(nvqs)];
-    unsigned j;
-
-    memcpy(bitmap, s->batch_notify_vqs, sizeof(bitmap));
-    memset(s->batch_notify_vqs, 0, sizeof(bitmap));
-
-    for (j = 0; j < nvqs; j += BITS_PER_LONG) {
-        unsigned long bits = bitmap[j];
-
-        while (bits != 0) {
-            unsigned i = j + ctzl(bits);
-            VirtQueue *vq = virtio_get_queue(s->vdev, i);
+    VirtQueueCtx *vq_ctx = opaque;
 
-            if (virtio_should_notify(s->vdev, vq)) {
-                event_notifier_set(virtio_queue_get_guest_notifier(vq));
-            }
-
-            bits &= bits - 1; /* clear right-most bit */
-        }
+    if (!virtio_should_notify(VIRTIO_DEVICE(vq_ctx->s), vq_ctx->vq)) {
+        return;
     }
+
+    event_notifier_set(virtio_queue_get_guest_notifier(vq_ctx->vq));
 }
 
 /* Context: QEMU global mutex held */
@@ -83,6 +57,10 @@  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
                                   Error **errp)
 {
     VirtIOBlockDataPlane *s;
+    VirtQueueCtx *vq_ctx;
+    VirtIOBlock *vblk = (VirtIOBlock *)vdev;
+    /* Use first block, others should be the same */
+    BlockBackend *blk0 = vblk->mq[0].blk;
     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 
@@ -103,7 +81,7 @@  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
     /* If dataplane is (re-)enabled while the guest is running there could be
      * block jobs that can conflict.
      */
-    if (blk_op_is_blocked(conf->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
+    if (blk_op_is_blocked(blk0, BLOCK_OP_TYPE_DATAPLANE, errp)) {
         error_prepend(errp, "cannot start dataplane thread: ");
         return;
     }
@@ -112,13 +90,12 @@  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
     s->vdev = vdev;
     s->conf = conf;
 
-    if (conf->iothread) {
-        s->iothread = conf->iothread;
-        object_ref(OBJECT(s->iothread));
+    for_each_valid_vq_ctx(vblk, vq_ctx) {
+        object_ref(OBJECT(vq_ctx->iothread));
+        vq_ctx->bh = aio_bh_new(iothread_get_aio_context(vq_ctx->iothread),
+                                notify_guest_bh, vq_ctx);
+        assert(vq_ctx->bh);
     }
-    s->ctx = iothread_get_aio_context(s->iothread);
-    s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
-    s->batch_notify_vqs = bitmap_new(conf->num_queues);
 
     *dataplane = s;
 }
@@ -126,14 +103,18 @@  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
 /* Context: QEMU global mutex held */
 void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 {
+    VirtQueueCtx *vq_ctx;
+    VirtIOBlock *vblk = (VirtIOBlock *)s->vdev;
+
     if (!s) {
         return;
     }
 
     virtio_blk_data_plane_stop(s);
-    g_free(s->batch_notify_vqs);
-    qemu_bh_delete(s->bh);
-    object_unref(OBJECT(s->iothread));
+    for_each_valid_vq_ctx(vblk, vq_ctx) {
+        qemu_bh_delete(vq_ctx->bh);
+        object_unref(OBJECT(vq_ctx->iothread));
+    }
     g_free(s);
 }
 
@@ -154,9 +135,9 @@  void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s->vdev)));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
-    unsigned i;
     unsigned nvqs = s->conf->num_queues;
-    int r;
+    VirtQueueCtx *vq_ctx;
+    int r, i;
 
     if (vblk->dataplane_started || s->starting) {
         return;
@@ -188,24 +169,24 @@  void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
     vblk->dataplane_started = true;
     trace_virtio_blk_data_plane_start(s);
 
-    blk_set_aio_context(s->conf->conf.blk, s->ctx);
-
     /* Kick right away to begin processing requests already in vring */
-    for (i = 0; i < nvqs; i++) {
-        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+    for_each_valid_vq_ctx(vblk, vq_ctx) {
+        AioContext *ctx = iothread_get_aio_context(vq_ctx->iothread);
 
-        event_notifier_set(virtio_queue_get_host_notifier(vq));
+        blk_set_aio_context(vq_ctx->blk, ctx);
+        event_notifier_set(virtio_queue_get_host_notifier(vq_ctx->vq));
     }
 
     /* Get this show started by hooking up our callbacks */
-    aio_context_acquire(s->ctx);
-    for (i = 0; i < nvqs; i++) {
-        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+    for_each_valid_vq_ctx(vblk, vq_ctx) {
+        AioContext *ctx = iothread_get_aio_context(vq_ctx->iothread);
+        VirtQueue *vq = vq_ctx->vq;
 
-        virtio_queue_aio_set_host_notifier_handler(vq, s->ctx,
+        aio_context_acquire(ctx);
+        virtio_queue_aio_set_host_notifier_handler(vq, ctx,
                 virtio_blk_data_plane_handle_output);
+        aio_context_release(ctx);
     }
-    aio_context_release(s->ctx);
     return;
 
   fail_guest_notifiers:
@@ -220,8 +201,9 @@  void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s->vdev)));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     VirtIOBlock *vblk = VIRTIO_BLK(s->vdev);
-    unsigned i;
     unsigned nvqs = s->conf->num_queues;
+    unsigned i;
+    VirtQueueCtx *vq_ctx;
 
     if (!vblk->dataplane_started || s->stopping) {
         return;
@@ -236,19 +218,19 @@  void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
     s->stopping = true;
     trace_virtio_blk_data_plane_stop(s);
 
-    aio_context_acquire(s->ctx);
-
     /* Stop notifications for new requests from guest */
-    for (i = 0; i < nvqs; i++) {
-        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+    for_each_valid_vq_ctx(vblk, vq_ctx) {
+        AioContext *ctx = iothread_get_aio_context(vq_ctx->iothread);
+        VirtQueue *vq = vq_ctx->vq;
 
-        virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, NULL);
-    }
+        aio_context_acquire(ctx);
+        virtio_queue_aio_set_host_notifier_handler(vq, ctx, NULL);
 
-    /* Drain and switch bs back to the QEMU main loop */
-    blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context());
+        /* Drain and switch bs back to the QEMU main loop */
+        blk_set_aio_context(vq_ctx->blk, qemu_get_aio_context());
 
-    aio_context_release(s->ctx);
+        aio_context_release(ctx);
+    }
 
     for (i = 0; i < nvqs; i++) {
         k->set_host_notifier(qbus->parent, i, false);
diff --git a/hw/block/dataplane/virtio-blk.h b/hw/block/dataplane/virtio-blk.h
index b1f0b95..a29deed 100644
--- a/hw/block/dataplane/virtio-blk.h
+++ b/hw/block/dataplane/virtio-blk.h
@@ -26,6 +26,6 @@  void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
 void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
 void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
 void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s);
-void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq);
+void virtio_blk_data_plane_notify(VirtQueueCtx *vq_ctx);
 
 #endif /* HW_DATAPLANE_VIRTIO_BLK_H */
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 6bdbca4..bd6a80b 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -28,6 +28,8 @@ 
 #endif
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
+#include "qom/object_interfaces.h"
+#include "block/block_int.h"
 
 void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq,
                              VirtIOBlockReq *req)
@@ -50,6 +52,7 @@  void virtio_blk_free_request(VirtIOBlockReq *req)
 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
 {
     VirtIOBlock *s = req->dev;
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
 
     trace_virtio_blk_req_complete(req, status);
@@ -57,7 +60,7 @@  static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
     stb_p(&req->in->status, status);
     virtqueue_push(req->vq, &req->elem, req->in_len);
     if (s->dataplane_started && !s->dataplane_disabled) {
-        virtio_blk_data_plane_notify(s->dataplane, req->vq);
+        virtio_blk_data_plane_notify(vq_ctx);
     } else {
         virtio_notify(vdev, req->vq);
     }
@@ -66,23 +69,23 @@  static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
     bool is_read)
 {
-    BlockErrorAction action = blk_get_error_action(req->dev->blk,
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
+    BlockErrorAction action = blk_get_error_action(vq_ctx->blk,
                                                    is_read, error);
-    VirtIOBlock *s = req->dev;
 
     if (action == BLOCK_ERROR_ACTION_STOP) {
         /* Break the link as the next request is going to be parsed from the
          * ring again. Otherwise we may end up doing a double completion! */
         req->mr_next = NULL;
-        req->next = s->rq;
-        s->rq = req;
+        req->next = vq_ctx->rq;
+        vq_ctx->rq = req;
     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        block_acct_failed(blk_get_stats(s->blk), &req->acct);
+        block_acct_failed(blk_get_stats(vq_ctx->blk), &req->acct);
         virtio_blk_free_request(req);
     }
 
-    blk_error_action(s->blk, action, is_read, error);
+    blk_error_action(vq_ctx->blk, action, is_read, error);
     return action != BLOCK_ERROR_ACTION_IGNORE;
 }
 
@@ -92,6 +95,8 @@  static void virtio_blk_rw_complete(void *opaque, int ret)
 
     while (next) {
         VirtIOBlockReq *req = next;
+        VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
+
         next = req->mr_next;
         trace_virtio_blk_rw_complete(req, ret);
 
@@ -119,7 +124,7 @@  static void virtio_blk_rw_complete(void *opaque, int ret)
         }
 
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+        block_acct_done(blk_get_stats(vq_ctx->blk), &req->acct);
         virtio_blk_free_request(req);
     }
 }
@@ -127,6 +132,7 @@  static void virtio_blk_rw_complete(void *opaque, int ret)
 static void virtio_blk_flush_complete(void *opaque, int ret)
 {
     VirtIOBlockReq *req = opaque;
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
 
     if (ret) {
         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
@@ -135,7 +141,7 @@  static void virtio_blk_flush_complete(void *opaque, int ret)
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+    block_acct_done(blk_get_stats(vq_ctx->blk), &req->acct);
     virtio_blk_free_request(req);
 }
 
@@ -206,6 +212,7 @@  static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
     VirtQueueElement *elem = &req->elem;
     VirtIOBlock *blk = req->dev;
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
 
 #ifdef __linux__
     int i;
@@ -288,7 +295,7 @@  static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
     ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
     ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
 
-    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
+    acb = blk_aio_ioctl(vq_ctx->blk, SG_IO, &ioctl_req->hdr,
                         virtio_blk_ioctl_complete, ioctl_req);
     if (!acb) {
         g_free(ioctl_req);
@@ -386,6 +393,7 @@  void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
     int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
     int max_xfer_len = 0;
     int64_t sector_num = 0;
+    VirtQueueCtx *vq_ctx;
 
     if (mrb->num_reqs == 1) {
         submit_requests(blk, mrb, 0, 1, -1);
@@ -393,7 +401,8 @@  void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
         return;
     }
 
-    max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
+    vq_ctx = virtio_req_get_mq_ctx(mrb->reqs[0]);
+    max_xfer_len = blk_get_max_transfer_length(vq_ctx->blk);
     max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS);
 
     qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
@@ -434,16 +443,18 @@  void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
 
 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
+
+    block_acct_start(blk_get_stats(vq_ctx->blk), &req->acct, 0,
                      BLOCK_ACCT_FLUSH);
 
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
     if (mrb->is_write && mrb->num_reqs > 0) {
-        virtio_blk_submit_multireq(req->dev->blk, mrb);
+        virtio_blk_submit_multireq(vq_ctx->blk, mrb);
     }
-    blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
+    blk_aio_flush(vq_ctx->blk, virtio_blk_flush_complete, req);
 }
 
 static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
@@ -451,6 +462,8 @@  static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
 {
     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
     uint64_t total_sectors;
+    /* Use first block, others should be the same */
+    BlockBackend *blk0 = dev->mq[0].blk;
 
     if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return false;
@@ -461,7 +474,7 @@  static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     if (size % dev->conf.conf.logical_block_size) {
         return false;
     }
-    blk_get_geometry(dev->blk, &total_sectors);
+    blk_get_geometry(blk0, &total_sectors);
     if (sector > total_sectors || nb_sectors > total_sectors - sector) {
         return false;
     }
@@ -475,6 +488,7 @@  void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     struct iovec *iov = req->elem.out_sg;
     unsigned in_num = req->elem.in_num;
     unsigned out_num = req->elem.out_num;
+    VirtQueueCtx *vq_ctx = virtio_req_get_mq_ctx(req);
 
     if (req->elem.out_num < 1 || req->elem.in_num < 1) {
         error_report("virtio-blk missing headers");
@@ -526,13 +540,13 @@  void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
                                       req->qiov.size)) {
             virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-            block_acct_invalid(blk_get_stats(req->dev->blk),
+            block_acct_invalid(blk_get_stats(vq_ctx->blk),
                                is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
             virtio_blk_free_request(req);
             return;
         }
 
-        block_acct_start(blk_get_stats(req->dev->blk),
+        block_acct_start(blk_get_stats(vq_ctx->blk),
                          &req->acct, req->qiov.size,
                          is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
 
@@ -541,7 +555,7 @@  void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
                                   is_write != mrb->is_write ||
                                   !req->dev->conf.request_merging)) {
-            virtio_blk_submit_multireq(req->dev->blk, mrb);
+            virtio_blk_submit_multireq(vq_ctx->blk, mrb);
         }
 
         assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
@@ -580,20 +594,21 @@  void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 
 void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 {
+    VirtQueueCtx *vq_ctx = virtio_vq_get_mq_ctx(s, vq);
     VirtIOBlockReq *req;
     MultiReqBuffer mrb = {};
 
-    blk_io_plug(s->blk);
+    blk_io_plug(vq_ctx->blk);
 
     while ((req = virtio_blk_get_request(s, vq))) {
         virtio_blk_handle_request(req, &mrb);
     }
 
     if (mrb.num_reqs) {
-        virtio_blk_submit_multireq(s->blk, &mrb);
+        virtio_blk_submit_multireq(vq_ctx->blk, &mrb);
     }
 
-    blk_io_unplug(s->blk);
+    blk_io_unplug(vq_ctx->blk);
 }
 
 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -614,14 +629,14 @@  static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 
 static void virtio_blk_dma_restart_bh(void *opaque)
 {
-    VirtIOBlock *s = opaque;
-    VirtIOBlockReq *req = s->rq;
+    VirtQueueCtx *vq_ctx = opaque;
+    VirtIOBlockReq *req = vq_ctx->rq;
     MultiReqBuffer mrb = {};
 
-    qemu_bh_delete(s->bh);
-    s->bh = NULL;
+    qemu_bh_delete(vq_ctx->bh);
+    vq_ctx->bh = NULL;
 
-    s->rq = NULL;
+    vq_ctx->rq = NULL;
 
     while (req) {
         VirtIOBlockReq *next = req->next;
@@ -630,7 +645,7 @@  static void virtio_blk_dma_restart_bh(void *opaque)
     }
 
     if (mrb.num_reqs) {
-        virtio_blk_submit_multireq(s->blk, &mrb);
+        virtio_blk_submit_multireq(vq_ctx->blk, &mrb);
     }
 }
 
@@ -638,15 +653,20 @@  static void virtio_blk_dma_restart_cb(void *opaque, int running,
                                       RunState state)
 {
     VirtIOBlock *s = opaque;
+    VirtQueueCtx *vq_ctx;
 
     if (!running) {
         return;
     }
 
-    if (!s->bh) {
-        s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
-                           virtio_blk_dma_restart_bh, s);
-        qemu_bh_schedule(s->bh);
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        if (vq_ctx->bh)
+            continue;
+
+        vq_ctx->bh = aio_bh_new(blk_get_aio_context(vq_ctx->blk),
+                                virtio_blk_dma_restart_bh, vq_ctx);
+        assert(vq_ctx->bh);
+        qemu_bh_schedule(vq_ctx->bh);
     }
 }
 
@@ -654,21 +674,27 @@  static void virtio_blk_reset(VirtIODevice *vdev)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     AioContext *ctx;
+    VirtQueueCtx *vq_ctx;
 
-    /*
-     * This should cancel pending requests, but can't do nicely until there
-     * are per-device request lists.
-     */
-    ctx = blk_get_aio_context(s->blk);
-    aio_context_acquire(ctx);
-    blk_drain(s->blk);
+    /* Acquire all aio contexts and drain all underlying backends */
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        ctx = blk_get_aio_context(vq_ctx->blk);
+        aio_context_acquire(ctx);
+        blk_drain(vq_ctx->blk);
+    }
 
+    /* Stop dataplane if any */
     if (s->dataplane) {
         virtio_blk_data_plane_stop(s->dataplane);
     }
-    aio_context_release(ctx);
 
-    blk_set_enable_write_cache(s->blk, s->original_wce);
+    /* Release aio contexts */
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        ctx = blk_get_aio_context(vq_ctx->blk);
+        aio_context_release(ctx);
+
+        blk_set_enable_write_cache(vq_ctx->blk, s->original_wce);
+    }
 }
 
 /* coalesce internal state, copy to pci i/o region 0
@@ -680,8 +706,10 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int blk_size = conf->logical_block_size;
+    /* Use first block, others should be the same */
+    BlockBackend *blk0 = s->mq[0].blk;
 
-    blk_get_geometry(s->blk, &capacity);
+    blk_get_geometry(blk0, &capacity);
     memset(&blkcfg, 0, sizeof(blkcfg));
     virtio_stq_p(vdev, &blkcfg.capacity, capacity);
     virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
@@ -701,7 +729,7 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
      * divided by 512 - instead it is the amount of blk_size blocks
      * per track (cylinder).
      */
-    if (blk_getlength(s->blk) /  conf->heads / conf->secs % blk_size) {
+    if (blk_getlength(blk0) /  conf->heads / conf->secs % blk_size) {
         blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
     } else {
         blkcfg.geometry.sectors = conf->secs;
@@ -709,7 +737,7 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
     blkcfg.size_max = 0;
     blkcfg.physical_block_exp = get_physical_block_exp(conf);
     blkcfg.alignment_offset = 0;
-    blkcfg.wce = blk_enable_write_cache(s->blk);
+    blkcfg.wce = blk_enable_write_cache(blk0);
     virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
     memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 }
@@ -718,18 +746,23 @@  static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     struct virtio_blk_config blkcfg;
+    VirtQueueCtx *vq_ctx;
 
     memcpy(&blkcfg, config, sizeof(blkcfg));
 
-    aio_context_acquire(blk_get_aio_context(s->blk));
-    blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
-    aio_context_release(blk_get_aio_context(s->blk));
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        aio_context_acquire(blk_get_aio_context(vq_ctx->blk));
+        blk_set_enable_write_cache(vq_ctx->blk, blkcfg.wce != 0);
+        aio_context_release(blk_get_aio_context(vq_ctx->blk));
+    }
 }
 
 static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
                                         Error **errp)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
+    /* Use first block, others should be the same */
+    BlockBackend *blk0 = s->mq[0].blk;
 
     virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
     virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
@@ -748,10 +781,10 @@  static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
     if (s->conf.config_wce) {
         virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
     }
-    if (blk_enable_write_cache(s->blk)) {
+    if (blk_enable_write_cache(blk0)) {
         virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
     }
-    if (blk_is_read_only(s->blk)) {
+    if (blk_is_read_only(blk0)) {
         virtio_add_feature(&features, VIRTIO_BLK_F_RO);
     }
     if (s->conf.num_queues > 1) {
@@ -787,14 +820,18 @@  static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
      *     Guest writes 1 to the WCE configuration field (writeback mode)
      *     Guest sets DRIVER_OK bit in status field
      *
-     * s->blk would erroneously be placed in writethrough mode.
+     * vq_ctx->blk would erroneously be placed in writethrough mode.
      */
     if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
-        aio_context_acquire(blk_get_aio_context(s->blk));
-        blk_set_enable_write_cache(s->blk,
-                                   virtio_vdev_has_feature(vdev,
+        VirtQueueCtx *vq_ctx;
+
+        for_each_valid_vq_ctx(s, vq_ctx) {
+            aio_context_acquire(blk_get_aio_context(vq_ctx->blk));
+            blk_set_enable_write_cache(vq_ctx->blk,
+                                       virtio_vdev_has_feature(vdev,
                                                            VIRTIO_BLK_F_WCE));
-        aio_context_release(blk_get_aio_context(s->blk));
+            aio_context_release(blk_get_aio_context(vq_ctx->blk));
+        }
     }
 }
 
@@ -809,21 +846,26 @@  static void virtio_blk_save(QEMUFile *f, void *opaque)
 
     virtio_save(vdev, f);
 }
-    
+
 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
-    VirtIOBlockReq *req = s->rq;
+    VirtIOBlockReq *req;
+    VirtQueueCtx *vq_ctx;
 
-    while (req) {
-        qemu_put_sbyte(f, 1);
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        req = vq_ctx->rq;
 
-        if (s->conf.num_queues > 1) {
-            qemu_put_be32(f, virtio_get_queue_index(req->vq));
-        }
+        while (req) {
+            qemu_put_sbyte(f, 1);
 
-        qemu_put_virtqueue_element(f, &req->elem);
-        req = req->next;
+            if (s->conf.num_queues > 1) {
+                qemu_put_be32(f, virtio_get_queue_index(req->vq));
+            }
+
+            qemu_put_virtqueue_element(f, &req->elem);
+            req = req->next;
+        }
     }
     qemu_put_sbyte(f, 0);
 }
@@ -843,6 +885,7 @@  static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
                                   int version_id)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
+    VirtQueueCtx *vq_ctx;
 
     while (qemu_get_sbyte(f)) {
         unsigned nvqs = s->conf.num_queues;
@@ -861,8 +904,9 @@  static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
 
         req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
         virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
-        req->next = s->rq;
-        s->rq = req;
+        vq_ctx = &s->mq[vq_idx];
+        req->next = vq_ctx->rq;
+        vq_ctx->rq = req;
     }
 
     return 0;
@@ -879,20 +923,73 @@  static const BlockDevOps virtio_block_ops = {
     .resize_cb = virtio_blk_resize,
 };
 
+static bool virtio_blk_dup_iothreads_and_drives(VirtIOBlock *s,
+                                                Error **errp)
+{
+    BlockDriverState *bs = blk_bs(s->conf.conf.blk);
+    Error *local_err = NULL;
+    int i;
+
+    s->conf.drives_arr[0] = s->conf.conf.blk;
+
+    if (s->conf.num_queues == 1)
+        return true;
+    if (bs->drv != &bdrv_raw) {
+        error_setg(errp, "multiqueues are supported only for raw block device\n");
+        return false;
+    }
+    if (bs->open_flags & BDRV_O_SNAPSHOT) {
+        error_setg(errp, "multiqueues do not work with snapshots\n");
+        return false;
+    }
+    for (i = !!s->conf.iothread; i < s->conf.num_queues; i++) {
+        IOThread *t;
+        char id[32];
+
+        /* Make unique id for a thread object */
+        snprintf(id, sizeof(id), "%p_%d", s, i);
+        t = (IOThread *)object_new_with_props(TYPE_IOTHREAD,
+                                              object_get_objects_root(),
+                                              id, &local_err, NULL);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return false;
+        }
+        s->conf.iothreads_arr[i] = t;
+    }
+    for (i = 1; i < s->conf.num_queues; i++) {
+        BlockBackend *blk;
+
+        blk = blk_new_open(bs->filename, NULL,
+                           qdict_clone_shallow(bs->full_open_options),
+                           bs->open_flags, &local_err);
+        if (!blk) {
+            error_propagate(errp, local_err);
+            return false;
+        }
+        s->conf.drives_arr[i] = blk;
+    }
+
+    return true;
+}
+
 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
     VirtIOBlkConf *conf = &s->conf;
+    /* Use first block, others should be the same */
+    BlockBackend *blk0 = s->conf.conf.blk;
+    VirtQueueCtx *vq_ctx;
     Error *err = NULL;
     static int virtio_blk_id;
     unsigned i;
 
-    if (!conf->conf.blk) {
+    if (!blk0) {
         error_setg(errp, "drive property not set");
         return;
     }
-    if (!blk_is_inserted(conf->conf.blk)) {
+    if (!blk_is_inserted(blk0)) {
         error_setg(errp, "Device needs media, but drive is empty");
         return;
     }
@@ -900,9 +997,12 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         error_setg(errp, "num-queues property must be larger than 0");
         return;
     }
+    if (!virtio_blk_dup_iothreads_and_drives(s, errp)) {
+        return;
+    }
 
     blkconf_serial(&conf->conf, &conf->serial);
-    s->original_wce = blk_enable_write_cache(conf->conf.blk);
+    s->original_wce = blk_enable_write_cache(blk0);
     blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, &err);
     if (err) {
         error_propagate(errp, err);
@@ -913,12 +1013,16 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
                 sizeof(struct virtio_blk_config));
 
-    s->blk = conf->conf.blk;
-    s->rq = NULL;
     s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 
+    /* Init MQ contexts */
     for (i = 0; i < conf->num_queues; i++) {
-        virtio_add_queue(vdev, 128, virtio_blk_handle_output);
+        vq_ctx = &s->mq[i];
+
+        vq_ctx->blk = s->conf.drives_arr[i];
+        vq_ctx->s   = s;
+        vq_ctx->vq  = virtio_add_queue(vdev, 128, virtio_blk_handle_output);
+        vq_ctx->iothread = s->conf.iothreads_arr[i];
     }
     virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
     if (err != NULL) {
@@ -930,22 +1034,26 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
     register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                     virtio_blk_save, virtio_blk_load, s);
-    blk_set_dev_ops(s->blk, &virtio_block_ops, s);
-    blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
+    for_each_valid_vq_ctx(s, vq_ctx) {
+        blk_set_dev_ops(vq_ctx->blk, &virtio_block_ops, s);
+        blk_set_guest_block_size(vq_ctx->blk, s->conf.conf.logical_block_size);
 
-    blk_iostatus_enable(s->blk);
+        blk_iostatus_enable(vq_ctx->blk);
+    }
 }
 
 static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
+    VirtQueueCtx *vq_ctx;
 
     virtio_blk_data_plane_destroy(s->dataplane);
     s->dataplane = NULL;
     qemu_del_vm_change_state_handler(s->change);
     unregister_savevm(dev, "virtio-blk", s);
-    blockdev_mark_auto_del(s->blk);
+    for_each_valid_vq_ctx(s, vq_ctx)
+        blockdev_mark_auto_del(vq_ctx->blk);
     virtio_cleanup(vdev);
 }
 
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index e9bf463..0542c23 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -33,7 +33,11 @@  struct virtio_blk_inhdr
 struct VirtIOBlkConf
 {
     BlockConf conf;
-    IOThread *iothread;
+    union {
+        IOThread *iothread; /* kept for compatibility */
+        IOThread *iothreads_arr[VIRTIO_QUEUE_MAX];
+    };
+    BlockBackend *drives_arr[VIRTIO_QUEUE_MAX];
     char *serial;
     uint32_t scsi;
     uint32_t config_wce;
@@ -41,14 +45,21 @@  struct VirtIOBlkConf
     uint16_t num_queues;
 };
 
+typedef struct VirtQueueCtx {
+    BlockBackend *blk;
+    struct VirtIOBlock *s;
+    VirtQueue *vq;
+    void *rq;
+    QEMUBH *bh;
+    IOThread *iothread;
+} VirtQueueCtx;
+
 struct VirtIOBlockDataPlane;
 
 struct VirtIOBlockReq;
 typedef struct VirtIOBlock {
     VirtIODevice parent_obj;
-    BlockBackend *blk;
-    void *rq;
-    QEMUBH *bh;
+    VirtQueueCtx mq[VIRTIO_QUEUE_MAX];
     VirtIOBlkConf conf;
     unsigned short sector_mask;
     bool original_wce;
@@ -58,6 +69,11 @@  typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
+#define for_each_valid_vq_ctx(s, vq_ctx)                    \
+    for (vq_ctx = &s->mq[0];                                \
+         vq_ctx < s->mq + ARRAY_SIZE(s->mq) && vq_ctx->blk; \
+         vq_ctx++)                                          \
+
 typedef struct VirtIOBlockReq {
     VirtQueueElement elem;
     int64_t sector_num;
@@ -72,6 +88,20 @@  typedef struct VirtIOBlockReq {
     BlockAcctCookie acct;
 } VirtIOBlockReq;
 
+static inline VirtQueueCtx *virtio_vq_get_mq_ctx(VirtIOBlock *s, VirtQueue *vq)
+{
+    uint16_t qi = virtio_get_queue_index(vq);
+
+    return &s->mq[qi];
+}
+
+static inline VirtQueueCtx *virtio_req_get_mq_ctx(VirtIOBlockReq *req)
+{
+    uint16_t qi = virtio_get_queue_index(req->vq);
+
+    return &req->dev->mq[qi];
+}
+
 #define VIRTIO_BLK_MAX_MERGE_REQS 32
 
 typedef struct MultiReqBuffer {