new file mode 100644
@@ -0,0 +1,79 @@
+#ifndef EVENT_POLL_H
+#define EVENT_POLL_H
+
+#include <sys/epoll.h>
+#include "event_notifier.h"
+
+typedef struct EventHandler EventHandler;
+typedef void EventCallback(EventHandler *handler);
+struct EventHandler
+{
+ EventNotifier *notifier; /* eventfd */
+ EventCallback *callback; /* callback function */
+};
+
+typedef struct {
+ int epoll_fd; /* epoll(2) file descriptor */
+} EventPoll;
+
+static void event_poll_init(EventPoll *poll)
+{
+ /* Create epoll file descriptor */
+ poll->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (poll->epoll_fd < 0) {
+ fprintf(stderr, "epoll_create1 failed: %m\n");
+ exit(1);
+ }
+}
+
+static void event_poll_cleanup(EventPoll *poll)
+{
+ close(poll->epoll_fd);
+ poll->epoll_fd = -1;
+}
+
+/* Add an event notifier and its callback for polling */
+static void event_poll_add(EventPoll *poll, EventHandler *handler, EventNotifier *notifier, EventCallback *callback)
+{
+ struct epoll_event event = {
+ .events = EPOLLIN,
+ .data.ptr = handler,
+ };
+ handler->notifier = notifier;
+ handler->callback = callback;
+ if (epoll_ctl(poll->epoll_fd, EPOLL_CTL_ADD, event_notifier_get_fd(notifier), &event) != 0) {
+ fprintf(stderr, "failed to add event handler to epoll: %m\n");
+ exit(1);
+ }
+}
+
+/* Block until the next event and invoke its callback
+ *
+ * Signals must be masked, EINTR should never happen. This is true for QEMU
+ * threads.
+ */
+static void event_poll(EventPoll *poll)
+{
+ EventHandler *handler;
+ struct epoll_event event;
+ int nevents;
+
+ /* Wait for the next event. Only do one event per call to keep the
+ * function simple, this could be changed later. */
+ nevents = epoll_wait(poll->epoll_fd, &event, 1, -1);
+ if (unlikely(nevents != 1)) {
+ fprintf(stderr, "epoll_wait failed: %m\n");
+ exit(1); /* should never happen */
+ }
+
+ /* Find out which event handler has become active */
+ handler = event.data.ptr;
+
+ /* Clear the eventfd */
+ event_notifier_test_and_clear(handler->notifier);
+
+ /* Handle the event */
+ handler->callback(handler);
+}
+
+#endif /* EVENT_POLL_H */
new file mode 100644
@@ -0,0 +1,191 @@
+#ifndef VRING_H
+#define VRING_H
+
+#include <linux/virtio_ring.h>
+#include "qemu-common.h"
+
+typedef struct {
+ void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */
+ struct vring vr; /* virtqueue vring mapped to host memory */
+ __u16 last_avail_idx; /* last processed avail ring index */
+ __u16 last_used_idx; /* last processed used ring index */
+} Vring;
+
+static inline unsigned int vring_get_num(Vring *vring)
+{
+ return vring->vr.num;
+}
+
+/* Map target physical address to host address
+ */
+static inline void *phys_to_host(Vring *vring, target_phys_addr_t phys)
+{
+ /* Adjust for 3.6-4 GB PCI memory range */
+ if (phys >= 0x100000000) {
+ phys -= 0x100000000 - 0xe0000000;
+ } else if (phys >= 0xe0000000) {
+ fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", phys);
+ exit(1);
+ }
+ return vring->phys_mem_zero_host_ptr + phys;
+}
+
+/* Setup for cheap target physical to host address conversion
+ *
+ * This is a hack for direct access to guest memory, we're not really allowed
+ * to do this.
+ */
+static void setup_phys_to_host(Vring *vring)
+{
+ target_phys_addr_t len = 4096; /* RAM is really much larger but we cheat */
+ vring->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0);
+ if (!vring->phys_mem_zero_host_ptr) {
+ fprintf(stderr, "setup_phys_to_host failed\n");
+ exit(1);
+ }
+}
+
+/* Map the guest's vring to host memory
+ *
+ * This is not allowed but we know the ring won't move.
+ */
+static void vring_setup(Vring *vring, VirtIODevice *vdev, int n)
+{
+ setup_phys_to_host(vring);
+
+ vring_init(&vring->vr, virtio_queue_get_num(vdev, n),
+ phys_to_host(vring, virtio_queue_get_ring_addr(vdev, n)), 4096);
+
+ vring->last_avail_idx = vring->vr.avail->idx;
+ vring->last_used_idx = vring->vr.used->idx;
+
+ fprintf(stderr, "vring physical=%#lx desc=%p avail=%p used=%p\n",
+ virtio_queue_get_ring_addr(vdev, n),
+ vring->vr.desc, vring->vr.avail, vring->vr.used);
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error.
+ *
+ * Stolen from linux-2.6/drivers/vhost/vhost.c.
+ */
+static unsigned int vring_pop(Vring *vring,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0, num = vring->vr.num;
+ __u16 avail_idx, last_avail_idx;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vring->last_avail_idx;
+ avail_idx = vring->vr.avail->idx;
+
+ if (unlikely((__u16)(avail_idx - last_avail_idx) > num)) {
+ fprintf(stderr, "Guest moved used index from %u to %u\n",
+ last_avail_idx, avail_idx);
+ exit(1);
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (avail_idx == last_avail_idx)
+ return num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ __sync_synchronize(); /* smp_rmb() */
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = vring->vr.avail->ring[last_avail_idx % num];
+
+ /* If their number is silly, that's an error. */
+ if (unlikely(head >= num)) {
+ fprintf(stderr, "Guest says index %u > %u is available\n",
+ head, num);
+ exit(1);
+ }
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+
+ i = head;
+ do {
+ if (unlikely(i >= num)) {
+ fprintf(stderr, "Desc index is %u > %u, head = %u\n",
+ i, num, head);
+ exit(1);
+ }
+ if (unlikely(++found > num)) {
+ fprintf(stderr, "Loop detected: last one at %u "
+ "vq size %u head %u\n",
+ i, num, head);
+ exit(1);
+ }
+ desc = vring->vr.desc[i];
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+/* ret = get_indirect(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, &desc);
+ if (unlikely(ret < 0)) {
+ vq_err(vq, "Failure detected "
+ "in indirect descriptor at idx %d\n", i);
+ return ret;
+ }
+ continue; */
+ fprintf(stderr, "virtio-blk indirect vring not supported\n");
+ exit(1);
+ }
+
+ iov->iov_base = phys_to_host(vring, desc.addr);
+ iov->iov_len = desc.len;
+ iov++;
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ /* If this is an input descriptor,
+ * increment that count. */
+ *in_num += 1;
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ fprintf(stderr, "Descriptor has out after in: "
+ "idx %d\n", i);
+ exit(1);
+ }
+ *out_num += 1;
+ }
+ i = desc.next;
+ } while (desc.flags & VRING_DESC_F_NEXT);
+
+ /* On success, increment avail index. */
+ vring->last_avail_idx++;
+ return head;
+}
+
+/* After we've used one of their buffers, we tell them about it.
+ *
+ * Stolen from linux-2.6/drivers/vhost/vhost.c.
+ */
+static __attribute__((unused)) void vring_push(Vring *vring, unsigned int head, int len)
+{
+ struct vring_used_elem *used;
+
+ /* The virtqueue contains a ring of used buffers. Get a pointer to the
+ * next entry in that used ring. */
+ used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num];
+ used->id = head;
+ used->len = len;
+
+ /* Make sure buffer is written before we update index. */
+ __sync_synchronize(); /* smp_wmb() */
+
+ vring->vr.used->idx = ++vring->last_used_idx;
+}
+
+#endif /* VRING_H */
@@ -11,26 +11,21 @@
*
*/
-#include <sys/epoll.h>
-#include <sys/eventfd.h>
#include <libaio.h>
-#include <linux/virtio_ring.h>
#include "qemu-common.h"
#include "qemu-thread.h"
#include "qemu-error.h"
#include "blockdev.h"
#include "virtio-blk.h"
+#include "hw/dataplane/event-poll.h"
+#include "hw/dataplane/vring.h"
+#include "kvm.h"
enum {
- SEG_MAX = 126, /* maximum number of I/O segments */
+ SEG_MAX = 126, /* maximum number of I/O segments */
+ VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */
};
-typedef struct
-{
- EventNotifier *notifier; /* eventfd */
- void (*handler)(void); /* handler function */
-} EventHandler;
-
typedef struct VirtIOBlock
{
VirtIODevice vdev;
@@ -44,15 +39,13 @@ typedef struct VirtIOBlock
bool data_plane_started;
QemuThread data_plane_thread;
- struct vring vring;
+ Vring vring; /* virtqueue vring */
- int epoll_fd; /* epoll(2) file descriptor */
+ EventPoll event_poll; /* event poller */
io_context_t io_ctx; /* Linux AIO context */
EventNotifier io_notifier; /* Linux AIO eventfd */
EventHandler io_handler; /* Linux AIO completion handler */
EventHandler notify_handler; /* virtqueue notify handler */
-
- void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */
} VirtIOBlock;
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -60,138 +53,64 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
return (VirtIOBlock *)vdev;
}
-/* Map target physical address to host address
- */
-static inline void *phys_to_host(VirtIOBlock *s, target_phys_addr_t phys)
+static void handle_io(EventHandler *handler)
{
- /* Adjust for 3.6-4 GB PCI memory range */
- if (phys >= 0x100000000) {
- phys -= 0x100000000 - 0xe0000000;
- } else if (phys >= 0xe0000000) {
- fprintf(stderr, "phys_to_host bad physical address in PCI range %#lx\n", phys);
- exit(1);
- }
- return s->phys_mem_zero_host_ptr + phys;
+ fprintf(stderr, "io completion happened\n");
}
-/* Setup for cheap target physical to host address conversion
- *
- * This is a hack for direct access to guest memory, we're not really allowed
- * to do this.
- */
-static void setup_phys_to_host(VirtIOBlock *s)
+static void handle_notify(EventHandler *handler)
{
- target_phys_addr_t len = 4096; /* RAM is really much larger but we cheat */
- s->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0);
- if (!s->phys_mem_zero_host_ptr) {
- fprintf(stderr, "setup_phys_to_host failed\n");
- exit(1);
+ VirtIOBlock *s = container_of(handler, VirtIOBlock, notify_handler);
+ struct iovec iov[VRING_MAX];
+ unsigned int out_num, in_num;
+ int head;
+
+ head = vring_pop(&s->vring, iov, ARRAY_SIZE(iov), &out_num, &in_num);
+ if (unlikely(head >= vring_get_num(&s->vring))) {
+ fprintf(stderr, "false alarm, nothing on vring\n");
+ return;
}
-}
-/* Map the guest's vring to host memory
- *
- * This is not allowed but we know the ring won't move.
- */
-static void map_vring(struct vring *vring, VirtIOBlock *s, VirtIODevice *vdev, int n)
-{
- vring->num = virtio_queue_get_num(vdev, n);
- vring->desc = phys_to_host(s, virtio_queue_get_desc_addr(vdev, n));
- vring->avail = phys_to_host(s, virtio_queue_get_avail_addr(vdev, n));
- vring->used = phys_to_host(s, virtio_queue_get_used_addr(vdev, n));
-
- fprintf(stderr, "virtio-blk vring physical=%#lx desc=%p avail=%p used=%p\n",
- virtio_queue_get_ring_addr(vdev, n),
- vring->desc, vring->avail, vring->used);
-}
-
-static void handle_io(void)
-{
- fprintf(stderr, "io completion happened\n");
-}
-
-static void handle_notify(void)
-{
- fprintf(stderr, "virtqueue notify happened\n");
+ fprintf(stderr, "head=%u out_num=%u in_num=%u\n", head, out_num, in_num);
}
static void *data_plane_thread(void *opaque)
{
VirtIOBlock *s = opaque;
- struct epoll_event event;
- int nevents;
- EventHandler *event_handler;
-
- /* Signals are masked, EINTR should never happen */
for (;;) {
- /* Wait for the next event. Only do one event per call to keep the
- * function simple, this could be changed later. */
- nevents = epoll_wait(s->epoll_fd, &event, 1, -1);
- if (unlikely(nevents != 1)) {
- fprintf(stderr, "epoll_wait failed: %m\n");
- continue; /* should never happen */
- }
-
- /* Find out which event handler has become active */
- event_handler = event.data.ptr;
-
- /* Clear the eventfd */
- event_notifier_test_and_clear(event_handler->notifier);
-
- /* Handle the event */
- event_handler->handler();
+ event_poll(&s->event_poll);
}
return NULL;
}
-static void add_event_handler(int epoll_fd, EventHandler *event_handler)
-{
- struct epoll_event event = {
- .events = EPOLLIN,
- .data.ptr = event_handler,
- };
- if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event_notifier_get_fd(event_handler->notifier), &event) != 0) {
- fprintf(stderr, "virtio-blk failed to add event handler to epoll: %m\n");
- exit(1);
- }
-}
-
static void data_plane_start(VirtIOBlock *s)
{
- setup_phys_to_host(s);
- map_vring(&s->vring, s, &s->vdev, 0);
-
- /* Create epoll file descriptor */
- s->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
- if (s->epoll_fd < 0) {
- fprintf(stderr, "epoll_create1 failed: %m\n");
- return; /* TODO error handling */
- }
+ vring_setup(&s->vring, &s->vdev, 0);
+
+ event_poll_init(&s->event_poll);
if (s->vdev.binding->set_host_notifier(s->vdev.binding_opaque, 0, true) != 0) {
- fprintf(stderr, "virtio-blk failed to set host notifier\n");
- return; /* TODO error handling */
+ fprintf(stderr, "virtio-blk failed to set host notifier, ensure -enable-kvm is set\n");
+ exit(1);
}
- s->notify_handler.notifier = virtio_queue_get_host_notifier(s->vq),
- s->notify_handler.handler = handle_notify;
- add_event_handler(s->epoll_fd, &s->notify_handler);
+ event_poll_add(&s->event_poll, &s->notify_handler,
+ virtio_queue_get_host_notifier(s->vq),
+ handle_notify);
/* Create aio context */
if (io_setup(SEG_MAX, &s->io_ctx) != 0) {
fprintf(stderr, "virtio-blk io_setup failed\n");
- return; /* TODO error handling */
+ exit(1);
}
if (event_notifier_init(&s->io_notifier, 0) != 0) {
fprintf(stderr, "virtio-blk io event notifier creation failed\n");
- return; /* TODO error handling */
+ exit(1);
}
- s->io_handler.notifier = &s->io_notifier;
- s->io_handler.handler = handle_io;
- add_event_handler(s->epoll_fd, &s->io_handler);
+ event_poll_add(&s->event_poll, &s->io_handler, &s->io_notifier, handle_io);
qemu_thread_create(&s->data_plane_thread, data_plane_thread, s, QEMU_THREAD_JOINABLE);
@@ -209,7 +128,7 @@ static void data_plane_stop(VirtIOBlock *s)
s->vdev.binding->set_host_notifier(s->vdev.binding_opaque, 0, false);
- close(s->epoll_fd);
+ event_poll_cleanup(&s->event_poll);
}
static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
@@ -317,7 +236,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf,
s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
- s->vq = virtio_add_queue(&s->vdev, SEG_MAX + 2, virtio_blk_handle_output);
+ s->vq = virtio_add_queue(&s->vdev, VRING_MAX, virtio_blk_handle_output);
s->data_plane_started = false;
s->qdev = dev;
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> --- hw/dataplane/event-poll.h | 79 +++++++++++++++++++ hw/dataplane/vring.h | 191 +++++++++++++++++++++++++++++++++++++++++++++ hw/virtio-blk.c | 149 ++++++++--------------------------- 3 files changed, 304 insertions(+), 115 deletions(-) create mode 100644 hw/dataplane/event-poll.h create mode 100644 hw/dataplane/vring.h