diff mbox series

[PULL,v2,10/20] libvduse: Add VDUSE (vDPA Device in Userspace) library

Message ID 20220624154103.185902-11-kwolf@redhat.com (mailing list archive)
State New, archived
Headers show
Series [PULL,v2,01/20] block: drop unused bdrv_co_drain() API | expand

Commit Message

Kevin Wolf June 24, 2022, 3:40 p.m. UTC
From: Xie Yongji <xieyongji@bytedance.com>

VDUSE [1] is a linux framework that makes it possible to implement
software-emulated vDPA devices in userspace. This adds a library
as a subproject to help implementing VDUSE backends in QEMU.

[1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Message-Id: <20220523084611.91-6-xieyongji@bytedance.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 meson_options.txt                           |    2 +
 subprojects/libvduse/include/atomic.h       |    1 +
 subprojects/libvduse/include/compiler.h     |    1 +
 subprojects/libvduse/libvduse.h             |  235 ++++
 subprojects/libvduse/libvduse.c             | 1150 +++++++++++++++++++
 MAINTAINERS                                 |    5 +
 meson.build                                 |   15 +
 scripts/meson-buildoptions.sh               |    3 +
 subprojects/libvduse/linux-headers/linux    |    1 +
 subprojects/libvduse/meson.build            |   10 +
 subprojects/libvduse/standard-headers/linux |    1 +
 11 files changed, 1424 insertions(+)
 create mode 120000 subprojects/libvduse/include/atomic.h
 create mode 120000 subprojects/libvduse/include/compiler.h
 create mode 100644 subprojects/libvduse/libvduse.h
 create mode 100644 subprojects/libvduse/libvduse.c
 create mode 120000 subprojects/libvduse/linux-headers/linux
 create mode 100644 subprojects/libvduse/meson.build
 create mode 120000 subprojects/libvduse/standard-headers/linux

Comments

Markus Armbruster June 27, 2022, 4:45 a.m. UTC | #1
Kevin Wolf <kwolf@redhat.com> writes:

> From: Xie Yongji <xieyongji@bytedance.com>
>
> VDUSE [1] is a linux framework that makes it possible to implement
> software-emulated vDPA devices in userspace. This adds a library
> as a subproject to help implementing VDUSE backends in QEMU.
>
> [1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> Message-Id: <20220523084611.91-6-xieyongji@bytedance.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  meson_options.txt                           |    2 +
>  subprojects/libvduse/include/atomic.h       |    1 +
>  subprojects/libvduse/include/compiler.h     |    1 +
>  subprojects/libvduse/libvduse.h             |  235 ++++
>  subprojects/libvduse/libvduse.c             | 1150 +++++++++++++++++++
>  MAINTAINERS                                 |    5 +
>  meson.build                                 |   15 +
>  scripts/meson-buildoptions.sh               |    3 +
>  subprojects/libvduse/linux-headers/linux    |    1 +
>  subprojects/libvduse/meson.build            |   10 +
>  subprojects/libvduse/standard-headers/linux |    1 +
>  11 files changed, 1424 insertions(+)
>  create mode 120000 subprojects/libvduse/include/atomic.h
>  create mode 120000 subprojects/libvduse/include/compiler.h
>  create mode 100644 subprojects/libvduse/libvduse.h
>  create mode 100644 subprojects/libvduse/libvduse.c
>  create mode 120000 subprojects/libvduse/linux-headers/linux
>  create mode 100644 subprojects/libvduse/meson.build
>  create mode 120000 subprojects/libvduse/standard-headers/linux
>
> diff --git a/meson_options.txt b/meson_options.txt
> index f3e2f22c1e..23a9f440f7 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -257,6 +257,8 @@ option('virtfs', type: 'feature', value: 'auto',
>         description: 'virtio-9p support')
>  option('virtiofsd', type: 'feature', value: 'auto',
>         description: 'build virtiofs daemon (virtiofsd)')
> +option('libvduse', type: 'feature', value: 'auto',
> +       description: 'build VDUSE Library')
>  
>  option('capstone', type: 'feature', value: 'auto',
>         description: 'Whether and how to find the capstone library')
> diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h
> new file mode 120000
> index 0000000000..8c2be64f7b
> --- /dev/null
> +++ b/subprojects/libvduse/include/atomic.h
> @@ -0,0 +1 @@
> +../../../include/qemu/atomic.h
> \ No newline at end of file
> diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h
> new file mode 120000
> index 0000000000..de7b70697c
> --- /dev/null
> +++ b/subprojects/libvduse/include/compiler.h
> @@ -0,0 +1 @@
> +../../../include/qemu/compiler.h
> \ No newline at end of file
> diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
> new file mode 100644
> index 0000000000..6c2fe98213
> --- /dev/null
> +++ b/subprojects/libvduse/libvduse.h
> @@ -0,0 +1,235 @@
> +/*
> + * VDUSE (vDPA Device in Userspace) library
> + *
> + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
> + *
> + * Author:
> + *   Xie Yongji <xieyongji@bytedance.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef LIBVDUSE_H
> +#define LIBVDUSE_H
> +
> +#include <stdint.h>
> +#include <sys/uio.h>
> +
> +#define VIRTQUEUE_MAX_SIZE 1024
> +
> +/* VDUSE device structure */
> +typedef struct VduseDev VduseDev;
> +
> +/* Virtqueue structure */
> +typedef struct VduseVirtq VduseVirtq;
> +
> +/* Some operation of VDUSE backend */
> +typedef struct VduseOps {
> +    /* Called when virtqueue can be processed */
> +    void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
> +    /* Called when virtqueue processing should be stopped */
> +    void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
> +} VduseOps;
> +
> +/* Describing elements of the I/O buffer */
> +typedef struct VduseVirtqElement {
> +    /* Descriptor table index */
> +    unsigned int index;
> +    /* Number of physically-contiguous device-readable descriptors */
> +    unsigned int out_num;
> +    /* Number of physically-contiguous device-writable descriptors */
> +    unsigned int in_num;
> +    /* Array to store physically-contiguous device-writable descriptors */
> +    struct iovec *in_sg;
> +    /* Array to store physically-contiguous device-readable descriptors */
> +    struct iovec *out_sg;
> +} VduseVirtqElement;
> +
> +
> +/**
> + * vduse_get_virtio_features:
> + *
> + * Get supported virtio features
> + *
> + * Returns: supported feature bits
> + */
> +uint64_t vduse_get_virtio_features(void);
> +
> +/**
> + * vduse_queue_get_dev:
> + * @vq: specified virtqueue
> + *
> + * Get corresponding VDUSE device from the virtqueue.
> + *
> + * Returns: a pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
> +
> +/**
> + * vduse_queue_get_fd:
> + * @vq: specified virtqueue
> + *
> + * Get the kick fd for the virtqueue.
> + *
> + * Returns: file descriptor on success, -1 on failure.
> + */
> +int vduse_queue_get_fd(VduseVirtq *vq);
> +
> +/**
> + * vduse_queue_pop:
> + * @vq: specified virtqueue
> + * @sz: the size of struct to return (must be >= VduseVirtqElement)
> + *
> + * Pop an element from virtqueue available ring.
> + *
> + * Returns: a pointer to a structure containing VduseVirtqElement on success,
> + * NULL on failure.
> + */
> +void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
> +
> +/**
> + * vduse_queue_push:
> + * @vq: specified virtqueue
> + * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
> + * @len: length in bytes to write
> + *
> + * Push an element to virtqueue used ring.
> + */
> +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> +                      unsigned int len);
> +/**
> + * vduse_queue_notify:
> + * @vq: specified virtqueue
> + *
> + * Request to notify the queue.
> + */
> +void vduse_queue_notify(VduseVirtq *vq);
> +
> +/**
> + * vduse_dev_get_priv:
> + * @dev: VDUSE device
> + *
> + * Get the private pointer passed to vduse_dev_create().
> + *
> + * Returns: private pointer on success, NULL on failure.
> + */
> +void *vduse_dev_get_priv(VduseDev *dev);
> +
> +/**
> + * vduse_dev_get_queue:
> + * @dev: VDUSE device
> + * @index: virtqueue index
> + *
> + * Get the specified virtqueue.
> + *
> + * Returns: a pointer to the virtqueue on success, NULL on failure.
> + */
> +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
> +
> +/**
> + * vduse_dev_get_fd:
> + * @dev: VDUSE device
> + *
> + * Get the control message fd for the VDUSE device.
> + *
> + * Returns: file descriptor on success, -1 on failure.
> + */
> +int vduse_dev_get_fd(VduseDev *dev);
> +
> +/**
> + * vduse_dev_handler:
> + * @dev: VDUSE device
> + *
> + * Used to process the control message.
> + *
> + * Returns: file descriptor on success, -errno on failure.
> + */
> +int vduse_dev_handler(VduseDev *dev);
> +
> +/**
> + * vduse_dev_update_config:
> + * @dev: VDUSE device
> + * @size: the size to write to configuration space
> + * @offset: the offset from the beginning of configuration space
> + * @buffer: the buffer used to write from
> + *
> + * Update device configuration space and inject a config interrupt.
> + *
> + * Returns: 0 on success, -errno on failure.
> + */
> +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> +                            uint32_t offset, char *buffer);
> +
> +/**
> + * vduse_dev_setup_queue:
> + * @dev: VDUSE device
> + * @index: virtqueue index
> + * @max_size: the max size of virtqueue
> + *
> + * Setup the specified virtqueue.
> + *
> + * Returns: 0 on success, -errno on failure.
> + */
> +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
> +
> +/**
> + * vduse_dev_create_by_fd:
> + * @fd: passed file descriptor
> + * @num_queues: the number of virtqueues
> + * @ops: the operation of VDUSE backend
> + * @priv: private pointer
> + *
> + * Create VDUSE device from a passed file descriptor.
> + *
> + * Returns: pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
> +                                 const VduseOps *ops, void *priv);
> +
> +/**
> + * vduse_dev_create_by_name:
> + * @name: VDUSE device name
> + * @num_queues: the number of virtqueues
> + * @ops: the operation of VDUSE backend
> + * @priv: private pointer
> + *
> + * Create VDUSE device on /dev/vduse/$NAME.
> + *
> + * Returns: pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
> +                                   const VduseOps *ops, void *priv);
> +
> +/**
> + * vduse_dev_create:
> + * @name: VDUSE device name
> + * @device_id: virtio device id
> + * @vendor_id: virtio vendor id
> + * @features: virtio features
> + * @num_queues: the number of virtqueues
> + * @config_size: the size of the configuration space
> + * @config: the buffer of the configuration space
> + * @ops: the operation of VDUSE backend
> + * @priv: private pointer
> + *
> + * Create VDUSE device.
> + *
> + * Returns: pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> +                           uint32_t vendor_id, uint64_t features,
> +                           uint16_t num_queues, uint32_t config_size,
> +                           char *config, const VduseOps *ops, void *priv);
> +
> +/**
> + * vduse_dev_destroy:
> + * @dev: VDUSE device
> + *
> + * Destroy the VDUSE device.
> + *
> + * Returns: 0 on success, -errno on failure.
> + */
> +int vduse_dev_destroy(VduseDev *dev);
> +
> +#endif
> diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
> new file mode 100644
> index 0000000000..78e1e5cf90
> --- /dev/null
> +++ b/subprojects/libvduse/libvduse.c
> @@ -0,0 +1,1150 @@
> +/*
> + * VDUSE (vDPA Device in Userspace) library
> + *
> + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
> + *   Portions of codes and concepts borrowed from libvhost-user.c, so:
> + *     Copyright IBM, Corp. 2007
> + *     Copyright (c) 2016 Red Hat, Inc.
> + *
> + * Author:
> + *   Xie Yongji <xieyongji@bytedance.com>
> + *   Anthony Liguori <aliguori@us.ibm.com>
> + *   Marc-André Lureau <mlureau@redhat.com>
> + *   Victor Kaplansky <victork@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <stdbool.h>
> +#include <stddef.h>
> +#include <errno.h>
> +#include <string.h>
> +#include <assert.h>
> +#include <endian.h>
> +#include <unistd.h>
> +#include <limits.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +
> +#include <sys/ioctl.h>
> +#include <sys/eventfd.h>
> +#include <sys/mman.h>
> +
> +#include "include/atomic.h"
> +#include "linux-headers/linux/virtio_ring.h"
> +#include "linux-headers/linux/virtio_config.h"
> +#include "linux-headers/linux/vduse.h"
> +#include "libvduse.h"
> +
> +#define VDUSE_VQ_ALIGN 4096
> +#define MAX_IOVA_REGIONS 256
> +
> +/* Round number down to multiple */
> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> +
> +/* Round number up to multiple */
> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> +
> +#ifndef unlikely
> +#define unlikely(x)   __builtin_expect(!!(x), 0)
> +#endif
> +
> +typedef struct VduseRing {
> +    unsigned int num;
> +    uint64_t desc_addr;
> +    uint64_t avail_addr;
> +    uint64_t used_addr;
> +    struct vring_desc *desc;
> +    struct vring_avail *avail;
> +    struct vring_used *used;
> +} VduseRing;
> +
> +struct VduseVirtq {
> +    VduseRing vring;
> +    uint16_t last_avail_idx;
> +    uint16_t shadow_avail_idx;
> +    uint16_t used_idx;
> +    uint16_t signalled_used;
> +    bool signalled_used_valid;
> +    int index;
> +    int inuse;
> +    bool ready;
> +    int fd;
> +    VduseDev *dev;
> +};
> +
> +typedef struct VduseIovaRegion {
> +    uint64_t iova;
> +    uint64_t size;
> +    uint64_t mmap_offset;
> +    uint64_t mmap_addr;
> +} VduseIovaRegion;
> +
> +struct VduseDev {
> +    VduseVirtq *vqs;
> +    VduseIovaRegion regions[MAX_IOVA_REGIONS];
> +    int num_regions;
> +    char *name;
> +    uint32_t device_id;
> +    uint32_t vendor_id;
> +    uint16_t num_queues;
> +    uint16_t queue_size;
> +    uint64_t features;
> +    const VduseOps *ops;
> +    int fd;
> +    int ctrl_fd;
> +    void *priv;
> +};
> +
> +static inline bool has_feature(uint64_t features, unsigned int fbit)
> +{
> +    assert(fbit < 64);
> +    return !!(features & (1ULL << fbit));
> +}
> +
> +static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
> +{
> +    return has_feature(dev->features, fbit);
> +}
> +
> +uint64_t vduse_get_virtio_features(void)
> +{
> +    return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> +           (1ULL << VIRTIO_F_VERSION_1) |
> +           (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
> +           (1ULL << VIRTIO_RING_F_EVENT_IDX) |
> +           (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
> +}
> +
> +VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
> +{
> +    return vq->dev;
> +}
> +
> +int vduse_queue_get_fd(VduseVirtq *vq)
> +{
> +    return vq->fd;
> +}
> +
> +void *vduse_dev_get_priv(VduseDev *dev)
> +{
> +    return dev->priv;
> +}
> +
> +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
> +{
> +    return &dev->vqs[index];
> +}
> +
> +int vduse_dev_get_fd(VduseDev *dev)
> +{
> +    return dev->fd;
> +}
> +
> +static int vduse_inject_irq(VduseDev *dev, int index)
> +{
> +    return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
> +}
> +
> +static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
> +                                     uint64_t last)
> +{
> +    int i;
> +
> +    if (last == start) {
> +        return;
> +    }
> +
> +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> +        if (!dev->regions[i].mmap_addr) {
> +            continue;
> +        }
> +
> +        if (start <= dev->regions[i].iova &&
> +            last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
> +            munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
> +                   dev->regions[i].mmap_offset + dev->regions[i].size);
> +            dev->regions[i].mmap_addr = 0;
> +            dev->num_regions--;
> +        }
> +    }
> +}
> +
> +static int vduse_iova_add_region(VduseDev *dev, int fd,
> +                                 uint64_t offset, uint64_t start,
> +                                 uint64_t last, int prot)
> +{
> +    int i;
> +    uint64_t size = last - start + 1;
> +    void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
> +
> +    if (mmap_addr == MAP_FAILED) {
> +        close(fd);
> +        return -EINVAL;
> +    }
> +
> +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> +        if (!dev->regions[i].mmap_addr) {
> +            dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
> +            dev->regions[i].mmap_offset = offset;
> +            dev->regions[i].iova = start;
> +            dev->regions[i].size = size;
> +            dev->num_regions++;
> +            break;
> +        }
> +    }
> +    assert(i < MAX_IOVA_REGIONS);
> +    close(fd);
> +
> +    return 0;
> +}
> +
> +static int perm_to_prot(uint8_t perm)
> +{
> +    int prot = 0;
> +
> +    switch (perm) {
> +    case VDUSE_ACCESS_WO:
> +        prot |= PROT_WRITE;
> +        break;
> +    case VDUSE_ACCESS_RO:
> +        prot |= PROT_READ;
> +        break;
> +    case VDUSE_ACCESS_RW:
> +        prot |= PROT_READ | PROT_WRITE;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    return prot;
> +}
> +
> +static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
> +{
> +    int i, ret;
> +    struct vduse_iotlb_entry entry;
> +
> +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> +        VduseIovaRegion *r = &dev->regions[i];
> +
> +        if (!r->mmap_addr) {
> +            continue;
> +        }
> +
> +        if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
> +            if ((iova + *plen) > (r->iova + r->size)) {
> +                *plen = r->iova + r->size - iova;
> +            }
> +            return (void *)(uintptr_t)(iova - r->iova +
> +                   r->mmap_addr + r->mmap_offset);
> +        }
> +    }
> +
> +    entry.start = iova;
> +    entry.last = iova + 1;
> +    ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
> +    if (ret < 0) {
> +        return NULL;
> +    }
> +
> +    if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
> +                               entry.last, perm_to_prot(entry.perm))) {
> +        return iova_to_va(dev, plen, iova);
> +    }
> +
> +    return NULL;
> +}
> +
> +static inline uint16_t vring_avail_flags(VduseVirtq *vq)
> +{
> +    return le16toh(vq->vring.avail->flags);
> +}
> +
> +static inline uint16_t vring_avail_idx(VduseVirtq *vq)
> +{
> +    vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
> +
> +    return vq->shadow_avail_idx;
> +}
> +
> +static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
> +{
> +    return le16toh(vq->vring.avail->ring[i]);
> +}
> +
> +static inline uint16_t vring_get_used_event(VduseVirtq *vq)
> +{
> +    return vring_avail_ring(vq, vq->vring.num);
> +}
> +
> +static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
> +                                 unsigned int *head)
> +{
> +    /*
> +     * Grab the next descriptor number they're advertising, and increment
> +     * the index we've seen.
> +     */
> +    *head = vring_avail_ring(vq, idx % vq->vring.num);
> +
> +    /* If their number is silly, that's a fatal mistake. */
> +    if (*head >= vq->vring.num) {
> +        fprintf(stderr, "Guest says index %u is available\n", *head);
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static int
> +vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
> +                               uint64_t addr, size_t len)
> +{
> +    struct vring_desc *ori_desc;
> +    uint64_t read_len;
> +
> +    if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
> +        return -1;
> +    }
> +
> +    if (len == 0) {
> +        return -1;
> +    }
> +
> +    while (len) {
> +        read_len = len;
> +        ori_desc = iova_to_va(dev, &read_len, addr);
> +        if (!ori_desc) {
> +            return -1;
> +        }
> +
> +        memcpy(desc, ori_desc, read_len);
> +        len -= read_len;
> +        addr += read_len;
> +        desc += read_len;
> +    }
> +
> +    return 0;
> +}
> +
> +enum {
> +    VIRTQUEUE_READ_DESC_ERROR = -1,
> +    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
> +    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
> +};
> +
> +static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
> +                                      unsigned int max, unsigned int *next)
> +{
> +    /* If this descriptor says it doesn't chain, we're done. */
> +    if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
> +        return VIRTQUEUE_READ_DESC_DONE;
> +    }
> +
> +    /* Check they're not leading us off end of descriptors. */
> +    *next = desc[i].next;
> +    /* Make sure compiler knows to grab that: we don't want it changing! */
> +    smp_wmb();
> +
> +    if (*next >= max) {
> +        fprintf(stderr, "Desc next is %u\n", *next);
> +        return VIRTQUEUE_READ_DESC_ERROR;
> +    }
> +
> +    return VIRTQUEUE_READ_DESC_MORE;
> +}
> +
> +/*
> + * Fetch avail_idx from VQ memory only when we really need to know if
> + * guest has added some buffers.
> + */
> +static bool vduse_queue_empty(VduseVirtq *vq)
> +{
> +    if (unlikely(!vq->vring.avail)) {
> +        return true;
> +    }
> +
> +    if (vq->shadow_avail_idx != vq->last_avail_idx) {
> +        return false;
> +    }
> +
> +    return vring_avail_idx(vq) == vq->last_avail_idx;
> +}
> +
> +static bool vduse_queue_should_notify(VduseVirtq *vq)
> +{
> +    VduseDev *dev = vq->dev;
> +    uint16_t old, new;
> +    bool v;
> +
> +    /* We need to expose used array entries before checking used event. */
> +    smp_mb();
> +
> +    /* Always notify when queue is empty (when feature acknowledge) */
> +    if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
> +        !vq->inuse && vduse_queue_empty(vq)) {
> +        return true;
> +    }
> +
> +    if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> +        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
> +    }
> +
> +    v = vq->signalled_used_valid;
> +    vq->signalled_used_valid = true;
> +    old = vq->signalled_used;
> +    new = vq->signalled_used = vq->used_idx;
> +    return !v || vring_need_event(vring_get_used_event(vq), new, old);
> +}
> +
> +void vduse_queue_notify(VduseVirtq *vq)
> +{
> +    VduseDev *dev = vq->dev;
> +
> +    if (unlikely(!vq->vring.avail)) {
> +        return;
> +    }
> +
> +    if (!vduse_queue_should_notify(vq)) {
> +        return;
> +    }
> +
> +    if (vduse_inject_irq(dev, vq->index) < 0) {
> +        fprintf(stderr, "Error inject irq for vq %d: %s\n",
> +                vq->index, strerror(errno));
> +    }
> +}
> +
> +static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
> +{
> +    *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
> +}
> +
> +static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
> +                                   struct iovec *iov, unsigned int max_num_sg,
> +                                   bool is_write, uint64_t pa, size_t sz)
> +{
> +    unsigned num_sg = *p_num_sg;
> +    VduseDev *dev = vq->dev;
> +
> +    assert(num_sg <= max_num_sg);
> +
> +    if (!sz) {
> +        fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
> +        return false;
> +    }
> +
> +    while (sz) {
> +        uint64_t len = sz;
> +
> +        if (num_sg == max_num_sg) {
> +            fprintf(stderr,
> +                    "virtio: too many descriptors in indirect table\n");
> +            return false;
> +        }
> +
> +        iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
> +        if (iov[num_sg].iov_base == NULL) {
> +            fprintf(stderr, "virtio: invalid address for buffers\n");
> +            return false;
> +        }
> +        iov[num_sg++].iov_len = len;
> +        sz -= len;
> +        pa += len;
> +    }
> +
> +    *p_num_sg = num_sg;
> +    return true;
> +}
> +
> +static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
> +                                       unsigned in_num)
> +{
> +    VduseVirtqElement *elem;
> +    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
> +    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
> +    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
> +
> +    assert(sz >= sizeof(VduseVirtqElement));
> +    elem = malloc(out_sg_end);
> +    if (!elem) {
> +        return NULL;
> +    }
> +    elem->out_num = out_num;
> +    elem->in_num = in_num;
> +    elem->in_sg = (void *)elem + in_sg_ofs;
> +    elem->out_sg = (void *)elem + out_sg_ofs;
> +    return elem;
> +}
> +
> +static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
> +{
> +    struct vring_desc *desc = vq->vring.desc;
> +    VduseDev *dev = vq->dev;
> +    uint64_t desc_addr, read_len;
> +    unsigned int desc_len;
> +    unsigned int max = vq->vring.num;
> +    unsigned int i = idx;
> +    VduseVirtqElement *elem;
> +    struct iovec iov[VIRTQUEUE_MAX_SIZE];
> +    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
> +    unsigned int out_num = 0, in_num = 0;
> +    int rc;
> +
> +    if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
> +        if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
> +            fprintf(stderr, "Invalid size for indirect buffer table\n");
> +            return NULL;
> +        }
> +
> +        /* loop over the indirect descriptor table */
> +        desc_addr = le64toh(desc[i].addr);
> +        desc_len = le32toh(desc[i].len);
> +        max = desc_len / sizeof(struct vring_desc);
> +        read_len = desc_len;
> +        desc = iova_to_va(dev, &read_len, desc_addr);
> +        if (unlikely(desc && read_len != desc_len)) {
> +            /* Failed to use zero copy */
> +            desc = NULL;
> +            if (!vduse_queue_read_indirect_desc(dev, desc_buf,
> +                                                desc_addr,
> +                                                desc_len)) {
> +                desc = desc_buf;
> +            }
> +        }
> +        if (!desc) {
> +            fprintf(stderr, "Invalid indirect buffer table\n");
> +            return NULL;
> +        }
> +        i = 0;
> +    }
> +
> +    /* Collect all the descriptors */
> +    do {
> +        if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
> +            if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
> +                                             VIRTQUEUE_MAX_SIZE - out_num,
> +                                             true, le64toh(desc[i].addr),
> +                                             le32toh(desc[i].len))) {
> +                return NULL;
> +            }
> +        } else {
> +            if (in_num) {
> +                fprintf(stderr, "Incorrect order for descriptors\n");
> +                return NULL;
> +            }
> +            if (!vduse_queue_map_single_desc(vq, &out_num, iov,
> +                                             VIRTQUEUE_MAX_SIZE, false,
> +                                             le64toh(desc[i].addr),
> +                                             le32toh(desc[i].len))) {
> +                return NULL;
> +            }
> +        }
> +
> +        /* If we've got too many, that implies a descriptor loop. */
> +        if ((in_num + out_num) > max) {
> +            fprintf(stderr, "Looped descriptor\n");
> +            return NULL;
> +        }
> +        rc = vduse_queue_read_next_desc(desc, i, max, &i);
> +    } while (rc == VIRTQUEUE_READ_DESC_MORE);
> +
> +    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
> +        fprintf(stderr, "read descriptor error\n");
> +        return NULL;
> +    }
> +
> +    /* Now copy what we have collected and mapped */
> +    elem = vduse_queue_alloc_element(sz, out_num, in_num);
> +    if (!elem) {
> +        fprintf(stderr, "read descriptor error\n");
> +        return NULL;
> +    }
> +    elem->index = idx;
> +    for (i = 0; i < out_num; i++) {
> +        elem->out_sg[i] = iov[i];
> +    }
> +    for (i = 0; i < in_num; i++) {
> +        elem->in_sg[i] = iov[out_num + i];
> +    }
> +
> +    return elem;
> +}
> +
> +void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
> +{
> +    unsigned int head;
> +    VduseVirtqElement *elem;
> +    VduseDev *dev = vq->dev;
> +
> +    if (unlikely(!vq->vring.avail)) {
> +        return NULL;
> +    }
> +
> +    if (vduse_queue_empty(vq)) {
> +        return NULL;
> +    }
> +    /* Needed after virtio_queue_empty() */
> +    smp_rmb();
> +
> +    if (vq->inuse >= vq->vring.num) {
> +        fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
> +        return NULL;
> +    }
> +
> +    if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
> +        return NULL;
> +    }
> +
> +    if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> +        vring_set_avail_event(vq, vq->last_avail_idx);
> +    }
> +
> +    elem = vduse_queue_map_desc(vq, head, sz);
> +
> +    if (!elem) {
> +        return NULL;
> +    }
> +
> +    vq->inuse++;
> +
> +    return elem;
> +}
> +
> +static inline void vring_used_write(VduseVirtq *vq,
> +                                    struct vring_used_elem *uelem, int i)
> +{
> +    struct vring_used *used = vq->vring.used;
> +
> +    used->ring[i] = *uelem;
> +}
> +
> +static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
> +                             unsigned int len, unsigned int idx)
> +{
> +    struct vring_used_elem uelem;
> +
> +    if (unlikely(!vq->vring.used)) {
> +        return;
> +    }
> +
> +    idx = (idx + vq->used_idx) % vq->vring.num;
> +
> +    uelem.id = htole32(elem->index);
> +    uelem.len = htole32(len);
> +    vring_used_write(vq, &uelem, idx);
> +}
> +
> +static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
> +{
> +    vq->vring.used->idx = htole16(val);
> +    vq->used_idx = val;
> +}
> +
> +static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
> +{
> +    uint16_t old, new;
> +
> +    if (unlikely(!vq->vring.used)) {
> +        return;
> +    }
> +
> +    /* Make sure buffer is written before we update index. */
> +    smp_wmb();
> +
> +    old = vq->used_idx;
> +    new = old + count;
> +    vring_used_idx_set(vq, new);
> +    vq->inuse -= count;
> +    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
> +        vq->signalled_used_valid = false;
> +    }
> +}
> +
> +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> +                      unsigned int len)
> +{
> +    vduse_queue_fill(vq, elem, len, 0);
> +    vduse_queue_flush(vq, 1);
> +}
> +
> +static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
> +                                    uint64_t avail_addr, uint64_t used_addr)
> +{
> +    struct VduseDev *dev = vq->dev;
> +    uint64_t len;
> +
> +    len = sizeof(struct vring_desc);
> +    vq->vring.desc = iova_to_va(dev, &len, desc_addr);
> +    if (len != sizeof(struct vring_desc)) {
> +        return -EINVAL;
> +    }
> +
> +    len = sizeof(struct vring_avail);
> +    vq->vring.avail = iova_to_va(dev, &len, avail_addr);
> +    if (len != sizeof(struct vring_avail)) {
> +        return -EINVAL;
> +    }
> +
> +    len = sizeof(struct vring_used);
> +    vq->vring.used = iova_to_va(dev, &len, used_addr);
> +    if (len != sizeof(struct vring_used)) {
> +        return -EINVAL;
> +    }
> +
> +    if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
> +        fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vduse_queue_enable(VduseVirtq *vq)
> +{
> +    struct VduseDev *dev = vq->dev;
> +    struct vduse_vq_info vq_info;
> +    struct vduse_vq_eventfd vq_eventfd;
> +    int fd;
> +
> +    vq_info.index = vq->index;
> +    if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
> +        fprintf(stderr, "Failed to get vq[%d] info: %s\n",
> +                vq->index, strerror(errno));
> +        return;
> +    }
> +
> +    if (!vq_info.ready) {
> +        return;
> +    }
> +
> +    vq->vring.num = vq_info.num;
> +    vq->vring.desc_addr = vq_info.desc_addr;
> +    vq->vring.avail_addr = vq_info.driver_addr;
> +    vq->vring.used_addr = vq_info.device_addr;
> +
> +    if (vduse_queue_update_vring(vq, vq_info.desc_addr,
> +                                 vq_info.driver_addr, vq_info.device_addr)) {
> +        fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
> +        return;
> +    }
> +
> +    fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +    if (fd < 0) {
> +        fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
> +        return;
> +    }
> +
> +    vq_eventfd.index = vq->index;
> +    vq_eventfd.fd = fd;
> +    if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
> +        fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
> +        close(fd);
> +        return;
> +    }
> +
> +    vq->fd = fd;
> +    vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
> +    vq->inuse = 0;
> +    vq->used_idx = 0;
> +    vq->signalled_used_valid = false;
> +    vq->ready = true;
> +
> +    dev->ops->enable_queue(dev, vq);
> +}
> +
> +static void vduse_queue_disable(VduseVirtq *vq)
> +{
> +    struct VduseDev *dev = vq->dev;
> +    struct vduse_vq_eventfd eventfd;
> +
> +    if (!vq->ready) {
> +        return;
> +    }
> +
> +    dev->ops->disable_queue(dev, vq);
> +
> +    eventfd.index = vq->index;
> +    eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
> +    ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
> +    close(vq->fd);
> +
> +    assert(vq->inuse == 0);
> +
> +    vq->vring.num = 0;
> +    vq->vring.desc_addr = 0;
> +    vq->vring.avail_addr = 0;
> +    vq->vring.used_addr = 0;
> +    vq->vring.desc = 0;
> +    vq->vring.avail = 0;
> +    vq->vring.used = 0;
> +    vq->ready = false;
> +    vq->fd = -1;
> +}
> +
> +static void vduse_dev_start_dataplane(VduseDev *dev)
> +{
> +    int i;
> +
> +    if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
> +        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
> +        return;
> +    }
> +    assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
> +
> +    for (i = 0; i < dev->num_queues; i++) {
> +        vduse_queue_enable(&dev->vqs[i]);
> +    }
> +}
> +
> +static void vduse_dev_stop_dataplane(VduseDev *dev)
> +{
> +    int i;
> +
> +    for (i = 0; i < dev->num_queues; i++) {
> +        vduse_queue_disable(&dev->vqs[i]);
> +    }
> +    dev->features = 0;
> +    vduse_iova_remove_region(dev, 0, ULONG_MAX);
> +}
> +
> +int vduse_dev_handler(VduseDev *dev)
> +{
> +    struct vduse_dev_request req;
> +    struct vduse_dev_response resp = { 0 };
> +    VduseVirtq *vq;
> +    int i, ret;
> +
> +    ret = read(dev->fd, &req, sizeof(req));
> +    if (ret != sizeof(req)) {
> +        fprintf(stderr, "Read request error [%d]: %s\n",
> +                ret, strerror(errno));
> +        return -errno;
> +    }
> +    resp.request_id = req.request_id;
> +
> +    switch (req.type) {
> +    case VDUSE_GET_VQ_STATE:
> +        vq = &dev->vqs[req.vq_state.index];
> +        resp.vq_state.split.avail_index = vq->last_avail_idx;
> +        resp.result = VDUSE_REQ_RESULT_OK;
> +        break;
> +    case VDUSE_SET_STATUS:
> +        if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
> +            vduse_dev_start_dataplane(dev);
> +        } else if (req.s.status == 0) {
> +            vduse_dev_stop_dataplane(dev);
> +        }
> +        resp.result = VDUSE_REQ_RESULT_OK;
> +        break;
> +    case VDUSE_UPDATE_IOTLB:
> +        /* The iova will be updated by iova_to_va() later, so just remove it */
> +        vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
> +        for (i = 0; i < dev->num_queues; i++) {
> +            VduseVirtq *vq = &dev->vqs[i];
> +            if (vq->ready) {
> +                if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
> +                                             vq->vring.avail_addr,
> +                                             vq->vring.used_addr)) {
> +                    fprintf(stderr, "Failed to update vring for vq[%d]\n",
> +                            vq->index);
> +                }
> +            }
> +        }
> +        resp.result = VDUSE_REQ_RESULT_OK;
> +        break;
> +    default:
> +        resp.result = VDUSE_REQ_RESULT_FAILED;
> +        break;
> +    }
> +
> +    ret = write(dev->fd, &resp, sizeof(resp));
> +    if (ret != sizeof(resp)) {
> +        fprintf(stderr, "Write request %d error [%d]: %s\n",
> +                req.type, ret, strerror(errno));
> +        return -errno;
> +    }
> +    return 0;
> +}
> +
> +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> +                            uint32_t offset, char *buffer)
> +{
> +    int ret;
> +    struct vduse_config_data *data;
> +
> +    data = malloc(offsetof(struct vduse_config_data, buffer) + size);
> +    if (!data) {
> +        return -ENOMEM;
> +    }
> +
> +    data->offset = offset;
> +    data->length = size;
> +    memcpy(data->buffer, buffer, size);
> +
> +    ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
> +    free(data);
> +
> +    if (ret) {
> +        return -errno;
> +    }
> +
> +    if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
> +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
> +{
> +    VduseVirtq *vq = &dev->vqs[index];
> +    struct vduse_vq_config vq_config = { 0 };
> +
> +    if (max_size > VIRTQUEUE_MAX_SIZE) {
> +        return -EINVAL;
> +    }
> +
> +    vq_config.index = vq->index;
> +    vq_config.max_size = max_size;
> +
> +    if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
> +{
> +    VduseVirtq *vqs;
> +    int i;
> +
> +    vqs = calloc(sizeof(VduseVirtq), num_queues);
> +    if (!vqs) {
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < num_queues; i++) {
> +        vqs[i].index = i;
> +        vqs[i].dev = dev;
> +        vqs[i].fd = -1;
> +    }
> +    dev->vqs = vqs;
> +
> +    return 0;
> +}
> +
> +static int vduse_dev_init(VduseDev *dev, const char *name,
> +                          uint16_t num_queues, const VduseOps *ops,
> +                          void *priv)
> +{
> +    char *dev_path, *dev_name;
> +    int ret, fd;
> +
> +    dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
> +    if (!dev_path) {
> +        return -ENOMEM;
> +    }
> +    sprintf(dev_path, "/dev/vduse/%s", name);
> +
> +    fd = open(dev_path, O_RDWR);
> +    free(dev_path);
> +    if (fd < 0) {
> +        fprintf(stderr, "Failed to open vduse dev %s: %s\n",
> +                name, strerror(errno));
> +        return -errno;
> +    }
> +
> +    dev_name = strdup(name);
> +    if (!dev_name) {
> +        close(fd);
> +        return -ENOMEM;
> +    }
> +
> +    ret = vduse_dev_init_vqs(dev, num_queues);
> +    if (ret) {
> +        free(dev_name);
> +        close(fd);
> +        return ret;
> +    }
> +
> +    dev->name = dev_name;
> +    dev->num_queues = num_queues;
> +    dev->fd = fd;
> +    dev->ops = ops;
> +    dev->priv = priv;
> +
> +    return 0;
> +}
> +
> +static inline bool vduse_name_is_valid(const char *name)
> +{
> +    return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
> +}
> +
> +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
> +                                 const VduseOps *ops, void *priv)
> +{
> +    VduseDev *dev;
> +    int ret;
> +
> +    if (!ops || !ops->enable_queue || !ops->disable_queue) {
> +        fprintf(stderr, "Invalid parameter for vduse\n");
> +        return NULL;
> +    }
> +
> +    dev = calloc(sizeof(VduseDev), 1);
> +    if (!dev) {
> +        fprintf(stderr, "Failed to allocate vduse device\n");
> +        return NULL;
> +    }
> +
> +    ret = vduse_dev_init_vqs(dev, num_queues);
> +    if (ret) {
> +        fprintf(stderr, "Failed to init vqs\n");
> +        free(dev);
> +        return NULL;
> +    }
> +
> +    dev->num_queues = num_queues;
> +    dev->fd = fd;
> +    dev->ops = ops;
> +    dev->priv = priv;
> +
> +    return dev;
> +}
> +
> +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
> +                                   const VduseOps *ops, void *priv)
> +{
> +    VduseDev *dev;
> +    int ret;
> +
> +    if (!name || vduse_name_is_valid(name) || !ops ||
> +        !ops->enable_queue || !ops->disable_queue) {
> +        fprintf(stderr, "Invalid parameter for vduse\n");
> +        return NULL;
> +    }
> +
> +    dev = calloc(sizeof(VduseDev), 1);
> +    if (!dev) {
> +        fprintf(stderr, "Failed to allocate vduse device\n");
> +        return NULL;
> +    }
> +
> +    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
> +    if (ret < 0) {
> +        fprintf(stderr, "Failed to init vduse device %s: %s\n",
> +                name, strerror(ret));

Must be strerror(-ret).  Spotted by Coverity, tracked as CID 1490226.

> +        free(dev);
> +        return NULL;
> +    }
> +
> +    return dev;
> +}
> +
> +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> +                           uint32_t vendor_id, uint64_t features,
> +                           uint16_t num_queues, uint32_t config_size,
> +                           char *config, const VduseOps *ops, void *priv)
> +{
> +    VduseDev *dev;
> +    int ret, ctrl_fd;
> +    uint64_t version;
> +    struct vduse_dev_config *dev_config;
> +    size_t size = offsetof(struct vduse_dev_config, config);
> +
> +    if (!name || vduse_name_is_valid(name) ||
> +        !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
> +        !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
> +        fprintf(stderr, "Invalid parameter for vduse\n");
> +        return NULL;
> +    }
> +
> +    dev = calloc(sizeof(VduseDev), 1);
> +    if (!dev) {
> +        fprintf(stderr, "Failed to allocate vduse device\n");
> +        return NULL;
> +    }
> +
> +    ctrl_fd = open("/dev/vduse/control", O_RDWR);
> +    if (ctrl_fd < 0) {
> +        fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
> +                strerror(errno));
> +        goto err_ctrl;
> +    }
> +
> +    version = VDUSE_API_VERSION;
> +    if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
> +        fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
> +                version, strerror(errno));
> +        goto err_dev;
> +    }
> +
> +    dev_config = calloc(size + config_size, 1);
> +    if (!dev_config) {
> +        fprintf(stderr, "Failed to allocate config space\n");
> +        goto err_dev;
> +    }
> +
> +    strcpy(dev_config->name, name);

What ensures @name fits into dev->config->name?

Coverity CID 1490224.

> +    dev_config->device_id = device_id;
> +    dev_config->vendor_id = vendor_id;
> +    dev_config->features = features;
> +    dev_config->vq_num = num_queues;
> +    dev_config->vq_align = VDUSE_VQ_ALIGN;
> +    dev_config->config_size = config_size;
> +    memcpy(dev_config->config, config, config_size);
> +
> +    ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
> +    free(dev_config);
> +    if (ret < 0) {
> +        fprintf(stderr, "Failed to create vduse device %s: %s\n",
> +                name, strerror(errno));
> +        goto err_dev;
> +    }
> +    dev->ctrl_fd = ctrl_fd;
> +
> +    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
> +    if (ret < 0) {
> +        fprintf(stderr, "Failed to init vduse device %s: %s\n",
> +                name, strerror(ret));

Must be strerror(-ret).  Spotted by Coverity, tracked as CID 1490223.

> +        goto err;
> +    }
> +
> +    return dev;
> +err:
> +    ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
> +err_dev:
> +    close(ctrl_fd);
> +err_ctrl:
> +    free(dev);
> +
> +    return NULL;
> +}
> +
> +int vduse_dev_destroy(VduseDev *dev)
> +{
> +    int ret = 0;
> +
> +    free(dev->vqs);
> +    if (dev->fd >= 0) {
> +        close(dev->fd);
> +        dev->fd = -1;
> +    }
> +    if (dev->ctrl_fd >= 0) {
> +        if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
> +            ret = -errno;
> +        }
> +        close(dev->ctrl_fd);
> +        dev->ctrl_fd = -1;
> +    }
> +    free(dev->name);
> +    free(dev);
> +
> +    return ret;
> +}

[...]
Yongji Xie June 27, 2022, 8:34 a.m. UTC | #2
On Mon, Jun 27, 2022 at 12:45 PM Markus Armbruster <armbru@redhat.com> wrote:
>
> Kevin Wolf <kwolf@redhat.com> writes:
>
> > From: Xie Yongji <xieyongji@bytedance.com>
> >
> > VDUSE [1] is a linux framework that makes it possible to implement
> > software-emulated vDPA devices in userspace. This adds a library
> > as a subproject to help implementing VDUSE backends in QEMU.
> >
> > [1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > Message-Id: <20220523084611.91-6-xieyongji@bytedance.com>
> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >  meson_options.txt                           |    2 +
> >  subprojects/libvduse/include/atomic.h       |    1 +
> >  subprojects/libvduse/include/compiler.h     |    1 +
> >  subprojects/libvduse/libvduse.h             |  235 ++++
> >  subprojects/libvduse/libvduse.c             | 1150 +++++++++++++++++++
> >  MAINTAINERS                                 |    5 +
> >  meson.build                                 |   15 +
> >  scripts/meson-buildoptions.sh               |    3 +
> >  subprojects/libvduse/linux-headers/linux    |    1 +
> >  subprojects/libvduse/meson.build            |   10 +
> >  subprojects/libvduse/standard-headers/linux |    1 +
> >  11 files changed, 1424 insertions(+)
> >  create mode 120000 subprojects/libvduse/include/atomic.h
> >  create mode 120000 subprojects/libvduse/include/compiler.h
> >  create mode 100644 subprojects/libvduse/libvduse.h
> >  create mode 100644 subprojects/libvduse/libvduse.c
> >  create mode 120000 subprojects/libvduse/linux-headers/linux
> >  create mode 100644 subprojects/libvduse/meson.build
> >  create mode 120000 subprojects/libvduse/standard-headers/linux
> >
> > diff --git a/meson_options.txt b/meson_options.txt
> > index f3e2f22c1e..23a9f440f7 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -257,6 +257,8 @@ option('virtfs', type: 'feature', value: 'auto',
> >         description: 'virtio-9p support')
> >  option('virtiofsd', type: 'feature', value: 'auto',
> >         description: 'build virtiofs daemon (virtiofsd)')
> > +option('libvduse', type: 'feature', value: 'auto',
> > +       description: 'build VDUSE Library')
> >
> >  option('capstone', type: 'feature', value: 'auto',
> >         description: 'Whether and how to find the capstone library')
> > diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h
> > new file mode 120000
> > index 0000000000..8c2be64f7b
> > --- /dev/null
> > +++ b/subprojects/libvduse/include/atomic.h
> > @@ -0,0 +1 @@
> > +../../../include/qemu/atomic.h
> > \ No newline at end of file
> > diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h
> > new file mode 120000
> > index 0000000000..de7b70697c
> > --- /dev/null
> > +++ b/subprojects/libvduse/include/compiler.h
> > @@ -0,0 +1 @@
> > +../../../include/qemu/compiler.h
> > \ No newline at end of file
> > diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
> > new file mode 100644
> > index 0000000000..6c2fe98213
> > --- /dev/null
> > +++ b/subprojects/libvduse/libvduse.h
> > @@ -0,0 +1,235 @@
> > +/*
> > + * VDUSE (vDPA Device in Userspace) library
> > + *
> > + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
> > + *
> > + * Author:
> > + *   Xie Yongji <xieyongji@bytedance.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> > + * later.  See the COPYING file in the top-level directory.
> > + */
> > +
> > +#ifndef LIBVDUSE_H
> > +#define LIBVDUSE_H
> > +
> > +#include <stdint.h>
> > +#include <sys/uio.h>
> > +
> > +#define VIRTQUEUE_MAX_SIZE 1024
> > +
> > +/* VDUSE device structure */
> > +typedef struct VduseDev VduseDev;
> > +
> > +/* Virtqueue structure */
> > +typedef struct VduseVirtq VduseVirtq;
> > +
> > +/* Some operation of VDUSE backend */
> > +typedef struct VduseOps {
> > +    /* Called when virtqueue can be processed */
> > +    void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
> > +    /* Called when virtqueue processing should be stopped */
> > +    void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
> > +} VduseOps;
> > +
> > +/* Describing elements of the I/O buffer */
> > +typedef struct VduseVirtqElement {
> > +    /* Descriptor table index */
> > +    unsigned int index;
> > +    /* Number of physically-contiguous device-readable descriptors */
> > +    unsigned int out_num;
> > +    /* Number of physically-contiguous device-writable descriptors */
> > +    unsigned int in_num;
> > +    /* Array to store physically-contiguous device-writable descriptors */
> > +    struct iovec *in_sg;
> > +    /* Array to store physically-contiguous device-readable descriptors */
> > +    struct iovec *out_sg;
> > +} VduseVirtqElement;
> > +
> > +
> > +/**
> > + * vduse_get_virtio_features:
> > + *
> > + * Get supported virtio features
> > + *
> > + * Returns: supported feature bits
> > + */
> > +uint64_t vduse_get_virtio_features(void);
> > +
> > +/**
> > + * vduse_queue_get_dev:
> > + * @vq: specified virtqueue
> > + *
> > + * Get corresponding VDUSE device from the virtqueue.
> > + *
> > + * Returns: a pointer to VDUSE device on success, NULL on failure.
> > + */
> > +VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
> > +
> > +/**
> > + * vduse_queue_get_fd:
> > + * @vq: specified virtqueue
> > + *
> > + * Get the kick fd for the virtqueue.
> > + *
> > + * Returns: file descriptor on success, -1 on failure.
> > + */
> > +int vduse_queue_get_fd(VduseVirtq *vq);
> > +
> > +/**
> > + * vduse_queue_pop:
> > + * @vq: specified virtqueue
> > + * @sz: the size of struct to return (must be >= VduseVirtqElement)
> > + *
> > + * Pop an element from virtqueue available ring.
> > + *
> > + * Returns: a pointer to a structure containing VduseVirtqElement on success,
> > + * NULL on failure.
> > + */
> > +void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
> > +
> > +/**
> > + * vduse_queue_push:
> > + * @vq: specified virtqueue
> > + * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
> > + * @len: length in bytes to write
> > + *
> > + * Push an element to virtqueue used ring.
> > + */
> > +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> > +                      unsigned int len);
> > +/**
> > + * vduse_queue_notify:
> > + * @vq: specified virtqueue
> > + *
> > + * Request to notify the queue.
> > + */
> > +void vduse_queue_notify(VduseVirtq *vq);
> > +
> > +/**
> > + * vduse_dev_get_priv:
> > + * @dev: VDUSE device
> > + *
> > + * Get the private pointer passed to vduse_dev_create().
> > + *
> > + * Returns: private pointer on success, NULL on failure.
> > + */
> > +void *vduse_dev_get_priv(VduseDev *dev);
> > +
> > +/**
> > + * vduse_dev_get_queue:
> > + * @dev: VDUSE device
> > + * @index: virtqueue index
> > + *
> > + * Get the specified virtqueue.
> > + *
> > + * Returns: a pointer to the virtqueue on success, NULL on failure.
> > + */
> > +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
> > +
> > +/**
> > + * vduse_dev_get_fd:
> > + * @dev: VDUSE device
> > + *
> > + * Get the control message fd for the VDUSE device.
> > + *
> > + * Returns: file descriptor on success, -1 on failure.
> > + */
> > +int vduse_dev_get_fd(VduseDev *dev);
> > +
> > +/**
> > + * vduse_dev_handler:
> > + * @dev: VDUSE device
> > + *
> > + * Used to process the control message.
> > + *
> > + * Returns: file descriptor on success, -errno on failure.
> > + */
> > +int vduse_dev_handler(VduseDev *dev);
> > +
> > +/**
> > + * vduse_dev_update_config:
> > + * @dev: VDUSE device
> > + * @size: the size to write to configuration space
> > + * @offset: the offset from the beginning of configuration space
> > + * @buffer: the buffer used to write from
> > + *
> > + * Update device configuration space and inject a config interrupt.
> > + *
> > + * Returns: 0 on success, -errno on failure.
> > + */
> > +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> > +                            uint32_t offset, char *buffer);
> > +
> > +/**
> > + * vduse_dev_setup_queue:
> > + * @dev: VDUSE device
> > + * @index: virtqueue index
> > + * @max_size: the max size of virtqueue
> > + *
> > + * Setup the specified virtqueue.
> > + *
> > + * Returns: 0 on success, -errno on failure.
> > + */
> > +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
> > +
> > +/**
> > + * vduse_dev_create_by_fd:
> > + * @fd: passed file descriptor
> > + * @num_queues: the number of virtqueues
> > + * @ops: the operation of VDUSE backend
> > + * @priv: private pointer
> > + *
> > + * Create VDUSE device from a passed file descriptor.
> > + *
> > + * Returns: pointer to VDUSE device on success, NULL on failure.
> > + */
> > +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
> > +                                 const VduseOps *ops, void *priv);
> > +
> > +/**
> > + * vduse_dev_create_by_name:
> > + * @name: VDUSE device name
> > + * @num_queues: the number of virtqueues
> > + * @ops: the operation of VDUSE backend
> > + * @priv: private pointer
> > + *
> > + * Create VDUSE device on /dev/vduse/$NAME.
> > + *
> > + * Returns: pointer to VDUSE device on success, NULL on failure.
> > + */
> > +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
> > +                                   const VduseOps *ops, void *priv);
> > +
> > +/**
> > + * vduse_dev_create:
> > + * @name: VDUSE device name
> > + * @device_id: virtio device id
> > + * @vendor_id: virtio vendor id
> > + * @features: virtio features
> > + * @num_queues: the number of virtqueues
> > + * @config_size: the size of the configuration space
> > + * @config: the buffer of the configuration space
> > + * @ops: the operation of VDUSE backend
> > + * @priv: private pointer
> > + *
> > + * Create VDUSE device.
> > + *
> > + * Returns: pointer to VDUSE device on success, NULL on failure.
> > + */
> > +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> > +                           uint32_t vendor_id, uint64_t features,
> > +                           uint16_t num_queues, uint32_t config_size,
> > +                           char *config, const VduseOps *ops, void *priv);
> > +
> > +/**
> > + * vduse_dev_destroy:
> > + * @dev: VDUSE device
> > + *
> > + * Destroy the VDUSE device.
> > + *
> > + * Returns: 0 on success, -errno on failure.
> > + */
> > +int vduse_dev_destroy(VduseDev *dev);
> > +
> > +#endif
> > diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
> > new file mode 100644
> > index 0000000000..78e1e5cf90
> > --- /dev/null
> > +++ b/subprojects/libvduse/libvduse.c
> > @@ -0,0 +1,1150 @@
> > +/*
> > + * VDUSE (vDPA Device in Userspace) library
> > + *
> > + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
> > + *   Portions of codes and concepts borrowed from libvhost-user.c, so:
> > + *     Copyright IBM, Corp. 2007
> > + *     Copyright (c) 2016 Red Hat, Inc.
> > + *
> > + * Author:
> > + *   Xie Yongji <xieyongji@bytedance.com>
> > + *   Anthony Liguori <aliguori@us.ibm.com>
> > + *   Marc-André Lureau <mlureau@redhat.com>
> > + *   Victor Kaplansky <victork@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> > + * later.  See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include <stdlib.h>
> > +#include <stdio.h>
> > +#include <stdbool.h>
> > +#include <stddef.h>
> > +#include <errno.h>
> > +#include <string.h>
> > +#include <assert.h>
> > +#include <endian.h>
> > +#include <unistd.h>
> > +#include <limits.h>
> > +#include <fcntl.h>
> > +#include <inttypes.h>
> > +
> > +#include <sys/ioctl.h>
> > +#include <sys/eventfd.h>
> > +#include <sys/mman.h>
> > +
> > +#include "include/atomic.h"
> > +#include "linux-headers/linux/virtio_ring.h"
> > +#include "linux-headers/linux/virtio_config.h"
> > +#include "linux-headers/linux/vduse.h"
> > +#include "libvduse.h"
> > +
> > +#define VDUSE_VQ_ALIGN 4096
> > +#define MAX_IOVA_REGIONS 256
> > +
> > +/* Round number down to multiple */
> > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > +
> > +/* Round number up to multiple */
> > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > +
> > +#ifndef unlikely
> > +#define unlikely(x)   __builtin_expect(!!(x), 0)
> > +#endif
> > +
> > +typedef struct VduseRing {
> > +    unsigned int num;
> > +    uint64_t desc_addr;
> > +    uint64_t avail_addr;
> > +    uint64_t used_addr;
> > +    struct vring_desc *desc;
> > +    struct vring_avail *avail;
> > +    struct vring_used *used;
> > +} VduseRing;
> > +
> > +struct VduseVirtq {
> > +    VduseRing vring;
> > +    uint16_t last_avail_idx;
> > +    uint16_t shadow_avail_idx;
> > +    uint16_t used_idx;
> > +    uint16_t signalled_used;
> > +    bool signalled_used_valid;
> > +    int index;
> > +    int inuse;
> > +    bool ready;
> > +    int fd;
> > +    VduseDev *dev;
> > +};
> > +
> > +typedef struct VduseIovaRegion {
> > +    uint64_t iova;
> > +    uint64_t size;
> > +    uint64_t mmap_offset;
> > +    uint64_t mmap_addr;
> > +} VduseIovaRegion;
> > +
> > +struct VduseDev {
> > +    VduseVirtq *vqs;
> > +    VduseIovaRegion regions[MAX_IOVA_REGIONS];
> > +    int num_regions;
> > +    char *name;
> > +    uint32_t device_id;
> > +    uint32_t vendor_id;
> > +    uint16_t num_queues;
> > +    uint16_t queue_size;
> > +    uint64_t features;
> > +    const VduseOps *ops;
> > +    int fd;
> > +    int ctrl_fd;
> > +    void *priv;
> > +};
> > +
> > +static inline bool has_feature(uint64_t features, unsigned int fbit)
> > +{
> > +    assert(fbit < 64);
> > +    return !!(features & (1ULL << fbit));
> > +}
> > +
> > +static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
> > +{
> > +    return has_feature(dev->features, fbit);
> > +}
> > +
> > +uint64_t vduse_get_virtio_features(void)
> > +{
> > +    return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> > +           (1ULL << VIRTIO_F_VERSION_1) |
> > +           (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
> > +           (1ULL << VIRTIO_RING_F_EVENT_IDX) |
> > +           (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
> > +}
> > +
> > +VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
> > +{
> > +    return vq->dev;
> > +}
> > +
> > +int vduse_queue_get_fd(VduseVirtq *vq)
> > +{
> > +    return vq->fd;
> > +}
> > +
> > +void *vduse_dev_get_priv(VduseDev *dev)
> > +{
> > +    return dev->priv;
> > +}
> > +
> > +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
> > +{
> > +    return &dev->vqs[index];
> > +}
> > +
> > +int vduse_dev_get_fd(VduseDev *dev)
> > +{
> > +    return dev->fd;
> > +}
> > +
> > +static int vduse_inject_irq(VduseDev *dev, int index)
> > +{
> > +    return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
> > +}
> > +
> > +static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
> > +                                     uint64_t last)
> > +{
> > +    int i;
> > +
> > +    if (last == start) {
> > +        return;
> > +    }
> > +
> > +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> > +        if (!dev->regions[i].mmap_addr) {
> > +            continue;
> > +        }
> > +
> > +        if (start <= dev->regions[i].iova &&
> > +            last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
> > +            munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
> > +                   dev->regions[i].mmap_offset + dev->regions[i].size);
> > +            dev->regions[i].mmap_addr = 0;
> > +            dev->num_regions--;
> > +        }
> > +    }
> > +}
> > +
> > +static int vduse_iova_add_region(VduseDev *dev, int fd,
> > +                                 uint64_t offset, uint64_t start,
> > +                                 uint64_t last, int prot)
> > +{
> > +    int i;
> > +    uint64_t size = last - start + 1;
> > +    void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
> > +
> > +    if (mmap_addr == MAP_FAILED) {
> > +        close(fd);
> > +        return -EINVAL;
> > +    }
> > +
> > +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> > +        if (!dev->regions[i].mmap_addr) {
> > +            dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
> > +            dev->regions[i].mmap_offset = offset;
> > +            dev->regions[i].iova = start;
> > +            dev->regions[i].size = size;
> > +            dev->num_regions++;
> > +            break;
> > +        }
> > +    }
> > +    assert(i < MAX_IOVA_REGIONS);
> > +    close(fd);
> > +
> > +    return 0;
> > +}
> > +
> > +static int perm_to_prot(uint8_t perm)
> > +{
> > +    int prot = 0;
> > +
> > +    switch (perm) {
> > +    case VDUSE_ACCESS_WO:
> > +        prot |= PROT_WRITE;
> > +        break;
> > +    case VDUSE_ACCESS_RO:
> > +        prot |= PROT_READ;
> > +        break;
> > +    case VDUSE_ACCESS_RW:
> > +        prot |= PROT_READ | PROT_WRITE;
> > +        break;
> > +    default:
> > +        break;
> > +    }
> > +
> > +    return prot;
> > +}
> > +
> > +static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
> > +{
> > +    int i, ret;
> > +    struct vduse_iotlb_entry entry;
> > +
> > +    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> > +        VduseIovaRegion *r = &dev->regions[i];
> > +
> > +        if (!r->mmap_addr) {
> > +            continue;
> > +        }
> > +
> > +        if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
> > +            if ((iova + *plen) > (r->iova + r->size)) {
> > +                *plen = r->iova + r->size - iova;
> > +            }
> > +            return (void *)(uintptr_t)(iova - r->iova +
> > +                   r->mmap_addr + r->mmap_offset);
> > +        }
> > +    }
> > +
> > +    entry.start = iova;
> > +    entry.last = iova + 1;
> > +    ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
> > +    if (ret < 0) {
> > +        return NULL;
> > +    }
> > +
> > +    if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
> > +                               entry.last, perm_to_prot(entry.perm))) {
> > +        return iova_to_va(dev, plen, iova);
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static inline uint16_t vring_avail_flags(VduseVirtq *vq)
> > +{
> > +    return le16toh(vq->vring.avail->flags);
> > +}
> > +
> > +static inline uint16_t vring_avail_idx(VduseVirtq *vq)
> > +{
> > +    vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
> > +
> > +    return vq->shadow_avail_idx;
> > +}
> > +
> > +static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
> > +{
> > +    return le16toh(vq->vring.avail->ring[i]);
> > +}
> > +
> > +static inline uint16_t vring_get_used_event(VduseVirtq *vq)
> > +{
> > +    return vring_avail_ring(vq, vq->vring.num);
> > +}
> > +
> > +static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
> > +                                 unsigned int *head)
> > +{
> > +    /*
> > +     * Grab the next descriptor number they're advertising, and increment
> > +     * the index we've seen.
> > +     */
> > +    *head = vring_avail_ring(vq, idx % vq->vring.num);
> > +
> > +    /* If their number is silly, that's a fatal mistake. */
> > +    if (*head >= vq->vring.num) {
> > +        fprintf(stderr, "Guest says index %u is available\n", *head);
> > +        return false;
> > +    }
> > +
> > +    return true;
> > +}
> > +
> > +static int
> > +vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
> > +                               uint64_t addr, size_t len)
> > +{
> > +    struct vring_desc *ori_desc;
> > +    uint64_t read_len;
> > +
> > +    if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
> > +        return -1;
> > +    }
> > +
> > +    if (len == 0) {
> > +        return -1;
> > +    }
> > +
> > +    while (len) {
> > +        read_len = len;
> > +        ori_desc = iova_to_va(dev, &read_len, addr);
> > +        if (!ori_desc) {
> > +            return -1;
> > +        }
> > +
> > +        memcpy(desc, ori_desc, read_len);
> > +        len -= read_len;
> > +        addr += read_len;
> > +        desc += read_len;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +enum {
> > +    VIRTQUEUE_READ_DESC_ERROR = -1,
> > +    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
> > +    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
> > +};
> > +
> > +static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
> > +                                      unsigned int max, unsigned int *next)
> > +{
> > +    /* If this descriptor says it doesn't chain, we're done. */
> > +    if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
> > +        return VIRTQUEUE_READ_DESC_DONE;
> > +    }
> > +
> > +    /* Check they're not leading us off end of descriptors. */
> > +    *next = desc[i].next;
> > +    /* Make sure compiler knows to grab that: we don't want it changing! */
> > +    smp_wmb();
> > +
> > +    if (*next >= max) {
> > +        fprintf(stderr, "Desc next is %u\n", *next);
> > +        return VIRTQUEUE_READ_DESC_ERROR;
> > +    }
> > +
> > +    return VIRTQUEUE_READ_DESC_MORE;
> > +}
> > +
> > +/*
> > + * Fetch avail_idx from VQ memory only when we really need to know if
> > + * guest has added some buffers.
> > + */
> > +static bool vduse_queue_empty(VduseVirtq *vq)
> > +{
> > +    if (unlikely(!vq->vring.avail)) {
> > +        return true;
> > +    }
> > +
> > +    if (vq->shadow_avail_idx != vq->last_avail_idx) {
> > +        return false;
> > +    }
> > +
> > +    return vring_avail_idx(vq) == vq->last_avail_idx;
> > +}
> > +
> > +static bool vduse_queue_should_notify(VduseVirtq *vq)
> > +{
> > +    VduseDev *dev = vq->dev;
> > +    uint16_t old, new;
> > +    bool v;
> > +
> > +    /* We need to expose used array entries before checking used event. */
> > +    smp_mb();
> > +
> > +    /* Always notify when queue is empty (when feature acknowledge) */
> > +    if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
> > +        !vq->inuse && vduse_queue_empty(vq)) {
> > +        return true;
> > +    }
> > +
> > +    if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> > +        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
> > +    }
> > +
> > +    v = vq->signalled_used_valid;
> > +    vq->signalled_used_valid = true;
> > +    old = vq->signalled_used;
> > +    new = vq->signalled_used = vq->used_idx;
> > +    return !v || vring_need_event(vring_get_used_event(vq), new, old);
> > +}
> > +
> > +void vduse_queue_notify(VduseVirtq *vq)
> > +{
> > +    VduseDev *dev = vq->dev;
> > +
> > +    if (unlikely(!vq->vring.avail)) {
> > +        return;
> > +    }
> > +
> > +    if (!vduse_queue_should_notify(vq)) {
> > +        return;
> > +    }
> > +
> > +    if (vduse_inject_irq(dev, vq->index) < 0) {
> > +        fprintf(stderr, "Error inject irq for vq %d: %s\n",
> > +                vq->index, strerror(errno));
> > +    }
> > +}
> > +
> > +static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
> > +{
> > +    *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
> > +}
> > +
> > +static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
> > +                                   struct iovec *iov, unsigned int max_num_sg,
> > +                                   bool is_write, uint64_t pa, size_t sz)
> > +{
> > +    unsigned num_sg = *p_num_sg;
> > +    VduseDev *dev = vq->dev;
> > +
> > +    assert(num_sg <= max_num_sg);
> > +
> > +    if (!sz) {
> > +        fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
> > +        return false;
> > +    }
> > +
> > +    while (sz) {
> > +        uint64_t len = sz;
> > +
> > +        if (num_sg == max_num_sg) {
> > +            fprintf(stderr,
> > +                    "virtio: too many descriptors in indirect table\n");
> > +            return false;
> > +        }
> > +
> > +        iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
> > +        if (iov[num_sg].iov_base == NULL) {
> > +            fprintf(stderr, "virtio: invalid address for buffers\n");
> > +            return false;
> > +        }
> > +        iov[num_sg++].iov_len = len;
> > +        sz -= len;
> > +        pa += len;
> > +    }
> > +
> > +    *p_num_sg = num_sg;
> > +    return true;
> > +}
> > +
> > +static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
> > +                                       unsigned in_num)
> > +{
> > +    VduseVirtqElement *elem;
> > +    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
> > +    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
> > +    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
> > +
> > +    assert(sz >= sizeof(VduseVirtqElement));
> > +    elem = malloc(out_sg_end);
> > +    if (!elem) {
> > +        return NULL;
> > +    }
> > +    elem->out_num = out_num;
> > +    elem->in_num = in_num;
> > +    elem->in_sg = (void *)elem + in_sg_ofs;
> > +    elem->out_sg = (void *)elem + out_sg_ofs;
> > +    return elem;
> > +}
> > +
> > +static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
> > +{
> > +    struct vring_desc *desc = vq->vring.desc;
> > +    VduseDev *dev = vq->dev;
> > +    uint64_t desc_addr, read_len;
> > +    unsigned int desc_len;
> > +    unsigned int max = vq->vring.num;
> > +    unsigned int i = idx;
> > +    VduseVirtqElement *elem;
> > +    struct iovec iov[VIRTQUEUE_MAX_SIZE];
> > +    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
> > +    unsigned int out_num = 0, in_num = 0;
> > +    int rc;
> > +
> > +    if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
> > +        if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
> > +            fprintf(stderr, "Invalid size for indirect buffer table\n");
> > +            return NULL;
> > +        }
> > +
> > +        /* loop over the indirect descriptor table */
> > +        desc_addr = le64toh(desc[i].addr);
> > +        desc_len = le32toh(desc[i].len);
> > +        max = desc_len / sizeof(struct vring_desc);
> > +        read_len = desc_len;
> > +        desc = iova_to_va(dev, &read_len, desc_addr);
> > +        if (unlikely(desc && read_len != desc_len)) {
> > +            /* Failed to use zero copy */
> > +            desc = NULL;
> > +            if (!vduse_queue_read_indirect_desc(dev, desc_buf,
> > +                                                desc_addr,
> > +                                                desc_len)) {
> > +                desc = desc_buf;
> > +            }
> > +        }
> > +        if (!desc) {
> > +            fprintf(stderr, "Invalid indirect buffer table\n");
> > +            return NULL;
> > +        }
> > +        i = 0;
> > +    }
> > +
> > +    /* Collect all the descriptors */
> > +    do {
> > +        if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
> > +            if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
> > +                                             VIRTQUEUE_MAX_SIZE - out_num,
> > +                                             true, le64toh(desc[i].addr),
> > +                                             le32toh(desc[i].len))) {
> > +                return NULL;
> > +            }
> > +        } else {
> > +            if (in_num) {
> > +                fprintf(stderr, "Incorrect order for descriptors\n");
> > +                return NULL;
> > +            }
> > +            if (!vduse_queue_map_single_desc(vq, &out_num, iov,
> > +                                             VIRTQUEUE_MAX_SIZE, false,
> > +                                             le64toh(desc[i].addr),
> > +                                             le32toh(desc[i].len))) {
> > +                return NULL;
> > +            }
> > +        }
> > +
> > +        /* If we've got too many, that implies a descriptor loop. */
> > +        if ((in_num + out_num) > max) {
> > +            fprintf(stderr, "Looped descriptor\n");
> > +            return NULL;
> > +        }
> > +        rc = vduse_queue_read_next_desc(desc, i, max, &i);
> > +    } while (rc == VIRTQUEUE_READ_DESC_MORE);
> > +
> > +    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
> > +        fprintf(stderr, "read descriptor error\n");
> > +        return NULL;
> > +    }
> > +
> > +    /* Now copy what we have collected and mapped */
> > +    elem = vduse_queue_alloc_element(sz, out_num, in_num);
> > +    if (!elem) {
> > +        fprintf(stderr, "read descriptor error\n");
> > +        return NULL;
> > +    }
> > +    elem->index = idx;
> > +    for (i = 0; i < out_num; i++) {
> > +        elem->out_sg[i] = iov[i];
> > +    }
> > +    for (i = 0; i < in_num; i++) {
> > +        elem->in_sg[i] = iov[out_num + i];
> > +    }
> > +
> > +    return elem;
> > +}
> > +
> > +void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
> > +{
> > +    unsigned int head;
> > +    VduseVirtqElement *elem;
> > +    VduseDev *dev = vq->dev;
> > +
> > +    if (unlikely(!vq->vring.avail)) {
> > +        return NULL;
> > +    }
> > +
> > +    if (vduse_queue_empty(vq)) {
> > +        return NULL;
> > +    }
> > +    /* Needed after virtio_queue_empty() */
> > +    smp_rmb();
> > +
> > +    if (vq->inuse >= vq->vring.num) {
> > +        fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
> > +        return NULL;
> > +    }
> > +
> > +    if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
> > +        return NULL;
> > +    }
> > +
> > +    if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> > +        vring_set_avail_event(vq, vq->last_avail_idx);
> > +    }
> > +
> > +    elem = vduse_queue_map_desc(vq, head, sz);
> > +
> > +    if (!elem) {
> > +        return NULL;
> > +    }
> > +
> > +    vq->inuse++;
> > +
> > +    return elem;
> > +}
> > +
> > +static inline void vring_used_write(VduseVirtq *vq,
> > +                                    struct vring_used_elem *uelem, int i)
> > +{
> > +    struct vring_used *used = vq->vring.used;
> > +
> > +    used->ring[i] = *uelem;
> > +}
> > +
> > +static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
> > +                             unsigned int len, unsigned int idx)
> > +{
> > +    struct vring_used_elem uelem;
> > +
> > +    if (unlikely(!vq->vring.used)) {
> > +        return;
> > +    }
> > +
> > +    idx = (idx + vq->used_idx) % vq->vring.num;
> > +
> > +    uelem.id = htole32(elem->index);
> > +    uelem.len = htole32(len);
> > +    vring_used_write(vq, &uelem, idx);
> > +}
> > +
> > +static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
> > +{
> > +    vq->vring.used->idx = htole16(val);
> > +    vq->used_idx = val;
> > +}
> > +
> > +static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
> > +{
> > +    uint16_t old, new;
> > +
> > +    if (unlikely(!vq->vring.used)) {
> > +        return;
> > +    }
> > +
> > +    /* Make sure buffer is written before we update index. */
> > +    smp_wmb();
> > +
> > +    old = vq->used_idx;
> > +    new = old + count;
> > +    vring_used_idx_set(vq, new);
> > +    vq->inuse -= count;
> > +    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
> > +        vq->signalled_used_valid = false;
> > +    }
> > +}
> > +
> > +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> > +                      unsigned int len)
> > +{
> > +    vduse_queue_fill(vq, elem, len, 0);
> > +    vduse_queue_flush(vq, 1);
> > +}
> > +
> > +static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
> > +                                    uint64_t avail_addr, uint64_t used_addr)
> > +{
> > +    struct VduseDev *dev = vq->dev;
> > +    uint64_t len;
> > +
> > +    len = sizeof(struct vring_desc);
> > +    vq->vring.desc = iova_to_va(dev, &len, desc_addr);
> > +    if (len != sizeof(struct vring_desc)) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    len = sizeof(struct vring_avail);
> > +    vq->vring.avail = iova_to_va(dev, &len, avail_addr);
> > +    if (len != sizeof(struct vring_avail)) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    len = sizeof(struct vring_used);
> > +    vq->vring.used = iova_to_va(dev, &len, used_addr);
> > +    if (len != sizeof(struct vring_used)) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
> > +        fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
> > +        return -EINVAL;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void vduse_queue_enable(VduseVirtq *vq)
> > +{
> > +    struct VduseDev *dev = vq->dev;
> > +    struct vduse_vq_info vq_info;
> > +    struct vduse_vq_eventfd vq_eventfd;
> > +    int fd;
> > +
> > +    vq_info.index = vq->index;
> > +    if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
> > +        fprintf(stderr, "Failed to get vq[%d] info: %s\n",
> > +                vq->index, strerror(errno));
> > +        return;
> > +    }
> > +
> > +    if (!vq_info.ready) {
> > +        return;
> > +    }
> > +
> > +    vq->vring.num = vq_info.num;
> > +    vq->vring.desc_addr = vq_info.desc_addr;
> > +    vq->vring.avail_addr = vq_info.driver_addr;
> > +    vq->vring.used_addr = vq_info.device_addr;
> > +
> > +    if (vduse_queue_update_vring(vq, vq_info.desc_addr,
> > +                                 vq_info.driver_addr, vq_info.device_addr)) {
> > +        fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
> > +        return;
> > +    }
> > +
> > +    fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > +    if (fd < 0) {
> > +        fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
> > +        return;
> > +    }
> > +
> > +    vq_eventfd.index = vq->index;
> > +    vq_eventfd.fd = fd;
> > +    if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
> > +        fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
> > +        close(fd);
> > +        return;
> > +    }
> > +
> > +    vq->fd = fd;
> > +    vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
> > +    vq->inuse = 0;
> > +    vq->used_idx = 0;
> > +    vq->signalled_used_valid = false;
> > +    vq->ready = true;
> > +
> > +    dev->ops->enable_queue(dev, vq);
> > +}
> > +
> > +static void vduse_queue_disable(VduseVirtq *vq)
> > +{
> > +    struct VduseDev *dev = vq->dev;
> > +    struct vduse_vq_eventfd eventfd;
> > +
> > +    if (!vq->ready) {
> > +        return;
> > +    }
> > +
> > +    dev->ops->disable_queue(dev, vq);
> > +
> > +    eventfd.index = vq->index;
> > +    eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
> > +    ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
> > +    close(vq->fd);
> > +
> > +    assert(vq->inuse == 0);
> > +
> > +    vq->vring.num = 0;
> > +    vq->vring.desc_addr = 0;
> > +    vq->vring.avail_addr = 0;
> > +    vq->vring.used_addr = 0;
> > +    vq->vring.desc = 0;
> > +    vq->vring.avail = 0;
> > +    vq->vring.used = 0;
> > +    vq->ready = false;
> > +    vq->fd = -1;
> > +}
> > +
> > +static void vduse_dev_start_dataplane(VduseDev *dev)
> > +{
> > +    int i;
> > +
> > +    if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
> > +        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
> > +        return;
> > +    }
> > +    assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
> > +
> > +    for (i = 0; i < dev->num_queues; i++) {
> > +        vduse_queue_enable(&dev->vqs[i]);
> > +    }
> > +}
> > +
> > +static void vduse_dev_stop_dataplane(VduseDev *dev)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < dev->num_queues; i++) {
> > +        vduse_queue_disable(&dev->vqs[i]);
> > +    }
> > +    dev->features = 0;
> > +    vduse_iova_remove_region(dev, 0, ULONG_MAX);
> > +}
> > +
> > +int vduse_dev_handler(VduseDev *dev)
> > +{
> > +    struct vduse_dev_request req;
> > +    struct vduse_dev_response resp = { 0 };
> > +    VduseVirtq *vq;
> > +    int i, ret;
> > +
> > +    ret = read(dev->fd, &req, sizeof(req));
> > +    if (ret != sizeof(req)) {
> > +        fprintf(stderr, "Read request error [%d]: %s\n",
> > +                ret, strerror(errno));
> > +        return -errno;
> > +    }
> > +    resp.request_id = req.request_id;
> > +
> > +    switch (req.type) {
> > +    case VDUSE_GET_VQ_STATE:
> > +        vq = &dev->vqs[req.vq_state.index];
> > +        resp.vq_state.split.avail_index = vq->last_avail_idx;
> > +        resp.result = VDUSE_REQ_RESULT_OK;
> > +        break;
> > +    case VDUSE_SET_STATUS:
> > +        if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
> > +            vduse_dev_start_dataplane(dev);
> > +        } else if (req.s.status == 0) {
> > +            vduse_dev_stop_dataplane(dev);
> > +        }
> > +        resp.result = VDUSE_REQ_RESULT_OK;
> > +        break;
> > +    case VDUSE_UPDATE_IOTLB:
> > +        /* The iova will be updated by iova_to_va() later, so just remove it */
> > +        vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
> > +        for (i = 0; i < dev->num_queues; i++) {
> > +            VduseVirtq *vq = &dev->vqs[i];
> > +            if (vq->ready) {
> > +                if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
> > +                                             vq->vring.avail_addr,
> > +                                             vq->vring.used_addr)) {
> > +                    fprintf(stderr, "Failed to update vring for vq[%d]\n",
> > +                            vq->index);
> > +                }
> > +            }
> > +        }
> > +        resp.result = VDUSE_REQ_RESULT_OK;
> > +        break;
> > +    default:
> > +        resp.result = VDUSE_REQ_RESULT_FAILED;
> > +        break;
> > +    }
> > +
> > +    ret = write(dev->fd, &resp, sizeof(resp));
> > +    if (ret != sizeof(resp)) {
> > +        fprintf(stderr, "Write request %d error [%d]: %s\n",
> > +                req.type, ret, strerror(errno));
> > +        return -errno;
> > +    }
> > +    return 0;
> > +}
> > +
> > +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> > +                            uint32_t offset, char *buffer)
> > +{
> > +    int ret;
> > +    struct vduse_config_data *data;
> > +
> > +    data = malloc(offsetof(struct vduse_config_data, buffer) + size);
> > +    if (!data) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    data->offset = offset;
> > +    data->length = size;
> > +    memcpy(data->buffer, buffer, size);
> > +
> > +    ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
> > +    free(data);
> > +
> > +    if (ret) {
> > +        return -errno;
> > +    }
> > +
> > +    if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
> > +        return -errno;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
> > +{
> > +    VduseVirtq *vq = &dev->vqs[index];
> > +    struct vduse_vq_config vq_config = { 0 };
> > +
> > +    if (max_size > VIRTQUEUE_MAX_SIZE) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    vq_config.index = vq->index;
> > +    vq_config.max_size = max_size;
> > +
> > +    if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
> > +        return -errno;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
> > +{
> > +    VduseVirtq *vqs;
> > +    int i;
> > +
> > +    vqs = calloc(sizeof(VduseVirtq), num_queues);
> > +    if (!vqs) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    for (i = 0; i < num_queues; i++) {
> > +        vqs[i].index = i;
> > +        vqs[i].dev = dev;
> > +        vqs[i].fd = -1;
> > +    }
> > +    dev->vqs = vqs;
> > +
> > +    return 0;
> > +}
> > +
> > +static int vduse_dev_init(VduseDev *dev, const char *name,
> > +                          uint16_t num_queues, const VduseOps *ops,
> > +                          void *priv)
> > +{
> > +    char *dev_path, *dev_name;
> > +    int ret, fd;
> > +
> > +    dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
> > +    if (!dev_path) {
> > +        return -ENOMEM;
> > +    }
> > +    sprintf(dev_path, "/dev/vduse/%s", name);
> > +
> > +    fd = open(dev_path, O_RDWR);
> > +    free(dev_path);
> > +    if (fd < 0) {
> > +        fprintf(stderr, "Failed to open vduse dev %s: %s\n",
> > +                name, strerror(errno));
> > +        return -errno;
> > +    }
> > +
> > +    dev_name = strdup(name);
> > +    if (!dev_name) {
> > +        close(fd);
> > +        return -ENOMEM;
> > +    }
> > +
> > +    ret = vduse_dev_init_vqs(dev, num_queues);
> > +    if (ret) {
> > +        free(dev_name);
> > +        close(fd);
> > +        return ret;
> > +    }
> > +
> > +    dev->name = dev_name;
> > +    dev->num_queues = num_queues;
> > +    dev->fd = fd;
> > +    dev->ops = ops;
> > +    dev->priv = priv;
> > +
> > +    return 0;
> > +}
> > +
> > +static inline bool vduse_name_is_valid(const char *name)
> > +{
> > +    return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
> > +}
> > +
> > +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
> > +                                 const VduseOps *ops, void *priv)
> > +{
> > +    VduseDev *dev;
> > +    int ret;
> > +
> > +    if (!ops || !ops->enable_queue || !ops->disable_queue) {
> > +        fprintf(stderr, "Invalid parameter for vduse\n");
> > +        return NULL;
> > +    }
> > +
> > +    dev = calloc(sizeof(VduseDev), 1);
> > +    if (!dev) {
> > +        fprintf(stderr, "Failed to allocate vduse device\n");
> > +        return NULL;
> > +    }
> > +
> > +    ret = vduse_dev_init_vqs(dev, num_queues);
> > +    if (ret) {
> > +        fprintf(stderr, "Failed to init vqs\n");
> > +        free(dev);
> > +        return NULL;
> > +    }
> > +
> > +    dev->num_queues = num_queues;
> > +    dev->fd = fd;
> > +    dev->ops = ops;
> > +    dev->priv = priv;
> > +
> > +    return dev;
> > +}
> > +
> > +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
> > +                                   const VduseOps *ops, void *priv)
> > +{
> > +    VduseDev *dev;
> > +    int ret;
> > +
> > +    if (!name || vduse_name_is_valid(name) || !ops ||
> > +        !ops->enable_queue || !ops->disable_queue) {
> > +        fprintf(stderr, "Invalid parameter for vduse\n");
> > +        return NULL;
> > +    }
> > +
> > +    dev = calloc(sizeof(VduseDev), 1);
> > +    if (!dev) {
> > +        fprintf(stderr, "Failed to allocate vduse device\n");
> > +        return NULL;
> > +    }
> > +
> > +    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "Failed to init vduse device %s: %s\n",
> > +                name, strerror(ret));
>
> Must be strerror(-ret).  Spotted by Coverity, tracked as CID 1490226.
>

OK.

> > +        free(dev);
> > +        return NULL;
> > +    }
> > +
> > +    return dev;
> > +}
> > +
> > +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> > +                           uint32_t vendor_id, uint64_t features,
> > +                           uint16_t num_queues, uint32_t config_size,
> > +                           char *config, const VduseOps *ops, void *priv)
> > +{
> > +    VduseDev *dev;
> > +    int ret, ctrl_fd;
> > +    uint64_t version;
> > +    struct vduse_dev_config *dev_config;
> > +    size_t size = offsetof(struct vduse_dev_config, config);
> > +
> > +    if (!name || vduse_name_is_valid(name) ||
> > +        !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
> > +        !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
> > +        fprintf(stderr, "Invalid parameter for vduse\n");
> > +        return NULL;
> > +    }
> > +
> > +    dev = calloc(sizeof(VduseDev), 1);
> > +    if (!dev) {
> > +        fprintf(stderr, "Failed to allocate vduse device\n");
> > +        return NULL;
> > +    }
> > +
> > +    ctrl_fd = open("/dev/vduse/control", O_RDWR);
> > +    if (ctrl_fd < 0) {
> > +        fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
> > +                strerror(errno));
> > +        goto err_ctrl;
> > +    }
> > +
> > +    version = VDUSE_API_VERSION;
> > +    if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
> > +        fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
> > +                version, strerror(errno));
> > +        goto err_dev;
> > +    }
> > +
> > +    dev_config = calloc(size + config_size, 1);
> > +    if (!dev_config) {
> > +        fprintf(stderr, "Failed to allocate config space\n");
> > +        goto err_dev;
> > +    }
> > +
> > +    strcpy(dev_config->name, name);
>
> What ensures @name fits into dev->config->name?
>
> Coverity CID 1490224.
>

This should be false postive since we already checked the length of
"name" in vduse_name_is_invalid(). But anyway, I will replace strcpy()
with strncpy() to fix the coverity complaint.

> > +    dev_config->device_id = device_id;
> > +    dev_config->vendor_id = vendor_id;
> > +    dev_config->features = features;
> > +    dev_config->vq_num = num_queues;
> > +    dev_config->vq_align = VDUSE_VQ_ALIGN;
> > +    dev_config->config_size = config_size;
> > +    memcpy(dev_config->config, config, config_size);
> > +
> > +    ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
> > +    free(dev_config);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "Failed to create vduse device %s: %s\n",
> > +                name, strerror(errno));
> > +        goto err_dev;
> > +    }
> > +    dev->ctrl_fd = ctrl_fd;
> > +
> > +    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "Failed to init vduse device %s: %s\n",
> > +                name, strerror(ret));
>
> Must be strerror(-ret).  Spotted by Coverity, tracked as CID 1490223.
>

Thanks for pointing out these issues. I will send a fix soon.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/meson_options.txt b/meson_options.txt
index f3e2f22c1e..23a9f440f7 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -257,6 +257,8 @@  option('virtfs', type: 'feature', value: 'auto',
        description: 'virtio-9p support')
 option('virtiofsd', type: 'feature', value: 'auto',
        description: 'build virtiofs daemon (virtiofsd)')
+option('libvduse', type: 'feature', value: 'auto',
+       description: 'build VDUSE Library')
 
 option('capstone', type: 'feature', value: 'auto',
        description: 'Whether and how to find the capstone library')
diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h
new file mode 120000
index 0000000000..8c2be64f7b
--- /dev/null
+++ b/subprojects/libvduse/include/atomic.h
@@ -0,0 +1 @@ 
+../../../include/qemu/atomic.h
\ No newline at end of file
diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h
new file mode 120000
index 0000000000..de7b70697c
--- /dev/null
+++ b/subprojects/libvduse/include/compiler.h
@@ -0,0 +1 @@ 
+../../../include/qemu/compiler.h
\ No newline at end of file
diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
new file mode 100644
index 0000000000..6c2fe98213
--- /dev/null
+++ b/subprojects/libvduse/libvduse.h
@@ -0,0 +1,235 @@ 
+/*
+ * VDUSE (vDPA Device in Userspace) library
+ *
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author:
+ *   Xie Yongji <xieyongji@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef LIBVDUSE_H
+#define LIBVDUSE_H
+
+#include <stdint.h>
+#include <sys/uio.h>
+
+#define VIRTQUEUE_MAX_SIZE 1024
+
+/* VDUSE device structure */
+typedef struct VduseDev VduseDev;
+
+/* Virtqueue structure */
+typedef struct VduseVirtq VduseVirtq;
+
+/* Some operation of VDUSE backend */
+typedef struct VduseOps {
+    /* Called when virtqueue can be processed */
+    void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
+    /* Called when virtqueue processing should be stopped */
+    void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
+} VduseOps;
+
+/* Describing elements of the I/O buffer */
+typedef struct VduseVirtqElement {
+    /* Descriptor table index */
+    unsigned int index;
+    /* Number of physically-contiguous device-readable descriptors */
+    unsigned int out_num;
+    /* Number of physically-contiguous device-writable descriptors */
+    unsigned int in_num;
+    /* Array to store physically-contiguous device-writable descriptors */
+    struct iovec *in_sg;
+    /* Array to store physically-contiguous device-readable descriptors */
+    struct iovec *out_sg;
+} VduseVirtqElement;
+
+
+/**
+ * vduse_get_virtio_features:
+ *
+ * Get supported virtio features
+ *
+ * Returns: supported feature bits
+ */
+uint64_t vduse_get_virtio_features(void);
+
+/**
+ * vduse_queue_get_dev:
+ * @vq: specified virtqueue
+ *
+ * Get corresponding VDUSE device from the virtqueue.
+ *
+ * Returns: a pointer to VDUSE device on success, NULL on failure.
+ */
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
+
+/**
+ * vduse_queue_get_fd:
+ * @vq: specified virtqueue
+ *
+ * Get the kick fd for the virtqueue.
+ *
+ * Returns: file descriptor on success, -1 on failure.
+ */
+int vduse_queue_get_fd(VduseVirtq *vq);
+
+/**
+ * vduse_queue_pop:
+ * @vq: specified virtqueue
+ * @sz: the size of struct to return (must be >= VduseVirtqElement)
+ *
+ * Pop an element from virtqueue available ring.
+ *
+ * Returns: a pointer to a structure containing VduseVirtqElement on success,
+ * NULL on failure.
+ */
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
+
+/**
+ * vduse_queue_push:
+ * @vq: specified virtqueue
+ * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
+ * @len: length in bytes to write
+ *
+ * Push an element to virtqueue used ring.
+ */
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
+                      unsigned int len);
+/**
+ * vduse_queue_notify:
+ * @vq: specified virtqueue
+ *
+ * Request to notify the queue.
+ */
+void vduse_queue_notify(VduseVirtq *vq);
+
+/**
+ * vduse_dev_get_priv:
+ * @dev: VDUSE device
+ *
+ * Get the private pointer passed to vduse_dev_create().
+ *
+ * Returns: private pointer on success, NULL on failure.
+ */
+void *vduse_dev_get_priv(VduseDev *dev);
+
+/**
+ * vduse_dev_get_queue:
+ * @dev: VDUSE device
+ * @index: virtqueue index
+ *
+ * Get the specified virtqueue.
+ *
+ * Returns: a pointer to the virtqueue on success, NULL on failure.
+ */
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
+
+/**
+ * vduse_dev_get_fd:
+ * @dev: VDUSE device
+ *
+ * Get the control message fd for the VDUSE device.
+ *
+ * Returns: file descriptor on success, -1 on failure.
+ */
+int vduse_dev_get_fd(VduseDev *dev);
+
+/**
+ * vduse_dev_handler:
+ * @dev: VDUSE device
+ *
+ * Used to process the control message.
+ *
+ * Returns: file descriptor on success, -errno on failure.
+ */
+int vduse_dev_handler(VduseDev *dev);
+
+/**
+ * vduse_dev_update_config:
+ * @dev: VDUSE device
+ * @size: the size to write to configuration space
+ * @offset: the offset from the beginning of configuration space
+ * @buffer: the buffer used to write from
+ *
+ * Update device configuration space and inject a config interrupt.
+ *
+ * Returns: 0 on success, -errno on failure.
+ */
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
+                            uint32_t offset, char *buffer);
+
+/**
+ * vduse_dev_setup_queue:
+ * @dev: VDUSE device
+ * @index: virtqueue index
+ * @max_size: the max size of virtqueue
+ *
+ * Setup the specified virtqueue.
+ *
+ * Returns: 0 on success, -errno on failure.
+ */
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
+
+/**
+ * vduse_dev_create_by_fd:
+ * @fd: passed file descriptor
+ * @num_queues: the number of virtqueues
+ * @ops: the operation of VDUSE backend
+ * @priv: private pointer
+ *
+ * Create VDUSE device from a passed file descriptor.
+ *
+ * Returns: pointer to VDUSE device on success, NULL on failure.
+ */
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
+                                 const VduseOps *ops, void *priv);
+
+/**
+ * vduse_dev_create_by_name:
+ * @name: VDUSE device name
+ * @num_queues: the number of virtqueues
+ * @ops: the operation of VDUSE backend
+ * @priv: private pointer
+ *
+ * Create VDUSE device on /dev/vduse/$NAME.
+ *
+ * Returns: pointer to VDUSE device on success, NULL on failure.
+ */
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
+                                   const VduseOps *ops, void *priv);
+
+/**
+ * vduse_dev_create:
+ * @name: VDUSE device name
+ * @device_id: virtio device id
+ * @vendor_id: virtio vendor id
+ * @features: virtio features
+ * @num_queues: the number of virtqueues
+ * @config_size: the size of the configuration space
+ * @config: the buffer of the configuration space
+ * @ops: the operation of VDUSE backend
+ * @priv: private pointer
+ *
+ * Create VDUSE device.
+ *
+ * Returns: pointer to VDUSE device on success, NULL on failure.
+ */
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
+                           uint32_t vendor_id, uint64_t features,
+                           uint16_t num_queues, uint32_t config_size,
+                           char *config, const VduseOps *ops, void *priv);
+
+/**
+ * vduse_dev_destroy:
+ * @dev: VDUSE device
+ *
+ * Destroy the VDUSE device.
+ *
+ * Returns: 0 on success, -errno on failure.
+ */
+int vduse_dev_destroy(VduseDev *dev);
+
+#endif
diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
new file mode 100644
index 0000000000..78e1e5cf90
--- /dev/null
+++ b/subprojects/libvduse/libvduse.c
@@ -0,0 +1,1150 @@ 
+/*
+ * VDUSE (vDPA Device in Userspace) library
+ *
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *   Portions of codes and concepts borrowed from libvhost-user.c, so:
+ *     Copyright IBM, Corp. 2007
+ *     Copyright (c) 2016 Red Hat, Inc.
+ *
+ * Author:
+ *   Xie Yongji <xieyongji@bytedance.com>
+ *   Anthony Liguori <aliguori@us.ibm.com>
+ *   Marc-André Lureau <mlureau@redhat.com>
+ *   Victor Kaplansky <victork@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <endian.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <inttypes.h>
+
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+
+#include "include/atomic.h"
+#include "linux-headers/linux/virtio_ring.h"
+#include "linux-headers/linux/virtio_config.h"
+#include "linux-headers/linux/vduse.h"
+#include "libvduse.h"
+
+#define VDUSE_VQ_ALIGN 4096
+#define MAX_IOVA_REGIONS 256
+
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+#ifndef unlikely
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
+
+typedef struct VduseRing {
+    unsigned int num;
+    uint64_t desc_addr;
+    uint64_t avail_addr;
+    uint64_t used_addr;
+    struct vring_desc *desc;
+    struct vring_avail *avail;
+    struct vring_used *used;
+} VduseRing;
+
+struct VduseVirtq {
+    VduseRing vring;
+    uint16_t last_avail_idx;
+    uint16_t shadow_avail_idx;
+    uint16_t used_idx;
+    uint16_t signalled_used;
+    bool signalled_used_valid;
+    int index;
+    int inuse;
+    bool ready;
+    int fd;
+    VduseDev *dev;
+};
+
+typedef struct VduseIovaRegion {
+    uint64_t iova;
+    uint64_t size;
+    uint64_t mmap_offset;
+    uint64_t mmap_addr;
+} VduseIovaRegion;
+
+struct VduseDev {
+    VduseVirtq *vqs;
+    VduseIovaRegion regions[MAX_IOVA_REGIONS];
+    int num_regions;
+    char *name;
+    uint32_t device_id;
+    uint32_t vendor_id;
+    uint16_t num_queues;
+    uint16_t queue_size;
+    uint64_t features;
+    const VduseOps *ops;
+    int fd;
+    int ctrl_fd;
+    void *priv;
+};
+
+static inline bool has_feature(uint64_t features, unsigned int fbit)
+{
+    assert(fbit < 64);
+    return !!(features & (1ULL << fbit));
+}
+
+static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
+{
+    return has_feature(dev->features, fbit);
+}
+
+uint64_t vduse_get_virtio_features(void)
+{
+    return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+           (1ULL << VIRTIO_F_VERSION_1) |
+           (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+           (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+           (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
+}
+
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
+{
+    return vq->dev;
+}
+
+int vduse_queue_get_fd(VduseVirtq *vq)
+{
+    return vq->fd;
+}
+
+void *vduse_dev_get_priv(VduseDev *dev)
+{
+    return dev->priv;
+}
+
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
+{
+    return &dev->vqs[index];
+}
+
+int vduse_dev_get_fd(VduseDev *dev)
+{
+    return dev->fd;
+}
+
+static int vduse_inject_irq(VduseDev *dev, int index)
+{
+    return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
+}
+
+static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
+                                     uint64_t last)
+{
+    int i;
+
+    if (last == start) {
+        return;
+    }
+
+    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
+        if (!dev->regions[i].mmap_addr) {
+            continue;
+        }
+
+        if (start <= dev->regions[i].iova &&
+            last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
+            munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
+                   dev->regions[i].mmap_offset + dev->regions[i].size);
+            dev->regions[i].mmap_addr = 0;
+            dev->num_regions--;
+        }
+    }
+}
+
+static int vduse_iova_add_region(VduseDev *dev, int fd,
+                                 uint64_t offset, uint64_t start,
+                                 uint64_t last, int prot)
+{
+    int i;
+    uint64_t size = last - start + 1;
+    void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
+
+    if (mmap_addr == MAP_FAILED) {
+        close(fd);
+        return -EINVAL;
+    }
+
+    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
+        if (!dev->regions[i].mmap_addr) {
+            dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
+            dev->regions[i].mmap_offset = offset;
+            dev->regions[i].iova = start;
+            dev->regions[i].size = size;
+            dev->num_regions++;
+            break;
+        }
+    }
+    assert(i < MAX_IOVA_REGIONS);
+    close(fd);
+
+    return 0;
+}
+
+static int perm_to_prot(uint8_t perm)
+{
+    int prot = 0;
+
+    switch (perm) {
+    case VDUSE_ACCESS_WO:
+        prot |= PROT_WRITE;
+        break;
+    case VDUSE_ACCESS_RO:
+        prot |= PROT_READ;
+        break;
+    case VDUSE_ACCESS_RW:
+        prot |= PROT_READ | PROT_WRITE;
+        break;
+    default:
+        break;
+    }
+
+    return prot;
+}
+
+static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
+{
+    int i, ret;
+    struct vduse_iotlb_entry entry;
+
+    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
+        VduseIovaRegion *r = &dev->regions[i];
+
+        if (!r->mmap_addr) {
+            continue;
+        }
+
+        if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
+            if ((iova + *plen) > (r->iova + r->size)) {
+                *plen = r->iova + r->size - iova;
+            }
+            return (void *)(uintptr_t)(iova - r->iova +
+                   r->mmap_addr + r->mmap_offset);
+        }
+    }
+
+    entry.start = iova;
+    entry.last = iova + 1;
+    ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
+    if (ret < 0) {
+        return NULL;
+    }
+
+    if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
+                               entry.last, perm_to_prot(entry.perm))) {
+        return iova_to_va(dev, plen, iova);
+    }
+
+    return NULL;
+}
+
+static inline uint16_t vring_avail_flags(VduseVirtq *vq)
+{
+    return le16toh(vq->vring.avail->flags);
+}
+
+static inline uint16_t vring_avail_idx(VduseVirtq *vq)
+{
+    vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
+
+    return vq->shadow_avail_idx;
+}
+
+static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
+{
+    return le16toh(vq->vring.avail->ring[i]);
+}
+
+static inline uint16_t vring_get_used_event(VduseVirtq *vq)
+{
+    return vring_avail_ring(vq, vq->vring.num);
+}
+
+static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
+                                 unsigned int *head)
+{
+    /*
+     * Grab the next descriptor number they're advertising, and increment
+     * the index we've seen.
+     */
+    *head = vring_avail_ring(vq, idx % vq->vring.num);
+
+    /* If their number is silly, that's a fatal mistake. */
+    if (*head >= vq->vring.num) {
+        fprintf(stderr, "Guest says index %u is available\n", *head);
+        return false;
+    }
+
+    return true;
+}
+
+static int
+vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
+                               uint64_t addr, size_t len)
+{
+    struct vring_desc *ori_desc;
+    uint64_t read_len;
+
+    if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
+        return -1;
+    }
+
+    if (len == 0) {
+        return -1;
+    }
+
+    while (len) {
+        read_len = len;
+        ori_desc = iova_to_va(dev, &read_len, addr);
+        if (!ori_desc) {
+            return -1;
+        }
+
+        memcpy(desc, ori_desc, read_len);
+        len -= read_len;
+        addr += read_len;
+        desc += read_len;
+    }
+
+    return 0;
+}
+
+enum {
+    VIRTQUEUE_READ_DESC_ERROR = -1,
+    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
+    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
+};
+
+static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
+                                      unsigned int max, unsigned int *next)
+{
+    /* If this descriptor says it doesn't chain, we're done. */
+    if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
+        return VIRTQUEUE_READ_DESC_DONE;
+    }
+
+    /* Check they're not leading us off end of descriptors. */
+    *next = desc[i].next;
+    /* Make sure compiler knows to grab that: we don't want it changing! */
+    smp_wmb();
+
+    if (*next >= max) {
+        fprintf(stderr, "Desc next is %u\n", *next);
+        return VIRTQUEUE_READ_DESC_ERROR;
+    }
+
+    return VIRTQUEUE_READ_DESC_MORE;
+}
+
+/*
+ * Fetch avail_idx from VQ memory only when we really need to know if
+ * guest has added some buffers.
+ */
+static bool vduse_queue_empty(VduseVirtq *vq)
+{
+    if (unlikely(!vq->vring.avail)) {
+        return true;
+    }
+
+    if (vq->shadow_avail_idx != vq->last_avail_idx) {
+        return false;
+    }
+
+    return vring_avail_idx(vq) == vq->last_avail_idx;
+}
+
+static bool vduse_queue_should_notify(VduseVirtq *vq)
+{
+    VduseDev *dev = vq->dev;
+    uint16_t old, new;
+    bool v;
+
+    /* We need to expose used array entries before checking used event. */
+    smp_mb();
+
+    /* Always notify when queue is empty (when feature acknowledge) */
+    if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+        !vq->inuse && vduse_queue_empty(vq)) {
+        return true;
+    }
+
+    if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
+    }
+
+    v = vq->signalled_used_valid;
+    vq->signalled_used_valid = true;
+    old = vq->signalled_used;
+    new = vq->signalled_used = vq->used_idx;
+    return !v || vring_need_event(vring_get_used_event(vq), new, old);
+}
+
+void vduse_queue_notify(VduseVirtq *vq)
+{
+    VduseDev *dev = vq->dev;
+
+    if (unlikely(!vq->vring.avail)) {
+        return;
+    }
+
+    if (!vduse_queue_should_notify(vq)) {
+        return;
+    }
+
+    if (vduse_inject_irq(dev, vq->index) < 0) {
+        fprintf(stderr, "Error inject irq for vq %d: %s\n",
+                vq->index, strerror(errno));
+    }
+}
+
+static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
+{
+    *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
+}
+
+static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
+                                   struct iovec *iov, unsigned int max_num_sg,
+                                   bool is_write, uint64_t pa, size_t sz)
+{
+    unsigned num_sg = *p_num_sg;
+    VduseDev *dev = vq->dev;
+
+    assert(num_sg <= max_num_sg);
+
+    if (!sz) {
+        fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
+        return false;
+    }
+
+    while (sz) {
+        uint64_t len = sz;
+
+        if (num_sg == max_num_sg) {
+            fprintf(stderr,
+                    "virtio: too many descriptors in indirect table\n");
+            return false;
+        }
+
+        iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
+        if (iov[num_sg].iov_base == NULL) {
+            fprintf(stderr, "virtio: invalid address for buffers\n");
+            return false;
+        }
+        iov[num_sg++].iov_len = len;
+        sz -= len;
+        pa += len;
+    }
+
+    *p_num_sg = num_sg;
+    return true;
+}
+
+static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
+                                       unsigned in_num)
+{
+    VduseVirtqElement *elem;
+    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
+    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
+    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
+
+    assert(sz >= sizeof(VduseVirtqElement));
+    elem = malloc(out_sg_end);
+    if (!elem) {
+        return NULL;
+    }
+    elem->out_num = out_num;
+    elem->in_num = in_num;
+    elem->in_sg = (void *)elem + in_sg_ofs;
+    elem->out_sg = (void *)elem + out_sg_ofs;
+    return elem;
+}
+
+static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
+{
+    struct vring_desc *desc = vq->vring.desc;
+    VduseDev *dev = vq->dev;
+    uint64_t desc_addr, read_len;
+    unsigned int desc_len;
+    unsigned int max = vq->vring.num;
+    unsigned int i = idx;
+    VduseVirtqElement *elem;
+    struct iovec iov[VIRTQUEUE_MAX_SIZE];
+    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
+    unsigned int out_num = 0, in_num = 0;
+    int rc;
+
+    if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
+        if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
+            fprintf(stderr, "Invalid size for indirect buffer table\n");
+            return NULL;
+        }
+
+        /* loop over the indirect descriptor table */
+        desc_addr = le64toh(desc[i].addr);
+        desc_len = le32toh(desc[i].len);
+        max = desc_len / sizeof(struct vring_desc);
+        read_len = desc_len;
+        desc = iova_to_va(dev, &read_len, desc_addr);
+        if (unlikely(desc && read_len != desc_len)) {
+            /* Failed to use zero copy */
+            desc = NULL;
+            if (!vduse_queue_read_indirect_desc(dev, desc_buf,
+                                                desc_addr,
+                                                desc_len)) {
+                desc = desc_buf;
+            }
+        }
+        if (!desc) {
+            fprintf(stderr, "Invalid indirect buffer table\n");
+            return NULL;
+        }
+        i = 0;
+    }
+
+    /* Collect all the descriptors */
+    do {
+        if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
+            if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
+                                             VIRTQUEUE_MAX_SIZE - out_num,
+                                             true, le64toh(desc[i].addr),
+                                             le32toh(desc[i].len))) {
+                return NULL;
+            }
+        } else {
+            if (in_num) {
+                fprintf(stderr, "Incorrect order for descriptors\n");
+                return NULL;
+            }
+            if (!vduse_queue_map_single_desc(vq, &out_num, iov,
+                                             VIRTQUEUE_MAX_SIZE, false,
+                                             le64toh(desc[i].addr),
+                                             le32toh(desc[i].len))) {
+                return NULL;
+            }
+        }
+
+        /* If we've got too many, that implies a descriptor loop. */
+        if ((in_num + out_num) > max) {
+            fprintf(stderr, "Looped descriptor\n");
+            return NULL;
+        }
+        rc = vduse_queue_read_next_desc(desc, i, max, &i);
+    } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+        fprintf(stderr, "read descriptor error\n");
+        return NULL;
+    }
+
+    /* Now copy what we have collected and mapped */
+    elem = vduse_queue_alloc_element(sz, out_num, in_num);
+    if (!elem) {
+        fprintf(stderr, "read descriptor error\n");
+        return NULL;
+    }
+    elem->index = idx;
+    for (i = 0; i < out_num; i++) {
+        elem->out_sg[i] = iov[i];
+    }
+    for (i = 0; i < in_num; i++) {
+        elem->in_sg[i] = iov[out_num + i];
+    }
+
+    return elem;
+}
+
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
+{
+    unsigned int head;
+    VduseVirtqElement *elem;
+    VduseDev *dev = vq->dev;
+
+    if (unlikely(!vq->vring.avail)) {
+        return NULL;
+    }
+
+    if (vduse_queue_empty(vq)) {
+        return NULL;
+    }
+    /* Needed after virtio_queue_empty() */
+    smp_rmb();
+
+    if (vq->inuse >= vq->vring.num) {
+        fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
+        return NULL;
+    }
+
+    if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
+        return NULL;
+    }
+
+    if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+        vring_set_avail_event(vq, vq->last_avail_idx);
+    }
+
+    elem = vduse_queue_map_desc(vq, head, sz);
+
+    if (!elem) {
+        return NULL;
+    }
+
+    vq->inuse++;
+
+    return elem;
+}
+
+static inline void vring_used_write(VduseVirtq *vq,
+                                    struct vring_used_elem *uelem, int i)
+{
+    struct vring_used *used = vq->vring.used;
+
+    used->ring[i] = *uelem;
+}
+
+static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
+                             unsigned int len, unsigned int idx)
+{
+    struct vring_used_elem uelem;
+
+    if (unlikely(!vq->vring.used)) {
+        return;
+    }
+
+    idx = (idx + vq->used_idx) % vq->vring.num;
+
+    uelem.id = htole32(elem->index);
+    uelem.len = htole32(len);
+    vring_used_write(vq, &uelem, idx);
+}
+
+static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
+{
+    vq->vring.used->idx = htole16(val);
+    vq->used_idx = val;
+}
+
+static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
+{
+    uint16_t old, new;
+
+    if (unlikely(!vq->vring.used)) {
+        return;
+    }
+
+    /* Make sure buffer is written before we update index. */
+    smp_wmb();
+
+    old = vq->used_idx;
+    new = old + count;
+    vring_used_idx_set(vq, new);
+    vq->inuse -= count;
+    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
+        vq->signalled_used_valid = false;
+    }
+}
+
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
+                      unsigned int len)
+{
+    vduse_queue_fill(vq, elem, len, 0);
+    vduse_queue_flush(vq, 1);
+}
+
+static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
+                                    uint64_t avail_addr, uint64_t used_addr)
+{
+    struct VduseDev *dev = vq->dev;
+    uint64_t len;
+
+    len = sizeof(struct vring_desc);
+    vq->vring.desc = iova_to_va(dev, &len, desc_addr);
+    if (len != sizeof(struct vring_desc)) {
+        return -EINVAL;
+    }
+
+    len = sizeof(struct vring_avail);
+    vq->vring.avail = iova_to_va(dev, &len, avail_addr);
+    if (len != sizeof(struct vring_avail)) {
+        return -EINVAL;
+    }
+
+    len = sizeof(struct vring_used);
+    vq->vring.used = iova_to_va(dev, &len, used_addr);
+    if (len != sizeof(struct vring_used)) {
+        return -EINVAL;
+    }
+
+    if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
+        fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void vduse_queue_enable(VduseVirtq *vq)
+{
+    struct VduseDev *dev = vq->dev;
+    struct vduse_vq_info vq_info;
+    struct vduse_vq_eventfd vq_eventfd;
+    int fd;
+
+    vq_info.index = vq->index;
+    if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
+        fprintf(stderr, "Failed to get vq[%d] info: %s\n",
+                vq->index, strerror(errno));
+        return;
+    }
+
+    if (!vq_info.ready) {
+        return;
+    }
+
+    vq->vring.num = vq_info.num;
+    vq->vring.desc_addr = vq_info.desc_addr;
+    vq->vring.avail_addr = vq_info.driver_addr;
+    vq->vring.used_addr = vq_info.device_addr;
+
+    if (vduse_queue_update_vring(vq, vq_info.desc_addr,
+                                 vq_info.driver_addr, vq_info.device_addr)) {
+        fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
+        return;
+    }
+
+    fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+    if (fd < 0) {
+        fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
+        return;
+    }
+
+    vq_eventfd.index = vq->index;
+    vq_eventfd.fd = fd;
+    if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
+        fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
+        close(fd);
+        return;
+    }
+
+    vq->fd = fd;
+    vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
+    vq->inuse = 0;
+    vq->used_idx = 0;
+    vq->signalled_used_valid = false;
+    vq->ready = true;
+
+    dev->ops->enable_queue(dev, vq);
+}
+
+static void vduse_queue_disable(VduseVirtq *vq)
+{
+    struct VduseDev *dev = vq->dev;
+    struct vduse_vq_eventfd eventfd;
+
+    if (!vq->ready) {
+        return;
+    }
+
+    dev->ops->disable_queue(dev, vq);
+
+    eventfd.index = vq->index;
+    eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
+    ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
+    close(vq->fd);
+
+    assert(vq->inuse == 0);
+
+    vq->vring.num = 0;
+    vq->vring.desc_addr = 0;
+    vq->vring.avail_addr = 0;
+    vq->vring.used_addr = 0;
+    vq->vring.desc = 0;
+    vq->vring.avail = 0;
+    vq->vring.used = 0;
+    vq->ready = false;
+    vq->fd = -1;
+}
+
+static void vduse_dev_start_dataplane(VduseDev *dev)
+{
+    int i;
+
+    if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
+        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
+        return;
+    }
+    assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
+
+    for (i = 0; i < dev->num_queues; i++) {
+        vduse_queue_enable(&dev->vqs[i]);
+    }
+}
+
+static void vduse_dev_stop_dataplane(VduseDev *dev)
+{
+    int i;
+
+    for (i = 0; i < dev->num_queues; i++) {
+        vduse_queue_disable(&dev->vqs[i]);
+    }
+    dev->features = 0;
+    vduse_iova_remove_region(dev, 0, ULONG_MAX);
+}
+
+int vduse_dev_handler(VduseDev *dev)
+{
+    struct vduse_dev_request req;
+    struct vduse_dev_response resp = { 0 };
+    VduseVirtq *vq;
+    int i, ret;
+
+    ret = read(dev->fd, &req, sizeof(req));
+    if (ret != sizeof(req)) {
+        fprintf(stderr, "Read request error [%d]: %s\n",
+                ret, strerror(errno));
+        return -errno;
+    }
+    resp.request_id = req.request_id;
+
+    switch (req.type) {
+    case VDUSE_GET_VQ_STATE:
+        vq = &dev->vqs[req.vq_state.index];
+        resp.vq_state.split.avail_index = vq->last_avail_idx;
+        resp.result = VDUSE_REQ_RESULT_OK;
+        break;
+    case VDUSE_SET_STATUS:
+        if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
+            vduse_dev_start_dataplane(dev);
+        } else if (req.s.status == 0) {
+            vduse_dev_stop_dataplane(dev);
+        }
+        resp.result = VDUSE_REQ_RESULT_OK;
+        break;
+    case VDUSE_UPDATE_IOTLB:
+        /* The iova will be updated by iova_to_va() later, so just remove it */
+        vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
+        for (i = 0; i < dev->num_queues; i++) {
+            VduseVirtq *vq = &dev->vqs[i];
+            if (vq->ready) {
+                if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
+                                             vq->vring.avail_addr,
+                                             vq->vring.used_addr)) {
+                    fprintf(stderr, "Failed to update vring for vq[%d]\n",
+                            vq->index);
+                }
+            }
+        }
+        resp.result = VDUSE_REQ_RESULT_OK;
+        break;
+    default:
+        resp.result = VDUSE_REQ_RESULT_FAILED;
+        break;
+    }
+
+    ret = write(dev->fd, &resp, sizeof(resp));
+    if (ret != sizeof(resp)) {
+        fprintf(stderr, "Write request %d error [%d]: %s\n",
+                req.type, ret, strerror(errno));
+        return -errno;
+    }
+    return 0;
+}
+
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
+                            uint32_t offset, char *buffer)
+{
+    int ret;
+    struct vduse_config_data *data;
+
+    data = malloc(offsetof(struct vduse_config_data, buffer) + size);
+    if (!data) {
+        return -ENOMEM;
+    }
+
+    data->offset = offset;
+    data->length = size;
+    memcpy(data->buffer, buffer, size);
+
+    ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
+    free(data);
+
+    if (ret) {
+        return -errno;
+    }
+
+    if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
+{
+    VduseVirtq *vq = &dev->vqs[index];
+    struct vduse_vq_config vq_config = { 0 };
+
+    if (max_size > VIRTQUEUE_MAX_SIZE) {
+        return -EINVAL;
+    }
+
+    vq_config.index = vq->index;
+    vq_config.max_size = max_size;
+
+    if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
+{
+    VduseVirtq *vqs;
+    int i;
+
+    vqs = calloc(sizeof(VduseVirtq), num_queues);
+    if (!vqs) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < num_queues; i++) {
+        vqs[i].index = i;
+        vqs[i].dev = dev;
+        vqs[i].fd = -1;
+    }
+    dev->vqs = vqs;
+
+    return 0;
+}
+
+static int vduse_dev_init(VduseDev *dev, const char *name,
+                          uint16_t num_queues, const VduseOps *ops,
+                          void *priv)
+{
+    char *dev_path, *dev_name;
+    int ret, fd;
+
+    dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
+    if (!dev_path) {
+        return -ENOMEM;
+    }
+    sprintf(dev_path, "/dev/vduse/%s", name);
+
+    fd = open(dev_path, O_RDWR);
+    free(dev_path);
+    if (fd < 0) {
+        fprintf(stderr, "Failed to open vduse dev %s: %s\n",
+                name, strerror(errno));
+        return -errno;
+    }
+
+    dev_name = strdup(name);
+    if (!dev_name) {
+        close(fd);
+        return -ENOMEM;
+    }
+
+    ret = vduse_dev_init_vqs(dev, num_queues);
+    if (ret) {
+        free(dev_name);
+        close(fd);
+        return ret;
+    }
+
+    dev->name = dev_name;
+    dev->num_queues = num_queues;
+    dev->fd = fd;
+    dev->ops = ops;
+    dev->priv = priv;
+
+    return 0;
+}
+
+static inline bool vduse_name_is_valid(const char *name)
+{
+    return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
+}
+
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
+                                 const VduseOps *ops, void *priv)
+{
+    VduseDev *dev;
+    int ret;
+
+    if (!ops || !ops->enable_queue || !ops->disable_queue) {
+        fprintf(stderr, "Invalid parameter for vduse\n");
+        return NULL;
+    }
+
+    dev = calloc(sizeof(VduseDev), 1);
+    if (!dev) {
+        fprintf(stderr, "Failed to allocate vduse device\n");
+        return NULL;
+    }
+
+    ret = vduse_dev_init_vqs(dev, num_queues);
+    if (ret) {
+        fprintf(stderr, "Failed to init vqs\n");
+        free(dev);
+        return NULL;
+    }
+
+    dev->num_queues = num_queues;
+    dev->fd = fd;
+    dev->ops = ops;
+    dev->priv = priv;
+
+    return dev;
+}
+
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
+                                   const VduseOps *ops, void *priv)
+{
+    VduseDev *dev;
+    int ret;
+
+    if (!name || vduse_name_is_valid(name) || !ops ||
+        !ops->enable_queue || !ops->disable_queue) {
+        fprintf(stderr, "Invalid parameter for vduse\n");
+        return NULL;
+    }
+
+    dev = calloc(sizeof(VduseDev), 1);
+    if (!dev) {
+        fprintf(stderr, "Failed to allocate vduse device\n");
+        return NULL;
+    }
+
+    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to init vduse device %s: %s\n",
+                name, strerror(ret));
+        free(dev);
+        return NULL;
+    }
+
+    return dev;
+}
+
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
+                           uint32_t vendor_id, uint64_t features,
+                           uint16_t num_queues, uint32_t config_size,
+                           char *config, const VduseOps *ops, void *priv)
+{
+    VduseDev *dev;
+    int ret, ctrl_fd;
+    uint64_t version;
+    struct vduse_dev_config *dev_config;
+    size_t size = offsetof(struct vduse_dev_config, config);
+
+    if (!name || vduse_name_is_valid(name) ||
+        !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
+        !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
+        fprintf(stderr, "Invalid parameter for vduse\n");
+        return NULL;
+    }
+
+    dev = calloc(sizeof(VduseDev), 1);
+    if (!dev) {
+        fprintf(stderr, "Failed to allocate vduse device\n");
+        return NULL;
+    }
+
+    ctrl_fd = open("/dev/vduse/control", O_RDWR);
+    if (ctrl_fd < 0) {
+        fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
+                strerror(errno));
+        goto err_ctrl;
+    }
+
+    version = VDUSE_API_VERSION;
+    if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
+        fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
+                version, strerror(errno));
+        goto err_dev;
+    }
+
+    dev_config = calloc(size + config_size, 1);
+    if (!dev_config) {
+        fprintf(stderr, "Failed to allocate config space\n");
+        goto err_dev;
+    }
+
+    strcpy(dev_config->name, name);
+    dev_config->device_id = device_id;
+    dev_config->vendor_id = vendor_id;
+    dev_config->features = features;
+    dev_config->vq_num = num_queues;
+    dev_config->vq_align = VDUSE_VQ_ALIGN;
+    dev_config->config_size = config_size;
+    memcpy(dev_config->config, config, config_size);
+
+    ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
+    free(dev_config);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to create vduse device %s: %s\n",
+                name, strerror(errno));
+        goto err_dev;
+    }
+    dev->ctrl_fd = ctrl_fd;
+
+    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to init vduse device %s: %s\n",
+                name, strerror(ret));
+        goto err;
+    }
+
+    return dev;
+err:
+    ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
+err_dev:
+    close(ctrl_fd);
+err_ctrl:
+    free(dev);
+
+    return NULL;
+}
+
+int vduse_dev_destroy(VduseDev *dev)
+{
+    int ret = 0;
+
+    free(dev->vqs);
+    if (dev->fd >= 0) {
+        close(dev->fd);
+        dev->fd = -1;
+    }
+    if (dev->ctrl_fd >= 0) {
+        if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
+            ret = -errno;
+        }
+        close(dev->ctrl_fd);
+        dev->ctrl_fd = -1;
+    }
+    free(dev->name);
+    free(dev);
+
+    return ret;
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index 51e9ff2dd2..b14891898c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3594,6 +3594,11 @@  L: qemu-block@nongnu.org
 S: Supported
 F: block/export/fuse.c
 
+VDUSE library
+M: Xie Yongji <xieyongji@bytedance.com>
+S: Maintained
+F: subprojects/libvduse/
+
 Replication
 M: Wen Congyang <wencongyang2@huawei.com>
 M: Xie Changlong <xiechanglong.d@gmail.com>
diff --git a/meson.build b/meson.build
index 9efcb175d1..ecfe31ca87 100644
--- a/meson.build
+++ b/meson.build
@@ -1541,6 +1541,15 @@  if get_option('fuse_lseek').allowed()
   endif
 endif
 
+have_libvduse = (targetos == 'linux')
+if get_option('libvduse').enabled()
+    if targetos != 'linux'
+        error('libvduse requires linux')
+    endif
+elif get_option('libvduse').disabled()
+    have_libvduse = false
+endif
+
 # libbpf
 libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config')
 if libbpf.found() and not cc.links('''
@@ -2986,6 +2995,12 @@  if targetos == 'linux' and have_vhost_user
   vhost_user = libvhost_user.get_variable('vhost_user_dep')
 endif
 
+libvduse = not_found
+if have_libvduse
+  libvduse_proj = subproject('libvduse')
+  libvduse = libvduse_proj.get_variable('libvduse_dep')
+endif
+
 # NOTE: the trace/ subdirectory needs the qapi_trace_events variable
 # that is filled in by qapi/.
 subdir('qapi')
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 24eb5f35ea..66d3f372a0 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -110,6 +110,7 @@  meson_options_help() {
   printf "%s\n" '  libssh          ssh block device support'
   printf "%s\n" '  libudev         Use libudev to enumerate host devices'
   printf "%s\n" '  libusb          libusb support for USB passthrough'
+  printf "%s\n" '  libvduse        build VDUSE Library'
   printf "%s\n" '  linux-aio       Linux AIO support'
   printf "%s\n" '  linux-io-uring  Linux io_uring support'
   printf "%s\n" '  live-block-migration'
@@ -307,6 +308,8 @@  _meson_option_parse() {
     --disable-libudev) printf "%s" -Dlibudev=disabled ;;
     --enable-libusb) printf "%s" -Dlibusb=enabled ;;
     --disable-libusb) printf "%s" -Dlibusb=disabled ;;
+    --enable-libvduse) printf "%s" -Dlibvduse=enabled ;;
+    --disable-libvduse) printf "%s" -Dlibvduse=disabled ;;
     --enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;;
     --disable-linux-aio) printf "%s" -Dlinux_aio=disabled ;;
     --enable-linux-io-uring) printf "%s" -Dlinux_io_uring=enabled ;;
diff --git a/subprojects/libvduse/linux-headers/linux b/subprojects/libvduse/linux-headers/linux
new file mode 120000
index 0000000000..04f3304f79
--- /dev/null
+++ b/subprojects/libvduse/linux-headers/linux
@@ -0,0 +1 @@ 
+../../../linux-headers/linux/
\ No newline at end of file
diff --git a/subprojects/libvduse/meson.build b/subprojects/libvduse/meson.build
new file mode 100644
index 0000000000..ba08f5ee1a
--- /dev/null
+++ b/subprojects/libvduse/meson.build
@@ -0,0 +1,10 @@ 
+project('libvduse', 'c',
+        license: 'GPL-2.0-or-later',
+        default_options: ['c_std=gnu99'])
+
+libvduse = static_library('vduse',
+                          files('libvduse.c'),
+                          c_args: '-D_GNU_SOURCE')
+
+libvduse_dep = declare_dependency(link_with: libvduse,
+                                  include_directories: include_directories('.'))
diff --git a/subprojects/libvduse/standard-headers/linux b/subprojects/libvduse/standard-headers/linux
new file mode 120000
index 0000000000..c416f068ac
--- /dev/null
+++ b/subprojects/libvduse/standard-headers/linux
@@ -0,0 +1 @@ 
+../../../include/standard-headers/linux/
\ No newline at end of file