diff mbox

[v4,RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

Message ID 1471935390-126476-1-git-send-email-ashish.mittal@veritas.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ashish Mittal Aug. 23, 2016, 6:56 a.m. UTC
This patch adds support for a new block device type called "vxhs".
Source code for the library that this code loads can be downloaded from:
https://github.com/MittalAshish/libqnio.git

Sample command line using JSON syntax:
./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'

Sample command line using URI syntax:
qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D

Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
---
v3 changelog:

Comments

Jeff Cody Aug. 30, 2016, 5:35 p.m. UTC | #1
First-pass-over-the-code review:

On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote:
> This patch adds support for a new block device type called "vxhs".
> Source code for the library that this code loads can be downloaded from:
> https://github.com/MittalAshish/libqnio.git
> 
> Sample command line using JSON syntax:
> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'
> 
> Sample command line using URI syntax:
> qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D
> 

Is there a reference specification for vxhs available?  If so, the commit
message would be a good place to drop a link.

> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
> ---
> v3 changelog:
> =============
> (1) Reworked QAPI/JSON parsing.
> (2) Reworked URI parsing as suggested by Kevin.
> (3) Fixes per review comments from Stefan on v1.
> (4) Fixes per review comments from Daniel on v3.
> 
>  block/Makefile.objs |    2 +
>  block/trace-events  |   41 ++
>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/vxhs.h        |  237 ++++++++++
>  configure           |   50 ++
>  5 files changed, 1634 insertions(+)
>  create mode 100644 block/vxhs.c
>  create mode 100644 block/vxhs.h
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 2593a2f..5f83305 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
>  block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
>  block-obj-$(CONFIG_LIBSSH2) += ssh.o
> +block-obj-$(CONFIG_VXHS) += vxhs.o
>  block-obj-y += accounting.o dirty-bitmap.o
>  block-obj-y += write-threshold.o
>  
> @@ -43,3 +44,4 @@ block-obj-m        += dmg.o
>  dmg.o-libs         := $(BZIP2_LIBS)
>  qcow.o-libs        := -lz
>  linux-aio.o-libs   := -laio
> +vxhs.o-libs        := $(VXHS_LIBS)
> diff --git a/block/trace-events b/block/trace-events
> index 05fa13c..06c6d8c 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
>  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
> +
> +# block/vxhs.c
> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
> +vxhs_open_fail(int ret) "Could not open the device. Error = %d"
> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
> +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
> diff --git a/block/vxhs.c b/block/vxhs.c
> new file mode 100644
> index 0000000..32c37c5
> --- /dev/null
> +++ b/block/vxhs.c
> @@ -0,0 +1,1304 @@
> +/*
> + * QEMU Block driver for Veritas HyperScale (VxHS)
> + *

Another good place for a link to the specification, if available.


> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "vxhs.h"
> +#include <qnio/qnio_api.h>
> +#include "trace.h"
> +#include "qapi/qmp/qerror.h"
> +#include "qapi/qmp/qdict.h"
> +#include "qapi/qmp/qstring.h"
> +
> +#define VXHS_OPT_FILENAME        "filename"
> +#define VXHS_OPT_VDISK_ID        "vdisk_id"
> +#define VXHS_OPT_SERVER          "server."
> +#define VXHS_OPT_HOST            "host"
> +#define VXHS_OPT_PORT            "port"
> +
> +/* qnio client ioapi_ctx */
> +static void *global_qnio_ctx;
> +
> +/* vdisk prefix to pass to qnio */
> +static const char vdisk_prefix[] = "/dev/of/vdisk";
> +
> +void vxhs_inc_acb_segment_count(void *ptr, int count)
> +{
> +    VXHSAIOCB *acb = ptr;
> +    BDRVVXHSState *s = acb->common.bs->opaque;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +    acb->segments += count;
> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +}
> +
> +void vxhs_dec_acb_segment_count(void *ptr, int count)
> +{
> +    VXHSAIOCB *acb = ptr;
> +    BDRVVXHSState *s = acb->common.bs->opaque;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +    acb->segments -= count;
> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +}
> +
> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
> +{
> +    VXHSAIOCB *acb = ptr;
> +    BDRVVXHSState *s = acb->common.bs->opaque;
> +    int segcount = 0;
> +
> +
> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +    acb->segments -= count;
> +    segcount = acb->segments;
> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +
> +    return segcount;
> +}

Unused function?

> +
> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> +{
> +    VXHSAIOCB *acb = ptr;
> +
> +    acb->buffer = buffer;
> +}
> +

Unused function?

(There are several unused functions - are there other patches that use these
functions?  Maybe I missed them on the list.  If not, and they are for
future work, maybe it would be best to introduce these functions in the
patch series that will actually use them. It will cut down on the amount of
code to be reviewed, and since it is not exercised it can't be verified
anyway).

> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
> +{
> +    BDRVVXHSState *s = ptr;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> +    s->vdisk_aio_count += count;
> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +}
> +
> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
> +{
> +    BDRVVXHSState *s = ptr;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> +    s->vdisk_aio_count -= count;
> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +}
> +
> +uint32_t vxhs_get_vdisk_iocount(void *ptr)
> +{
> +    BDRVVXHSState *s = ptr;
> +    uint32_t count = 0;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> +    count = s->vdisk_aio_count;
> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +
> +    return count;
> +}

Function only used by another function that is not called from anywhere.


> +
> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
> +{
> +    VXHSAIOCB *acb = NULL;
> +    BDRVVXHSState *s = NULL;
> +    int rv = 0;
> +    int segcount = 0;
> +    uint32_t error = 0;
> +    uint32_t opcode = 0;
> +
> +    assert(m);
> +    if (m) {
> +        /* TODO: need common get message attrs, not two separate lib calls */
> +        error = qemu_iio_extract_msg_error(m);
> +        opcode = qemu_iio_extract_msg_opcode(m);
> +    }
> +    switch (opcode) {
> +    case IRP_READ_REQUEST:
> +    case IRP_WRITE_REQUEST:
> +
> +    /*
> +     * ctx is VXHSAIOCB*
> +     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
> +     */
> +    if (ctx) {
> +        acb = ctx;
> +        s = acb->common.bs->opaque;
> +    } else {
> +        trace_vxhs_iio_callback(error, reason);
> +        goto out;
> +    }
> +
> +    if (error) {
> +        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
> +
> +        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
> +            /*
> +             * Storage agent failed while I/O was in progress
> +             * Fail over only if the qnio channel dropped, indicating
> +             * storage agent failure. Don't fail over in response to other
> +             * I/O errors such as disk failure.
> +             */
> +            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
> +                error == VXERROR_CHANNEL_HUP || error == -1) {
> +                /*
> +                 * Start vDisk IO failover once callback is
> +                 * called against all the pending IOs.
> +                 * If vDisk has no redundancy enabled
> +                 * then IO failover routine will mark
> +                 * the vDisk failed and fail all the
> +                 * AIOs without retry (stateless vDisk)
> +                 */
> +                VXHS_SPIN_LOCK(s->vdisk_lock);
> +                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> +                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
> +                }
> +                /*
> +                 * Check if this acb is already queued before.
> +                 * It is possible in case if I/Os are submitted
> +                 * in multiple segments (QNIO_MAX_IO_SIZE).
> +                 */
> +                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
> +                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
> +                                         acb, retry_entry);
> +                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +                    s->vdisk_aio_retry_qd++;
> +                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
> +                }
> +                segcount = --acb->segments;
> +                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +                /*
> +                 * Decrement AIO count only when callback is called
> +                 * against all the segments of aiocb.
> +                 */
> +                if (segcount == 0 && --s->vdisk_aio_count == 0) {
> +                    /*
> +                     * Start vDisk I/O failover
> +                     */
> +                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +                    /*
> +                     * TODO:
> +                     * Need to explore further if it is possible to optimize
> +                     * the failover operation on Virtual-Machine (global)
> +                     * specific rather vDisk specific.
> +                     */
> +                    vxhs_failover_io(s);
> +                    goto out;
> +                }
> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +                goto out;
> +            }
> +        } else if (reason == IIO_REASON_HUP) {
> +            /*
> +             * Channel failed, spontaneous notification,
> +             * not in response to I/O
> +             */
> +            trace_vxhs_iio_callback_chnlfail(error);
> +            /*
> +             * TODO: Start channel failover when no I/O is outstanding
> +             */
> +            goto out;
> +        } else {
> +            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
> +                                         acb->size, error);
> +        }
> +    }
> +    /*
> +     * Set error into acb if not set. In case if acb is being
> +     * submitted in multiple segments then need to set the error
> +     * only once.
> +     *
> +     * Once acb done callback is called for the last segment
> +     * then acb->ret return status will be sent back to the
> +     * caller.
> +     */
> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +    if (error && !acb->ret) {
> +        acb->ret = error;
> +    }
> +    --acb->segments;
> +    segcount = acb->segments;
> +    assert(segcount >= 0);
> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +    /*
> +     * Check if all the outstanding I/Os are done against acb.
> +     * If yes then send signal for AIO completion.
> +     */
> +    if (segcount == 0) {
> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> +        if (rv != sizeof(acb)) {
> +            error_report("VXHS AIO completion failed: %s", strerror(errno));
> +            abort();
> +        }
> +    }
> +    break;
> +
> +    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
> +        /* ctx is BDRVVXHSState* */
> +        assert(ctx);
> +        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
> +                                      error);
> +        vxhs_failover_ioctl_cb(error, ctx);
> +        break;
> +
> +    default:
> +        if (reason == IIO_REASON_HUP) {
> +            /*
> +             * Channel failed, spontaneous notification,
> +             * not in response to I/O
> +             */
> +            trace_vxhs_iio_callback_chnfail(error, errno);
> +            /*
> +             * TODO: Start channel failover when no I/O is outstanding
> +             */
> +        } else {
> +            trace_vxhs_iio_callback_unknwn(opcode, error);
> +        }
> +        break;
> +    }
> +out:
> +    return;
> +}
> +
> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
> +{
> +    BlockCompletionFunc *cb = acb->common.cb;
> +    void *opaque = acb->common.opaque;
> +    int ret = 0;
> +
> +    if (acb->ret != 0) {
> +        trace_vxhs_complete_aio(acb, acb->ret);
> +    /*
> +     * We mask all the IO errors generically as EIO for upper layers
> +     * Right now our IO Manager uses non standard error codes. Instead
> +     * of confusing upper layers with incorrect interpretation we are
> +     * doing this workaround.
> +     */
> +        ret = (-EIO);
> +    }
> +    /*
> +     * Copy back contents from stablization buffer into original iovector
> +     * before returning the IO
> +     */
> +    if (acb->buffer != NULL) {
> +        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
> +        free(acb->buffer);
> +        acb->buffer = NULL;
> +    }
> +    vxhs_dec_vdisk_iocount(s, 1);
> +    acb->aio_done = VXHS_IO_COMPLETED;
> +    qemu_aio_unref(acb);
> +    cb(opaque, ret);
> +}
> +
> +/*
> + * This is the HyperScale event handler registered to QEMU.
> + * It is invoked when any IO gets completed and written on pipe
> + * by callback called from QNIO thread context. Then it marks
> + * the AIO as completed, and releases HyperScale AIO callbacks.
> + */
> +void vxhs_aio_event_reader(void *opaque)
> +{
> +    BDRVVXHSState *s = opaque;
> +    ssize_t ret;
> +
> +    do {
> +        char *p = (char *)&s->qnio_event_acb;
> +
> +        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
> +                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
> +        if (ret > 0) {
> +            s->event_reader_pos += ret;
> +            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
> +                s->event_reader_pos = 0;
> +                vxhs_complete_aio(s->qnio_event_acb, s);
> +            }
> +        }
> +    } while (ret < 0 && errno == EINTR);
> +}
> +
> +/*
> + * QEMU calls this to check if there are any pending IO on vDisk.
> + * It will wait in a loop until all the AIOs are completed.
> + */
> +int vxhs_aio_flush_cb(void *opaque)
> +{
> +    BDRVVXHSState *s = opaque;
> +
> +    return vxhs_get_vdisk_iocount(s);
> +}

Unused function.


> +
> +/*
> + * This will be called by QEMU while booting for each vDisk.
> + * bs->opaque will be allocated by QEMU upper block layer before
> + * calling open. It will load all the QNIO operations from
> + * qemuqnio library and call QNIO operation to create channel to
> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk
> + * path and then sets HyperScale event handler to QEMU.
> + */
> +void *vxhs_setup_qnio(void)
> +{
> +    void *qnio_ctx = NULL;
> +
> +    qnio_ctx = qemu_iio_init(vxhs_iio_callback);

One concern with this callback function - when does it get called?  I am
assuming that qemu_iio_init() is running threads / processes in the
background, so this callback could happen at anytime.

(I'll discuss my specific concern later, in vxhs_failover_ioctl_cb())



An aside:

It is odd that the qnio library is using a qemu_ prefix for functions.
Seems like a good way to run into issues later, especially with relatively
generic function names such as 'qemu_ck_spin_lock()'. Consider that most
developers / maintainers are probably not going to compile in vxhs given the
external library dependence without a specific reason, so name collisions
could sneak in.

'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause:

0000000000004d07 T qemu_calculate_iovec_size
000000000000663d T qemu_ck_destroy_lock
00000000000065cf T qemu_ck_initialize_lock
00000000000065f7 T qemu_ck_spin_lock
000000000000661a T qemu_ck_spin_unlock
0000000000004dfb T qemu_convert_iovector_to_buffer
0000000000004d63 T qemu_copy_iov_to_buffer
0000000000005f2f T qemu_extract_flush_response
0000000000005c89 T qemu_extract_geometry_from_json
0000000000005a78 T qemu_extract_size_from_json
0000000000004f40 T qemu_is_iovector_read_aligned



> +
> +    if (qnio_ctx != NULL) {
> +        trace_vxhs_setup_qnio(qnio_ctx);
> +    } else {
> +        trace_vxhs_setup_qnio_nwerror('.');
> +    }
> +
> +    return qnio_ctx;
> +}
> +
> +static QemuOptsList runtime_opts = {
> +    .name = "vxhs",
> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> +    .desc = {
> +        {
> +            .name = VXHS_OPT_FILENAME,
> +            .type = QEMU_OPT_STRING,
> +            .help = "URI to the Veritas HyperScale image",
> +        },
> +        {
> +            .name = VXHS_OPT_VDISK_ID,
> +            .type = QEMU_OPT_STRING,
> +            .help = "UUID of the VxHS vdisk",
> +        },
> +        { /* end of list */ }
> +    },
> +};
> +
> +static QemuOptsList runtime_tcp_opts = {
> +    .name = "vxhs_tcp",
> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
> +    .desc = {
> +        {
> +            .name = VXHS_OPT_HOST,
> +            .type = QEMU_OPT_STRING,
> +            .help = "host address (ipv4 addresses)",
> +        },
> +        {
> +            .name = VXHS_OPT_PORT,
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "port number on which VxHSD is listening (default 9999)",
> +            .def_value_str = "9999"
> +        },
> +        {
> +            .name = "to",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "max port number, not supported by VxHS",
> +        },
> +        {
> +            .name = "ipv4",
> +            .type = QEMU_OPT_BOOL,
> +            .help = "ipv4 bool value, not supported by VxHS",
> +        },
> +        {
> +            .name = "ipv6",
> +            .type = QEMU_OPT_BOOL,
> +            .help = "ipv6 bool value, not supported by VxHS",
> +        },
> +        { /* end of list */ }
> +    },
> +};
> +
> +/*
> + * Parse the incoming URI and populate *options with all the host(s)
> + * information. Host at index 0 is local storage agent.
> + * Remaining are the reflection target storage agents. The local storage agent
> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
> + * The local storage agent address is stored at index 0. The reflection target
> + * ips, are the E-W data network addresses of the reflection node agents, also
> + * extracted from the uri.
> + */
> +static int vxhs_parse_uri(const char *filename, QDict *options)
> +{
> +    gchar **target_list;
> +    URI *uri = NULL;
> +    char *hoststr, *portstr;
> +    char *port;
> +    int ret = 0;
> +    int i = 0;
> +
> +    trace_vxhs_parse_uri_filename(filename);
> +    target_list = g_strsplit(filename, "%7D", 0);
> +    assert(target_list != NULL && target_list[0] != NULL);
> +
> +    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
> +        uri = uri_parse(target_list[i]);
> +        if (!uri || !uri->server) {
> +            uri_free(uri);
> +            ret = -EINVAL;
> +            break;
> +        }
> +
> +        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
> +        qdict_put(options, hoststr, qstring_from_str(uri->server));
> +
> +        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
> +        if (uri->port) {
> +            port = g_strdup_printf("%d", uri->port);
> +            qdict_put(options, portstr, qstring_from_str(port));
> +            g_free(port);
> +        }
> +
> +        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
> +            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
> +        }
> +
> +        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
> +        g_free(hoststr);
> +        g_free(portstr);
> +        uri_free(uri);
> +    }
> +
> +    g_strfreev(target_list);
> +    return ret;
> +}
> +
> +static void vxhs_parse_filename(const char *filename, QDict *options,
> +                               Error **errp)
> +{
> +    if (qdict_haskey(options, "host")
> +        || qdict_haskey(options, "port")
> +        || qdict_haskey(options, "path"))
> +    {
> +        error_setg(errp, "host/port/path and a file name may not be specified "
> +                         "at the same time");
> +        return;
> +    }
> +
> +    if (strstr(filename, "://")) {
> +        int ret = vxhs_parse_uri(filename, options);
> +        if (ret < 0) {
> +            error_setg(errp, "Invalid URI. URI should be of the form "
> +                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
> +        }
> +    }
> +}
> +
> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
> +                              int *cfd, int *rfd, Error **errp)
> +{
> +    QDict *backing_options = NULL;
> +    const char *vxhs_filename;
> +    char *of_vsa_addr = NULL;
> +    Error *local_err = NULL;
> +    const char *ptr = NULL;

ptr does not need to be initialized to NULL here.  I also recommend using a
more descriptive name like 'vdisk_id_opt' over something generic like 'ptr'.

> +    char *file_name = NULL;
> +    size_t num_servers;
> +    char *str = NULL;
> +    QemuOpts *opts;
> +    int ret = 0;
> +    int i;
> +
> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> +    qemu_opts_absorb_qdict(opts, options, &local_err);
> +    if (local_err) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
> +    if (vxhs_filename) {
> +        trace_vxhs_qemu_init_filename(vxhs_filename);
> +    }
> +
> +    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
> +    if (!ptr) {
> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    s->vdisk_guid = g_strdup(ptr);

This is never freed anywhere.

> +    trace_vxhs_qemu_init_vdisk(ptr);
> +
> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
> +    if (num_servers < 1) {
> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    trace_vxhs_qemu_init_numservers(num_servers);
> +
> +    qemu_opts_del(opts);
> +
> +    for (i = 0; i < num_servers; i++) {

s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size.  There is no
bounds checking on num_servers, which is an issue: we can easily blow
past this.

In general, for any parameter supplied by a user, or read from a file /
protocol / etc, we need to do appropriate bounds checking to prevent
potential CVEs.

> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
> +        qdict_extract_subqdict(options, &backing_options, str);
> +
> +        /* create opts info from runtime_tcp_opts list */
> +        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
> +        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
> +        if (local_err) {
> +            ret = -EINVAL;
> +            qemu_opts_del(opts);
> +            goto out;
> +        }
> +
> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
> +                                                            VXHS_OPT_HOST));

This is never freed anywhere.

> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
> +                                                                 VXHS_OPT_PORT),
> +                                                    NULL, 0);
> +
> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
> +                             s->vdisk_hostinfo[i].port);
> +        qemu_opts_del(opts);
> +    }
> +
> +    s->vdisk_nhosts = i;
> +    s->vdisk_cur_host_idx = 0;
> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
> +
> +    /*
> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
> +     */
> +    if (global_qnio_ctx == NULL) {
> +        global_qnio_ctx = vxhs_setup_qnio();
> +        if (global_qnio_ctx == NULL) {
> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
> +            ret = -EINVAL;
> +            goto out;
> +        }
> +    }
> +
> +    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
> +    if (*cfd < 0) {
> +        error_setg(&local_err, "Failed qemu_open_iio_conn");
> +        ret = -EIO;
> +        goto out;
> +    }
> +    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
> +    if (*rfd < 0) {
> +        error_setg(&local_err, "Failed qemu_iio_devopen");
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +out:
> +    g_free(file_name);
> +    g_free(of_vsa_addr);
> +    if (str) {
> +        qdict_del(backing_options, str);
> +        g_free(str);
> +    }
> +
> +    if (ret < 0) {
> +        errno = -ret;
> +    }
> +    error_propagate(errp, local_err);
> +    return ret;
> +}
> +
> +int vxhs_open(BlockDriverState *bs, QDict *options,
> +              int bdrv_flags, Error **errp)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    AioContext *aio_context;
> +    int qemu_qnio_cfd = -1;
> +    int qemu_rfd = -1;
> +    int ret = 0;
> +
> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
> +    if (ret < 0) {
> +        trace_vxhs_open_fail(ret);
> +        return ret;
> +    }
> +
> +    s->qnio_ctx = global_qnio_ctx;
> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
> +    s->vdisk_size = 0;
> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
> +
> +    /*
> +     * Create a pipe for communicating between two threads in different
> +     * context. Set handler for read event, which gets triggered when
> +     * IO completion is done by non-QEMU context.
> +     */
> +    ret = qemu_pipe(s->fds);
> +    if (ret < 0) {
> +        trace_vxhs_open_epipe('.');
> +        ret = -errno;
> +        goto out;
> +    }
> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
> +
> +    aio_context = bdrv_get_aio_context(bs);
> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
> +                       false, vxhs_aio_event_reader, NULL, s);
> +
> +    /*
> +     * Allocate/Initialize the spin-locks.
> +     *
> +     * NOTE:
> +     *      Since spin lock is being allocated
> +     *      dynamically hence moving acb struct
> +     *      specific lock to BDRVVXHSState
> +     *      struct. The reason being,
> +     *      we don't want the overhead of spin
> +     *      lock being dynamically allocated and
> +     *      freed for every AIO.
> +     */
> +    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
> +    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
> +
> +    return 0;
> +
> +out:
> +    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
> +        qemu_iio_devclose(s->qnio_ctx, 0,
> +                                 s->vdisk_hostinfo[0].vdisk_rfd);
> +    }
> +    /* never close qnio_cfd */

Surely we want to be able to close it at some point... or does the qnio
library take care of that cleanup somewhere/sometime?


> +    trace_vxhs_open_fail(ret);
> +    return ret;
> +}
> +
> +static const AIOCBInfo vxhs_aiocb_info = {
> +    .aiocb_size = sizeof(VXHSAIOCB)
> +};
> +
> +/*
> + * This is called in QNIO thread context when IO done
> + * on IO Manager and QNIO client received the data or
> + * ACK. It notify another event handler thread running in QEMU context
> + * by writing on the pipe
> + */
> +void vxhs_finish_aiocb(ssize_t ret, void *arg)
> +{
> +    VXHSAIOCB *acb = arg;
> +    BlockDriverState *bs = acb->common.bs;
> +    BDRVVXHSState *s = bs->opaque;
> +    int rv;
> +
> +    acb->ret = ret;
> +    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> +    if (rv != sizeof(acb)) {
> +        error_report("VXHS AIO completion failed: %s",
> +                     strerror(errno));
> +        abort();
> +    }
> +}

Another unused function.

> +
> +/*
> + * This allocates QEMU-VXHS callback for each IO
> + * and is passed to QNIO. When QNIO completes the work,
> + * it will be passed back through the callback.
> + */
> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
> +                                int64_t sector_num, QEMUIOVector *qiov,
> +                                int nb_sectors,
> +                                BlockCompletionFunc *cb,
> +                                void *opaque, int iodir)
> +{
> +    VXHSAIOCB *acb = NULL;
> +    BDRVVXHSState *s = bs->opaque;
> +    size_t size;
> +    uint64_t offset;
> +    int iio_flags = 0;
> +    int ret = 0;
> +
> +    offset = sector_num * BDRV_SECTOR_SIZE;
> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
> +    /*
> +     * Setup or initialize VXHSAIOCB.
> +     * Every single field should be initialized since
> +     * acb will be picked up from the slab without
> +     * initializing with zero.
> +     */
> +    acb->io_offset = offset;
> +    acb->size = size;
> +    acb->ret = 0;
> +    acb->flags = 0;
> +    acb->aio_done = VXHS_IO_INPROGRESS;
> +    acb->segments = 0;
> +    acb->buffer = 0;
> +    acb->qiov = qiov;
> +    acb->direction = iodir;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> +    if (OF_VDISK_FAILED(s)) {
> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +        goto errout;
> +    }
> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> +        s->vdisk_aio_retry_qd++;
> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
> +        goto out;
> +    }
> +    s->vdisk_aio_count++;
> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +
> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> +
> +    switch (iodir) {
> +    case VDISK_AIO_WRITE:
> +            vxhs_inc_acb_segment_count(acb, 1);
> +            ret = qemu_iio_writev(s->qnio_ctx,
> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
> +            break;
> +    case VDISK_AIO_READ:
> +            vxhs_inc_acb_segment_count(acb, 1);
> +            ret = qemu_iio_readv(s->qnio_ctx,
> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
> +            break;
> +    default:
> +            trace_vxhs_aio_rw_invalid(iodir);
> +            goto errout;
> +    }
> +
> +    if (ret != 0) {
> +        trace_vxhs_aio_rw_ioerr(
> +                  s->vdisk_guid, iodir, size, offset,
> +                  acb, acb->segments, ret, errno);
> +        /*
> +         * Don't retry I/Os against vDisk having no
> +         * redundancy or stateful storage on compute
> +         *
> +         * TODO: Revisit this code path to see if any
> +         *       particular error needs to be handled.
> +         *       At this moment failing the I/O.
> +         */
> +        VXHS_SPIN_LOCK(s->vdisk_lock);
> +        if (s->vdisk_nhosts == 1) {
> +            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
> +            s->vdisk_aio_count--;
> +            vxhs_dec_acb_segment_count(acb, 1);
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            goto errout;
> +        }
> +        if (OF_VDISK_FAILED(s)) {
> +            trace_vxhs_aio_rw_devfail(
> +                      s->vdisk_guid, iodir, size, offset);
> +            s->vdisk_aio_count--;
> +            vxhs_dec_acb_segment_count(acb, 1);
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            goto errout;
> +        }
> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> +            /*
> +             * Queue all incoming io requests after failover starts.
> +             * Number of requests that can arrive is limited by io queue depth
> +             * so an app blasting independent ios will not exhaust memory.
> +             */
> +            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> +            s->vdisk_aio_retry_qd++;
> +            OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +            s->vdisk_aio_count--;
> +            vxhs_dec_acb_segment_count(acb, 1);
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
> +            goto out;
> +        }
> +        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> +        s->vdisk_aio_retry_qd++;
> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +        vxhs_dec_acb_segment_count(acb, 1);
> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
> +        /*
> +         * Start I/O failover if there is no active
> +         * AIO within vxhs block driver.
> +         */
> +        if (--s->vdisk_aio_count == 0) {
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            /*
> +             * Start IO failover
> +             */
> +            vxhs_failover_io(s);
> +            goto out;
> +        }
> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +    }
> +
> +out:
> +    return &acb->common;
> +
> +errout:
> +    qemu_aio_unref(acb);
> +    return NULL;
> +}
> +
> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
> +                                   int64_t sector_num, QEMUIOVector *qiov,
> +                                   int nb_sectors,
> +                                   BlockCompletionFunc *cb, void *opaque)
> +{
> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
> +                         cb, opaque, VDISK_AIO_READ);
> +}
> +
> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
> +                                    int64_t sector_num, QEMUIOVector *qiov,
> +                                    int nb_sectors,
> +                                    BlockCompletionFunc *cb, void *opaque)
> +{
> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
> +                         cb, opaque, VDISK_AIO_WRITE);
> +}
> +
> +/*
> + * This is called by QEMU when a flush gets triggered from within
> + * a guest at the block layer, either for IDE or SCSI disks.
> + */
> +int vxhs_co_flush(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    uint64_t size = 0;
> +    int ret = 0;
> +
> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
> +
> +    if (ret < 0) {
> +        /*
> +         * Currently not handling the flush ioctl
> +         * failure because of network connection
> +         * disconnect. Since all the writes are
> +         * commited into persistent storage hence
> +         * this flush call is noop and we can safely
> +         * return success status to the caller.

I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
above is a noop?

> +         *
> +         * If any write failure occurs for inflight
> +         * write AIO because of network disconnect
> +         * then anyway IO failover will be triggered.
> +         */
> +        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
> +        ret = 0;
> +    }
> +
> +    return ret;
> +}
> +
> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
> +{
> +    void *ctx = NULL;
> +    int flags = 0;
> +    unsigned long vdisk_size = 0;
> +    int ret = 0;
> +
> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +            VDISK_STAT, &vdisk_size, ctx, flags);
> +
> +    if (ret < 0) {
> +        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
> +        return 0;
> +    }
> +
> +    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
> +    return vdisk_size;
> +}
> +
> +/*
> + * Returns the size of vDisk in bytes. This is required
> + * by QEMU block upper block layer so that it is visible
> + * to guest.
> + */
> +int64_t vxhs_getlength(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    unsigned long vdisk_size = 0;
> +
> +    if (s->vdisk_size > 0) {
> +        vdisk_size = s->vdisk_size;

s->vdisk_size is int64_t, vdisk_size is unsigned long.  On 32-bit platforms,
I believe unsigned long is only 32-bits, so this assignment is problematic.

What is the maximum disk size for vxhs?

(this comment applies to all the vdisk_size usage, but I'll only mention it
here)

> +    } else {
> +        /*
> +         * Fetch the vDisk size using stat ioctl
> +         */
> +        vdisk_size = vxhs_get_vdisk_stat(s);
> +        if (vdisk_size > 0) {
> +            s->vdisk_size = vdisk_size;
> +        }
> +    }
> +
> +    if (vdisk_size > 0) {
> +        return (int64_t)vdisk_size; /* return size in bytes */
> +    } else {
> +        return -EIO;
> +    }
> +}
> +
> +/*
> + * Returns actual blocks allocated for the vDisk.
> + * This is required by qemu-img utility.
> + */
> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +    unsigned long vdisk_size = 0;
> +
> +    if (s->vdisk_size > 0) {
> +        vdisk_size = s->vdisk_size;
> +    } else {
> +        /*
> +         * TODO:
> +         * Once HyperScale storage-virtualizer provides
> +         * actual physical allocation of blocks then
> +         * fetch that information and return back to the
> +         * caller but for now just get the full size.
> +         */
> +        vdisk_size = vxhs_get_vdisk_stat(s);
> +        if (vdisk_size > 0) {
> +            s->vdisk_size = vdisk_size;
> +        }
> +    }
> +
> +    if (vdisk_size > 0) {
> +        return (int64_t)vdisk_size; /* return size in bytes */
> +    } else {
> +        return -EIO;
> +    }
> +}
> +
> +void vxhs_close(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +
> +    close(s->fds[VDISK_FD_READ]);
> +    close(s->fds[VDISK_FD_WRITE]);
> +
> +    /*
> +     * never close channel - not ref counted, will
> +     * close for all vdisks
> +     */
> +    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
> +        qemu_iio_devclose(s->qnio_ctx, 0,
> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
> +    }
> +    if (s->vdisk_lock) {
> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
> +        s->vdisk_lock = NULL;
> +    }
> +    if (s->vdisk_acb_lock) {
> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
> +        s->vdisk_acb_lock = NULL;
> +    }
> +
> +    /*
> +     * TODO: Verify that all the resources were relinguished.
> +     */

Which resources?  Allocated here in the driver, or stuff in libqnio?

> +}
> +
> +/*
> + * If errors are consistent with storage agent failure:
> + *  - Try to reconnect in case error is transient or storage agent restarted.
> + *  - Currently failover is being triggered on per vDisk basis. There is
> + *    a scope of further optimization where failover can be global (per VM).
> + *  - In case of network (storage agent) failure, for all the vDisks, having
> + *    no redundancy, I/Os will be failed without attempting for I/O failover
> + *    because of stateless nature of vDisk.
> + *  - If local or source storage agent is down then send an ioctl to remote
> + *    storage agent to check if remote storage agent in a state to accept
> + *    application I/Os.
> + *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
> + *  - If I/Os cannot be serviced then vDisk will be marked failed so that
> + *    new incoming I/Os are returned with failure immediately.
> + *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
> + *    queued and will be restarted or failed based on failover operation
> + *    is successful or not.
> + *  - I/O failover can be started either in I/O forward or I/O backward
> + *    path.
> + *  - I/O failover will be started as soon as all the pending acb(s)
> + *    are queued and there is no pending I/O count.
> + *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
> + *    then vDisk will be marked failed and all I/Os will be completed with
> + *    error.
> + */
> +
> +int vxhs_switch_storage_agent(BDRVVXHSState *s)
> +{
> +    int res = 0;
> +    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
> +
> +    trace_vxhs_switch_storage_agent(
> +              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
> +              s->vdisk_guid);
> +
> +    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
> +    if (res == 0) {
> +        res = qemu_iio_ioctl(s->qnio_ctx,
> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
> +                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
> +    } else {
> +        trace_vxhs_switch_storage_agent_failed(
> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
> +                  s->vdisk_guid, res, errno);
> +        /*
> +         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
> +         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
> +         * target list.
> +         */
> +
> +        /* try next host */
> +        vxhs_failover_ioctl_cb(res, s);
> +    }
> +    return res;
> +}
> +
> +void vxhs_failover_ioctl_cb(int res, void *ctx)
> +{
> +    BDRVVXHSState *s = ctx;
> +
> +    if (res == 0) {
> +        /* found failover target */
> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
> +        s->vdisk_ask_failover_idx = 0;
> +        trace_vxhs_failover_ioctl_cb(
> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> +                   s->vdisk_guid);
> +        VXHS_SPIN_LOCK(s->vdisk_lock);
> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +        vxhs_handle_queued_ios(s);
> +    } else {
> +        /* keep looking */
> +        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
> +        s->vdisk_ask_failover_idx++;
> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {

This function, vxhs_failover_ioctrl_cb(), is called from the external
library callback, in addition to QEMU itself (i.e., vxhs_aio_rw).  This
makes me concerned that we may have (or be able to induce) an unsafe race
condition when incrementing the vdisk_ask_failover_id array index.

> +            /* pause and cycle through list again */
> +            sleep(QNIO_CONNECT_RETRY_SECS);
> +            s->vdisk_ask_failover_idx = 0;
> +        }
> +        res = vxhs_switch_storage_agent(s);
> +    }
> +}
> +
> +int vxhs_failover_io(BDRVVXHSState *s)
> +{
> +    int res = 0;
> +
> +    trace_vxhs_failover_io(s->vdisk_guid);
> +
> +    s->vdisk_ask_failover_idx = 0;
> +    res = vxhs_switch_storage_agent(s);
> +
> +    return res;
> +}
> +
> +/*
> + * Try to reopen the vDisk on one of the available hosts
> + * If vDisk reopen is successful on any of the host then
> + * check if that node is ready to accept I/O.
> + */
> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
> +{
> +    char *of_vsa_addr = NULL;
> +    char *file_name = NULL;
> +    int  res = 0;
> +
> +    /*
> +     * Don't close the channel if it was opened
> +     * before successfully. It will be handled
> +     * within iio* api if the same channel open
> +     * fd is reused.
> +     *
> +     * close stale vdisk device remote fd since
> +     * it is invalid after channel disconnect.
> +     */
> +    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
> +        qemu_iio_devclose(s->qnio_ctx, 0,
> +                                 s->vdisk_hostinfo[index].vdisk_rfd);
> +        s->vdisk_hostinfo[index].vdisk_rfd = -1;
> +    }
> +
> +    /*
> +     * build storage agent address and vdisk device name strings
> +     */
> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
> +             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
> +    /*
> +     * open qnio channel to storage agent if not opened before.
> +     */
> +    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
> +        s->vdisk_hostinfo[index].qnio_cfd =
> +                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
> +        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
> +            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
> +            res = ENODEV;
> +            goto out;
> +        }
> +    }
> +    /*
> +     * open vdisk device
> +     */
> +    s->vdisk_hostinfo[index].vdisk_rfd =
> +            qemu_iio_devopen(global_qnio_ctx,
> +             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
> +    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
> +        trace_vxhs_reopen_vdisk_openfail(file_name);
> +        res = EIO;
> +        goto out;
> +    }
> +
> +out:
> +    g_free(of_vsa_addr);
> +    g_free(file_name);
> +    return res;
> +}
> +
> +int vxhs_handle_queued_ios(BDRVVXHSState *s)
> +{
> +    VXHSAIOCB *acb = NULL;
> +    int res = 0;
> +
> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> +    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
> +        /*
> +         * Before we process the acb, check whether I/O failover
> +         * started again due to failback or cascading failure.
> +         */
> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            goto out;
> +        }
> +        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
> +        s->vdisk_aio_retry_qd--;
> +        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
> +        if (OF_VDISK_FAILED(s)) {
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            vxhs_fail_aio(acb, EIO);
> +            VXHS_SPIN_LOCK(s->vdisk_lock);
> +        } else {
> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +            res = vxhs_restart_aio(acb);
> +            trace_vxhs_handle_queued_ios(acb, res);
> +            VXHS_SPIN_LOCK(s->vdisk_lock);
> +            if (res) {
> +                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
> +                                     acb, retry_entry);
> +                OF_AIOCB_FLAGS_SET_QUEUED(acb);
> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +                goto out;
> +            }
> +        }
> +    }
> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> +out:
> +    return res;
> +}
> +
> +int vxhs_restart_aio(VXHSAIOCB *acb)
> +{
> +    BDRVVXHSState *s = NULL;
> +    int iio_flags = 0;
> +    int res = 0;
> +
> +    s = acb->common.bs->opaque;
> +
> +    if (acb->direction == VDISK_AIO_WRITE) {
> +        vxhs_inc_vdisk_iocount(s, 1);
> +        vxhs_inc_acb_segment_count(acb, 1);
> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> +        res = qemu_iio_writev(s->qnio_ctx,
> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +                acb->qiov->iov, acb->qiov->niov,
> +                acb->io_offset, (void *)acb, iio_flags);
> +    }
> +
> +    if (acb->direction == VDISK_AIO_READ) {
> +        vxhs_inc_vdisk_iocount(s, 1);
> +        vxhs_inc_acb_segment_count(acb, 1);
> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> +        res = qemu_iio_readv(s->qnio_ctx,
> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> +                acb->qiov->iov, acb->qiov->niov,
> +                acb->io_offset, (void *)acb, iio_flags);
> +    }
> +
> +    if (res != 0) {
> +        vxhs_dec_vdisk_iocount(s, 1);
> +        vxhs_dec_acb_segment_count(acb, 1);
> +        trace_vxhs_restart_aio(acb->direction, res, errno);
> +    }
> +
> +    return res;
> +}
> +
> +void vxhs_fail_aio(VXHSAIOCB *acb, int err)
> +{
> +    BDRVVXHSState *s = NULL;
> +    int segcount = 0;
> +    int rv = 0;
> +
> +    s = acb->common.bs->opaque;
> +
> +    trace_vxhs_fail_aio(s->vdisk_guid, acb);
> +    if (!acb->ret) {
> +        acb->ret = err;
> +    }
> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> +    segcount = acb->segments;
> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> +    if (segcount == 0) {
> +        /*
> +         * Complete the io request
> +         */
> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> +        if (rv != sizeof(acb)) {
> +            error_report("VXHS AIO completion failed: %s",
> +                         strerror(errno));
> +            abort();
> +        }
> +    }
> +}
> +
> +static void vxhs_detach_aio_context(BlockDriverState *bs)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +
> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
> +                       false, NULL, NULL, NULL);
> +
> +}
> +
> +static void vxhs_attach_aio_context(BlockDriverState *bs,
> +                                   AioContext *new_context)
> +{
> +    BDRVVXHSState *s = bs->opaque;
> +
> +    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
> +                       false, vxhs_aio_event_reader, NULL, s);
> +}
> +
> +static BlockDriver bdrv_vxhs = {
> +    .format_name                  = "vxhs",
> +    .protocol_name                = "vxhs",
> +    .instance_size                = sizeof(BDRVVXHSState),
> +    .bdrv_file_open               = vxhs_open,
> +    .bdrv_parse_filename          = vxhs_parse_filename,
> +    .bdrv_close                   = vxhs_close,
> +    .bdrv_getlength               = vxhs_getlength,
> +    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
> +    .bdrv_aio_readv               = vxhs_aio_readv,
> +    .bdrv_aio_writev              = vxhs_aio_writev,
> +    .bdrv_co_flush_to_disk        = vxhs_co_flush,
> +    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
> +    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
> +};
> +
> +void bdrv_vxhs_init(void)
> +{
> +    trace_vxhs_bdrv_init('.');
> +    bdrv_register(&bdrv_vxhs);
> +}
> +
> +block_init(bdrv_vxhs_init);
> diff --git a/block/vxhs.h b/block/vxhs.h
> new file mode 100644
> index 0000000..38e0165
> --- /dev/null
> +++ b/block/vxhs.h
> @@ -0,0 +1,237 @@
> +/*
> + * QEMU Block driver for Veritas HyperScale (VxHS)
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef VXHSD_H
> +#define VXHSD_H
> +
> +#include <gmodule.h>
> +#include <inttypes.h>
> +#include <pthread.h>
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "block/block_int.h"
> +#include "qemu/uri.h"
> +#include "qemu/queue.h"
> +
> +#define OF_GUID_STR_LEN             40
> +#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
> +#define QNIO_CONNECT_RETRY_SECS     5
> +#define QNIO_CONNECT_TIMOUT_SECS    120
> +
> +/* constants from io_qnio.h */
> +#define IIO_REASON_DONE     0x00000004
> +#define IIO_REASON_EVENT    0x00000008
> +#define IIO_REASON_HUP      0x00000010
> +
> +/*
> + * IO specific flags
> + */
> +#define IIO_FLAG_ASYNC      0x00000001
> +#define IIO_FLAG_DONE       0x00000010
> +#define IIO_FLAG_SYNC       0
> +
> +/* constants from error.h */
> +#define VXERROR_RETRY_ON_SOURCE     44
> +#define VXERROR_HUP                 901
> +#define VXERROR_CHANNEL_HUP         903
> +
> +/* constants from iomgr.h and opcode.h */
> +#define IRP_READ_REQUEST                    0x1FFF
> +#define IRP_WRITE_REQUEST                   0x2FFF
> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
> +
> +/* Lock specific macros */
> +#define VXHS_SPIN_LOCK_ALLOC                  \
> +    (qemu_ck_initialize_lock())
> +#define VXHS_SPIN_LOCK(lock)                  \
> +    (qemu_ck_spin_lock(lock))
> +#define VXHS_SPIN_UNLOCK(lock)                \
> +    (qemu_ck_spin_unlock(lock))
> +#define VXHS_SPIN_LOCK_DESTROY(lock)          \
> +    (qemu_ck_destroy_lock(lock))

What's the value of these macros, rather than just calling the underlying
function directly in code?

> +
> +typedef enum {
> +    VXHS_IO_INPROGRESS,
> +    VXHS_IO_COMPLETED,
> +    VXHS_IO_ERROR
> +} VXHSIOState;
> +
> +
> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
> +
> +#define VDISK_FD_READ 0
> +#define VDISK_FD_WRITE 1
> +
> +#define QNIO_VDISK_NONE        0x00
> +#define QNIO_VDISK_CREATE      0x01
> +
> +/* max IO size supported by QEMU NIO lib */
> +#define QNIO_MAX_IO_SIZE       4194304
> +
> +#define MAX_HOSTS               4
> +
> +/*
> + * Opcodes for making IOCTL on QEMU NIO library
> + */
> +#define BASE_OPCODE_SHARED     1000
> +#define BASE_OPCODE_DAL        2000
> +#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
> +#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
> +#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
> +#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
> +
> +/*
> + * BDRVVXHSState specific flags
> + */
> +#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
> +#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
> +
> +#define OF_VDISK_ACTIVE(s)                                              \
> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
> +#define OF_VDISK_SET_ACTIVE(s)                                          \
> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
> +#define OF_VDISK_RESET_ACTIVE(s)                                        \
> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
> +
> +#define OF_VDISK_FAILED(s)                                              \
> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
> +#define OF_VDISK_SET_FAILED(s)                                          \
> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
> +#define OF_VDISK_RESET_FAILED(s)                                        \
> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
> +
> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> +
> +/*
> + * VXHSAIOCB specific flags
> + */
> +#define OF_ACB_QUEUED       0x00000001
> +
> +#define OF_AIOCB_FLAGS_QUEUED(a)            \
> +        ((a)->flags & OF_ACB_QUEUED)
> +#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
> +        ((a)->flags |= OF_ACB_QUEUED)
> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
> +        ((a)->flags &= ~OF_ACB_QUEUED)
> +

Many helper macros here... IMO, it decreases the readability of the code,
and my preference is to just have the underlying code without the use of the
macros.

> +typedef struct qemu2qnio_ctx {
> +    uint32_t            qnio_flag;
> +    uint64_t            qnio_size;
> +    char                *qnio_channel;
> +    char                *target;
> +    qnio_callback_t     qnio_cb;
> +} qemu2qnio_ctx_t;
> +
> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
> +
> +typedef struct LibQNIOSymbol {
> +        const char *name;
> +        gpointer *addr;
> +} LibQNIOSymbol;
> +
> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
> +                          void *reply);
> +
> +/*
> + * HyperScale AIO callbacks structure
> + */
> +typedef struct VXHSAIOCB {
> +    BlockAIOCB          common;
> +    size_t              ret;
> +    size_t              size;
> +    QEMUBH              *bh;
> +    int                 aio_done;
> +    int                 segments;
> +    int                 flags;
> +    size_t              io_offset;
> +    QEMUIOVector        *qiov;
> +    void                *buffer;
> +    int                 direction;  /* IO direction (r/w) */
> +    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
> +} VXHSAIOCB;
> +
> +typedef struct VXHSvDiskHostsInfo {
> +    int     qnio_cfd;        /* Channel FD */
> +    int     vdisk_rfd;      /* vDisk remote FD */
> +    char    *hostip;        /* Host's IP addresses */
> +    int     port;           /* Host's port number */
> +} VXHSvDiskHostsInfo;
> +
> +/*
> + * Structure per vDisk maintained for state
> + */
> +typedef struct BDRVVXHSState {
> +    int                     fds[2];
> +    int64_t                 vdisk_size;
> +    int64_t                 vdisk_blocks;
> +    int64_t                 vdisk_flags;
> +    int                     vdisk_aio_count;
> +    int                     event_reader_pos;
> +    VXHSAIOCB               *qnio_event_acb;
> +    void                    *qnio_ctx;
> +    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
> +    void                    *vdisk_acb_lock;  /* Protects ACB */
> +    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
> +    int                     vdisk_nhosts;   /* Total number of hosts */
> +    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
> +    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
> +    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
> +    int                     vdisk_aio_retry_qd;

Is vdisk_aio_retry_qd just used for debugging purposes?  I see it
incremented/decremented throughout the code, but no one checks it.

> +    char                    *vdisk_guid;
> +} BDRVVXHSState;
> +
> +void bdrv_vxhs_init(void);
> +void *vxhs_setup_qnio(void);
> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
> +void vxhs_aio_event_reader(void *opaque);
> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
> +int vxhs_aio_flush_cb(void *opaque);
> +void vxhs_finish_aiocb(ssize_t ret, void *arg);
> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
> +int vxhs_open(BlockDriverState *bs, QDict *options,
> +              int bdrv_flags, Error **errp);
> +void vxhs_close(BlockDriverState *bs);
> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
> +                                   QEMUIOVector *qiov, int nb_sectors,
> +                                   BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
> +                                    QEMUIOVector *qiov, int nb_sectors,
> +                                    BlockCompletionFunc *cb,
> +                                    void *opaque);
> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
> +                                QEMUIOVector *qiov, int nb_sectors,
> +                                BlockCompletionFunc *cb,
> +                                void *opaque, int write);
> +int vxhs_co_flush(BlockDriverState *bs);
> +int64_t vxhs_getlength(BlockDriverState *bs);
> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
> +uint32_t vxhs_get_vdisk_iocount(void *ptr);
> +void vxhs_inc_acb_segment_count(void *ptr, int count);
> +void vxhs_dec_acb_segment_count(void *ptr, int count);
> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
> +int vxhs_failover_io(BDRVVXHSState *s);
> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
> +int vxhs_switch_storage_agent(BDRVVXHSState *s);
> +int vxhs_handle_queued_ios(BDRVVXHSState *s);
> +int vxhs_restart_aio(VXHSAIOCB *acb);
> +void vxhs_fail_aio(VXHSAIOCB *acb, int err);
> +void vxhs_failover_ioctl_cb(int res, void *ctx);
> +
> +
> +#endif
> diff --git a/configure b/configure
> index 4b808f9..c71e270 100755
> --- a/configure
> +++ b/configure
> @@ -320,6 +320,7 @@ vhdx=""
>  numa=""
>  tcmalloc="no"
>  jemalloc="no"
> +vxhs=""
>  
>  # parse CC options first
>  for opt do
> @@ -1150,6 +1151,11 @@ for opt do
>    ;;
>    --enable-jemalloc) jemalloc="yes"
>    ;;
> +  --disable-vxhs) vxhs="no"
> +  ;;
> +  --enable-vxhs) vxhs="yes"
> +  ;;
> +
>    *)
>        echo "ERROR: unknown option $opt"
>        echo "Try '$0 --help' for more information"
> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if available:
>    numa            libnuma support
>    tcmalloc        tcmalloc support
>    jemalloc        jemalloc support
> +  vxhs            Veritas HyperScale vDisk backend support
>  
>  NOTE: The object files are built at the place where configure is launched
>  EOF
> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
>  fi
>  
>  ##########################################
> +# Veritas HyperScale block driver VxHS
> +# Check if libqnio is installed
> +if test "$vxhs" != "no" ; then
> +  cat > $TMPC <<EOF
> +#include <stdio.h>
> +#include <qnio/qnio_api.h>
> +
> +void vxhs_inc_acb_segment_count(void *acb, int count);
> +void vxhs_dec_acb_segment_count(void *acb, int count);
> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
> +
> +void vxhs_inc_acb_segment_count(void *ptr, int count)
> +{
> +}
> +void vxhs_dec_acb_segment_count(void *ptr, int count)
> +{
> +}
> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> +{
> +}
> +int main(void) {
> +    qemu_ck_initialize_lock();
> +    return 0;
> +}
> +EOF
> +  vxhs_libs="-lqnioshim -lqnio"
> +  if compile_prog "" "$vxhs_libs" ; then
> +    vxhs=yes
> +  else
> +    if test "$vxhs" = "yes" ; then
> +      feature_not_found "vxhs block device" "Install libqnio. See github"
> +    fi
> +    vxhs=no
> +  fi
> +fi
> +
> +##########################################
>  # End of CC checks
>  # After here, no more $cc or $ld runs
>  
> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then
>    echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
>  fi
>  
> +if test "$vxhs" = "yes" ; then
> +  echo "CONFIG_VXHS=y" >> $config_host_mak
> +  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
> +  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
> +fi
> +
>  if test "$tcg_interpreter" = "yes"; then
>    QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
>  elif test "$ARCH" = "sparc64" ; then

Your configure file should indicate if vxhs is enabled or not. For instance,
here is a snippet from where this occurs in the current configure file:

[...]
echo "NUMA host support $numa"
echo "tcmalloc support  $tcmalloc"
echo "jemalloc support  $jemalloc"
echo "avx2 optimization $avx2_opt"


> -- 
> 2.5.5
> 
>
Fam Zheng Sept. 6, 2016, 10:09 a.m. UTC | #2
On Mon, 08/22 23:56, Ashish Mittal wrote:
>  block/Makefile.objs |    2 +
>  block/trace-events  |   41 ++
>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/vxhs.h        |  237 ++++++++++
>  configure           |   50 ++

If vxhs code is contained in one file, no need to add vxhs.h and all forward
declarations can go to vxhs.c. Then local functions can be marked as "static",
and all unused functions can be cleaned up.

Fam
Ashish Mittal Sept. 7, 2016, 10:32 p.m. UTC | #3
Hi Jeff,

Thank you for the comments and corrections.

I have submitted a v5 patch after incorporating most of your review
comments. Please find details in-line.

Thanks,
Ashish

On Tue, Aug 30, 2016 at 10:35 AM, Jeff Cody <jcody@redhat.com> wrote:
>
> First-pass-over-the-code review:
>
> On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote:
>> This patch adds support for a new block device type called "vxhs".
>> Source code for the library that this code loads can be downloaded from:
>> https://github.com/MittalAshish/libqnio.git
>>
>> Sample command line using JSON syntax:
>> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'
>>
>> Sample command line using URI syntax:
>> qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D
>>
>
> Is there a reference specification for vxhs available?  If so, the commit
> message would be a good place to drop a link.
>

We don't have a reference specification doc yet. Although, I will
shortly be adding a test program to libqnio that can be used to
test/demo basic libqnio IO transfer.

>> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
>> ---
>> v3 changelog:
>> =============
>> (1) Reworked QAPI/JSON parsing.
>> (2) Reworked URI parsing as suggested by Kevin.
>> (3) Fixes per review comments from Stefan on v1.
>> (4) Fixes per review comments from Daniel on v3.
>>
>>  block/Makefile.objs |    2 +
>>  block/trace-events  |   41 ++
>>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  block/vxhs.h        |  237 ++++++++++
>>  configure           |   50 ++
>>  5 files changed, 1634 insertions(+)
>>  create mode 100644 block/vxhs.c
>>  create mode 100644 block/vxhs.h
>>
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index 2593a2f..5f83305 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
>>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
>>  block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
>>  block-obj-$(CONFIG_LIBSSH2) += ssh.o
>> +block-obj-$(CONFIG_VXHS) += vxhs.o
>>  block-obj-y += accounting.o dirty-bitmap.o
>>  block-obj-y += write-threshold.o
>>
>> @@ -43,3 +44,4 @@ block-obj-m        += dmg.o
>>  dmg.o-libs         := $(BZIP2_LIBS)
>>  qcow.o-libs        := -lz
>>  linux-aio.o-libs   := -laio
>> +vxhs.o-libs        := $(VXHS_LIBS)
>> diff --git a/block/trace-events b/block/trace-events
>> index 05fa13c..06c6d8c 100644
>> --- a/block/trace-events
>> +++ b/block/trace-events
>> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
>>  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
>> +
>> +# block/vxhs.c
>> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
>> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
>> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
>> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
>> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
>> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
>> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
>> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
>> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
>> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
>> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
>> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
>> +vxhs_open_fail(int ret) "Could not open the device. Error = %d"
>> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
>> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
>> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
>> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
>> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
>> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
>> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
>> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
>> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
>> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
>> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
>> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
>> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
>> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
>> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
>> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
>> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
>> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
>> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
>> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
>> +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
>> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
>> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
>> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
>> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
>> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
>> diff --git a/block/vxhs.c b/block/vxhs.c
>> new file mode 100644
>> index 0000000..32c37c5
>> --- /dev/null
>> +++ b/block/vxhs.c
>> @@ -0,0 +1,1304 @@
>> +/*
>> + * QEMU Block driver for Veritas HyperScale (VxHS)
>> + *
>
> Another good place for a link to the specification, if available.
>
>
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "vxhs.h"
>> +#include <qnio/qnio_api.h>
>> +#include "trace.h"
>> +#include "qapi/qmp/qerror.h"
>> +#include "qapi/qmp/qdict.h"
>> +#include "qapi/qmp/qstring.h"
>> +
>> +#define VXHS_OPT_FILENAME        "filename"
>> +#define VXHS_OPT_VDISK_ID        "vdisk_id"
>> +#define VXHS_OPT_SERVER          "server."
>> +#define VXHS_OPT_HOST            "host"
>> +#define VXHS_OPT_PORT            "port"
>> +
>> +/* qnio client ioapi_ctx */
>> +static void *global_qnio_ctx;
>> +
>> +/* vdisk prefix to pass to qnio */
>> +static const char vdisk_prefix[] = "/dev/of/vdisk";
>> +
>> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>> +{
>> +    VXHSAIOCB *acb = ptr;
>> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +    acb->segments += count;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +}
>> +
>> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>> +{
>> +    VXHSAIOCB *acb = ptr;
>> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +    acb->segments -= count;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +}
>> +
>> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
>> +{
>> +    VXHSAIOCB *acb = ptr;
>> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> +    int segcount = 0;
>> +
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +    acb->segments -= count;
>> +    segcount = acb->segments;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +
>> +    return segcount;
>> +}
>
> Unused function?
>

Yes, it is unused. Sorry I forgot to remove it from V5. Will remember
to remove in the next iteration.

>> +
>> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>> +{
>> +    VXHSAIOCB *acb = ptr;
>> +
>> +    acb->buffer = buffer;
>> +}
>> +
>
> Unused function?
>

This is called from within libqnio.

> (There are several unused functions - are there other patches that use these
> functions?  Maybe I missed them on the list.  If not, and they are for
> future work, maybe it would be best to introduce these functions in the
> patch series that will actually use them. It will cut down on the amount of
> code to be reviewed, and since it is not exercised it can't be verified
> anyway).
>

Removed all unused functions, except vxhs_dec_and_get_acb_segment_count().

>> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
>> +{
>> +    BDRVVXHSState *s = ptr;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> +    s->vdisk_aio_count += count;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +}
>> +
>> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
>> +{
>> +    BDRVVXHSState *s = ptr;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> +    s->vdisk_aio_count -= count;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +}
>> +
>> +uint32_t vxhs_get_vdisk_iocount(void *ptr)
>> +{
>> +    BDRVVXHSState *s = ptr;
>> +    uint32_t count = 0;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> +    count = s->vdisk_aio_count;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +
>> +    return count;
>> +}
>
> Function only used by another function that is not called from anywhere.
>
>

Removed.

>> +
>> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
>> +{
>> +    VXHSAIOCB *acb = NULL;
>> +    BDRVVXHSState *s = NULL;
>> +    int rv = 0;
>> +    int segcount = 0;
>> +    uint32_t error = 0;
>> +    uint32_t opcode = 0;
>> +
>> +    assert(m);
>> +    if (m) {
>> +        /* TODO: need common get message attrs, not two separate lib calls */
>> +        error = qemu_iio_extract_msg_error(m);
>> +        opcode = qemu_iio_extract_msg_opcode(m);
>> +    }
>> +    switch (opcode) {
>> +    case IRP_READ_REQUEST:
>> +    case IRP_WRITE_REQUEST:
>> +
>> +    /*
>> +     * ctx is VXHSAIOCB*
>> +     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
>> +     */
>> +    if (ctx) {
>> +        acb = ctx;
>> +        s = acb->common.bs->opaque;
>> +    } else {
>> +        trace_vxhs_iio_callback(error, reason);
>> +        goto out;
>> +    }
>> +
>> +    if (error) {
>> +        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
>> +
>> +        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
>> +            /*
>> +             * Storage agent failed while I/O was in progress
>> +             * Fail over only if the qnio channel dropped, indicating
>> +             * storage agent failure. Don't fail over in response to other
>> +             * I/O errors such as disk failure.
>> +             */
>> +            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
>> +                error == VXERROR_CHANNEL_HUP || error == -1) {
>> +                /*
>> +                 * Start vDisk IO failover once callback is
>> +                 * called against all the pending IOs.
>> +                 * If vDisk has no redundancy enabled
>> +                 * then IO failover routine will mark
>> +                 * the vDisk failed and fail all the
>> +                 * AIOs without retry (stateless vDisk)
>> +                 */
>> +                VXHS_SPIN_LOCK(s->vdisk_lock);
>> +                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>> +                }
>> +                /*
>> +                 * Check if this acb is already queued before.
>> +                 * It is possible in case if I/Os are submitted
>> +                 * in multiple segments (QNIO_MAX_IO_SIZE).
>> +                 */
>> +                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
>> +                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>> +                                         acb, retry_entry);
>> +                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +                    s->vdisk_aio_retry_qd++;
>> +                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
>> +                }
>> +                segcount = --acb->segments;
>> +                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +                /*
>> +                 * Decrement AIO count only when callback is called
>> +                 * against all the segments of aiocb.
>> +                 */
>> +                if (segcount == 0 && --s->vdisk_aio_count == 0) {
>> +                    /*
>> +                     * Start vDisk I/O failover
>> +                     */
>> +                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +                    /*
>> +                     * TODO:
>> +                     * Need to explore further if it is possible to optimize
>> +                     * the failover operation on Virtual-Machine (global)
>> +                     * specific rather vDisk specific.
>> +                     */
>> +                    vxhs_failover_io(s);
>> +                    goto out;
>> +                }
>> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +                goto out;
>> +            }
>> +        } else if (reason == IIO_REASON_HUP) {
>> +            /*
>> +             * Channel failed, spontaneous notification,
>> +             * not in response to I/O
>> +             */
>> +            trace_vxhs_iio_callback_chnlfail(error);
>> +            /*
>> +             * TODO: Start channel failover when no I/O is outstanding
>> +             */
>> +            goto out;
>> +        } else {
>> +            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
>> +                                         acb->size, error);
>> +        }
>> +    }
>> +    /*
>> +     * Set error into acb if not set. In case if acb is being
>> +     * submitted in multiple segments then need to set the error
>> +     * only once.
>> +     *
>> +     * Once acb done callback is called for the last segment
>> +     * then acb->ret return status will be sent back to the
>> +     * caller.
>> +     */
>> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +    if (error && !acb->ret) {
>> +        acb->ret = error;
>> +    }
>> +    --acb->segments;
>> +    segcount = acb->segments;
>> +    assert(segcount >= 0);
>> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +    /*
>> +     * Check if all the outstanding I/Os are done against acb.
>> +     * If yes then send signal for AIO completion.
>> +     */
>> +    if (segcount == 0) {
>> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> +        if (rv != sizeof(acb)) {
>> +            error_report("VXHS AIO completion failed: %s", strerror(errno));
>> +            abort();
>> +        }
>> +    }
>> +    break;
>> +
>> +    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
>> +        /* ctx is BDRVVXHSState* */
>> +        assert(ctx);
>> +        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
>> +                                      error);
>> +        vxhs_failover_ioctl_cb(error, ctx);
>> +        break;
>> +
>> +    default:
>> +        if (reason == IIO_REASON_HUP) {
>> +            /*
>> +             * Channel failed, spontaneous notification,
>> +             * not in response to I/O
>> +             */
>> +            trace_vxhs_iio_callback_chnfail(error, errno);
>> +            /*
>> +             * TODO: Start channel failover when no I/O is outstanding
>> +             */
>> +        } else {
>> +            trace_vxhs_iio_callback_unknwn(opcode, error);
>> +        }
>> +        break;
>> +    }
>> +out:
>> +    return;
>> +}
>> +
>> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
>> +{
>> +    BlockCompletionFunc *cb = acb->common.cb;
>> +    void *opaque = acb->common.opaque;
>> +    int ret = 0;
>> +
>> +    if (acb->ret != 0) {
>> +        trace_vxhs_complete_aio(acb, acb->ret);
>> +    /*
>> +     * We mask all the IO errors generically as EIO for upper layers
>> +     * Right now our IO Manager uses non standard error codes. Instead
>> +     * of confusing upper layers with incorrect interpretation we are
>> +     * doing this workaround.
>> +     */
>> +        ret = (-EIO);
>> +    }
>> +    /*
>> +     * Copy back contents from stablization buffer into original iovector
>> +     * before returning the IO
>> +     */
>> +    if (acb->buffer != NULL) {
>> +        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
>> +        free(acb->buffer);
>> +        acb->buffer = NULL;
>> +    }
>> +    vxhs_dec_vdisk_iocount(s, 1);
>> +    acb->aio_done = VXHS_IO_COMPLETED;
>> +    qemu_aio_unref(acb);
>> +    cb(opaque, ret);
>> +}
>> +
>> +/*
>> + * This is the HyperScale event handler registered to QEMU.
>> + * It is invoked when any IO gets completed and written on pipe
>> + * by callback called from QNIO thread context. Then it marks
>> + * the AIO as completed, and releases HyperScale AIO callbacks.
>> + */
>> +void vxhs_aio_event_reader(void *opaque)
>> +{
>> +    BDRVVXHSState *s = opaque;
>> +    ssize_t ret;
>> +
>> +    do {
>> +        char *p = (char *)&s->qnio_event_acb;
>> +
>> +        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
>> +                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
>> +        if (ret > 0) {
>> +            s->event_reader_pos += ret;
>> +            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
>> +                s->event_reader_pos = 0;
>> +                vxhs_complete_aio(s->qnio_event_acb, s);
>> +            }
>> +        }
>> +    } while (ret < 0 && errno == EINTR);
>> +}
>> +
>> +/*
>> + * QEMU calls this to check if there are any pending IO on vDisk.
>> + * It will wait in a loop until all the AIOs are completed.
>> + */
>> +int vxhs_aio_flush_cb(void *opaque)
>> +{
>> +    BDRVVXHSState *s = opaque;
>> +
>> +    return vxhs_get_vdisk_iocount(s);
>> +}
>
> Unused function.
>
>

Removed.

>> +
>> +/*
>> + * This will be called by QEMU while booting for each vDisk.
>> + * bs->opaque will be allocated by QEMU upper block layer before
>> + * calling open. It will load all the QNIO operations from
>> + * qemuqnio library and call QNIO operation to create channel to
>> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk
>> + * path and then sets HyperScale event handler to QEMU.
>> + */
>> +void *vxhs_setup_qnio(void)
>> +{
>> +    void *qnio_ctx = NULL;
>> +
>> +    qnio_ctx = qemu_iio_init(vxhs_iio_callback);
>
> One concern with this callback function - when does it get called?  I am
> assuming that qemu_iio_init() is running threads / processes in the
> background, so this callback could happen at anytime.
>
> (I'll discuss my specific concern later, in vxhs_failover_ioctl_cb())
>
>
>

Yes, that is correct. vxhs_setup_qnio() is called from the threads
running in libqnio.



> An aside:
>
> It is odd that the qnio library is using a qemu_ prefix for functions.
> Seems like a good way to run into issues later, especially with relatively
> generic function names such as 'qemu_ck_spin_lock()'. Consider that most
> developers / maintainers are probably not going to compile in vxhs given the
> external library dependence without a specific reason, so name collisions
> could sneak in.
>
> 'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause:
>
> 0000000000004d07 T qemu_calculate_iovec_size
> 000000000000663d T qemu_ck_destroy_lock
> 00000000000065cf T qemu_ck_initialize_lock
> 00000000000065f7 T qemu_ck_spin_lock
> 000000000000661a T qemu_ck_spin_unlock
> 0000000000004dfb T qemu_convert_iovector_to_buffer
> 0000000000004d63 T qemu_copy_iov_to_buffer
> 0000000000005f2f T qemu_extract_flush_response
> 0000000000005c89 T qemu_extract_geometry_from_json
> 0000000000005a78 T qemu_extract_size_from_json
> 0000000000004f40 T qemu_is_iovector_read_aligned
>
>

Replaced all qemu_ prefixed symbols within qnio.

>
>> +
>> +    if (qnio_ctx != NULL) {
>> +        trace_vxhs_setup_qnio(qnio_ctx);
>> +    } else {
>> +        trace_vxhs_setup_qnio_nwerror('.');
>> +    }
>> +
>> +    return qnio_ctx;
>> +}
>> +
>> +static QemuOptsList runtime_opts = {
>> +    .name = "vxhs",
>> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>> +    .desc = {
>> +        {
>> +            .name = VXHS_OPT_FILENAME,
>> +            .type = QEMU_OPT_STRING,
>> +            .help = "URI to the Veritas HyperScale image",
>> +        },
>> +        {
>> +            .name = VXHS_OPT_VDISK_ID,
>> +            .type = QEMU_OPT_STRING,
>> +            .help = "UUID of the VxHS vdisk",
>> +        },
>> +        { /* end of list */ }
>> +    },
>> +};
>> +
>> +static QemuOptsList runtime_tcp_opts = {
>> +    .name = "vxhs_tcp",
>> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
>> +    .desc = {
>> +        {
>> +            .name = VXHS_OPT_HOST,
>> +            .type = QEMU_OPT_STRING,
>> +            .help = "host address (ipv4 addresses)",
>> +        },
>> +        {
>> +            .name = VXHS_OPT_PORT,
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "port number on which VxHSD is listening (default 9999)",
>> +            .def_value_str = "9999"
>> +        },
>> +        {
>> +            .name = "to",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "max port number, not supported by VxHS",
>> +        },
>> +        {
>> +            .name = "ipv4",
>> +            .type = QEMU_OPT_BOOL,
>> +            .help = "ipv4 bool value, not supported by VxHS",
>> +        },
>> +        {
>> +            .name = "ipv6",
>> +            .type = QEMU_OPT_BOOL,
>> +            .help = "ipv6 bool value, not supported by VxHS",
>> +        },
>> +        { /* end of list */ }
>> +    },
>> +};
>> +
>> +/*
>> + * Parse the incoming URI and populate *options with all the host(s)
>> + * information. Host at index 0 is local storage agent.
>> + * Remaining are the reflection target storage agents. The local storage agent
>> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
>> + * The local storage agent address is stored at index 0. The reflection target
>> + * ips, are the E-W data network addresses of the reflection node agents, also
>> + * extracted from the uri.
>> + */
>> +static int vxhs_parse_uri(const char *filename, QDict *options)
>> +{
>> +    gchar **target_list;
>> +    URI *uri = NULL;
>> +    char *hoststr, *portstr;
>> +    char *port;
>> +    int ret = 0;
>> +    int i = 0;
>> +
>> +    trace_vxhs_parse_uri_filename(filename);
>> +    target_list = g_strsplit(filename, "%7D", 0);
>> +    assert(target_list != NULL && target_list[0] != NULL);
>> +
>> +    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
>> +        uri = uri_parse(target_list[i]);
>> +        if (!uri || !uri->server) {
>> +            uri_free(uri);
>> +            ret = -EINVAL;
>> +            break;
>> +        }
>> +
>> +        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
>> +        qdict_put(options, hoststr, qstring_from_str(uri->server));
>> +
>> +        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
>> +        if (uri->port) {
>> +            port = g_strdup_printf("%d", uri->port);
>> +            qdict_put(options, portstr, qstring_from_str(port));
>> +            g_free(port);
>> +        }
>> +
>> +        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
>> +            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
>> +        }
>> +
>> +        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
>> +        g_free(hoststr);
>> +        g_free(portstr);
>> +        uri_free(uri);
>> +    }
>> +
>> +    g_strfreev(target_list);
>> +    return ret;
>> +}
>> +
>> +static void vxhs_parse_filename(const char *filename, QDict *options,
>> +                               Error **errp)
>> +{
>> +    if (qdict_haskey(options, "host")
>> +        || qdict_haskey(options, "port")
>> +        || qdict_haskey(options, "path"))
>> +    {
>> +        error_setg(errp, "host/port/path and a file name may not be specified "
>> +                         "at the same time");
>> +        return;
>> +    }
>> +
>> +    if (strstr(filename, "://")) {
>> +        int ret = vxhs_parse_uri(filename, options);
>> +        if (ret < 0) {
>> +            error_setg(errp, "Invalid URI. URI should be of the form "
>> +                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
>> +        }
>> +    }
>> +}
>> +
>> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
>> +                              int *cfd, int *rfd, Error **errp)
>> +{
>> +    QDict *backing_options = NULL;
>> +    const char *vxhs_filename;
>> +    char *of_vsa_addr = NULL;
>> +    Error *local_err = NULL;
>> +    const char *ptr = NULL;
>
> ptr does not need to be initialized to NULL here.  I also recommend using a
> more descriptive name like 'vdisk_id_opt' over something generic like 'ptr'.
>

Fixed.


>> +    char *file_name = NULL;
>> +    size_t num_servers;
>> +    char *str = NULL;
>> +    QemuOpts *opts;
>> +    int ret = 0;
>> +    int i;
>> +
>> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>> +    qemu_opts_absorb_qdict(opts, options, &local_err);
>> +    if (local_err) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
>> +    if (vxhs_filename) {
>> +        trace_vxhs_qemu_init_filename(vxhs_filename);
>> +    }
>> +
>> +    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
>> +    if (!ptr) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    s->vdisk_guid = g_strdup(ptr);
>
> This is never freed anywhere.
>

Now freed on error conditions and in vxhs_close().

>> +    trace_vxhs_qemu_init_vdisk(ptr);
>> +
>> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
>> +    if (num_servers < 1) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    trace_vxhs_qemu_init_numservers(num_servers);
>> +
>> +    qemu_opts_del(opts);
>> +
>> +    for (i = 0; i < num_servers; i++) {
>
> s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size.  There is no
> bounds checking on num_servers, which is an issue: we can easily blow
> past this.
>
> In general, for any parameter supplied by a user, or read from a file /
> protocol / etc, we need to do appropriate bounds checking to prevent
> potential CVEs.
>

Fixed.

>> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
>> +        qdict_extract_subqdict(options, &backing_options, str);
>> +
>> +        /* create opts info from runtime_tcp_opts list */
>> +        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
>> +        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
>> +        if (local_err) {
>> +            ret = -EINVAL;
>> +            qemu_opts_del(opts);
>> +            goto out;
>> +        }
>> +
>> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
>> +                                                            VXHS_OPT_HOST));
>
> This is never freed anywhere.
>

Now freed on error and vxhs_close().

>> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
>> +                                                                 VXHS_OPT_PORT),
>> +                                                    NULL, 0);
>> +
>> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
>> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
>> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
>> +                             s->vdisk_hostinfo[i].port);
>> +        qemu_opts_del(opts);
>> +    }
>> +
>> +    s->vdisk_nhosts = i;
>> +    s->vdisk_cur_host_idx = 0;
>> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
>> +
>> +    /*
>> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
>> +     */
>> +    if (global_qnio_ctx == NULL) {
>> +        global_qnio_ctx = vxhs_setup_qnio();
>> +        if (global_qnio_ctx == NULL) {
>> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
>> +            ret = -EINVAL;
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>> +    if (*cfd < 0) {
>> +        error_setg(&local_err, "Failed qemu_open_iio_conn");
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
>> +    if (*rfd < 0) {
>> +        error_setg(&local_err, "Failed qemu_iio_devopen");
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +
>> +out:
>> +    g_free(file_name);
>> +    g_free(of_vsa_addr);
>> +    if (str) {
>> +        qdict_del(backing_options, str);
>> +        g_free(str);
>> +    }
>> +
>> +    if (ret < 0) {
>> +        errno = -ret;
>> +    }
>> +    error_propagate(errp, local_err);
>> +    return ret;
>> +}
>> +
>> +int vxhs_open(BlockDriverState *bs, QDict *options,
>> +              int bdrv_flags, Error **errp)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    AioContext *aio_context;
>> +    int qemu_qnio_cfd = -1;
>> +    int qemu_rfd = -1;
>> +    int ret = 0;
>> +
>> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_fail(ret);
>> +        return ret;
>> +    }
>> +
>> +    s->qnio_ctx = global_qnio_ctx;
>> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
>> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
>> +    s->vdisk_size = 0;
>> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
>> +
>> +    /*
>> +     * Create a pipe for communicating between two threads in different
>> +     * context. Set handler for read event, which gets triggered when
>> +     * IO completion is done by non-QEMU context.
>> +     */
>> +    ret = qemu_pipe(s->fds);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_epipe('.');
>> +        ret = -errno;
>> +        goto out;
>> +    }
>> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
>> +
>> +    aio_context = bdrv_get_aio_context(bs);
>> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
>> +                       false, vxhs_aio_event_reader, NULL, s);
>> +
>> +    /*
>> +     * Allocate/Initialize the spin-locks.
>> +     *
>> +     * NOTE:
>> +     *      Since spin lock is being allocated
>> +     *      dynamically hence moving acb struct
>> +     *      specific lock to BDRVVXHSState
>> +     *      struct. The reason being,
>> +     *      we don't want the overhead of spin
>> +     *      lock being dynamically allocated and
>> +     *      freed for every AIO.
>> +     */
>> +    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
>> +    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
>> +
>> +    return 0;
>> +
>> +out:
>> +    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
>> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> +                                 s->vdisk_hostinfo[0].vdisk_rfd);
>> +    }
>> +    /* never close qnio_cfd */
>
> Surely we want to be able to close it at some point... or does the qnio
> library take care of that cleanup somewhere/sometime?
>
>

Changed to close channel fd on error conditions, vxhs_close() and
vxhs_reopen_vdisk().


>> +    trace_vxhs_open_fail(ret);
>> +    return ret;
>> +}
>> +
>> +static const AIOCBInfo vxhs_aiocb_info = {
>> +    .aiocb_size = sizeof(VXHSAIOCB)
>> +};
>> +
>> +/*
>> + * This is called in QNIO thread context when IO done
>> + * on IO Manager and QNIO client received the data or
>> + * ACK. It notify another event handler thread running in QEMU context
>> + * by writing on the pipe
>> + */
>> +void vxhs_finish_aiocb(ssize_t ret, void *arg)
>> +{
>> +    VXHSAIOCB *acb = arg;
>> +    BlockDriverState *bs = acb->common.bs;
>> +    BDRVVXHSState *s = bs->opaque;
>> +    int rv;
>> +
>> +    acb->ret = ret;
>> +    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> +    if (rv != sizeof(acb)) {
>> +        error_report("VXHS AIO completion failed: %s",
>> +                     strerror(errno));
>> +        abort();
>> +    }
>> +}
>
> Another unused function.
>

Removed.


>> +
>> +/*
>> + * This allocates QEMU-VXHS callback for each IO
>> + * and is passed to QNIO. When QNIO completes the work,
>> + * it will be passed back through the callback.
>> + */
>> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
>> +                                int64_t sector_num, QEMUIOVector *qiov,
>> +                                int nb_sectors,
>> +                                BlockCompletionFunc *cb,
>> +                                void *opaque, int iodir)
>> +{
>> +    VXHSAIOCB *acb = NULL;
>> +    BDRVVXHSState *s = bs->opaque;
>> +    size_t size;
>> +    uint64_t offset;
>> +    int iio_flags = 0;
>> +    int ret = 0;
>> +
>> +    offset = sector_num * BDRV_SECTOR_SIZE;
>> +    size = nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
>> +    /*
>> +     * Setup or initialize VXHSAIOCB.
>> +     * Every single field should be initialized since
>> +     * acb will be picked up from the slab without
>> +     * initializing with zero.
>> +     */
>> +    acb->io_offset = offset;
>> +    acb->size = size;
>> +    acb->ret = 0;
>> +    acb->flags = 0;
>> +    acb->aio_done = VXHS_IO_INPROGRESS;
>> +    acb->segments = 0;
>> +    acb->buffer = 0;
>> +    acb->qiov = qiov;
>> +    acb->direction = iodir;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> +    if (OF_VDISK_FAILED(s)) {
>> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
>> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +        goto errout;
>> +    }
>> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> +        s->vdisk_aio_retry_qd++;
>> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
>> +        goto out;
>> +    }
>> +    s->vdisk_aio_count++;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +
>> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> +
>> +    switch (iodir) {
>> +    case VDISK_AIO_WRITE:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = qemu_iio_writev(s->qnio_ctx,
>> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>> +            break;
>> +    case VDISK_AIO_READ:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = qemu_iio_readv(s->qnio_ctx,
>> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>> +            break;
>> +    default:
>> +            trace_vxhs_aio_rw_invalid(iodir);
>> +            goto errout;
>> +    }
>> +
>> +    if (ret != 0) {
>> +        trace_vxhs_aio_rw_ioerr(
>> +                  s->vdisk_guid, iodir, size, offset,
>> +                  acb, acb->segments, ret, errno);
>> +        /*
>> +         * Don't retry I/Os against vDisk having no
>> +         * redundancy or stateful storage on compute
>> +         *
>> +         * TODO: Revisit this code path to see if any
>> +         *       particular error needs to be handled.
>> +         *       At this moment failing the I/O.
>> +         */
>> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>> +        if (s->vdisk_nhosts == 1) {
>> +            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
>> +            s->vdisk_aio_count--;
>> +            vxhs_dec_acb_segment_count(acb, 1);
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            goto errout;
>> +        }
>> +        if (OF_VDISK_FAILED(s)) {
>> +            trace_vxhs_aio_rw_devfail(
>> +                      s->vdisk_guid, iodir, size, offset);
>> +            s->vdisk_aio_count--;
>> +            vxhs_dec_acb_segment_count(acb, 1);
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            goto errout;
>> +        }
>> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +            /*
>> +             * Queue all incoming io requests after failover starts.
>> +             * Number of requests that can arrive is limited by io queue depth
>> +             * so an app blasting independent ios will not exhaust memory.
>> +             */
>> +            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> +            s->vdisk_aio_retry_qd++;
>> +            OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +            s->vdisk_aio_count--;
>> +            vxhs_dec_acb_segment_count(acb, 1);
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
>> +            goto out;
>> +        }
>> +        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> +        s->vdisk_aio_retry_qd++;
>> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +        vxhs_dec_acb_segment_count(acb, 1);
>> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
>> +        /*
>> +         * Start I/O failover if there is no active
>> +         * AIO within vxhs block driver.
>> +         */
>> +        if (--s->vdisk_aio_count == 0) {
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            /*
>> +             * Start IO failover
>> +             */
>> +            vxhs_failover_io(s);
>> +            goto out;
>> +        }
>> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +    }
>> +
>> +out:
>> +    return &acb->common;
>> +
>> +errout:
>> +    qemu_aio_unref(acb);
>> +    return NULL;
>> +}
>> +
>> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
>> +                                   int64_t sector_num, QEMUIOVector *qiov,
>> +                                   int nb_sectors,
>> +                                   BlockCompletionFunc *cb, void *opaque)
>> +{
>> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>> +                         cb, opaque, VDISK_AIO_READ);
>> +}
>> +
>> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
>> +                                    int64_t sector_num, QEMUIOVector *qiov,
>> +                                    int nb_sectors,
>> +                                    BlockCompletionFunc *cb, void *opaque)
>> +{
>> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>> +                         cb, opaque, VDISK_AIO_WRITE);
>> +}
>> +
>> +/*
>> + * This is called by QEMU when a flush gets triggered from within
>> + * a guest at the block layer, either for IDE or SCSI disks.
>> + */
>> +int vxhs_co_flush(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    uint64_t size = 0;
>> +    int ret = 0;
>> +
>> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
>> +
>> +    if (ret < 0) {
>> +        /*
>> +         * Currently not handling the flush ioctl
>> +         * failure because of network connection
>> +         * disconnect. Since all the writes are
>> +         * commited into persistent storage hence
>> +         * this flush call is noop and we can safely
>> +         * return success status to the caller.
>
> I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
> above is a noop?
>

Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
in case we later want to add some functionality to it. I have now
added a comment to this affect to avoid any confusion.

>> +         *
>> +         * If any write failure occurs for inflight
>> +         * write AIO because of network disconnect
>> +         * then anyway IO failover will be triggered.
>> +         */
>> +        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
>> +        ret = 0;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
>> +{
>> +    void *ctx = NULL;
>> +    int flags = 0;
>> +    unsigned long vdisk_size = 0;
>> +    int ret = 0;
>> +
>> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +            VDISK_STAT, &vdisk_size, ctx, flags);
>> +
>> +    if (ret < 0) {
>> +        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
>> +        return 0;
>> +    }
>> +
>> +    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
>> +    return vdisk_size;
>> +}
>> +
>> +/*
>> + * Returns the size of vDisk in bytes. This is required
>> + * by QEMU block upper block layer so that it is visible
>> + * to guest.
>> + */
>> +int64_t vxhs_getlength(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    unsigned long vdisk_size = 0;
>> +
>> +    if (s->vdisk_size > 0) {
>> +        vdisk_size = s->vdisk_size;
>
> s->vdisk_size is int64_t, vdisk_size is unsigned long.  On 32-bit platforms,
> I believe unsigned long is only 32-bits, so this assignment is problematic.
>
> What is the maximum disk size for vxhs?
>
> (this comment applies to all the vdisk_size usage, but I'll only mention it
> here)
>

Changed to int64_t in all places. The code does not place any
restriction on the maximum size of the vdisk.

>> +    } else {
>> +        /*
>> +         * Fetch the vDisk size using stat ioctl
>> +         */
>> +        vdisk_size = vxhs_get_vdisk_stat(s);
>> +        if (vdisk_size > 0) {
>> +            s->vdisk_size = vdisk_size;
>> +        }
>> +    }
>> +
>> +    if (vdisk_size > 0) {
>> +        return (int64_t)vdisk_size; /* return size in bytes */
>> +    } else {
>> +        return -EIO;
>> +    }
>> +}
>> +
>> +/*
>> + * Returns actual blocks allocated for the vDisk.
>> + * This is required by qemu-img utility.
>> + */
>> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    unsigned long vdisk_size = 0;
>> +
>> +    if (s->vdisk_size > 0) {
>> +        vdisk_size = s->vdisk_size;
>> +    } else {
>> +        /*
>> +         * TODO:
>> +         * Once HyperScale storage-virtualizer provides
>> +         * actual physical allocation of blocks then
>> +         * fetch that information and return back to the
>> +         * caller but for now just get the full size.
>> +         */
>> +        vdisk_size = vxhs_get_vdisk_stat(s);
>> +        if (vdisk_size > 0) {
>> +            s->vdisk_size = vdisk_size;
>> +        }
>> +    }
>> +
>> +    if (vdisk_size > 0) {
>> +        return (int64_t)vdisk_size; /* return size in bytes */
>> +    } else {
>> +        return -EIO;
>> +    }
>> +}
>> +
>> +void vxhs_close(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +
>> +    close(s->fds[VDISK_FD_READ]);
>> +    close(s->fds[VDISK_FD_WRITE]);
>> +
>> +    /*
>> +     * never close channel - not ref counted, will
>> +     * close for all vdisks
>> +     */
>> +    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
>> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
>> +    }
>> +    if (s->vdisk_lock) {
>> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
>> +        s->vdisk_lock = NULL;
>> +    }
>> +    if (s->vdisk_acb_lock) {
>> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
>> +        s->vdisk_acb_lock = NULL;
>> +    }
>> +
>> +    /*
>> +     * TODO: Verify that all the resources were relinguished.
>> +     */
>
> Which resources?  Allocated here in the driver, or stuff in libqnio?
>

The comment was for all resources allocated within vxhs.c.
Added code to free all allocated memory and fds on close.


>> +}
>> +
>> +/*
>> + * If errors are consistent with storage agent failure:
>> + *  - Try to reconnect in case error is transient or storage agent restarted.
>> + *  - Currently failover is being triggered on per vDisk basis. There is
>> + *    a scope of further optimization where failover can be global (per VM).
>> + *  - In case of network (storage agent) failure, for all the vDisks, having
>> + *    no redundancy, I/Os will be failed without attempting for I/O failover
>> + *    because of stateless nature of vDisk.
>> + *  - If local or source storage agent is down then send an ioctl to remote
>> + *    storage agent to check if remote storage agent in a state to accept
>> + *    application I/Os.
>> + *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
>> + *  - If I/Os cannot be serviced then vDisk will be marked failed so that
>> + *    new incoming I/Os are returned with failure immediately.
>> + *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
>> + *    queued and will be restarted or failed based on failover operation
>> + *    is successful or not.
>> + *  - I/O failover can be started either in I/O forward or I/O backward
>> + *    path.
>> + *  - I/O failover will be started as soon as all the pending acb(s)
>> + *    are queued and there is no pending I/O count.
>> + *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
>> + *    then vDisk will be marked failed and all I/Os will be completed with
>> + *    error.
>> + */
>> +
>> +int vxhs_switch_storage_agent(BDRVVXHSState *s)
>> +{
>> +    int res = 0;
>> +    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
>> +
>> +    trace_vxhs_switch_storage_agent(
>> +              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>> +              s->vdisk_guid);
>> +
>> +    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
>> +    if (res == 0) {
>> +        res = qemu_iio_ioctl(s->qnio_ctx,
>> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
>> +                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
>> +    } else {
>> +        trace_vxhs_switch_storage_agent_failed(
>> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>> +                  s->vdisk_guid, res, errno);
>> +        /*
>> +         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
>> +         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
>> +         * target list.
>> +         */
>> +
>> +        /* try next host */
>> +        vxhs_failover_ioctl_cb(res, s);
>> +    }
>> +    return res;
>> +}
>> +
>> +void vxhs_failover_ioctl_cb(int res, void *ctx)
>> +{
>> +    BDRVVXHSState *s = ctx;
>> +
>> +    if (res == 0) {
>> +        /* found failover target */
>> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
>> +        s->vdisk_ask_failover_idx = 0;
>> +        trace_vxhs_failover_ioctl_cb(
>> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                   s->vdisk_guid);
>> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
>> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +        vxhs_handle_queued_ios(s);
>> +    } else {
>> +        /* keep looking */
>> +        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
>> +        s->vdisk_ask_failover_idx++;
>> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
>
> This function, vxhs_failover_ioctrl_cb(), is called from the external
> library callback, in addition to QEMU itself (i.e., vxhs_aio_rw).  This
> makes me concerned that we may have (or be able to induce) an unsafe race
> condition when incrementing the vdisk_ask_failover_id array index.
>

vxhs_failover_ioctl_cb()  can be called from the callback
vxhs_iio_callback() case IRP_VDISK_CHECK_IO_FAILOVER_READY in response
to the qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY)

This ioctl qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) is only
called from within vxhs_switch_storage_agent()

A direct call to vxhs_failover_ioctl_cb() is only issued from the same
function vxhs_switch_storage_agent() on an "else" condition. So
vxhs_failover_ioctl_cb() can only be called from a single thread
context either in response to the ioctl, or directly. We should be
safe here, but thanks for pointing it out!

>> +            /* pause and cycle through list again */
>> +            sleep(QNIO_CONNECT_RETRY_SECS);
>> +            s->vdisk_ask_failover_idx = 0;
>> +        }
>> +        res = vxhs_switch_storage_agent(s);
>> +    }
>> +}
>> +
>> +int vxhs_failover_io(BDRVVXHSState *s)
>> +{
>> +    int res = 0;
>> +
>> +    trace_vxhs_failover_io(s->vdisk_guid);
>> +
>> +    s->vdisk_ask_failover_idx = 0;
>> +    res = vxhs_switch_storage_agent(s);
>> +
>> +    return res;
>> +}
>> +
>> +/*
>> + * Try to reopen the vDisk on one of the available hosts
>> + * If vDisk reopen is successful on any of the host then
>> + * check if that node is ready to accept I/O.
>> + */
>> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
>> +{
>> +    char *of_vsa_addr = NULL;
>> +    char *file_name = NULL;
>> +    int  res = 0;
>> +
>> +    /*
>> +     * Don't close the channel if it was opened
>> +     * before successfully. It will be handled
>> +     * within iio* api if the same channel open
>> +     * fd is reused.
>> +     *
>> +     * close stale vdisk device remote fd since
>> +     * it is invalid after channel disconnect.
>> +     */
>> +    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
>> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> +                                 s->vdisk_hostinfo[index].vdisk_rfd);
>> +        s->vdisk_hostinfo[index].vdisk_rfd = -1;
>> +    }
>> +
>> +    /*
>> +     * build storage agent address and vdisk device name strings
>> +     */
>> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> +             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
>> +    /*
>> +     * open qnio channel to storage agent if not opened before.
>> +     */
>> +    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>> +        s->vdisk_hostinfo[index].qnio_cfd =
>> +                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>> +        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>> +            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
>> +            res = ENODEV;
>> +            goto out;
>> +        }
>> +    }
>> +    /*
>> +     * open vdisk device
>> +     */
>> +    s->vdisk_hostinfo[index].vdisk_rfd =
>> +            qemu_iio_devopen(global_qnio_ctx,
>> +             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
>> +    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
>> +        trace_vxhs_reopen_vdisk_openfail(file_name);
>> +        res = EIO;
>> +        goto out;
>> +    }
>> +
>> +out:
>> +    g_free(of_vsa_addr);
>> +    g_free(file_name);
>> +    return res;
>> +}
>> +
>> +int vxhs_handle_queued_ios(BDRVVXHSState *s)
>> +{
>> +    VXHSAIOCB *acb = NULL;
>> +    int res = 0;
>> +
>> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> +    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
>> +        /*
>> +         * Before we process the acb, check whether I/O failover
>> +         * started again due to failback or cascading failure.
>> +         */
>> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            goto out;
>> +        }
>> +        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
>> +        s->vdisk_aio_retry_qd--;
>> +        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
>> +        if (OF_VDISK_FAILED(s)) {
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            vxhs_fail_aio(acb, EIO);
>> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>> +        } else {
>> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +            res = vxhs_restart_aio(acb);
>> +            trace_vxhs_handle_queued_ios(acb, res);
>> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>> +            if (res) {
>> +                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>> +                                     acb, retry_entry);
>> +                OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +                goto out;
>> +            }
>> +        }
>> +    }
>> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> +out:
>> +    return res;
>> +}
>> +
>> +int vxhs_restart_aio(VXHSAIOCB *acb)
>> +{
>> +    BDRVVXHSState *s = NULL;
>> +    int iio_flags = 0;
>> +    int res = 0;
>> +
>> +    s = acb->common.bs->opaque;
>> +
>> +    if (acb->direction == VDISK_AIO_WRITE) {
>> +        vxhs_inc_vdisk_iocount(s, 1);
>> +        vxhs_inc_acb_segment_count(acb, 1);
>> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> +        res = qemu_iio_writev(s->qnio_ctx,
>> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +                acb->qiov->iov, acb->qiov->niov,
>> +                acb->io_offset, (void *)acb, iio_flags);
>> +    }
>> +
>> +    if (acb->direction == VDISK_AIO_READ) {
>> +        vxhs_inc_vdisk_iocount(s, 1);
>> +        vxhs_inc_acb_segment_count(acb, 1);
>> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> +        res = qemu_iio_readv(s->qnio_ctx,
>> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> +                acb->qiov->iov, acb->qiov->niov,
>> +                acb->io_offset, (void *)acb, iio_flags);
>> +    }
>> +
>> +    if (res != 0) {
>> +        vxhs_dec_vdisk_iocount(s, 1);
>> +        vxhs_dec_acb_segment_count(acb, 1);
>> +        trace_vxhs_restart_aio(acb->direction, res, errno);
>> +    }
>> +
>> +    return res;
>> +}
>> +
>> +void vxhs_fail_aio(VXHSAIOCB *acb, int err)
>> +{
>> +    BDRVVXHSState *s = NULL;
>> +    int segcount = 0;
>> +    int rv = 0;
>> +
>> +    s = acb->common.bs->opaque;
>> +
>> +    trace_vxhs_fail_aio(s->vdisk_guid, acb);
>> +    if (!acb->ret) {
>> +        acb->ret = err;
>> +    }
>> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> +    segcount = acb->segments;
>> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> +    if (segcount == 0) {
>> +        /*
>> +         * Complete the io request
>> +         */
>> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> +        if (rv != sizeof(acb)) {
>> +            error_report("VXHS AIO completion failed: %s",
>> +                         strerror(errno));
>> +            abort();
>> +        }
>> +    }
>> +}
>> +
>> +static void vxhs_detach_aio_context(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +
>> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
>> +                       false, NULL, NULL, NULL);
>> +
>> +}
>> +
>> +static void vxhs_attach_aio_context(BlockDriverState *bs,
>> +                                   AioContext *new_context)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +
>> +    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
>> +                       false, vxhs_aio_event_reader, NULL, s);
>> +}
>> +
>> +static BlockDriver bdrv_vxhs = {
>> +    .format_name                  = "vxhs",
>> +    .protocol_name                = "vxhs",
>> +    .instance_size                = sizeof(BDRVVXHSState),
>> +    .bdrv_file_open               = vxhs_open,
>> +    .bdrv_parse_filename          = vxhs_parse_filename,
>> +    .bdrv_close                   = vxhs_close,
>> +    .bdrv_getlength               = vxhs_getlength,
>> +    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
>> +    .bdrv_aio_readv               = vxhs_aio_readv,
>> +    .bdrv_aio_writev              = vxhs_aio_writev,
>> +    .bdrv_co_flush_to_disk        = vxhs_co_flush,
>> +    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
>> +    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
>> +};
>> +
>> +void bdrv_vxhs_init(void)
>> +{
>> +    trace_vxhs_bdrv_init('.');
>> +    bdrv_register(&bdrv_vxhs);
>> +}
>> +
>> +block_init(bdrv_vxhs_init);
>> diff --git a/block/vxhs.h b/block/vxhs.h
>> new file mode 100644
>> index 0000000..38e0165
>> --- /dev/null
>> +++ b/block/vxhs.h
>> @@ -0,0 +1,237 @@
>> +/*
>> + * QEMU Block driver for Veritas HyperScale (VxHS)
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef VXHSD_H
>> +#define VXHSD_H
>> +
>> +#include <gmodule.h>
>> +#include <inttypes.h>
>> +#include <pthread.h>
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/error-report.h"
>> +#include "block/block_int.h"
>> +#include "qemu/uri.h"
>> +#include "qemu/queue.h"
>> +
>> +#define OF_GUID_STR_LEN             40
>> +#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
>> +#define QNIO_CONNECT_RETRY_SECS     5
>> +#define QNIO_CONNECT_TIMOUT_SECS    120
>> +
>> +/* constants from io_qnio.h */
>> +#define IIO_REASON_DONE     0x00000004
>> +#define IIO_REASON_EVENT    0x00000008
>> +#define IIO_REASON_HUP      0x00000010
>> +
>> +/*
>> + * IO specific flags
>> + */
>> +#define IIO_FLAG_ASYNC      0x00000001
>> +#define IIO_FLAG_DONE       0x00000010
>> +#define IIO_FLAG_SYNC       0
>> +
>> +/* constants from error.h */
>> +#define VXERROR_RETRY_ON_SOURCE     44
>> +#define VXERROR_HUP                 901
>> +#define VXERROR_CHANNEL_HUP         903
>> +
>> +/* constants from iomgr.h and opcode.h */
>> +#define IRP_READ_REQUEST                    0x1FFF
>> +#define IRP_WRITE_REQUEST                   0x2FFF
>> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
>> +
>> +/* Lock specific macros */
>> +#define VXHS_SPIN_LOCK_ALLOC                  \
>> +    (qemu_ck_initialize_lock())
>> +#define VXHS_SPIN_LOCK(lock)                  \
>> +    (qemu_ck_spin_lock(lock))
>> +#define VXHS_SPIN_UNLOCK(lock)                \
>> +    (qemu_ck_spin_unlock(lock))
>> +#define VXHS_SPIN_LOCK_DESTROY(lock)          \
>> +    (qemu_ck_destroy_lock(lock))
>
> What's the value of these macros, rather than just calling the underlying
> function directly in code?
>

This way we can easily swap out CK locks with something else like
pthread_mutex if required.

>> +
>> +typedef enum {
>> +    VXHS_IO_INPROGRESS,
>> +    VXHS_IO_COMPLETED,
>> +    VXHS_IO_ERROR
>> +} VXHSIOState;
>> +
>> +
>> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
>> +
>> +#define VDISK_FD_READ 0
>> +#define VDISK_FD_WRITE 1
>> +
>> +#define QNIO_VDISK_NONE        0x00
>> +#define QNIO_VDISK_CREATE      0x01
>> +
>> +/* max IO size supported by QEMU NIO lib */
>> +#define QNIO_MAX_IO_SIZE       4194304
>> +
>> +#define MAX_HOSTS               4
>> +
>> +/*
>> + * Opcodes for making IOCTL on QEMU NIO library
>> + */
>> +#define BASE_OPCODE_SHARED     1000
>> +#define BASE_OPCODE_DAL        2000
>> +#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
>> +#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
>> +#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
>> +#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
>> +
>> +/*
>> + * BDRVVXHSState specific flags
>> + */
>> +#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
>> +#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
>> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
>> +
>> +#define OF_VDISK_ACTIVE(s)                                              \
>> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
>> +#define OF_VDISK_SET_ACTIVE(s)                                          \
>> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
>> +#define OF_VDISK_RESET_ACTIVE(s)                                        \
>> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
>> +
>> +#define OF_VDISK_FAILED(s)                                              \
>> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
>> +#define OF_VDISK_SET_FAILED(s)                                          \
>> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
>> +#define OF_VDISK_RESET_FAILED(s)                                        \
>> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
>> +
>> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
>> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
>> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
>> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> +
>> +/*
>> + * VXHSAIOCB specific flags
>> + */
>> +#define OF_ACB_QUEUED       0x00000001
>> +
>> +#define OF_AIOCB_FLAGS_QUEUED(a)            \
>> +        ((a)->flags & OF_ACB_QUEUED)
>> +#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
>> +        ((a)->flags |= OF_ACB_QUEUED)
>> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
>> +        ((a)->flags &= ~OF_ACB_QUEUED)
>> +
>
> Many helper macros here... IMO, it decreases the readability of the code,
> and my preference is to just have the underlying code without the use of the
> macros.
>

Would like to keep this unchanged if that's OK!

>> +typedef struct qemu2qnio_ctx {
>> +    uint32_t            qnio_flag;
>> +    uint64_t            qnio_size;
>> +    char                *qnio_channel;
>> +    char                *target;
>> +    qnio_callback_t     qnio_cb;
>> +} qemu2qnio_ctx_t;
>> +
>> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
>> +
>> +typedef struct LibQNIOSymbol {
>> +        const char *name;
>> +        gpointer *addr;
>> +} LibQNIOSymbol;
>> +
>> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
>> +                          void *reply);
>> +
>> +/*
>> + * HyperScale AIO callbacks structure
>> + */
>> +typedef struct VXHSAIOCB {
>> +    BlockAIOCB          common;
>> +    size_t              ret;
>> +    size_t              size;
>> +    QEMUBH              *bh;
>> +    int                 aio_done;
>> +    int                 segments;
>> +    int                 flags;
>> +    size_t              io_offset;
>> +    QEMUIOVector        *qiov;
>> +    void                *buffer;
>> +    int                 direction;  /* IO direction (r/w) */
>> +    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
>> +} VXHSAIOCB;
>> +
>> +typedef struct VXHSvDiskHostsInfo {
>> +    int     qnio_cfd;        /* Channel FD */
>> +    int     vdisk_rfd;      /* vDisk remote FD */
>> +    char    *hostip;        /* Host's IP addresses */
>> +    int     port;           /* Host's port number */
>> +} VXHSvDiskHostsInfo;
>> +
>> +/*
>> + * Structure per vDisk maintained for state
>> + */
>> +typedef struct BDRVVXHSState {
>> +    int                     fds[2];
>> +    int64_t                 vdisk_size;
>> +    int64_t                 vdisk_blocks;
>> +    int64_t                 vdisk_flags;
>> +    int                     vdisk_aio_count;
>> +    int                     event_reader_pos;
>> +    VXHSAIOCB               *qnio_event_acb;
>> +    void                    *qnio_ctx;
>> +    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
>> +    void                    *vdisk_acb_lock;  /* Protects ACB */
>> +    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
>> +    int                     vdisk_nhosts;   /* Total number of hosts */
>> +    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
>> +    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
>> +    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
>> +    int                     vdisk_aio_retry_qd;
>
> Is vdisk_aio_retry_qd just used for debugging purposes?  I see it
> incremented/decremented throughout the code, but no one checks it.
>

Yes, it is for debugging only. Added a comment to state the same.


>> +    char                    *vdisk_guid;
>> +} BDRVVXHSState;
>> +
>> +void bdrv_vxhs_init(void);
>> +void *vxhs_setup_qnio(void);
>> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
>> +void vxhs_aio_event_reader(void *opaque);
>> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
>> +int vxhs_aio_flush_cb(void *opaque);
>> +void vxhs_finish_aiocb(ssize_t ret, void *arg);
>> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
>> +int vxhs_open(BlockDriverState *bs, QDict *options,
>> +              int bdrv_flags, Error **errp);
>> +void vxhs_close(BlockDriverState *bs);
>> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
>> +                                   QEMUIOVector *qiov, int nb_sectors,
>> +                                   BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> +                                    QEMUIOVector *qiov, int nb_sectors,
>> +                                    BlockCompletionFunc *cb,
>> +                                    void *opaque);
>> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
>> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
>> +                                QEMUIOVector *qiov, int nb_sectors,
>> +                                BlockCompletionFunc *cb,
>> +                                void *opaque, int write);
>> +int vxhs_co_flush(BlockDriverState *bs);
>> +int64_t vxhs_getlength(BlockDriverState *bs);
>> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
>> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
>> +uint32_t vxhs_get_vdisk_iocount(void *ptr);
>> +void vxhs_inc_acb_segment_count(void *ptr, int count);
>> +void vxhs_dec_acb_segment_count(void *ptr, int count);
>> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
>> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>> +int vxhs_failover_io(BDRVVXHSState *s);
>> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
>> +int vxhs_switch_storage_agent(BDRVVXHSState *s);
>> +int vxhs_handle_queued_ios(BDRVVXHSState *s);
>> +int vxhs_restart_aio(VXHSAIOCB *acb);
>> +void vxhs_fail_aio(VXHSAIOCB *acb, int err);
>> +void vxhs_failover_ioctl_cb(int res, void *ctx);
>> +
>> +
>> +#endif
>> diff --git a/configure b/configure
>> index 4b808f9..c71e270 100755
>> --- a/configure
>> +++ b/configure
>> @@ -320,6 +320,7 @@ vhdx=""
>>  numa=""
>>  tcmalloc="no"
>>  jemalloc="no"
>> +vxhs=""
>>
>>  # parse CC options first
>>  for opt do
>> @@ -1150,6 +1151,11 @@ for opt do
>>    ;;
>>    --enable-jemalloc) jemalloc="yes"
>>    ;;
>> +  --disable-vxhs) vxhs="no"
>> +  ;;
>> +  --enable-vxhs) vxhs="yes"
>> +  ;;
>> +
>>    *)
>>        echo "ERROR: unknown option $opt"
>>        echo "Try '$0 --help' for more information"
>> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if available:
>>    numa            libnuma support
>>    tcmalloc        tcmalloc support
>>    jemalloc        jemalloc support
>> +  vxhs            Veritas HyperScale vDisk backend support
>>
>>  NOTE: The object files are built at the place where configure is launched
>>  EOF
>> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
>>  fi
>>
>>  ##########################################
>> +# Veritas HyperScale block driver VxHS
>> +# Check if libqnio is installed
>> +if test "$vxhs" != "no" ; then
>> +  cat > $TMPC <<EOF
>> +#include <stdio.h>
>> +#include <qnio/qnio_api.h>
>> +
>> +void vxhs_inc_acb_segment_count(void *acb, int count);
>> +void vxhs_dec_acb_segment_count(void *acb, int count);
>> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>> +
>> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>> +{
>> +}
>> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>> +{
>> +}
>> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>> +{
>> +}
>> +int main(void) {
>> +    qemu_ck_initialize_lock();
>> +    return 0;
>> +}
>> +EOF
>> +  vxhs_libs="-lqnioshim -lqnio"
>> +  if compile_prog "" "$vxhs_libs" ; then
>> +    vxhs=yes
>> +  else
>> +    if test "$vxhs" = "yes" ; then
>> +      feature_not_found "vxhs block device" "Install libqnio. See github"
>> +    fi
>> +    vxhs=no
>> +  fi
>> +fi
>> +
>> +##########################################
>>  # End of CC checks
>>  # After here, no more $cc or $ld runs
>>
>> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then
>>    echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
>>  fi
>>
>> +if test "$vxhs" = "yes" ; then
>> +  echo "CONFIG_VXHS=y" >> $config_host_mak
>> +  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
>> +  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
>> +fi
>> +
>>  if test "$tcg_interpreter" = "yes"; then
>>    QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
>>  elif test "$ARCH" = "sparc64" ; then
>
> Your configure file should indicate if vxhs is enabled or not. For instance,
> here is a snippet from where this occurs in the current configure file:
>
> [...]
> echo "NUMA host support $numa"
> echo "tcmalloc support  $tcmalloc"
> echo "jemalloc support  $jemalloc"
> echo "avx2 optimization $avx2_opt"
>
>

Fixed.

>> --
>> 2.5.5
>>
>>
Kevin Wolf Sept. 8, 2016, 8:43 a.m. UTC | #4
Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> >> +
> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> >> +{
> >> +    VXHSAIOCB *acb = ptr;
> >> +
> >> +    acb->buffer = buffer;
> >> +}
> >> +
> >
> > Unused function?
> 
> This is called from within libqnio.

Wait, you mean the library references a symbol in the qemu binary? This
sounds completely wrong to me. I wouldn't even do something like this if
it were an internal qemu library because I think libraries should be
self-contained, but it's a much larger problem in something that is
supposed to be an independent library.

If the library has to call into qemu, it should only do so through
callbacks that qemu registered with the library (and then the function
wouldn't look unused in qemu).

In an earlier version of the series I suggested moving the part directly
related to qemu (at least the functions called qemu_*) from the library
to the block driver. Did you consider this?

Kevin
Daniel P. Berrangé Sept. 8, 2016, 8:49 a.m. UTC | #5
On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > >> +
> > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > >> +{
> > >> +    VXHSAIOCB *acb = ptr;
> > >> +
> > >> +    acb->buffer = buffer;
> > >> +}
> > >> +
> > >
> > > Unused function?
> > 
> > This is called from within libqnio.
> 
> Wait, you mean the library references a symbol in the qemu binary? This
> sounds completely wrong to me. I wouldn't even do something like this if
> it were an internal qemu library because I think libraries should be
> self-contained, but it's a much larger problem in something that is
> supposed to be an independent library.

I'd be surprised if that actually worked. If an external library wants
to refrence symbols in the QEMU binary, then QEMU would ned to be using
the -export-dynamic / -rdynamic linker flags to export all its symbols,
and AFAIK, we're not doing that.


Regards,
Daniel
Kevin Wolf Sept. 8, 2016, 9:29 a.m. UTC | #6
Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
> On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> > Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > > >> +
> > > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > > >> +{
> > > >> +    VXHSAIOCB *acb = ptr;
> > > >> +
> > > >> +    acb->buffer = buffer;
> > > >> +}
> > > >> +
> > > >
> > > > Unused function?
> > > 
> > > This is called from within libqnio.
> > 
> > Wait, you mean the library references a symbol in the qemu binary? This
> > sounds completely wrong to me. I wouldn't even do something like this if
> > it were an internal qemu library because I think libraries should be
> > self-contained, but it's a much larger problem in something that is
> > supposed to be an independent library.
> 
> I'd be surprised if that actually worked. If an external library wants
> to refrence symbols in the QEMU binary, then QEMU would ned to be using
> the -export-dynamic / -rdynamic linker flags to export all its symbols,
> and AFAIK, we're not doing that.

Hm, but if the function is really used by the library, how else would it
be when its name isn't mentioned anywhere in the patch except in its
declaration? And it appears to be called there directly:

https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer

Anyway, something is fishy here.

Kevin
Daniel P. Berrangé Sept. 8, 2016, 9:34 a.m. UTC | #7
On Thu, Sep 08, 2016 at 11:29:41AM +0200, Kevin Wolf wrote:
> Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
> > On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> > > Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > > > >> +
> > > > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > > > >> +{
> > > > >> +    VXHSAIOCB *acb = ptr;
> > > > >> +
> > > > >> +    acb->buffer = buffer;
> > > > >> +}
> > > > >> +
> > > > >
> > > > > Unused function?
> > > > 
> > > > This is called from within libqnio.
> > > 
> > > Wait, you mean the library references a symbol in the qemu binary? This
> > > sounds completely wrong to me. I wouldn't even do something like this if
> > > it were an internal qemu library because I think libraries should be
> > > self-contained, but it's a much larger problem in something that is
> > > supposed to be an independent library.
> > 
> > I'd be surprised if that actually worked. If an external library wants
> > to refrence symbols in the QEMU binary, then QEMU would ned to be using
> > the -export-dynamic / -rdynamic linker flags to export all its symbols,
> > and AFAIK, we're not doing that.
> 
> Hm, but if the function is really used by the library, how else would it
> be when its name isn't mentioned anywhere in the patch except in its
> declaration? And it appears to be called there directly:
> 
> https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer
> 
> Anyway, something is fishy here.

Yeah, and as you say, this approach is just plain wrong, and absolutely
can never be merged as is. The external library must never use anything
in QEMU that it wasn't explicitly given via a callback, otherwise that
library is defacto part of QEMU.

Regards,
Daniel
Daniel P. Berrangé Sept. 8, 2016, 9:43 a.m. UTC | #8
On Thu, Sep 08, 2016 at 11:29:41AM +0200, Kevin Wolf wrote:
> Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
> > On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> > > Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > > > >> +
> > > > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > > > >> +{
> > > > >> +    VXHSAIOCB *acb = ptr;
> > > > >> +
> > > > >> +    acb->buffer = buffer;
> > > > >> +}
> > > > >> +
> > > > >
> > > > > Unused function?
> > > > 
> > > > This is called from within libqnio.
> > > 
> > > Wait, you mean the library references a symbol in the qemu binary? This
> > > sounds completely wrong to me. I wouldn't even do something like this if
> > > it were an internal qemu library because I think libraries should be
> > > self-contained, but it's a much larger problem in something that is
> > > supposed to be an independent library.
> > 
> > I'd be surprised if that actually worked. If an external library wants
> > to refrence symbols in the QEMU binary, then QEMU would ned to be using
> > the -export-dynamic / -rdynamic linker flags to export all its symbols,
> > and AFAIK, we're not doing that.
> 
> Hm, but if the function is really used by the library, how else would it
> be when its name isn't mentioned anywhere in the patch except in its
> declaration? And it appears to be called there directly:
> 
> https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer
> 
> Anyway, something is fishy here.

Oh, notice that .c file is actually part of the shim library, not the
main libqnio.so that QEMU would link against.

So, it really is unused and should be deleted from this block/vxhs.c
file.

Regards,
Daniel
Kevin Wolf Sept. 8, 2016, 10:18 a.m. UTC | #9
Am 08.09.2016 um 11:43 hat Daniel P. Berrange geschrieben:
> On Thu, Sep 08, 2016 at 11:29:41AM +0200, Kevin Wolf wrote:
> > Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
> > > On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> > > > Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > > > > >> +
> > > > > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > > > > >> +{
> > > > > >> +    VXHSAIOCB *acb = ptr;
> > > > > >> +
> > > > > >> +    acb->buffer = buffer;
> > > > > >> +}
> > > > > >> +
> > > > > >
> > > > > > Unused function?
> > > > > 
> > > > > This is called from within libqnio.
> > > > 
> > > > Wait, you mean the library references a symbol in the qemu binary? This
> > > > sounds completely wrong to me. I wouldn't even do something like this if
> > > > it were an internal qemu library because I think libraries should be
> > > > self-contained, but it's a much larger problem in something that is
> > > > supposed to be an independent library.
> > > 
> > > I'd be surprised if that actually worked. If an external library wants
> > > to refrence symbols in the QEMU binary, then QEMU would ned to be using
> > > the -export-dynamic / -rdynamic linker flags to export all its symbols,
> > > and AFAIK, we're not doing that.
> > 
> > Hm, but if the function is really used by the library, how else would it
> > be when its name isn't mentioned anywhere in the patch except in its
> > declaration? And it appears to be called there directly:
> > 
> > https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer
> > 
> > Anyway, something is fishy here.
> 
> Oh, notice that .c file is actually part of the shim library, not the
> main libqnio.so that QEMU would link against.
> 
> So, it really is unused and should be deleted from this block/vxhs.c
> file.

I haven't fully understood yet what's the deal with this shim library,
but this patch links against both.

Kevin
Paolo Bonzini Sept. 8, 2016, 10:23 a.m. UTC | #10
On 08/09/2016 12:18, Kevin Wolf wrote:
> Am 08.09.2016 um 11:43 hat Daniel P. Berrange geschrieben:
>> On Thu, Sep 08, 2016 at 11:29:41AM +0200, Kevin Wolf wrote:
>>> Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
>>>> On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
>>>>> Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
>>>>>>>> +
>>>>>>>> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>>>>>>>> +{
>>>>>>>> +    VXHSAIOCB *acb = ptr;
>>>>>>>> +
>>>>>>>> +    acb->buffer = buffer;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>
>>>>>>> Unused function?
>>>>>>
>>>>>> This is called from within libqnio.
>>>>>
>>>>> Wait, you mean the library references a symbol in the qemu binary? This
>>>>> sounds completely wrong to me. I wouldn't even do something like this if
>>>>> it were an internal qemu library because I think libraries should be
>>>>> self-contained, but it's a much larger problem in something that is
>>>>> supposed to be an independent library.
>>>>
>>>> I'd be surprised if that actually worked. If an external library wants
>>>> to refrence symbols in the QEMU binary, then QEMU would ned to be using
>>>> the -export-dynamic / -rdynamic linker flags to export all its symbols,
>>>> and AFAIK, we're not doing that.
>>>
>>> Hm, but if the function is really used by the library, how else would it
>>> be when its name isn't mentioned anywhere in the patch except in its
>>> declaration? And it appears to be called there directly:
>>>
>>> https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer
>>>
>>> Anyway, something is fishy here.
>>
>> Oh, notice that .c file is actually part of the shim library, not the
>> main libqnio.so that QEMU would link against.
>>
>> So, it really is unused and should be deleted from this block/vxhs.c
>> file.
> 
> I haven't fully understood yet what's the deal with this shim library,
> but this patch links against both.

Indeed, I suspect that the "shim library" should really be part of QEMU.
 It's just 700 lines of code, and some parts (e.g.
qnio_ck_initialize_lock seem unused even).  The main thing to do is
convert it from cJSON to QEMU's internal libraries for the same thing.

Paolo
Jeff Cody Sept. 8, 2016, 12:48 p.m. UTC | #11
On Thu, Sep 08, 2016 at 10:34:24AM +0100, Daniel P. Berrange wrote:
> On Thu, Sep 08, 2016 at 11:29:41AM +0200, Kevin Wolf wrote:
> > Am 08.09.2016 um 10:49 hat Daniel P. Berrange geschrieben:
> > > On Thu, Sep 08, 2016 at 10:43:19AM +0200, Kevin Wolf wrote:
> > > > Am 08.09.2016 um 00:32 hat ashish mittal geschrieben:
> > > > > >> +
> > > > > >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> > > > > >> +{
> > > > > >> +    VXHSAIOCB *acb = ptr;
> > > > > >> +
> > > > > >> +    acb->buffer = buffer;
> > > > > >> +}
> > > > > >> +
> > > > > >
> > > > > > Unused function?
> > > > > 
> > > > > This is called from within libqnio.
> > > > 
> > > > Wait, you mean the library references a symbol in the qemu binary? This
> > > > sounds completely wrong to me. I wouldn't even do something like this if
> > > > it were an internal qemu library because I think libraries should be
> > > > self-contained, but it's a much larger problem in something that is
> > > > supposed to be an independent library.
> > > 
> > > I'd be surprised if that actually worked. If an external library wants
> > > to refrence symbols in the QEMU binary, then QEMU would ned to be using
> > > the -export-dynamic / -rdynamic linker flags to export all its symbols,
> > > and AFAIK, we're not doing that.
> > 
> > Hm, but if the function is really used by the library, how else would it
> > be when its name isn't mentioned anywhere in the patch except in its
> > declaration? And it appears to be called there directly:
> > 
> > https://github.com/MittalAshish/libqnio/search?q=vxhs_set_acb_buffer
> > 
> > Anyway, something is fishy here.
> 
> Yeah, and as you say, this approach is just plain wrong, and absolutely
> can never be merged as is. The external library must never use anything
> in QEMU that it wasn't explicitly given via a callback, otherwise that
> library is defacto part of QEMU.
>

Looking at the qnio test server, it does not link against the shim; it looks
like only QEMU links against the shim.

There are at least 3 qemu symbols referenced by the shim library (as of v5):
vxhs_dec_acb_segment_count, vxhs_inc_acb_segment_count, vxhs_set_acb_buffer.

So given that:

1. QEMU is the only user of the shim,
2. The shim calls QEMU functions directly,
3. The shim only exists to bridge QEMU and the QNIO library,
   (From the shim .c file:
   "Network IO library for VxHS QEMU block driver ")
4. The shim makes assumptions about QEMU (e.g. sector size alignment)

I strongly agree with everyone that the shim should just be part of QEMU,
and is the better choice over simply refactoring out QEMU calls from the
shim.


Jeff
Jeff Cody Sept. 8, 2016, 2 p.m. UTC | #12
On Wed, Sep 07, 2016 at 03:32:47PM -0700, ashish mittal wrote:
> Hi Jeff,
> 
> Thank you for the comments and corrections.
> 
> I have submitted a v5 patch after incorporating most of your review
> comments. Please find details in-line.
> 
> Thanks,
> Ashish
> 
> On Tue, Aug 30, 2016 at 10:35 AM, Jeff Cody <jcody@redhat.com> wrote:
> >
> > First-pass-over-the-code review:
> >
> > On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote:
> >> This patch adds support for a new block device type called "vxhs".
> >> Source code for the library that this code loads can be downloaded from:
> >> https://github.com/MittalAshish/libqnio.git
> >>
> >> Sample command line using JSON syntax:
> >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'
> >>
> >> Sample command line using URI syntax:
> >> qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D
> >>
> >
> > Is there a reference specification for vxhs available?  If so, the commit
> > message would be a good place to drop a link.
> >
> 
> We don't have a reference specification doc yet. Although, I will
> shortly be adding a test program to libqnio that can be used to
> test/demo basic libqnio IO transfer.
> 
> >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
> >> ---
> >> v3 changelog:
> >> =============
> >> (1) Reworked QAPI/JSON parsing.
> >> (2) Reworked URI parsing as suggested by Kevin.
> >> (3) Fixes per review comments from Stefan on v1.
> >> (4) Fixes per review comments from Daniel on v3.
> >>
> >>  block/Makefile.objs |    2 +
> >>  block/trace-events  |   41 ++
> >>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  block/vxhs.h        |  237 ++++++++++
> >>  configure           |   50 ++
> >>  5 files changed, 1634 insertions(+)
> >>  create mode 100644 block/vxhs.c
> >>  create mode 100644 block/vxhs.h
> >>
> >> diff --git a/block/Makefile.objs b/block/Makefile.objs
> >> index 2593a2f..5f83305 100644
> >> --- a/block/Makefile.objs
> >> +++ b/block/Makefile.objs
> >> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
> >>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
> >>  block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
> >>  block-obj-$(CONFIG_LIBSSH2) += ssh.o
> >> +block-obj-$(CONFIG_VXHS) += vxhs.o
> >>  block-obj-y += accounting.o dirty-bitmap.o
> >>  block-obj-y += write-threshold.o
> >>
> >> @@ -43,3 +44,4 @@ block-obj-m        += dmg.o
> >>  dmg.o-libs         := $(BZIP2_LIBS)
> >>  qcow.o-libs        := -lz
> >>  linux-aio.o-libs   := -laio
> >> +vxhs.o-libs        := $(VXHS_LIBS)
> >> diff --git a/block/trace-events b/block/trace-events
> >> index 05fa13c..06c6d8c 100644
> >> --- a/block/trace-events
> >> +++ b/block/trace-events
> >> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
> >>  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
> >>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
> >>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
> >> +
> >> +# block/vxhs.c
> >> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
> >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
> >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
> >> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
> >> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
> >> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
> >> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
> >> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
> >> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
> >> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
> >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
> >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
> >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d"
> >> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
> >> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
> >> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
> >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
> >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
> >> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
> >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
> >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
> >> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
> >> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
> >> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
> >> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
> >> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
> >> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
> >> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
> >> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
> >> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
> >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
> >> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
> >> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
> >> +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
> >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
> >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
> >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
> >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
> >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
> >> diff --git a/block/vxhs.c b/block/vxhs.c
> >> new file mode 100644
> >> index 0000000..32c37c5
> >> --- /dev/null
> >> +++ b/block/vxhs.c
> >> @@ -0,0 +1,1304 @@
> >> +/*
> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
> >> + *
> >
> > Another good place for a link to the specification, if available.
> >
> >
> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >> + * See the COPYING file in the top-level directory.
> >> + *
> >> + */
> >> +
> >> +#include "vxhs.h"
> >> +#include <qnio/qnio_api.h>
> >> +#include "trace.h"
> >> +#include "qapi/qmp/qerror.h"
> >> +#include "qapi/qmp/qdict.h"
> >> +#include "qapi/qmp/qstring.h"
> >> +
> >> +#define VXHS_OPT_FILENAME        "filename"
> >> +#define VXHS_OPT_VDISK_ID        "vdisk_id"
> >> +#define VXHS_OPT_SERVER          "server."
> >> +#define VXHS_OPT_HOST            "host"
> >> +#define VXHS_OPT_PORT            "port"
> >> +
> >> +/* qnio client ioapi_ctx */
> >> +static void *global_qnio_ctx;
> >> +
> >> +/* vdisk prefix to pass to qnio */
> >> +static const char vdisk_prefix[] = "/dev/of/vdisk";
> >> +
> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
> >> +{
> >> +    VXHSAIOCB *acb = ptr;
> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +    acb->segments += count;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +}
> >> +
> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
> >> +{
> >> +    VXHSAIOCB *acb = ptr;
> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +    acb->segments -= count;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +}
> >> +
> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
> >> +{
> >> +    VXHSAIOCB *acb = ptr;
> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
> >> +    int segcount = 0;
> >> +
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +    acb->segments -= count;
> >> +    segcount = acb->segments;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +
> >> +    return segcount;
> >> +}
> >
> > Unused function?
> >
> 
> Yes, it is unused. Sorry I forgot to remove it from V5. Will remember
> to remove in the next iteration.
> 
> >> +
> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> >> +{
> >> +    VXHSAIOCB *acb = ptr;
> >> +
> >> +    acb->buffer = buffer;
> >> +}
> >> +
> >
> > Unused function?
> >
> 
> This is called from within libqnio.
> 
> > (There are several unused functions - are there other patches that use these
> > functions?  Maybe I missed them on the list.  If not, and they are for
> > future work, maybe it would be best to introduce these functions in the
> > patch series that will actually use them. It will cut down on the amount of
> > code to be reviewed, and since it is not exercised it can't be verified
> > anyway).
> >
> 
> Removed all unused functions, except vxhs_dec_and_get_acb_segment_count().
> 
> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
> >> +{
> >> +    BDRVVXHSState *s = ptr;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +    s->vdisk_aio_count += count;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +}
> >> +
> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
> >> +{
> >> +    BDRVVXHSState *s = ptr;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +    s->vdisk_aio_count -= count;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +}
> >> +
> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr)
> >> +{
> >> +    BDRVVXHSState *s = ptr;
> >> +    uint32_t count = 0;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +    count = s->vdisk_aio_count;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +
> >> +    return count;
> >> +}
> >
> > Function only used by another function that is not called from anywhere.
> >
> >
> 
> Removed.
> 
> >> +
> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
> >> +{
> >> +    VXHSAIOCB *acb = NULL;
> >> +    BDRVVXHSState *s = NULL;
> >> +    int rv = 0;
> >> +    int segcount = 0;
> >> +    uint32_t error = 0;
> >> +    uint32_t opcode = 0;
> >> +
> >> +    assert(m);
> >> +    if (m) {
> >> +        /* TODO: need common get message attrs, not two separate lib calls */
> >> +        error = qemu_iio_extract_msg_error(m);
> >> +        opcode = qemu_iio_extract_msg_opcode(m);
> >> +    }
> >> +    switch (opcode) {
> >> +    case IRP_READ_REQUEST:
> >> +    case IRP_WRITE_REQUEST:
> >> +
> >> +    /*
> >> +     * ctx is VXHSAIOCB*
> >> +     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
> >> +     */
> >> +    if (ctx) {
> >> +        acb = ctx;
> >> +        s = acb->common.bs->opaque;
> >> +    } else {
> >> +        trace_vxhs_iio_callback(error, reason);
> >> +        goto out;
> >> +    }
> >> +
> >> +    if (error) {
> >> +        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
> >> +
> >> +        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
> >> +            /*
> >> +             * Storage agent failed while I/O was in progress
> >> +             * Fail over only if the qnio channel dropped, indicating
> >> +             * storage agent failure. Don't fail over in response to other
> >> +             * I/O errors such as disk failure.
> >> +             */
> >> +            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
> >> +                error == VXERROR_CHANNEL_HUP || error == -1) {
> >> +                /*
> >> +                 * Start vDisk IO failover once callback is
> >> +                 * called against all the pending IOs.
> >> +                 * If vDisk has no redundancy enabled
> >> +                 * then IO failover routine will mark
> >> +                 * the vDisk failed and fail all the
> >> +                 * AIOs without retry (stateless vDisk)
> >> +                 */
> >> +                VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> >> +                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
> >> +                }
> >> +                /*
> >> +                 * Check if this acb is already queued before.
> >> +                 * It is possible in case if I/Os are submitted
> >> +                 * in multiple segments (QNIO_MAX_IO_SIZE).
> >> +                 */
> >> +                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
> >> +                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
> >> +                                         acb, retry_entry);
> >> +                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
> >> +                    s->vdisk_aio_retry_qd++;
> >> +                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
> >> +                }
> >> +                segcount = --acb->segments;
> >> +                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +                /*
> >> +                 * Decrement AIO count only when callback is called
> >> +                 * against all the segments of aiocb.
> >> +                 */
> >> +                if (segcount == 0 && --s->vdisk_aio_count == 0) {
> >> +                    /*
> >> +                     * Start vDisk I/O failover
> >> +                     */
> >> +                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +                    /*
> >> +                     * TODO:
> >> +                     * Need to explore further if it is possible to optimize
> >> +                     * the failover operation on Virtual-Machine (global)
> >> +                     * specific rather vDisk specific.
> >> +                     */
> >> +                    vxhs_failover_io(s);
> >> +                    goto out;
> >> +                }
> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +                goto out;
> >> +            }
> >> +        } else if (reason == IIO_REASON_HUP) {
> >> +            /*
> >> +             * Channel failed, spontaneous notification,
> >> +             * not in response to I/O
> >> +             */
> >> +            trace_vxhs_iio_callback_chnlfail(error);
> >> +            /*
> >> +             * TODO: Start channel failover when no I/O is outstanding
> >> +             */
> >> +            goto out;
> >> +        } else {
> >> +            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
> >> +                                         acb->size, error);
> >> +        }
> >> +    }
> >> +    /*
> >> +     * Set error into acb if not set. In case if acb is being
> >> +     * submitted in multiple segments then need to set the error
> >> +     * only once.
> >> +     *
> >> +     * Once acb done callback is called for the last segment
> >> +     * then acb->ret return status will be sent back to the
> >> +     * caller.
> >> +     */
> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +    if (error && !acb->ret) {
> >> +        acb->ret = error;
> >> +    }
> >> +    --acb->segments;
> >> +    segcount = acb->segments;
> >> +    assert(segcount >= 0);
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +    /*
> >> +     * Check if all the outstanding I/Os are done against acb.
> >> +     * If yes then send signal for AIO completion.
> >> +     */
> >> +    if (segcount == 0) {
> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> >> +        if (rv != sizeof(acb)) {
> >> +            error_report("VXHS AIO completion failed: %s", strerror(errno));
> >> +            abort();
> >> +        }
> >> +    }
> >> +    break;
> >> +
> >> +    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
> >> +        /* ctx is BDRVVXHSState* */
> >> +        assert(ctx);
> >> +        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
> >> +                                      error);
> >> +        vxhs_failover_ioctl_cb(error, ctx);
> >> +        break;
> >> +
> >> +    default:
> >> +        if (reason == IIO_REASON_HUP) {
> >> +            /*
> >> +             * Channel failed, spontaneous notification,
> >> +             * not in response to I/O
> >> +             */
> >> +            trace_vxhs_iio_callback_chnfail(error, errno);
> >> +            /*
> >> +             * TODO: Start channel failover when no I/O is outstanding
> >> +             */
> >> +        } else {
> >> +            trace_vxhs_iio_callback_unknwn(opcode, error);
> >> +        }
> >> +        break;
> >> +    }
> >> +out:
> >> +    return;
> >> +}
> >> +
> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
> >> +{
> >> +    BlockCompletionFunc *cb = acb->common.cb;
> >> +    void *opaque = acb->common.opaque;
> >> +    int ret = 0;
> >> +
> >> +    if (acb->ret != 0) {
> >> +        trace_vxhs_complete_aio(acb, acb->ret);
> >> +    /*
> >> +     * We mask all the IO errors generically as EIO for upper layers
> >> +     * Right now our IO Manager uses non standard error codes. Instead
> >> +     * of confusing upper layers with incorrect interpretation we are
> >> +     * doing this workaround.
> >> +     */
> >> +        ret = (-EIO);
> >> +    }
> >> +    /*
> >> +     * Copy back contents from stablization buffer into original iovector
> >> +     * before returning the IO
> >> +     */
> >> +    if (acb->buffer != NULL) {
> >> +        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
> >> +        free(acb->buffer);
> >> +        acb->buffer = NULL;
> >> +    }
> >> +    vxhs_dec_vdisk_iocount(s, 1);
> >> +    acb->aio_done = VXHS_IO_COMPLETED;
> >> +    qemu_aio_unref(acb);
> >> +    cb(opaque, ret);
> >> +}
> >> +
> >> +/*
> >> + * This is the HyperScale event handler registered to QEMU.
> >> + * It is invoked when any IO gets completed and written on pipe
> >> + * by callback called from QNIO thread context. Then it marks
> >> + * the AIO as completed, and releases HyperScale AIO callbacks.
> >> + */
> >> +void vxhs_aio_event_reader(void *opaque)
> >> +{
> >> +    BDRVVXHSState *s = opaque;
> >> +    ssize_t ret;
> >> +
> >> +    do {
> >> +        char *p = (char *)&s->qnio_event_acb;
> >> +
> >> +        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
> >> +                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
> >> +        if (ret > 0) {
> >> +            s->event_reader_pos += ret;
> >> +            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
> >> +                s->event_reader_pos = 0;
> >> +                vxhs_complete_aio(s->qnio_event_acb, s);
> >> +            }
> >> +        }
> >> +    } while (ret < 0 && errno == EINTR);
> >> +}
> >> +
> >> +/*
> >> + * QEMU calls this to check if there are any pending IO on vDisk.
> >> + * It will wait in a loop until all the AIOs are completed.
> >> + */
> >> +int vxhs_aio_flush_cb(void *opaque)
> >> +{
> >> +    BDRVVXHSState *s = opaque;
> >> +
> >> +    return vxhs_get_vdisk_iocount(s);
> >> +}
> >
> > Unused function.
> >
> >
> 
> Removed.
> 
> >> +
> >> +/*
> >> + * This will be called by QEMU while booting for each vDisk.
> >> + * bs->opaque will be allocated by QEMU upper block layer before
> >> + * calling open. It will load all the QNIO operations from
> >> + * qemuqnio library and call QNIO operation to create channel to
> >> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk
> >> + * path and then sets HyperScale event handler to QEMU.
> >> + */
> >> +void *vxhs_setup_qnio(void)
> >> +{
> >> +    void *qnio_ctx = NULL;
> >> +
> >> +    qnio_ctx = qemu_iio_init(vxhs_iio_callback);
> >
> > One concern with this callback function - when does it get called?  I am
> > assuming that qemu_iio_init() is running threads / processes in the
> > background, so this callback could happen at anytime.
> >
> > (I'll discuss my specific concern later, in vxhs_failover_ioctl_cb())
> >
> >
> >
> 
> Yes, that is correct. vxhs_setup_qnio() is called from the threads
> running in libqnio.
> 
> 
> 
> > An aside:
> >
> > It is odd that the qnio library is using a qemu_ prefix for functions.
> > Seems like a good way to run into issues later, especially with relatively
> > generic function names such as 'qemu_ck_spin_lock()'. Consider that most
> > developers / maintainers are probably not going to compile in vxhs given the
> > external library dependence without a specific reason, so name collisions
> > could sneak in.
> >
> > 'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause:
> >
> > 0000000000004d07 T qemu_calculate_iovec_size
> > 000000000000663d T qemu_ck_destroy_lock
> > 00000000000065cf T qemu_ck_initialize_lock
> > 00000000000065f7 T qemu_ck_spin_lock
> > 000000000000661a T qemu_ck_spin_unlock
> > 0000000000004dfb T qemu_convert_iovector_to_buffer
> > 0000000000004d63 T qemu_copy_iov_to_buffer
> > 0000000000005f2f T qemu_extract_flush_response
> > 0000000000005c89 T qemu_extract_geometry_from_json
> > 0000000000005a78 T qemu_extract_size_from_json
> > 0000000000004f40 T qemu_is_iovector_read_aligned
> >
> >
> 
> Replaced all qemu_ prefixed symbols within qnio.
> 
> >
> >> +
> >> +    if (qnio_ctx != NULL) {
> >> +        trace_vxhs_setup_qnio(qnio_ctx);
> >> +    } else {
> >> +        trace_vxhs_setup_qnio_nwerror('.');
> >> +    }
> >> +
> >> +    return qnio_ctx;
> >> +}
> >> +
> >> +static QemuOptsList runtime_opts = {
> >> +    .name = "vxhs",
> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> >> +    .desc = {
> >> +        {
> >> +            .name = VXHS_OPT_FILENAME,
> >> +            .type = QEMU_OPT_STRING,
> >> +            .help = "URI to the Veritas HyperScale image",
> >> +        },
> >> +        {
> >> +            .name = VXHS_OPT_VDISK_ID,
> >> +            .type = QEMU_OPT_STRING,
> >> +            .help = "UUID of the VxHS vdisk",
> >> +        },
> >> +        { /* end of list */ }
> >> +    },
> >> +};
> >> +
> >> +static QemuOptsList runtime_tcp_opts = {
> >> +    .name = "vxhs_tcp",
> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
> >> +    .desc = {
> >> +        {
> >> +            .name = VXHS_OPT_HOST,
> >> +            .type = QEMU_OPT_STRING,
> >> +            .help = "host address (ipv4 addresses)",
> >> +        },
> >> +        {
> >> +            .name = VXHS_OPT_PORT,
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "port number on which VxHSD is listening (default 9999)",
> >> +            .def_value_str = "9999"
> >> +        },
> >> +        {
> >> +            .name = "to",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "max port number, not supported by VxHS",
> >> +        },
> >> +        {
> >> +            .name = "ipv4",
> >> +            .type = QEMU_OPT_BOOL,
> >> +            .help = "ipv4 bool value, not supported by VxHS",
> >> +        },
> >> +        {
> >> +            .name = "ipv6",
> >> +            .type = QEMU_OPT_BOOL,
> >> +            .help = "ipv6 bool value, not supported by VxHS",
> >> +        },
> >> +        { /* end of list */ }
> >> +    },
> >> +};
> >> +
> >> +/*
> >> + * Parse the incoming URI and populate *options with all the host(s)
> >> + * information. Host at index 0 is local storage agent.
> >> + * Remaining are the reflection target storage agents. The local storage agent
> >> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
> >> + * The local storage agent address is stored at index 0. The reflection target
> >> + * ips, are the E-W data network addresses of the reflection node agents, also
> >> + * extracted from the uri.
> >> + */
> >> +static int vxhs_parse_uri(const char *filename, QDict *options)
> >> +{
> >> +    gchar **target_list;
> >> +    URI *uri = NULL;
> >> +    char *hoststr, *portstr;
> >> +    char *port;
> >> +    int ret = 0;
> >> +    int i = 0;
> >> +
> >> +    trace_vxhs_parse_uri_filename(filename);
> >> +    target_list = g_strsplit(filename, "%7D", 0);
> >> +    assert(target_list != NULL && target_list[0] != NULL);
> >> +
> >> +    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
> >> +        uri = uri_parse(target_list[i]);
> >> +        if (!uri || !uri->server) {
> >> +            uri_free(uri);
> >> +            ret = -EINVAL;
> >> +            break;
> >> +        }
> >> +
> >> +        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
> >> +        qdict_put(options, hoststr, qstring_from_str(uri->server));
> >> +
> >> +        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
> >> +        if (uri->port) {
> >> +            port = g_strdup_printf("%d", uri->port);
> >> +            qdict_put(options, portstr, qstring_from_str(port));
> >> +            g_free(port);
> >> +        }
> >> +
> >> +        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
> >> +            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
> >> +        }
> >> +
> >> +        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
> >> +        g_free(hoststr);
> >> +        g_free(portstr);
> >> +        uri_free(uri);
> >> +    }
> >> +
> >> +    g_strfreev(target_list);
> >> +    return ret;
> >> +}
> >> +
> >> +static void vxhs_parse_filename(const char *filename, QDict *options,
> >> +                               Error **errp)
> >> +{
> >> +    if (qdict_haskey(options, "host")
> >> +        || qdict_haskey(options, "port")
> >> +        || qdict_haskey(options, "path"))
> >> +    {
> >> +        error_setg(errp, "host/port/path and a file name may not be specified "
> >> +                         "at the same time");
> >> +        return;
> >> +    }
> >> +
> >> +    if (strstr(filename, "://")) {
> >> +        int ret = vxhs_parse_uri(filename, options);
> >> +        if (ret < 0) {
> >> +            error_setg(errp, "Invalid URI. URI should be of the form "
> >> +                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
> >> +                              int *cfd, int *rfd, Error **errp)
> >> +{
> >> +    QDict *backing_options = NULL;
> >> +    const char *vxhs_filename;
> >> +    char *of_vsa_addr = NULL;
> >> +    Error *local_err = NULL;
> >> +    const char *ptr = NULL;
> >
> > ptr does not need to be initialized to NULL here.  I also recommend using a
> > more descriptive name like 'vdisk_id_opt' over something generic like 'ptr'.
> >
> 
> Fixed.
> 
> 
> >> +    char *file_name = NULL;
> >> +    size_t num_servers;
> >> +    char *str = NULL;
> >> +    QemuOpts *opts;
> >> +    int ret = 0;
> >> +    int i;
> >> +
> >> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> >> +    qemu_opts_absorb_qdict(opts, options, &local_err);
> >> +    if (local_err) {
> >> +        ret = -EINVAL;
> >> +        goto out;
> >> +    }
> >> +
> >> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
> >> +    if (vxhs_filename) {
> >> +        trace_vxhs_qemu_init_filename(vxhs_filename);
> >> +    }
> >> +
> >> +    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
> >> +    if (!ptr) {
> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
> >> +        ret = -EINVAL;
> >> +        goto out;
> >> +    }
> >> +    s->vdisk_guid = g_strdup(ptr);
> >
> > This is never freed anywhere.
> >
> 
> Now freed on error conditions and in vxhs_close().
> 
> >> +    trace_vxhs_qemu_init_vdisk(ptr);
> >> +
> >> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
> >> +    if (num_servers < 1) {
> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
> >> +        ret = -EINVAL;
> >> +        goto out;
> >> +    }
> >> +    trace_vxhs_qemu_init_numservers(num_servers);
> >> +
> >> +    qemu_opts_del(opts);
> >> +
> >> +    for (i = 0; i < num_servers; i++) {
> >
> > s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size.  There is no
> > bounds checking on num_servers, which is an issue: we can easily blow
> > past this.
> >
> > In general, for any parameter supplied by a user, or read from a file /
> > protocol / etc, we need to do appropriate bounds checking to prevent
> > potential CVEs.
> >
> 
> Fixed.
> 
> >> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
> >> +        qdict_extract_subqdict(options, &backing_options, str);
> >> +
> >> +        /* create opts info from runtime_tcp_opts list */
> >> +        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
> >> +        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
> >> +        if (local_err) {
> >> +            ret = -EINVAL;
> >> +            qemu_opts_del(opts);
> >> +            goto out;
> >> +        }
> >> +
> >> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
> >> +                                                            VXHS_OPT_HOST));
> >
> > This is never freed anywhere.
> >
> 
> Now freed on error and vxhs_close().
> 
> >> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
> >> +                                                                 VXHS_OPT_PORT),
> >> +                                                    NULL, 0);
> >> +
> >> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
> >> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
> >> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
> >> +                             s->vdisk_hostinfo[i].port);
> >> +        qemu_opts_del(opts);
> >> +    }
> >> +
> >> +    s->vdisk_nhosts = i;
> >> +    s->vdisk_cur_host_idx = 0;
> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
> >> +
> >> +    /*
> >> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
> >> +     */
> >> +    if (global_qnio_ctx == NULL) {
> >> +        global_qnio_ctx = vxhs_setup_qnio();
> >> +        if (global_qnio_ctx == NULL) {
> >> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
> >> +            ret = -EINVAL;
> >> +            goto out;
> >> +        }
> >> +    }
> >> +
> >> +    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
> >> +    if (*cfd < 0) {
> >> +        error_setg(&local_err, "Failed qemu_open_iio_conn");
> >> +        ret = -EIO;
> >> +        goto out;
> >> +    }
> >> +    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
> >> +    if (*rfd < 0) {
> >> +        error_setg(&local_err, "Failed qemu_iio_devopen");
> >> +        ret = -EIO;
> >> +        goto out;
> >> +    }
> >> +
> >> +out:
> >> +    g_free(file_name);
> >> +    g_free(of_vsa_addr);
> >> +    if (str) {
> >> +        qdict_del(backing_options, str);
> >> +        g_free(str);
> >> +    }
> >> +
> >> +    if (ret < 0) {
> >> +        errno = -ret;
> >> +    }
> >> +    error_propagate(errp, local_err);
> >> +    return ret;
> >> +}
> >> +
> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
> >> +              int bdrv_flags, Error **errp)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    AioContext *aio_context;
> >> +    int qemu_qnio_cfd = -1;
> >> +    int qemu_rfd = -1;
> >> +    int ret = 0;
> >> +
> >> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
> >> +    if (ret < 0) {
> >> +        trace_vxhs_open_fail(ret);
> >> +        return ret;
> >> +    }
> >> +
> >> +    s->qnio_ctx = global_qnio_ctx;
> >> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
> >> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
> >> +    s->vdisk_size = 0;
> >> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
> >> +
> >> +    /*
> >> +     * Create a pipe for communicating between two threads in different
> >> +     * context. Set handler for read event, which gets triggered when
> >> +     * IO completion is done by non-QEMU context.
> >> +     */
> >> +    ret = qemu_pipe(s->fds);
> >> +    if (ret < 0) {
> >> +        trace_vxhs_open_epipe('.');
> >> +        ret = -errno;
> >> +        goto out;
> >> +    }
> >> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
> >> +
> >> +    aio_context = bdrv_get_aio_context(bs);
> >> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
> >> +                       false, vxhs_aio_event_reader, NULL, s);
> >> +
> >> +    /*
> >> +     * Allocate/Initialize the spin-locks.
> >> +     *
> >> +     * NOTE:
> >> +     *      Since spin lock is being allocated
> >> +     *      dynamically hence moving acb struct
> >> +     *      specific lock to BDRVVXHSState
> >> +     *      struct. The reason being,
> >> +     *      we don't want the overhead of spin
> >> +     *      lock being dynamically allocated and
> >> +     *      freed for every AIO.
> >> +     */
> >> +    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
> >> +    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
> >> +
> >> +    return 0;
> >> +
> >> +out:
> >> +    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
> >> +                                 s->vdisk_hostinfo[0].vdisk_rfd);
> >> +    }
> >> +    /* never close qnio_cfd */
> >
> > Surely we want to be able to close it at some point... or does the qnio
> > library take care of that cleanup somewhere/sometime?
> >
> >
> 
> Changed to close channel fd on error conditions, vxhs_close() and
> vxhs_reopen_vdisk().
> 
> 
> >> +    trace_vxhs_open_fail(ret);
> >> +    return ret;
> >> +}
> >> +
> >> +static const AIOCBInfo vxhs_aiocb_info = {
> >> +    .aiocb_size = sizeof(VXHSAIOCB)
> >> +};
> >> +
> >> +/*
> >> + * This is called in QNIO thread context when IO done
> >> + * on IO Manager and QNIO client received the data or
> >> + * ACK. It notify another event handler thread running in QEMU context
> >> + * by writing on the pipe
> >> + */
> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg)
> >> +{
> >> +    VXHSAIOCB *acb = arg;
> >> +    BlockDriverState *bs = acb->common.bs;
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    int rv;
> >> +
> >> +    acb->ret = ret;
> >> +    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> >> +    if (rv != sizeof(acb)) {
> >> +        error_report("VXHS AIO completion failed: %s",
> >> +                     strerror(errno));
> >> +        abort();
> >> +    }
> >> +}
> >
> > Another unused function.
> >
> 
> Removed.
> 
> 
> >> +
> >> +/*
> >> + * This allocates QEMU-VXHS callback for each IO
> >> + * and is passed to QNIO. When QNIO completes the work,
> >> + * it will be passed back through the callback.
> >> + */
> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
> >> +                                int64_t sector_num, QEMUIOVector *qiov,
> >> +                                int nb_sectors,
> >> +                                BlockCompletionFunc *cb,
> >> +                                void *opaque, int iodir)
> >> +{
> >> +    VXHSAIOCB *acb = NULL;
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    size_t size;
> >> +    uint64_t offset;
> >> +    int iio_flags = 0;
> >> +    int ret = 0;
> >> +
> >> +    offset = sector_num * BDRV_SECTOR_SIZE;
> >> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> >> +
> >> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
> >> +    /*
> >> +     * Setup or initialize VXHSAIOCB.
> >> +     * Every single field should be initialized since
> >> +     * acb will be picked up from the slab without
> >> +     * initializing with zero.
> >> +     */
> >> +    acb->io_offset = offset;
> >> +    acb->size = size;
> >> +    acb->ret = 0;
> >> +    acb->flags = 0;
> >> +    acb->aio_done = VXHS_IO_INPROGRESS;
> >> +    acb->segments = 0;
> >> +    acb->buffer = 0;
> >> +    acb->qiov = qiov;
> >> +    acb->direction = iodir;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +    if (OF_VDISK_FAILED(s)) {
> >> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +        goto errout;
> >> +    }
> >> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> >> +        s->vdisk_aio_retry_qd++;
> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
> >> +        goto out;
> >> +    }
> >> +    s->vdisk_aio_count++;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +
> >> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> >> +
> >> +    switch (iodir) {
> >> +    case VDISK_AIO_WRITE:
> >> +            vxhs_inc_acb_segment_count(acb, 1);
> >> +            ret = qemu_iio_writev(s->qnio_ctx,
> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
> >> +            break;
> >> +    case VDISK_AIO_READ:
> >> +            vxhs_inc_acb_segment_count(acb, 1);
> >> +            ret = qemu_iio_readv(s->qnio_ctx,
> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
> >> +            break;
> >> +    default:
> >> +            trace_vxhs_aio_rw_invalid(iodir);
> >> +            goto errout;
> >> +    }
> >> +
> >> +    if (ret != 0) {
> >> +        trace_vxhs_aio_rw_ioerr(
> >> +                  s->vdisk_guid, iodir, size, offset,
> >> +                  acb, acb->segments, ret, errno);
> >> +        /*
> >> +         * Don't retry I/Os against vDisk having no
> >> +         * redundancy or stateful storage on compute
> >> +         *
> >> +         * TODO: Revisit this code path to see if any
> >> +         *       particular error needs to be handled.
> >> +         *       At this moment failing the I/O.
> >> +         */
> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +        if (s->vdisk_nhosts == 1) {
> >> +            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
> >> +            s->vdisk_aio_count--;
> >> +            vxhs_dec_acb_segment_count(acb, 1);
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            goto errout;
> >> +        }
> >> +        if (OF_VDISK_FAILED(s)) {
> >> +            trace_vxhs_aio_rw_devfail(
> >> +                      s->vdisk_guid, iodir, size, offset);
> >> +            s->vdisk_aio_count--;
> >> +            vxhs_dec_acb_segment_count(acb, 1);
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            goto errout;
> >> +        }
> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> >> +            /*
> >> +             * Queue all incoming io requests after failover starts.
> >> +             * Number of requests that can arrive is limited by io queue depth
> >> +             * so an app blasting independent ios will not exhaust memory.
> >> +             */
> >> +            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> >> +            s->vdisk_aio_retry_qd++;
> >> +            OF_AIOCB_FLAGS_SET_QUEUED(acb);
> >> +            s->vdisk_aio_count--;
> >> +            vxhs_dec_acb_segment_count(acb, 1);
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
> >> +            goto out;
> >> +        }
> >> +        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
> >> +        s->vdisk_aio_retry_qd++;
> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
> >> +        vxhs_dec_acb_segment_count(acb, 1);
> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
> >> +        /*
> >> +         * Start I/O failover if there is no active
> >> +         * AIO within vxhs block driver.
> >> +         */
> >> +        if (--s->vdisk_aio_count == 0) {
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            /*
> >> +             * Start IO failover
> >> +             */
> >> +            vxhs_failover_io(s);
> >> +            goto out;
> >> +        }
> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +    }
> >> +
> >> +out:
> >> +    return &acb->common;
> >> +
> >> +errout:
> >> +    qemu_aio_unref(acb);
> >> +    return NULL;
> >> +}
> >> +
> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
> >> +                                   int64_t sector_num, QEMUIOVector *qiov,
> >> +                                   int nb_sectors,
> >> +                                   BlockCompletionFunc *cb, void *opaque)
> >> +{
> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
> >> +                         cb, opaque, VDISK_AIO_READ);
> >> +}
> >> +
> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
> >> +                                    int64_t sector_num, QEMUIOVector *qiov,
> >> +                                    int nb_sectors,
> >> +                                    BlockCompletionFunc *cb, void *opaque)
> >> +{
> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
> >> +                         cb, opaque, VDISK_AIO_WRITE);
> >> +}
> >> +
> >> +/*
> >> + * This is called by QEMU when a flush gets triggered from within
> >> + * a guest at the block layer, either for IDE or SCSI disks.
> >> + */
> >> +int vxhs_co_flush(BlockDriverState *bs)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    uint64_t size = 0;
> >> +    int ret = 0;
> >> +
> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
> >> +
> >> +    if (ret < 0) {
> >> +        /*
> >> +         * Currently not handling the flush ioctl
> >> +         * failure because of network connection
> >> +         * disconnect. Since all the writes are
> >> +         * commited into persistent storage hence
> >> +         * this flush call is noop and we can safely
> >> +         * return success status to the caller.
> >
> > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
> > above is a noop?
> >
> 
> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
> in case we later want to add some functionality to it. I have now
> added a comment to this affect to avoid any confusion.
> 

The problem is you don't know which version of the qnio library any given
QEMU binary will be using, since it is a shared library.  Future versions
may implement the flush ioctl as expressed above, in which case we may hide
a valid error.

Am I correct in assuming that this call suppresses errors because an error
is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
you want a placeholder here for flushing, you should go all the way and stub
out the underlying ioctl call to return success.  Then QEMU can at least
rely on the error return from the flush operation.


> >> +         *
> >> +         * If any write failure occurs for inflight
> >> +         * write AIO because of network disconnect
> >> +         * then anyway IO failover will be triggered.
> >> +         */
> >> +        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
> >> +        ret = 0;
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
> >> +{
> >> +    void *ctx = NULL;
> >> +    int flags = 0;
> >> +    unsigned long vdisk_size = 0;
> >> +    int ret = 0;
> >> +
> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +            VDISK_STAT, &vdisk_size, ctx, flags);
> >> +
> >> +    if (ret < 0) {
> >> +        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
> >> +        return 0;
> >> +    }
> >> +
> >> +    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
> >> +    return vdisk_size;
> >> +}
> >> +
> >> +/*
> >> + * Returns the size of vDisk in bytes. This is required
> >> + * by QEMU block upper block layer so that it is visible
> >> + * to guest.
> >> + */
> >> +int64_t vxhs_getlength(BlockDriverState *bs)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    unsigned long vdisk_size = 0;
> >> +
> >> +    if (s->vdisk_size > 0) {
> >> +        vdisk_size = s->vdisk_size;
> >
> > s->vdisk_size is int64_t, vdisk_size is unsigned long.  On 32-bit platforms,
> > I believe unsigned long is only 32-bits, so this assignment is problematic.
> >
> > What is the maximum disk size for vxhs?
> >
> > (this comment applies to all the vdisk_size usage, but I'll only mention it
> > here)
> >
> 
> Changed to int64_t in all places. The code does not place any
> restriction on the maximum size of the vdisk.
> 
> >> +    } else {
> >> +        /*
> >> +         * Fetch the vDisk size using stat ioctl
> >> +         */
> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
> >> +        if (vdisk_size > 0) {
> >> +            s->vdisk_size = vdisk_size;
> >> +        }
> >> +    }
> >> +
> >> +    if (vdisk_size > 0) {
> >> +        return (int64_t)vdisk_size; /* return size in bytes */
> >> +    } else {
> >> +        return -EIO;
> >> +    }
> >> +}
> >> +
> >> +/*
> >> + * Returns actual blocks allocated for the vDisk.
> >> + * This is required by qemu-img utility.
> >> + */
> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +    unsigned long vdisk_size = 0;
> >> +
> >> +    if (s->vdisk_size > 0) {
> >> +        vdisk_size = s->vdisk_size;
> >> +    } else {
> >> +        /*
> >> +         * TODO:
> >> +         * Once HyperScale storage-virtualizer provides
> >> +         * actual physical allocation of blocks then
> >> +         * fetch that information and return back to the
> >> +         * caller but for now just get the full size.
> >> +         */
> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
> >> +        if (vdisk_size > 0) {
> >> +            s->vdisk_size = vdisk_size;
> >> +        }
> >> +    }
> >> +
> >> +    if (vdisk_size > 0) {
> >> +        return (int64_t)vdisk_size; /* return size in bytes */
> >> +    } else {
> >> +        return -EIO;
> >> +    }
> >> +}
> >> +
> >> +void vxhs_close(BlockDriverState *bs)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +
> >> +    close(s->fds[VDISK_FD_READ]);
> >> +    close(s->fds[VDISK_FD_WRITE]);
> >> +
> >> +    /*
> >> +     * never close channel - not ref counted, will
> >> +     * close for all vdisks
> >> +     */
> >> +    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
> >> +    }
> >> +    if (s->vdisk_lock) {
> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
> >> +        s->vdisk_lock = NULL;
> >> +    }
> >> +    if (s->vdisk_acb_lock) {
> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
> >> +        s->vdisk_acb_lock = NULL;
> >> +    }
> >> +
> >> +    /*
> >> +     * TODO: Verify that all the resources were relinguished.
> >> +     */
> >
> > Which resources?  Allocated here in the driver, or stuff in libqnio?
> >
> 
> The comment was for all resources allocated within vxhs.c.
> Added code to free all allocated memory and fds on close.
> 
> 
> >> +}
> >> +
> >> +/*
> >> + * If errors are consistent with storage agent failure:
> >> + *  - Try to reconnect in case error is transient or storage agent restarted.
> >> + *  - Currently failover is being triggered on per vDisk basis. There is
> >> + *    a scope of further optimization where failover can be global (per VM).
> >> + *  - In case of network (storage agent) failure, for all the vDisks, having
> >> + *    no redundancy, I/Os will be failed without attempting for I/O failover
> >> + *    because of stateless nature of vDisk.
> >> + *  - If local or source storage agent is down then send an ioctl to remote
> >> + *    storage agent to check if remote storage agent in a state to accept
> >> + *    application I/Os.
> >> + *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
> >> + *  - If I/Os cannot be serviced then vDisk will be marked failed so that
> >> + *    new incoming I/Os are returned with failure immediately.
> >> + *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
> >> + *    queued and will be restarted or failed based on failover operation
> >> + *    is successful or not.
> >> + *  - I/O failover can be started either in I/O forward or I/O backward
> >> + *    path.
> >> + *  - I/O failover will be started as soon as all the pending acb(s)
> >> + *    are queued and there is no pending I/O count.
> >> + *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
> >> + *    then vDisk will be marked failed and all I/Os will be completed with
> >> + *    error.
> >> + */
> >> +
> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s)
> >> +{
> >> +    int res = 0;
> >> +    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
> >> +
> >> +    trace_vxhs_switch_storage_agent(
> >> +              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
> >> +              s->vdisk_guid);
> >> +
> >> +    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
> >> +    if (res == 0) {
> >> +        res = qemu_iio_ioctl(s->qnio_ctx,
> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
> >> +                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
> >> +    } else {
> >> +        trace_vxhs_switch_storage_agent_failed(
> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
> >> +                  s->vdisk_guid, res, errno);
> >> +        /*
> >> +         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
> >> +         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
> >> +         * target list.
> >> +         */
> >> +
> >> +        /* try next host */
> >> +        vxhs_failover_ioctl_cb(res, s);
> >> +    }
> >> +    return res;
> >> +}
> >> +
> >> +void vxhs_failover_ioctl_cb(int res, void *ctx)
> >> +{
> >> +    BDRVVXHSState *s = ctx;
> >> +
> >> +    if (res == 0) {
> >> +        /* found failover target */
> >> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
> >> +        s->vdisk_ask_failover_idx = 0;
> >> +        trace_vxhs_failover_ioctl_cb(
> >> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
> >> +                   s->vdisk_guid);
> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +        vxhs_handle_queued_ios(s);
> >> +    } else {
> >> +        /* keep looking */
> >> +        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
> >> +        s->vdisk_ask_failover_idx++;
> >> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
> >
> > This function, vxhs_failover_ioctrl_cb(), is called from the external
> > library callback, in addition to QEMU itself (i.e., vxhs_aio_rw).  This
> > makes me concerned that we may have (or be able to induce) an unsafe race
> > condition when incrementing the vdisk_ask_failover_id array index.
> >
> 
> vxhs_failover_ioctl_cb()  can be called from the callback
> vxhs_iio_callback() case IRP_VDISK_CHECK_IO_FAILOVER_READY in response
> to the qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY)
> 
> This ioctl qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) is only
> called from within vxhs_switch_storage_agent()
> 
> A direct call to vxhs_failover_ioctl_cb() is only issued from the same
> function vxhs_switch_storage_agent() on an "else" condition. So
> vxhs_failover_ioctl_cb() can only be called from a single thread
> context either in response to the ioctl, or directly. We should be
> safe here, but thanks for pointing it out!


The callback function 'vxhs_iio_callback' also calls
vxhs_failover_ioctl_cb().

vxhs_failover_ioctl_cb() is also called from vxhs_aio_rw() by QEMU (via
'vxds_failover_io').

The above two paths make me concerned about the array pointer being
incremented.

(BTW: 'vxhs_failover_ioctl_cb' seems to be misnamed, because it is not a
callback function, but just a helper function).

Also related: this code reads like it will hang QEMU if QNIO goes offline;
we will be caught in perpetual retries, in which case vxhs_aio_rw() may
never return.

> 
> >> +            /* pause and cycle through list again */
> >> +            sleep(QNIO_CONNECT_RETRY_SECS);
> >> +            s->vdisk_ask_failover_idx = 0;
> >> +        }
> >> +        res = vxhs_switch_storage_agent(s);
> >> +    }
> >> +}
> >> +
> >> +int vxhs_failover_io(BDRVVXHSState *s)
> >> +{
> >> +    int res = 0;
> >> +
> >> +    trace_vxhs_failover_io(s->vdisk_guid);
> >> +
> >> +    s->vdisk_ask_failover_idx = 0;
> >> +    res = vxhs_switch_storage_agent(s);
> >> +
> >> +    return res;
> >> +}
> >> +
> >> +/*
> >> + * Try to reopen the vDisk on one of the available hosts
> >> + * If vDisk reopen is successful on any of the host then
> >> + * check if that node is ready to accept I/O.
> >> + */
> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
> >> +{
> >> +    char *of_vsa_addr = NULL;
> >> +    char *file_name = NULL;
> >> +    int  res = 0;
> >> +
> >> +    /*
> >> +     * Don't close the channel if it was opened
> >> +     * before successfully. It will be handled
> >> +     * within iio* api if the same channel open
> >> +     * fd is reused.
> >> +     *
> >> +     * close stale vdisk device remote fd since
> >> +     * it is invalid after channel disconnect.
> >> +     */
> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
> >> +                                 s->vdisk_hostinfo[index].vdisk_rfd);
> >> +        s->vdisk_hostinfo[index].vdisk_rfd = -1;
> >> +    }
> >> +
> >> +    /*
> >> +     * build storage agent address and vdisk device name strings
> >> +     */
> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
> >> +             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
> >> +    /*
> >> +     * open qnio channel to storage agent if not opened before.
> >> +     */
> >> +    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
> >> +        s->vdisk_hostinfo[index].qnio_cfd =
> >> +                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
> >> +        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
> >> +            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
> >> +            res = ENODEV;
> >> +            goto out;
> >> +        }
> >> +    }
> >> +    /*
> >> +     * open vdisk device
> >> +     */
> >> +    s->vdisk_hostinfo[index].vdisk_rfd =
> >> +            qemu_iio_devopen(global_qnio_ctx,
> >> +             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
> >> +        trace_vxhs_reopen_vdisk_openfail(file_name);
> >> +        res = EIO;
> >> +        goto out;
> >> +    }
> >> +
> >> +out:
> >> +    g_free(of_vsa_addr);
> >> +    g_free(file_name);
> >> +    return res;
> >> +}
> >> +
> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s)
> >> +{
> >> +    VXHSAIOCB *acb = NULL;
> >> +    int res = 0;
> >> +
> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
> >> +        /*
> >> +         * Before we process the acb, check whether I/O failover
> >> +         * started again due to failback or cascading failure.
> >> +         */
> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            goto out;
> >> +        }
> >> +        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
> >> +        s->vdisk_aio_retry_qd--;
> >> +        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
> >> +        if (OF_VDISK_FAILED(s)) {
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            vxhs_fail_aio(acb, EIO);
> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +        } else {
> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +            res = vxhs_restart_aio(acb);
> >> +            trace_vxhs_handle_queued_ios(acb, res);
> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
> >> +            if (res) {
> >> +                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
> >> +                                     acb, retry_entry);
> >> +                OF_AIOCB_FLAGS_SET_QUEUED(acb);
> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +                goto out;
> >> +            }
> >> +        }
> >> +    }
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
> >> +out:
> >> +    return res;
> >> +}
> >> +
> >> +int vxhs_restart_aio(VXHSAIOCB *acb)
> >> +{
> >> +    BDRVVXHSState *s = NULL;
> >> +    int iio_flags = 0;
> >> +    int res = 0;
> >> +
> >> +    s = acb->common.bs->opaque;
> >> +
> >> +    if (acb->direction == VDISK_AIO_WRITE) {
> >> +        vxhs_inc_vdisk_iocount(s, 1);
> >> +        vxhs_inc_acb_segment_count(acb, 1);
> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> >> +        res = qemu_iio_writev(s->qnio_ctx,
> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +                acb->qiov->iov, acb->qiov->niov,
> >> +                acb->io_offset, (void *)acb, iio_flags);
> >> +    }
> >> +
> >> +    if (acb->direction == VDISK_AIO_READ) {
> >> +        vxhs_inc_vdisk_iocount(s, 1);
> >> +        vxhs_inc_acb_segment_count(acb, 1);
> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
> >> +        res = qemu_iio_readv(s->qnio_ctx,
> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> +                acb->qiov->iov, acb->qiov->niov,
> >> +                acb->io_offset, (void *)acb, iio_flags);
> >> +    }
> >> +
> >> +    if (res != 0) {
> >> +        vxhs_dec_vdisk_iocount(s, 1);
> >> +        vxhs_dec_acb_segment_count(acb, 1);
> >> +        trace_vxhs_restart_aio(acb->direction, res, errno);
> >> +    }
> >> +
> >> +    return res;
> >> +}
> >> +
> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err)
> >> +{
> >> +    BDRVVXHSState *s = NULL;
> >> +    int segcount = 0;
> >> +    int rv = 0;
> >> +
> >> +    s = acb->common.bs->opaque;
> >> +
> >> +    trace_vxhs_fail_aio(s->vdisk_guid, acb);
> >> +    if (!acb->ret) {
> >> +        acb->ret = err;
> >> +    }
> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
> >> +    segcount = acb->segments;
> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
> >> +    if (segcount == 0) {
> >> +        /*
> >> +         * Complete the io request
> >> +         */
> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
> >> +        if (rv != sizeof(acb)) {
> >> +            error_report("VXHS AIO completion failed: %s",
> >> +                         strerror(errno));
> >> +            abort();
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static void vxhs_detach_aio_context(BlockDriverState *bs)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +
> >> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
> >> +                       false, NULL, NULL, NULL);
> >> +
> >> +}
> >> +
> >> +static void vxhs_attach_aio_context(BlockDriverState *bs,
> >> +                                   AioContext *new_context)
> >> +{
> >> +    BDRVVXHSState *s = bs->opaque;
> >> +
> >> +    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
> >> +                       false, vxhs_aio_event_reader, NULL, s);
> >> +}
> >> +
> >> +static BlockDriver bdrv_vxhs = {
> >> +    .format_name                  = "vxhs",
> >> +    .protocol_name                = "vxhs",
> >> +    .instance_size                = sizeof(BDRVVXHSState),
> >> +    .bdrv_file_open               = vxhs_open,
> >> +    .bdrv_parse_filename          = vxhs_parse_filename,
> >> +    .bdrv_close                   = vxhs_close,
> >> +    .bdrv_getlength               = vxhs_getlength,
> >> +    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
> >> +    .bdrv_aio_readv               = vxhs_aio_readv,
> >> +    .bdrv_aio_writev              = vxhs_aio_writev,
> >> +    .bdrv_co_flush_to_disk        = vxhs_co_flush,
> >> +    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
> >> +    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
> >> +};
> >> +
> >> +void bdrv_vxhs_init(void)
> >> +{
> >> +    trace_vxhs_bdrv_init('.');
> >> +    bdrv_register(&bdrv_vxhs);
> >> +}
> >> +
> >> +block_init(bdrv_vxhs_init);
> >> diff --git a/block/vxhs.h b/block/vxhs.h
> >> new file mode 100644
> >> index 0000000..38e0165
> >> --- /dev/null
> >> +++ b/block/vxhs.h
> >> @@ -0,0 +1,237 @@
> >> +/*
> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
> >> + *
> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >> + * See the COPYING file in the top-level directory.
> >> + *
> >> + */
> >> +
> >> +#ifndef VXHSD_H
> >> +#define VXHSD_H
> >> +
> >> +#include <gmodule.h>
> >> +#include <inttypes.h>
> >> +#include <pthread.h>
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qapi/error.h"
> >> +#include "qemu/error-report.h"
> >> +#include "block/block_int.h"
> >> +#include "qemu/uri.h"
> >> +#include "qemu/queue.h"
> >> +
> >> +#define OF_GUID_STR_LEN             40
> >> +#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
> >> +#define QNIO_CONNECT_RETRY_SECS     5
> >> +#define QNIO_CONNECT_TIMOUT_SECS    120
> >> +
> >> +/* constants from io_qnio.h */
> >> +#define IIO_REASON_DONE     0x00000004
> >> +#define IIO_REASON_EVENT    0x00000008
> >> +#define IIO_REASON_HUP      0x00000010
> >> +
> >> +/*
> >> + * IO specific flags
> >> + */
> >> +#define IIO_FLAG_ASYNC      0x00000001
> >> +#define IIO_FLAG_DONE       0x00000010
> >> +#define IIO_FLAG_SYNC       0
> >> +
> >> +/* constants from error.h */
> >> +#define VXERROR_RETRY_ON_SOURCE     44
> >> +#define VXERROR_HUP                 901
> >> +#define VXERROR_CHANNEL_HUP         903
> >> +
> >> +/* constants from iomgr.h and opcode.h */
> >> +#define IRP_READ_REQUEST                    0x1FFF
> >> +#define IRP_WRITE_REQUEST                   0x2FFF
> >> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
> >> +
> >> +/* Lock specific macros */
> >> +#define VXHS_SPIN_LOCK_ALLOC                  \
> >> +    (qemu_ck_initialize_lock())
> >> +#define VXHS_SPIN_LOCK(lock)                  \
> >> +    (qemu_ck_spin_lock(lock))
> >> +#define VXHS_SPIN_UNLOCK(lock)                \
> >> +    (qemu_ck_spin_unlock(lock))
> >> +#define VXHS_SPIN_LOCK_DESTROY(lock)          \
> >> +    (qemu_ck_destroy_lock(lock))
> >
> > What's the value of these macros, rather than just calling the underlying
> > function directly in code?
> >
> 
> This way we can easily swap out CK locks with something else like
> pthread_mutex if required.
> 
> >> +
> >> +typedef enum {
> >> +    VXHS_IO_INPROGRESS,
> >> +    VXHS_IO_COMPLETED,
> >> +    VXHS_IO_ERROR
> >> +} VXHSIOState;
> >> +
> >> +
> >> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
> >> +
> >> +#define VDISK_FD_READ 0
> >> +#define VDISK_FD_WRITE 1
> >> +
> >> +#define QNIO_VDISK_NONE        0x00
> >> +#define QNIO_VDISK_CREATE      0x01
> >> +
> >> +/* max IO size supported by QEMU NIO lib */
> >> +#define QNIO_MAX_IO_SIZE       4194304
> >> +
> >> +#define MAX_HOSTS               4
> >> +
> >> +/*
> >> + * Opcodes for making IOCTL on QEMU NIO library
> >> + */
> >> +#define BASE_OPCODE_SHARED     1000
> >> +#define BASE_OPCODE_DAL        2000
> >> +#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
> >> +#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
> >> +#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
> >> +#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
> >> +
> >> +/*
> >> + * BDRVVXHSState specific flags
> >> + */
> >> +#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
> >> +#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
> >> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
> >> +
> >> +#define OF_VDISK_ACTIVE(s)                                              \
> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
> >> +#define OF_VDISK_SET_ACTIVE(s)                                          \
> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
> >> +#define OF_VDISK_RESET_ACTIVE(s)                                        \
> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
> >> +
> >> +#define OF_VDISK_FAILED(s)                                              \
> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
> >> +#define OF_VDISK_SET_FAILED(s)                                          \
> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
> >> +#define OF_VDISK_RESET_FAILED(s)                                        \
> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
> >> +
> >> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> >> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> >> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
> >> +
> >> +/*
> >> + * VXHSAIOCB specific flags
> >> + */
> >> +#define OF_ACB_QUEUED       0x00000001
> >> +
> >> +#define OF_AIOCB_FLAGS_QUEUED(a)            \
> >> +        ((a)->flags & OF_ACB_QUEUED)
> >> +#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
> >> +        ((a)->flags |= OF_ACB_QUEUED)
> >> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
> >> +        ((a)->flags &= ~OF_ACB_QUEUED)
> >> +
> >
> > Many helper macros here... IMO, it decreases the readability of the code,
> > and my preference is to just have the underlying code without the use of the
> > macros.
> >
> 
> Would like to keep this unchanged if that's OK!
> 
> >> +typedef struct qemu2qnio_ctx {
> >> +    uint32_t            qnio_flag;
> >> +    uint64_t            qnio_size;
> >> +    char                *qnio_channel;
> >> +    char                *target;
> >> +    qnio_callback_t     qnio_cb;
> >> +} qemu2qnio_ctx_t;
> >> +
> >> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
> >> +
> >> +typedef struct LibQNIOSymbol {
> >> +        const char *name;
> >> +        gpointer *addr;
> >> +} LibQNIOSymbol;
> >> +
> >> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
> >> +                          void *reply);
> >> +
> >> +/*
> >> + * HyperScale AIO callbacks structure
> >> + */
> >> +typedef struct VXHSAIOCB {
> >> +    BlockAIOCB          common;
> >> +    size_t              ret;
> >> +    size_t              size;
> >> +    QEMUBH              *bh;
> >> +    int                 aio_done;
> >> +    int                 segments;
> >> +    int                 flags;
> >> +    size_t              io_offset;
> >> +    QEMUIOVector        *qiov;
> >> +    void                *buffer;
> >> +    int                 direction;  /* IO direction (r/w) */
> >> +    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
> >> +} VXHSAIOCB;
> >> +
> >> +typedef struct VXHSvDiskHostsInfo {
> >> +    int     qnio_cfd;        /* Channel FD */
> >> +    int     vdisk_rfd;      /* vDisk remote FD */
> >> +    char    *hostip;        /* Host's IP addresses */
> >> +    int     port;           /* Host's port number */
> >> +} VXHSvDiskHostsInfo;
> >> +
> >> +/*
> >> + * Structure per vDisk maintained for state
> >> + */
> >> +typedef struct BDRVVXHSState {
> >> +    int                     fds[2];
> >> +    int64_t                 vdisk_size;
> >> +    int64_t                 vdisk_blocks;
> >> +    int64_t                 vdisk_flags;
> >> +    int                     vdisk_aio_count;
> >> +    int                     event_reader_pos;
> >> +    VXHSAIOCB               *qnio_event_acb;
> >> +    void                    *qnio_ctx;
> >> +    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
> >> +    void                    *vdisk_acb_lock;  /* Protects ACB */
> >> +    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
> >> +    int                     vdisk_nhosts;   /* Total number of hosts */
> >> +    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
> >> +    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
> >> +    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
> >> +    int                     vdisk_aio_retry_qd;
> >
> > Is vdisk_aio_retry_qd just used for debugging purposes?  I see it
> > incremented/decremented throughout the code, but no one checks it.
> >
> 
> Yes, it is for debugging only. Added a comment to state the same.
> 
> 
> >> +    char                    *vdisk_guid;
> >> +} BDRVVXHSState;
> >> +
> >> +void bdrv_vxhs_init(void);
> >> +void *vxhs_setup_qnio(void);
> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
> >> +void vxhs_aio_event_reader(void *opaque);
> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
> >> +int vxhs_aio_flush_cb(void *opaque);
> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg);
> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
> >> +              int bdrv_flags, Error **errp);
> >> +void vxhs_close(BlockDriverState *bs);
> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
> >> +                                   QEMUIOVector *qiov, int nb_sectors,
> >> +                                   BlockCompletionFunc *cb, void *opaque);
> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
> >> +                                    QEMUIOVector *qiov, int nb_sectors,
> >> +                                    BlockCompletionFunc *cb,
> >> +                                    void *opaque);
> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
> >> +                                QEMUIOVector *qiov, int nb_sectors,
> >> +                                BlockCompletionFunc *cb,
> >> +                                void *opaque, int write);
> >> +int vxhs_co_flush(BlockDriverState *bs);
> >> +int64_t vxhs_getlength(BlockDriverState *bs);
> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr);
> >> +void vxhs_inc_acb_segment_count(void *ptr, int count);
> >> +void vxhs_dec_acb_segment_count(void *ptr, int count);
> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
> >> +int vxhs_failover_io(BDRVVXHSState *s);
> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s);
> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s);
> >> +int vxhs_restart_aio(VXHSAIOCB *acb);
> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err);
> >> +void vxhs_failover_ioctl_cb(int res, void *ctx);
> >> +
> >> +
> >> +#endif
> >> diff --git a/configure b/configure
> >> index 4b808f9..c71e270 100755
> >> --- a/configure
> >> +++ b/configure
> >> @@ -320,6 +320,7 @@ vhdx=""
> >>  numa=""
> >>  tcmalloc="no"
> >>  jemalloc="no"
> >> +vxhs=""
> >>
> >>  # parse CC options first
> >>  for opt do
> >> @@ -1150,6 +1151,11 @@ for opt do
> >>    ;;
> >>    --enable-jemalloc) jemalloc="yes"
> >>    ;;
> >> +  --disable-vxhs) vxhs="no"
> >> +  ;;
> >> +  --enable-vxhs) vxhs="yes"
> >> +  ;;
> >> +
> >>    *)
> >>        echo "ERROR: unknown option $opt"
> >>        echo "Try '$0 --help' for more information"
> >> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if available:
> >>    numa            libnuma support
> >>    tcmalloc        tcmalloc support
> >>    jemalloc        jemalloc support
> >> +  vxhs            Veritas HyperScale vDisk backend support
> >>
> >>  NOTE: The object files are built at the place where configure is launched
> >>  EOF
> >> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
> >>  fi
> >>
> >>  ##########################################
> >> +# Veritas HyperScale block driver VxHS
> >> +# Check if libqnio is installed
> >> +if test "$vxhs" != "no" ; then
> >> +  cat > $TMPC <<EOF
> >> +#include <stdio.h>
> >> +#include <qnio/qnio_api.h>
> >> +
> >> +void vxhs_inc_acb_segment_count(void *acb, int count);
> >> +void vxhs_dec_acb_segment_count(void *acb, int count);
> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
> >> +
> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
> >> +{
> >> +}
> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
> >> +{
> >> +}
> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
> >> +{
> >> +}
> >> +int main(void) {
> >> +    qemu_ck_initialize_lock();
> >> +    return 0;
> >> +}
> >> +EOF
> >> +  vxhs_libs="-lqnioshim -lqnio"
> >> +  if compile_prog "" "$vxhs_libs" ; then
> >> +    vxhs=yes
> >> +  else
> >> +    if test "$vxhs" = "yes" ; then
> >> +      feature_not_found "vxhs block device" "Install libqnio. See github"
> >> +    fi
> >> +    vxhs=no
> >> +  fi
> >> +fi
> >> +
> >> +##########################################
> >>  # End of CC checks
> >>  # After here, no more $cc or $ld runs
> >>
> >> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then
> >>    echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
> >>  fi
> >>
> >> +if test "$vxhs" = "yes" ; then
> >> +  echo "CONFIG_VXHS=y" >> $config_host_mak
> >> +  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
> >> +  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
> >> +fi
> >> +
> >>  if test "$tcg_interpreter" = "yes"; then
> >>    QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
> >>  elif test "$ARCH" = "sparc64" ; then
> >
> > Your configure file should indicate if vxhs is enabled or not. For instance,
> > here is a snippet from where this occurs in the current configure file:
> >
> > [...]
> > echo "NUMA host support $numa"
> > echo "tcmalloc support  $tcmalloc"
> > echo "jemalloc support  $jemalloc"
> > echo "avx2 optimization $avx2_opt"
> >
> >
> 
> Fixed.
> 
> >> --
> >> 2.5.5
> >>
> >>
Kevin Wolf Sept. 8, 2016, 2:20 p.m. UTC | #13
Am 08.09.2016 um 16:00 hat Jeff Cody geschrieben:
> > >> +/*
> > >> + * This is called by QEMU when a flush gets triggered from within
> > >> + * a guest at the block layer, either for IDE or SCSI disks.
> > >> + */
> > >> +int vxhs_co_flush(BlockDriverState *bs)
> > >> +{
> > >> +    BDRVVXHSState *s = bs->opaque;
> > >> +    uint64_t size = 0;
> > >> +    int ret = 0;
> > >> +
> > >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> > >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> > >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
> > >> +
> > >> +    if (ret < 0) {
> > >> +        /*
> > >> +         * Currently not handling the flush ioctl
> > >> +         * failure because of network connection
> > >> +         * disconnect. Since all the writes are
> > >> +         * commited into persistent storage hence
> > >> +         * this flush call is noop and we can safely
> > >> +         * return success status to the caller.
> > >
> > > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
> > > above is a noop?
> > >
> > 
> > Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
> > in case we later want to add some functionality to it. I have now
> > added a comment to this affect to avoid any confusion.
> > 
> 
> The problem is you don't know which version of the qnio library any given
> QEMU binary will be using, since it is a shared library.  Future versions
> may implement the flush ioctl as expressed above, in which case we may hide
> a valid error.
> 
> Am I correct in assuming that this call suppresses errors because an error
> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
> you want a placeholder here for flushing, you should go all the way and stub
> out the underlying ioctl call to return success.  Then QEMU can at least
> rely on the error return from the flush operation.

So what's the story behind the missing flush command?

Does the server always use something like O_SYNC, i.e. all potential
write caches in the stack operate in a writethrough mode? So each write
request is only completed successfully if it is ensured that the data is
safe on disk rather than in a volatile writeback cache?

As soon as any writeback cache can be involved (e.g. the kernel page
cache or a volatile disk cache) and there is no flush command (a real
one, not just stubbed), the driver is not operating correctly and
therefore not ready for inclusion.

So Ashish, can you tell us something about caching behaviour across the
storage stack when vxhs is involved?

Kevin
Ashish Mittal Sept. 8, 2016, 8:46 p.m. UTC | #14
Hi Kevin,

By design, our writeback cache is on non-volatile SSD device. We do
async writes to this cache and also maintain a persistent index map of
the data written. This gives us the capability to recover write-back
cache if needed.

Thanks,
Ashish

On Thu, Sep 8, 2016 at 7:20 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 08.09.2016 um 16:00 hat Jeff Cody geschrieben:
>> > >> +/*
>> > >> + * This is called by QEMU when a flush gets triggered from within
>> > >> + * a guest at the block layer, either for IDE or SCSI disks.
>> > >> + */
>> > >> +int vxhs_co_flush(BlockDriverState *bs)
>> > >> +{
>> > >> +    BDRVVXHSState *s = bs->opaque;
>> > >> +    uint64_t size = 0;
>> > >> +    int ret = 0;
>> > >> +
>> > >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>> > >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> > >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
>> > >> +
>> > >> +    if (ret < 0) {
>> > >> +        /*
>> > >> +         * Currently not handling the flush ioctl
>> > >> +         * failure because of network connection
>> > >> +         * disconnect. Since all the writes are
>> > >> +         * commited into persistent storage hence
>> > >> +         * this flush call is noop and we can safely
>> > >> +         * return success status to the caller.
>> > >
>> > > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
>> > > above is a noop?
>> > >
>> >
>> > Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
>> > in case we later want to add some functionality to it. I have now
>> > added a comment to this affect to avoid any confusion.
>> >
>>
>> The problem is you don't know which version of the qnio library any given
>> QEMU binary will be using, since it is a shared library.  Future versions
>> may implement the flush ioctl as expressed above, in which case we may hide
>> a valid error.
>>
>> Am I correct in assuming that this call suppresses errors because an error
>> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
>> you want a placeholder here for flushing, you should go all the way and stub
>> out the underlying ioctl call to return success.  Then QEMU can at least
>> rely on the error return from the flush operation.
>
> So what's the story behind the missing flush command?
>
> Does the server always use something like O_SYNC, i.e. all potential
> write caches in the stack operate in a writethrough mode? So each write
> request is only completed successfully if it is ensured that the data is
> safe on disk rather than in a volatile writeback cache?
>
> As soon as any writeback cache can be involved (e.g. the kernel page
> cache or a volatile disk cache) and there is no flush command (a real
> one, not just stubbed), the driver is not operating correctly and
> therefore not ready for inclusion.
>
> So Ashish, can you tell us something about caching behaviour across the
> storage stack when vxhs is involved?
>
> Kevin
Ashish Mittal Sept. 8, 2016, 11:15 p.m. UTC | #15
>> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
>> in case we later want to add some functionality to it. I have now
>> added a comment to this affect to avoid any confusion.
>>
>
> The problem is you don't know which version of the qnio library any given
> QEMU binary will be using, since it is a shared library.  Future versions
> may implement the flush ioctl as expressed above, in which case we may hide
> a valid error.
>
> Am I correct in assuming that this call suppresses errors because an error
> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
> you want a placeholder here for flushing, you should go all the way and stub
> out the underlying ioctl call to return success.  Then QEMU can at least
> rely on the error return from the flush operation.
>
>

I agree. Will change accordingly. I also completely agree on the
discussion of libqnio calling a qemu function. Will check on the best
way to resolve that and get back.

Thanks,
Ashish


On Thu, Sep 8, 2016 at 7:00 AM, Jeff Cody <jcody@redhat.com> wrote:
> On Wed, Sep 07, 2016 at 03:32:47PM -0700, ashish mittal wrote:
>> Hi Jeff,
>>
>> Thank you for the comments and corrections.
>>
>> I have submitted a v5 patch after incorporating most of your review
>> comments. Please find details in-line.
>>
>> Thanks,
>> Ashish
>>
>> On Tue, Aug 30, 2016 at 10:35 AM, Jeff Cody <jcody@redhat.com> wrote:
>> >
>> > First-pass-over-the-code review:
>> >
>> > On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote:
>> >> This patch adds support for a new block device type called "vxhs".
>> >> Source code for the library that this code loads can be downloaded from:
>> >> https://github.com/MittalAshish/libqnio.git
>> >>
>> >> Sample command line using JSON syntax:
>> >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'
>> >>
>> >> Sample command line using URI syntax:
>> >> qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D
>> >>
>> >
>> > Is there a reference specification for vxhs available?  If so, the commit
>> > message would be a good place to drop a link.
>> >
>>
>> We don't have a reference specification doc yet. Although, I will
>> shortly be adding a test program to libqnio that can be used to
>> test/demo basic libqnio IO transfer.
>>
>> >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
>> >> ---
>> >> v3 changelog:
>> >> =============
>> >> (1) Reworked QAPI/JSON parsing.
>> >> (2) Reworked URI parsing as suggested by Kevin.
>> >> (3) Fixes per review comments from Stefan on v1.
>> >> (4) Fixes per review comments from Daniel on v3.
>> >>
>> >>  block/Makefile.objs |    2 +
>> >>  block/trace-events  |   41 ++
>> >>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> >>  block/vxhs.h        |  237 ++++++++++
>> >>  configure           |   50 ++
>> >>  5 files changed, 1634 insertions(+)
>> >>  create mode 100644 block/vxhs.c
>> >>  create mode 100644 block/vxhs.h
>> >>
>> >> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> >> index 2593a2f..5f83305 100644
>> >> --- a/block/Makefile.objs
>> >> +++ b/block/Makefile.objs
>> >> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
>> >>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
>> >>  block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
>> >>  block-obj-$(CONFIG_LIBSSH2) += ssh.o
>> >> +block-obj-$(CONFIG_VXHS) += vxhs.o
>> >>  block-obj-y += accounting.o dirty-bitmap.o
>> >>  block-obj-y += write-threshold.o
>> >>
>> >> @@ -43,3 +44,4 @@ block-obj-m        += dmg.o
>> >>  dmg.o-libs         := $(BZIP2_LIBS)
>> >>  qcow.o-libs        := -lz
>> >>  linux-aio.o-libs   := -laio
>> >> +vxhs.o-libs        := $(VXHS_LIBS)
>> >> diff --git a/block/trace-events b/block/trace-events
>> >> index 05fa13c..06c6d8c 100644
>> >> --- a/block/trace-events
>> >> +++ b/block/trace-events
>> >> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
>> >>  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>> >>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>> >>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
>> >> +
>> >> +# block/vxhs.c
>> >> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
>> >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
>> >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
>> >> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
>> >> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
>> >> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
>> >> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
>> >> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
>> >> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
>> >> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
>> >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
>> >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
>> >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d"
>> >> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
>> >> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
>> >> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
>> >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
>> >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
>> >> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
>> >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
>> >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
>> >> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
>> >> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
>> >> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
>> >> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
>> >> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
>> >> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
>> >> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
>> >> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
>> >> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
>> >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
>> >> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
>> >> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
>> >> +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
>> >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
>> >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
>> >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
>> >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
>> >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
>> >> diff --git a/block/vxhs.c b/block/vxhs.c
>> >> new file mode 100644
>> >> index 0000000..32c37c5
>> >> --- /dev/null
>> >> +++ b/block/vxhs.c
>> >> @@ -0,0 +1,1304 @@
>> >> +/*
>> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
>> >> + *
>> >
>> > Another good place for a link to the specification, if available.
>> >
>> >
>> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> >> + * See the COPYING file in the top-level directory.
>> >> + *
>> >> + */
>> >> +
>> >> +#include "vxhs.h"
>> >> +#include <qnio/qnio_api.h>
>> >> +#include "trace.h"
>> >> +#include "qapi/qmp/qerror.h"
>> >> +#include "qapi/qmp/qdict.h"
>> >> +#include "qapi/qmp/qstring.h"
>> >> +
>> >> +#define VXHS_OPT_FILENAME        "filename"
>> >> +#define VXHS_OPT_VDISK_ID        "vdisk_id"
>> >> +#define VXHS_OPT_SERVER          "server."
>> >> +#define VXHS_OPT_HOST            "host"
>> >> +#define VXHS_OPT_PORT            "port"
>> >> +
>> >> +/* qnio client ioapi_ctx */
>> >> +static void *global_qnio_ctx;
>> >> +
>> >> +/* vdisk prefix to pass to qnio */
>> >> +static const char vdisk_prefix[] = "/dev/of/vdisk";
>> >> +
>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>> >> +{
>> >> +    VXHSAIOCB *acb = ptr;
>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +    acb->segments += count;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +}
>> >> +
>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>> >> +{
>> >> +    VXHSAIOCB *acb = ptr;
>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +    acb->segments -= count;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +}
>> >> +
>> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
>> >> +{
>> >> +    VXHSAIOCB *acb = ptr;
>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>> >> +    int segcount = 0;
>> >> +
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +    acb->segments -= count;
>> >> +    segcount = acb->segments;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +
>> >> +    return segcount;
>> >> +}
>> >
>> > Unused function?
>> >
>>
>> Yes, it is unused. Sorry I forgot to remove it from V5. Will remember
>> to remove in the next iteration.
>>
>> >> +
>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>> >> +{
>> >> +    VXHSAIOCB *acb = ptr;
>> >> +
>> >> +    acb->buffer = buffer;
>> >> +}
>> >> +
>> >
>> > Unused function?
>> >
>>
>> This is called from within libqnio.
>>
>> > (There are several unused functions - are there other patches that use these
>> > functions?  Maybe I missed them on the list.  If not, and they are for
>> > future work, maybe it would be best to introduce these functions in the
>> > patch series that will actually use them. It will cut down on the amount of
>> > code to be reviewed, and since it is not exercised it can't be verified
>> > anyway).
>> >
>>
>> Removed all unused functions, except vxhs_dec_and_get_acb_segment_count().
>>
>> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
>> >> +{
>> >> +    BDRVVXHSState *s = ptr;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +    s->vdisk_aio_count += count;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +}
>> >> +
>> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
>> >> +{
>> >> +    BDRVVXHSState *s = ptr;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +    s->vdisk_aio_count -= count;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +}
>> >> +
>> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr)
>> >> +{
>> >> +    BDRVVXHSState *s = ptr;
>> >> +    uint32_t count = 0;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +    count = s->vdisk_aio_count;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +
>> >> +    return count;
>> >> +}
>> >
>> > Function only used by another function that is not called from anywhere.
>> >
>> >
>>
>> Removed.
>>
>> >> +
>> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
>> >> +{
>> >> +    VXHSAIOCB *acb = NULL;
>> >> +    BDRVVXHSState *s = NULL;
>> >> +    int rv = 0;
>> >> +    int segcount = 0;
>> >> +    uint32_t error = 0;
>> >> +    uint32_t opcode = 0;
>> >> +
>> >> +    assert(m);
>> >> +    if (m) {
>> >> +        /* TODO: need common get message attrs, not two separate lib calls */
>> >> +        error = qemu_iio_extract_msg_error(m);
>> >> +        opcode = qemu_iio_extract_msg_opcode(m);
>> >> +    }
>> >> +    switch (opcode) {
>> >> +    case IRP_READ_REQUEST:
>> >> +    case IRP_WRITE_REQUEST:
>> >> +
>> >> +    /*
>> >> +     * ctx is VXHSAIOCB*
>> >> +     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
>> >> +     */
>> >> +    if (ctx) {
>> >> +        acb = ctx;
>> >> +        s = acb->common.bs->opaque;
>> >> +    } else {
>> >> +        trace_vxhs_iio_callback(error, reason);
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +    if (error) {
>> >> +        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
>> >> +
>> >> +        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
>> >> +            /*
>> >> +             * Storage agent failed while I/O was in progress
>> >> +             * Fail over only if the qnio channel dropped, indicating
>> >> +             * storage agent failure. Don't fail over in response to other
>> >> +             * I/O errors such as disk failure.
>> >> +             */
>> >> +            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
>> >> +                error == VXERROR_CHANNEL_HUP || error == -1) {
>> >> +                /*
>> >> +                 * Start vDisk IO failover once callback is
>> >> +                 * called against all the pending IOs.
>> >> +                 * If vDisk has no redundancy enabled
>> >> +                 * then IO failover routine will mark
>> >> +                 * the vDisk failed and fail all the
>> >> +                 * AIOs without retry (stateless vDisk)
>> >> +                 */
>> >> +                VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> >> +                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>> >> +                }
>> >> +                /*
>> >> +                 * Check if this acb is already queued before.
>> >> +                 * It is possible in case if I/Os are submitted
>> >> +                 * in multiple segments (QNIO_MAX_IO_SIZE).
>> >> +                 */
>> >> +                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
>> >> +                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>> >> +                                         acb, retry_entry);
>> >> +                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> >> +                    s->vdisk_aio_retry_qd++;
>> >> +                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
>> >> +                }
>> >> +                segcount = --acb->segments;
>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +                /*
>> >> +                 * Decrement AIO count only when callback is called
>> >> +                 * against all the segments of aiocb.
>> >> +                 */
>> >> +                if (segcount == 0 && --s->vdisk_aio_count == 0) {
>> >> +                    /*
>> >> +                     * Start vDisk I/O failover
>> >> +                     */
>> >> +                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +                    /*
>> >> +                     * TODO:
>> >> +                     * Need to explore further if it is possible to optimize
>> >> +                     * the failover operation on Virtual-Machine (global)
>> >> +                     * specific rather vDisk specific.
>> >> +                     */
>> >> +                    vxhs_failover_io(s);
>> >> +                    goto out;
>> >> +                }
>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +                goto out;
>> >> +            }
>> >> +        } else if (reason == IIO_REASON_HUP) {
>> >> +            /*
>> >> +             * Channel failed, spontaneous notification,
>> >> +             * not in response to I/O
>> >> +             */
>> >> +            trace_vxhs_iio_callback_chnlfail(error);
>> >> +            /*
>> >> +             * TODO: Start channel failover when no I/O is outstanding
>> >> +             */
>> >> +            goto out;
>> >> +        } else {
>> >> +            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
>> >> +                                         acb->size, error);
>> >> +        }
>> >> +    }
>> >> +    /*
>> >> +     * Set error into acb if not set. In case if acb is being
>> >> +     * submitted in multiple segments then need to set the error
>> >> +     * only once.
>> >> +     *
>> >> +     * Once acb done callback is called for the last segment
>> >> +     * then acb->ret return status will be sent back to the
>> >> +     * caller.
>> >> +     */
>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +    if (error && !acb->ret) {
>> >> +        acb->ret = error;
>> >> +    }
>> >> +    --acb->segments;
>> >> +    segcount = acb->segments;
>> >> +    assert(segcount >= 0);
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +    /*
>> >> +     * Check if all the outstanding I/Os are done against acb.
>> >> +     * If yes then send signal for AIO completion.
>> >> +     */
>> >> +    if (segcount == 0) {
>> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> >> +        if (rv != sizeof(acb)) {
>> >> +            error_report("VXHS AIO completion failed: %s", strerror(errno));
>> >> +            abort();
>> >> +        }
>> >> +    }
>> >> +    break;
>> >> +
>> >> +    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
>> >> +        /* ctx is BDRVVXHSState* */
>> >> +        assert(ctx);
>> >> +        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
>> >> +                                      error);
>> >> +        vxhs_failover_ioctl_cb(error, ctx);
>> >> +        break;
>> >> +
>> >> +    default:
>> >> +        if (reason == IIO_REASON_HUP) {
>> >> +            /*
>> >> +             * Channel failed, spontaneous notification,
>> >> +             * not in response to I/O
>> >> +             */
>> >> +            trace_vxhs_iio_callback_chnfail(error, errno);
>> >> +            /*
>> >> +             * TODO: Start channel failover when no I/O is outstanding
>> >> +             */
>> >> +        } else {
>> >> +            trace_vxhs_iio_callback_unknwn(opcode, error);
>> >> +        }
>> >> +        break;
>> >> +    }
>> >> +out:
>> >> +    return;
>> >> +}
>> >> +
>> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
>> >> +{
>> >> +    BlockCompletionFunc *cb = acb->common.cb;
>> >> +    void *opaque = acb->common.opaque;
>> >> +    int ret = 0;
>> >> +
>> >> +    if (acb->ret != 0) {
>> >> +        trace_vxhs_complete_aio(acb, acb->ret);
>> >> +    /*
>> >> +     * We mask all the IO errors generically as EIO for upper layers
>> >> +     * Right now our IO Manager uses non standard error codes. Instead
>> >> +     * of confusing upper layers with incorrect interpretation we are
>> >> +     * doing this workaround.
>> >> +     */
>> >> +        ret = (-EIO);
>> >> +    }
>> >> +    /*
>> >> +     * Copy back contents from stablization buffer into original iovector
>> >> +     * before returning the IO
>> >> +     */
>> >> +    if (acb->buffer != NULL) {
>> >> +        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
>> >> +        free(acb->buffer);
>> >> +        acb->buffer = NULL;
>> >> +    }
>> >> +    vxhs_dec_vdisk_iocount(s, 1);
>> >> +    acb->aio_done = VXHS_IO_COMPLETED;
>> >> +    qemu_aio_unref(acb);
>> >> +    cb(opaque, ret);
>> >> +}
>> >> +
>> >> +/*
>> >> + * This is the HyperScale event handler registered to QEMU.
>> >> + * It is invoked when any IO gets completed and written on pipe
>> >> + * by callback called from QNIO thread context. Then it marks
>> >> + * the AIO as completed, and releases HyperScale AIO callbacks.
>> >> + */
>> >> +void vxhs_aio_event_reader(void *opaque)
>> >> +{
>> >> +    BDRVVXHSState *s = opaque;
>> >> +    ssize_t ret;
>> >> +
>> >> +    do {
>> >> +        char *p = (char *)&s->qnio_event_acb;
>> >> +
>> >> +        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
>> >> +                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
>> >> +        if (ret > 0) {
>> >> +            s->event_reader_pos += ret;
>> >> +            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
>> >> +                s->event_reader_pos = 0;
>> >> +                vxhs_complete_aio(s->qnio_event_acb, s);
>> >> +            }
>> >> +        }
>> >> +    } while (ret < 0 && errno == EINTR);
>> >> +}
>> >> +
>> >> +/*
>> >> + * QEMU calls this to check if there are any pending IO on vDisk.
>> >> + * It will wait in a loop until all the AIOs are completed.
>> >> + */
>> >> +int vxhs_aio_flush_cb(void *opaque)
>> >> +{
>> >> +    BDRVVXHSState *s = opaque;
>> >> +
>> >> +    return vxhs_get_vdisk_iocount(s);
>> >> +}
>> >
>> > Unused function.
>> >
>> >
>>
>> Removed.
>>
>> >> +
>> >> +/*
>> >> + * This will be called by QEMU while booting for each vDisk.
>> >> + * bs->opaque will be allocated by QEMU upper block layer before
>> >> + * calling open. It will load all the QNIO operations from
>> >> + * qemuqnio library and call QNIO operation to create channel to
>> >> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk
>> >> + * path and then sets HyperScale event handler to QEMU.
>> >> + */
>> >> +void *vxhs_setup_qnio(void)
>> >> +{
>> >> +    void *qnio_ctx = NULL;
>> >> +
>> >> +    qnio_ctx = qemu_iio_init(vxhs_iio_callback);
>> >
>> > One concern with this callback function - when does it get called?  I am
>> > assuming that qemu_iio_init() is running threads / processes in the
>> > background, so this callback could happen at anytime.
>> >
>> > (I'll discuss my specific concern later, in vxhs_failover_ioctl_cb())
>> >
>> >
>> >
>>
>> Yes, that is correct. vxhs_setup_qnio() is called from the threads
>> running in libqnio.
>>
>>
>>
>> > An aside:
>> >
>> > It is odd that the qnio library is using a qemu_ prefix for functions.
>> > Seems like a good way to run into issues later, especially with relatively
>> > generic function names such as 'qemu_ck_spin_lock()'. Consider that most
>> > developers / maintainers are probably not going to compile in vxhs given the
>> > external library dependence without a specific reason, so name collisions
>> > could sneak in.
>> >
>> > 'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause:
>> >
>> > 0000000000004d07 T qemu_calculate_iovec_size
>> > 000000000000663d T qemu_ck_destroy_lock
>> > 00000000000065cf T qemu_ck_initialize_lock
>> > 00000000000065f7 T qemu_ck_spin_lock
>> > 000000000000661a T qemu_ck_spin_unlock
>> > 0000000000004dfb T qemu_convert_iovector_to_buffer
>> > 0000000000004d63 T qemu_copy_iov_to_buffer
>> > 0000000000005f2f T qemu_extract_flush_response
>> > 0000000000005c89 T qemu_extract_geometry_from_json
>> > 0000000000005a78 T qemu_extract_size_from_json
>> > 0000000000004f40 T qemu_is_iovector_read_aligned
>> >
>> >
>>
>> Replaced all qemu_ prefixed symbols within qnio.
>>
>> >
>> >> +
>> >> +    if (qnio_ctx != NULL) {
>> >> +        trace_vxhs_setup_qnio(qnio_ctx);
>> >> +    } else {
>> >> +        trace_vxhs_setup_qnio_nwerror('.');
>> >> +    }
>> >> +
>> >> +    return qnio_ctx;
>> >> +}
>> >> +
>> >> +static QemuOptsList runtime_opts = {
>> >> +    .name = "vxhs",
>> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>> >> +    .desc = {
>> >> +        {
>> >> +            .name = VXHS_OPT_FILENAME,
>> >> +            .type = QEMU_OPT_STRING,
>> >> +            .help = "URI to the Veritas HyperScale image",
>> >> +        },
>> >> +        {
>> >> +            .name = VXHS_OPT_VDISK_ID,
>> >> +            .type = QEMU_OPT_STRING,
>> >> +            .help = "UUID of the VxHS vdisk",
>> >> +        },
>> >> +        { /* end of list */ }
>> >> +    },
>> >> +};
>> >> +
>> >> +static QemuOptsList runtime_tcp_opts = {
>> >> +    .name = "vxhs_tcp",
>> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
>> >> +    .desc = {
>> >> +        {
>> >> +            .name = VXHS_OPT_HOST,
>> >> +            .type = QEMU_OPT_STRING,
>> >> +            .help = "host address (ipv4 addresses)",
>> >> +        },
>> >> +        {
>> >> +            .name = VXHS_OPT_PORT,
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "port number on which VxHSD is listening (default 9999)",
>> >> +            .def_value_str = "9999"
>> >> +        },
>> >> +        {
>> >> +            .name = "to",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "max port number, not supported by VxHS",
>> >> +        },
>> >> +        {
>> >> +            .name = "ipv4",
>> >> +            .type = QEMU_OPT_BOOL,
>> >> +            .help = "ipv4 bool value, not supported by VxHS",
>> >> +        },
>> >> +        {
>> >> +            .name = "ipv6",
>> >> +            .type = QEMU_OPT_BOOL,
>> >> +            .help = "ipv6 bool value, not supported by VxHS",
>> >> +        },
>> >> +        { /* end of list */ }
>> >> +    },
>> >> +};
>> >> +
>> >> +/*
>> >> + * Parse the incoming URI and populate *options with all the host(s)
>> >> + * information. Host at index 0 is local storage agent.
>> >> + * Remaining are the reflection target storage agents. The local storage agent
>> >> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
>> >> + * The local storage agent address is stored at index 0. The reflection target
>> >> + * ips, are the E-W data network addresses of the reflection node agents, also
>> >> + * extracted from the uri.
>> >> + */
>> >> +static int vxhs_parse_uri(const char *filename, QDict *options)
>> >> +{
>> >> +    gchar **target_list;
>> >> +    URI *uri = NULL;
>> >> +    char *hoststr, *portstr;
>> >> +    char *port;
>> >> +    int ret = 0;
>> >> +    int i = 0;
>> >> +
>> >> +    trace_vxhs_parse_uri_filename(filename);
>> >> +    target_list = g_strsplit(filename, "%7D", 0);
>> >> +    assert(target_list != NULL && target_list[0] != NULL);
>> >> +
>> >> +    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
>> >> +        uri = uri_parse(target_list[i]);
>> >> +        if (!uri || !uri->server) {
>> >> +            uri_free(uri);
>> >> +            ret = -EINVAL;
>> >> +            break;
>> >> +        }
>> >> +
>> >> +        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
>> >> +        qdict_put(options, hoststr, qstring_from_str(uri->server));
>> >> +
>> >> +        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
>> >> +        if (uri->port) {
>> >> +            port = g_strdup_printf("%d", uri->port);
>> >> +            qdict_put(options, portstr, qstring_from_str(port));
>> >> +            g_free(port);
>> >> +        }
>> >> +
>> >> +        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
>> >> +            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
>> >> +        }
>> >> +
>> >> +        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
>> >> +        g_free(hoststr);
>> >> +        g_free(portstr);
>> >> +        uri_free(uri);
>> >> +    }
>> >> +
>> >> +    g_strfreev(target_list);
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +static void vxhs_parse_filename(const char *filename, QDict *options,
>> >> +                               Error **errp)
>> >> +{
>> >> +    if (qdict_haskey(options, "host")
>> >> +        || qdict_haskey(options, "port")
>> >> +        || qdict_haskey(options, "path"))
>> >> +    {
>> >> +        error_setg(errp, "host/port/path and a file name may not be specified "
>> >> +                         "at the same time");
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    if (strstr(filename, "://")) {
>> >> +        int ret = vxhs_parse_uri(filename, options);
>> >> +        if (ret < 0) {
>> >> +            error_setg(errp, "Invalid URI. URI should be of the form "
>> >> +                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
>> >> +        }
>> >> +    }
>> >> +}
>> >> +
>> >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
>> >> +                              int *cfd, int *rfd, Error **errp)
>> >> +{
>> >> +    QDict *backing_options = NULL;
>> >> +    const char *vxhs_filename;
>> >> +    char *of_vsa_addr = NULL;
>> >> +    Error *local_err = NULL;
>> >> +    const char *ptr = NULL;
>> >
>> > ptr does not need to be initialized to NULL here.  I also recommend using a
>> > more descriptive name like 'vdisk_id_opt' over something generic like 'ptr'.
>> >
>>
>> Fixed.
>>
>>
>> >> +    char *file_name = NULL;
>> >> +    size_t num_servers;
>> >> +    char *str = NULL;
>> >> +    QemuOpts *opts;
>> >> +    int ret = 0;
>> >> +    int i;
>> >> +
>> >> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>> >> +    qemu_opts_absorb_qdict(opts, options, &local_err);
>> >> +    if (local_err) {
>> >> +        ret = -EINVAL;
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
>> >> +    if (vxhs_filename) {
>> >> +        trace_vxhs_qemu_init_filename(vxhs_filename);
>> >> +    }
>> >> +
>> >> +    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
>> >> +    if (!ptr) {
>> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
>> >> +        ret = -EINVAL;
>> >> +        goto out;
>> >> +    }
>> >> +    s->vdisk_guid = g_strdup(ptr);
>> >
>> > This is never freed anywhere.
>> >
>>
>> Now freed on error conditions and in vxhs_close().
>>
>> >> +    trace_vxhs_qemu_init_vdisk(ptr);
>> >> +
>> >> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
>> >> +    if (num_servers < 1) {
>> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
>> >> +        ret = -EINVAL;
>> >> +        goto out;
>> >> +    }
>> >> +    trace_vxhs_qemu_init_numservers(num_servers);
>> >> +
>> >> +    qemu_opts_del(opts);
>> >> +
>> >> +    for (i = 0; i < num_servers; i++) {
>> >
>> > s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size.  There is no
>> > bounds checking on num_servers, which is an issue: we can easily blow
>> > past this.
>> >
>> > In general, for any parameter supplied by a user, or read from a file /
>> > protocol / etc, we need to do appropriate bounds checking to prevent
>> > potential CVEs.
>> >
>>
>> Fixed.
>>
>> >> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
>> >> +        qdict_extract_subqdict(options, &backing_options, str);
>> >> +
>> >> +        /* create opts info from runtime_tcp_opts list */
>> >> +        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
>> >> +        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
>> >> +        if (local_err) {
>> >> +            ret = -EINVAL;
>> >> +            qemu_opts_del(opts);
>> >> +            goto out;
>> >> +        }
>> >> +
>> >> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
>> >> +                                                            VXHS_OPT_HOST));
>> >
>> > This is never freed anywhere.
>> >
>>
>> Now freed on error and vxhs_close().
>>
>> >> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
>> >> +                                                                 VXHS_OPT_PORT),
>> >> +                                                    NULL, 0);
>> >> +
>> >> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
>> >> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
>> >> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
>> >> +                             s->vdisk_hostinfo[i].port);
>> >> +        qemu_opts_del(opts);
>> >> +    }
>> >> +
>> >> +    s->vdisk_nhosts = i;
>> >> +    s->vdisk_cur_host_idx = 0;
>> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
>> >> +
>> >> +    /*
>> >> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
>> >> +     */
>> >> +    if (global_qnio_ctx == NULL) {
>> >> +        global_qnio_ctx = vxhs_setup_qnio();
>> >> +        if (global_qnio_ctx == NULL) {
>> >> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
>> >> +            ret = -EINVAL;
>> >> +            goto out;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>> >> +    if (*cfd < 0) {
>> >> +        error_setg(&local_err, "Failed qemu_open_iio_conn");
>> >> +        ret = -EIO;
>> >> +        goto out;
>> >> +    }
>> >> +    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
>> >> +    if (*rfd < 0) {
>> >> +        error_setg(&local_err, "Failed qemu_iio_devopen");
>> >> +        ret = -EIO;
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +out:
>> >> +    g_free(file_name);
>> >> +    g_free(of_vsa_addr);
>> >> +    if (str) {
>> >> +        qdict_del(backing_options, str);
>> >> +        g_free(str);
>> >> +    }
>> >> +
>> >> +    if (ret < 0) {
>> >> +        errno = -ret;
>> >> +    }
>> >> +    error_propagate(errp, local_err);
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
>> >> +              int bdrv_flags, Error **errp)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    AioContext *aio_context;
>> >> +    int qemu_qnio_cfd = -1;
>> >> +    int qemu_rfd = -1;
>> >> +    int ret = 0;
>> >> +
>> >> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
>> >> +    if (ret < 0) {
>> >> +        trace_vxhs_open_fail(ret);
>> >> +        return ret;
>> >> +    }
>> >> +
>> >> +    s->qnio_ctx = global_qnio_ctx;
>> >> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
>> >> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
>> >> +    s->vdisk_size = 0;
>> >> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
>> >> +
>> >> +    /*
>> >> +     * Create a pipe for communicating between two threads in different
>> >> +     * context. Set handler for read event, which gets triggered when
>> >> +     * IO completion is done by non-QEMU context.
>> >> +     */
>> >> +    ret = qemu_pipe(s->fds);
>> >> +    if (ret < 0) {
>> >> +        trace_vxhs_open_epipe('.');
>> >> +        ret = -errno;
>> >> +        goto out;
>> >> +    }
>> >> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
>> >> +
>> >> +    aio_context = bdrv_get_aio_context(bs);
>> >> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
>> >> +                       false, vxhs_aio_event_reader, NULL, s);
>> >> +
>> >> +    /*
>> >> +     * Allocate/Initialize the spin-locks.
>> >> +     *
>> >> +     * NOTE:
>> >> +     *      Since spin lock is being allocated
>> >> +     *      dynamically hence moving acb struct
>> >> +     *      specific lock to BDRVVXHSState
>> >> +     *      struct. The reason being,
>> >> +     *      we don't want the overhead of spin
>> >> +     *      lock being dynamically allocated and
>> >> +     *      freed for every AIO.
>> >> +     */
>> >> +    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
>> >> +    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
>> >> +
>> >> +    return 0;
>> >> +
>> >> +out:
>> >> +    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> >> +                                 s->vdisk_hostinfo[0].vdisk_rfd);
>> >> +    }
>> >> +    /* never close qnio_cfd */
>> >
>> > Surely we want to be able to close it at some point... or does the qnio
>> > library take care of that cleanup somewhere/sometime?
>> >
>> >
>>
>> Changed to close channel fd on error conditions, vxhs_close() and
>> vxhs_reopen_vdisk().
>>
>>
>> >> +    trace_vxhs_open_fail(ret);
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +static const AIOCBInfo vxhs_aiocb_info = {
>> >> +    .aiocb_size = sizeof(VXHSAIOCB)
>> >> +};
>> >> +
>> >> +/*
>> >> + * This is called in QNIO thread context when IO done
>> >> + * on IO Manager and QNIO client received the data or
>> >> + * ACK. It notify another event handler thread running in QEMU context
>> >> + * by writing on the pipe
>> >> + */
>> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg)
>> >> +{
>> >> +    VXHSAIOCB *acb = arg;
>> >> +    BlockDriverState *bs = acb->common.bs;
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    int rv;
>> >> +
>> >> +    acb->ret = ret;
>> >> +    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> >> +    if (rv != sizeof(acb)) {
>> >> +        error_report("VXHS AIO completion failed: %s",
>> >> +                     strerror(errno));
>> >> +        abort();
>> >> +    }
>> >> +}
>> >
>> > Another unused function.
>> >
>>
>> Removed.
>>
>>
>> >> +
>> >> +/*
>> >> + * This allocates QEMU-VXHS callback for each IO
>> >> + * and is passed to QNIO. When QNIO completes the work,
>> >> + * it will be passed back through the callback.
>> >> + */
>> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
>> >> +                                int64_t sector_num, QEMUIOVector *qiov,
>> >> +                                int nb_sectors,
>> >> +                                BlockCompletionFunc *cb,
>> >> +                                void *opaque, int iodir)
>> >> +{
>> >> +    VXHSAIOCB *acb = NULL;
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    size_t size;
>> >> +    uint64_t offset;
>> >> +    int iio_flags = 0;
>> >> +    int ret = 0;
>> >> +
>> >> +    offset = sector_num * BDRV_SECTOR_SIZE;
>> >> +    size = nb_sectors * BDRV_SECTOR_SIZE;
>> >> +
>> >> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
>> >> +    /*
>> >> +     * Setup or initialize VXHSAIOCB.
>> >> +     * Every single field should be initialized since
>> >> +     * acb will be picked up from the slab without
>> >> +     * initializing with zero.
>> >> +     */
>> >> +    acb->io_offset = offset;
>> >> +    acb->size = size;
>> >> +    acb->ret = 0;
>> >> +    acb->flags = 0;
>> >> +    acb->aio_done = VXHS_IO_INPROGRESS;
>> >> +    acb->segments = 0;
>> >> +    acb->buffer = 0;
>> >> +    acb->qiov = qiov;
>> >> +    acb->direction = iodir;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +    if (OF_VDISK_FAILED(s)) {
>> >> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +        goto errout;
>> >> +    }
>> >> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> >> +        s->vdisk_aio_retry_qd++;
>> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
>> >> +        goto out;
>> >> +    }
>> >> +    s->vdisk_aio_count++;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +
>> >> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> >> +
>> >> +    switch (iodir) {
>> >> +    case VDISK_AIO_WRITE:
>> >> +            vxhs_inc_acb_segment_count(acb, 1);
>> >> +            ret = qemu_iio_writev(s->qnio_ctx,
>> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>> >> +            break;
>> >> +    case VDISK_AIO_READ:
>> >> +            vxhs_inc_acb_segment_count(acb, 1);
>> >> +            ret = qemu_iio_readv(s->qnio_ctx,
>> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>> >> +            break;
>> >> +    default:
>> >> +            trace_vxhs_aio_rw_invalid(iodir);
>> >> +            goto errout;
>> >> +    }
>> >> +
>> >> +    if (ret != 0) {
>> >> +        trace_vxhs_aio_rw_ioerr(
>> >> +                  s->vdisk_guid, iodir, size, offset,
>> >> +                  acb, acb->segments, ret, errno);
>> >> +        /*
>> >> +         * Don't retry I/Os against vDisk having no
>> >> +         * redundancy or stateful storage on compute
>> >> +         *
>> >> +         * TODO: Revisit this code path to see if any
>> >> +         *       particular error needs to be handled.
>> >> +         *       At this moment failing the I/O.
>> >> +         */
>> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +        if (s->vdisk_nhosts == 1) {
>> >> +            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
>> >> +            s->vdisk_aio_count--;
>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            goto errout;
>> >> +        }
>> >> +        if (OF_VDISK_FAILED(s)) {
>> >> +            trace_vxhs_aio_rw_devfail(
>> >> +                      s->vdisk_guid, iodir, size, offset);
>> >> +            s->vdisk_aio_count--;
>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            goto errout;
>> >> +        }
>> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> >> +            /*
>> >> +             * Queue all incoming io requests after failover starts.
>> >> +             * Number of requests that can arrive is limited by io queue depth
>> >> +             * so an app blasting independent ios will not exhaust memory.
>> >> +             */
>> >> +            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> >> +            s->vdisk_aio_retry_qd++;
>> >> +            OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> >> +            s->vdisk_aio_count--;
>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
>> >> +            goto out;
>> >> +        }
>> >> +        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> >> +        s->vdisk_aio_retry_qd++;
>> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> >> +        vxhs_dec_acb_segment_count(acb, 1);
>> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
>> >> +        /*
>> >> +         * Start I/O failover if there is no active
>> >> +         * AIO within vxhs block driver.
>> >> +         */
>> >> +        if (--s->vdisk_aio_count == 0) {
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            /*
>> >> +             * Start IO failover
>> >> +             */
>> >> +            vxhs_failover_io(s);
>> >> +            goto out;
>> >> +        }
>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +    }
>> >> +
>> >> +out:
>> >> +    return &acb->common;
>> >> +
>> >> +errout:
>> >> +    qemu_aio_unref(acb);
>> >> +    return NULL;
>> >> +}
>> >> +
>> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
>> >> +                                   int64_t sector_num, QEMUIOVector *qiov,
>> >> +                                   int nb_sectors,
>> >> +                                   BlockCompletionFunc *cb, void *opaque)
>> >> +{
>> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>> >> +                         cb, opaque, VDISK_AIO_READ);
>> >> +}
>> >> +
>> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
>> >> +                                    int64_t sector_num, QEMUIOVector *qiov,
>> >> +                                    int nb_sectors,
>> >> +                                    BlockCompletionFunc *cb, void *opaque)
>> >> +{
>> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>> >> +                         cb, opaque, VDISK_AIO_WRITE);
>> >> +}
>> >> +
>> >> +/*
>> >> + * This is called by QEMU when a flush gets triggered from within
>> >> + * a guest at the block layer, either for IDE or SCSI disks.
>> >> + */
>> >> +int vxhs_co_flush(BlockDriverState *bs)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    uint64_t size = 0;
>> >> +    int ret = 0;
>> >> +
>> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
>> >> +
>> >> +    if (ret < 0) {
>> >> +        /*
>> >> +         * Currently not handling the flush ioctl
>> >> +         * failure because of network connection
>> >> +         * disconnect. Since all the writes are
>> >> +         * commited into persistent storage hence
>> >> +         * this flush call is noop and we can safely
>> >> +         * return success status to the caller.
>> >
>> > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
>> > above is a noop?
>> >
>>
>> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
>> in case we later want to add some functionality to it. I have now
>> added a comment to this affect to avoid any confusion.
>>
>
> The problem is you don't know which version of the qnio library any given
> QEMU binary will be using, since it is a shared library.  Future versions
> may implement the flush ioctl as expressed above, in which case we may hide
> a valid error.
>
> Am I correct in assuming that this call suppresses errors because an error
> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
> you want a placeholder here for flushing, you should go all the way and stub
> out the underlying ioctl call to return success.  Then QEMU can at least
> rely on the error return from the flush operation.
>
>
>> >> +         *
>> >> +         * If any write failure occurs for inflight
>> >> +         * write AIO because of network disconnect
>> >> +         * then anyway IO failover will be triggered.
>> >> +         */
>> >> +        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
>> >> +        ret = 0;
>> >> +    }
>> >> +
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
>> >> +{
>> >> +    void *ctx = NULL;
>> >> +    int flags = 0;
>> >> +    unsigned long vdisk_size = 0;
>> >> +    int ret = 0;
>> >> +
>> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +            VDISK_STAT, &vdisk_size, ctx, flags);
>> >> +
>> >> +    if (ret < 0) {
>> >> +        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
>> >> +        return 0;
>> >> +    }
>> >> +
>> >> +    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
>> >> +    return vdisk_size;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Returns the size of vDisk in bytes. This is required
>> >> + * by QEMU block upper block layer so that it is visible
>> >> + * to guest.
>> >> + */
>> >> +int64_t vxhs_getlength(BlockDriverState *bs)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    unsigned long vdisk_size = 0;
>> >> +
>> >> +    if (s->vdisk_size > 0) {
>> >> +        vdisk_size = s->vdisk_size;
>> >
>> > s->vdisk_size is int64_t, vdisk_size is unsigned long.  On 32-bit platforms,
>> > I believe unsigned long is only 32-bits, so this assignment is problematic.
>> >
>> > What is the maximum disk size for vxhs?
>> >
>> > (this comment applies to all the vdisk_size usage, but I'll only mention it
>> > here)
>> >
>>
>> Changed to int64_t in all places. The code does not place any
>> restriction on the maximum size of the vdisk.
>>
>> >> +    } else {
>> >> +        /*
>> >> +         * Fetch the vDisk size using stat ioctl
>> >> +         */
>> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
>> >> +        if (vdisk_size > 0) {
>> >> +            s->vdisk_size = vdisk_size;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    if (vdisk_size > 0) {
>> >> +        return (int64_t)vdisk_size; /* return size in bytes */
>> >> +    } else {
>> >> +        return -EIO;
>> >> +    }
>> >> +}
>> >> +
>> >> +/*
>> >> + * Returns actual blocks allocated for the vDisk.
>> >> + * This is required by qemu-img utility.
>> >> + */
>> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +    unsigned long vdisk_size = 0;
>> >> +
>> >> +    if (s->vdisk_size > 0) {
>> >> +        vdisk_size = s->vdisk_size;
>> >> +    } else {
>> >> +        /*
>> >> +         * TODO:
>> >> +         * Once HyperScale storage-virtualizer provides
>> >> +         * actual physical allocation of blocks then
>> >> +         * fetch that information and return back to the
>> >> +         * caller but for now just get the full size.
>> >> +         */
>> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
>> >> +        if (vdisk_size > 0) {
>> >> +            s->vdisk_size = vdisk_size;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    if (vdisk_size > 0) {
>> >> +        return (int64_t)vdisk_size; /* return size in bytes */
>> >> +    } else {
>> >> +        return -EIO;
>> >> +    }
>> >> +}
>> >> +
>> >> +void vxhs_close(BlockDriverState *bs)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +
>> >> +    close(s->fds[VDISK_FD_READ]);
>> >> +    close(s->fds[VDISK_FD_WRITE]);
>> >> +
>> >> +    /*
>> >> +     * never close channel - not ref counted, will
>> >> +     * close for all vdisks
>> >> +     */
>> >> +    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
>> >> +    }
>> >> +    if (s->vdisk_lock) {
>> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
>> >> +        s->vdisk_lock = NULL;
>> >> +    }
>> >> +    if (s->vdisk_acb_lock) {
>> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
>> >> +        s->vdisk_acb_lock = NULL;
>> >> +    }
>> >> +
>> >> +    /*
>> >> +     * TODO: Verify that all the resources were relinguished.
>> >> +     */
>> >
>> > Which resources?  Allocated here in the driver, or stuff in libqnio?
>> >
>>
>> The comment was for all resources allocated within vxhs.c.
>> Added code to free all allocated memory and fds on close.
>>
>>
>> >> +}
>> >> +
>> >> +/*
>> >> + * If errors are consistent with storage agent failure:
>> >> + *  - Try to reconnect in case error is transient or storage agent restarted.
>> >> + *  - Currently failover is being triggered on per vDisk basis. There is
>> >> + *    a scope of further optimization where failover can be global (per VM).
>> >> + *  - In case of network (storage agent) failure, for all the vDisks, having
>> >> + *    no redundancy, I/Os will be failed without attempting for I/O failover
>> >> + *    because of stateless nature of vDisk.
>> >> + *  - If local or source storage agent is down then send an ioctl to remote
>> >> + *    storage agent to check if remote storage agent in a state to accept
>> >> + *    application I/Os.
>> >> + *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
>> >> + *  - If I/Os cannot be serviced then vDisk will be marked failed so that
>> >> + *    new incoming I/Os are returned with failure immediately.
>> >> + *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
>> >> + *    queued and will be restarted or failed based on failover operation
>> >> + *    is successful or not.
>> >> + *  - I/O failover can be started either in I/O forward or I/O backward
>> >> + *    path.
>> >> + *  - I/O failover will be started as soon as all the pending acb(s)
>> >> + *    are queued and there is no pending I/O count.
>> >> + *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
>> >> + *    then vDisk will be marked failed and all I/Os will be completed with
>> >> + *    error.
>> >> + */
>> >> +
>> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s)
>> >> +{
>> >> +    int res = 0;
>> >> +    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
>> >> +
>> >> +    trace_vxhs_switch_storage_agent(
>> >> +              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>> >> +              s->vdisk_guid);
>> >> +
>> >> +    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
>> >> +    if (res == 0) {
>> >> +        res = qemu_iio_ioctl(s->qnio_ctx,
>> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
>> >> +                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
>> >> +    } else {
>> >> +        trace_vxhs_switch_storage_agent_failed(
>> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>> >> +                  s->vdisk_guid, res, errno);
>> >> +        /*
>> >> +         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
>> >> +         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
>> >> +         * target list.
>> >> +         */
>> >> +
>> >> +        /* try next host */
>> >> +        vxhs_failover_ioctl_cb(res, s);
>> >> +    }
>> >> +    return res;
>> >> +}
>> >> +
>> >> +void vxhs_failover_ioctl_cb(int res, void *ctx)
>> >> +{
>> >> +    BDRVVXHSState *s = ctx;
>> >> +
>> >> +    if (res == 0) {
>> >> +        /* found failover target */
>> >> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
>> >> +        s->vdisk_ask_failover_idx = 0;
>> >> +        trace_vxhs_failover_ioctl_cb(
>> >> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> >> +                   s->vdisk_guid);
>> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +        vxhs_handle_queued_ios(s);
>> >> +    } else {
>> >> +        /* keep looking */
>> >> +        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
>> >> +        s->vdisk_ask_failover_idx++;
>> >> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
>> >
>> > This function, vxhs_failover_ioctrl_cb(), is called from the external
>> > library callback, in addition to QEMU itself (i.e., vxhs_aio_rw).  This
>> > makes me concerned that we may have (or be able to induce) an unsafe race
>> > condition when incrementing the vdisk_ask_failover_id array index.
>> >
>>
>> vxhs_failover_ioctl_cb()  can be called from the callback
>> vxhs_iio_callback() case IRP_VDISK_CHECK_IO_FAILOVER_READY in response
>> to the qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY)
>>
>> This ioctl qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) is only
>> called from within vxhs_switch_storage_agent()
>>
>> A direct call to vxhs_failover_ioctl_cb() is only issued from the same
>> function vxhs_switch_storage_agent() on an "else" condition. So
>> vxhs_failover_ioctl_cb() can only be called from a single thread
>> context either in response to the ioctl, or directly. We should be
>> safe here, but thanks for pointing it out!
>
>
> The callback function 'vxhs_iio_callback' also calls
> vxhs_failover_ioctl_cb().
>
> vxhs_failover_ioctl_cb() is also called from vxhs_aio_rw() by QEMU (via
> 'vxds_failover_io').
>
> The above two paths make me concerned about the array pointer being
> incremented.
>
> (BTW: 'vxhs_failover_ioctl_cb' seems to be misnamed, because it is not a
> callback function, but just a helper function).
>
> Also related: this code reads like it will hang QEMU if QNIO goes offline;
> we will be caught in perpetual retries, in which case vxhs_aio_rw() may
> never return.
>
>>
>> >> +            /* pause and cycle through list again */
>> >> +            sleep(QNIO_CONNECT_RETRY_SECS);
>> >> +            s->vdisk_ask_failover_idx = 0;
>> >> +        }
>> >> +        res = vxhs_switch_storage_agent(s);
>> >> +    }
>> >> +}
>> >> +
>> >> +int vxhs_failover_io(BDRVVXHSState *s)
>> >> +{
>> >> +    int res = 0;
>> >> +
>> >> +    trace_vxhs_failover_io(s->vdisk_guid);
>> >> +
>> >> +    s->vdisk_ask_failover_idx = 0;
>> >> +    res = vxhs_switch_storage_agent(s);
>> >> +
>> >> +    return res;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Try to reopen the vDisk on one of the available hosts
>> >> + * If vDisk reopen is successful on any of the host then
>> >> + * check if that node is ready to accept I/O.
>> >> + */
>> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
>> >> +{
>> >> +    char *of_vsa_addr = NULL;
>> >> +    char *file_name = NULL;
>> >> +    int  res = 0;
>> >> +
>> >> +    /*
>> >> +     * Don't close the channel if it was opened
>> >> +     * before successfully. It will be handled
>> >> +     * within iio* api if the same channel open
>> >> +     * fd is reused.
>> >> +     *
>> >> +     * close stale vdisk device remote fd since
>> >> +     * it is invalid after channel disconnect.
>> >> +     */
>> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>> >> +                                 s->vdisk_hostinfo[index].vdisk_rfd);
>> >> +        s->vdisk_hostinfo[index].vdisk_rfd = -1;
>> >> +    }
>> >> +
>> >> +    /*
>> >> +     * build storage agent address and vdisk device name strings
>> >> +     */
>> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> >> +             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
>> >> +    /*
>> >> +     * open qnio channel to storage agent if not opened before.
>> >> +     */
>> >> +    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>> >> +        s->vdisk_hostinfo[index].qnio_cfd =
>> >> +                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>> >> +        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>> >> +            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
>> >> +            res = ENODEV;
>> >> +            goto out;
>> >> +        }
>> >> +    }
>> >> +    /*
>> >> +     * open vdisk device
>> >> +     */
>> >> +    s->vdisk_hostinfo[index].vdisk_rfd =
>> >> +            qemu_iio_devopen(global_qnio_ctx,
>> >> +             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
>> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
>> >> +        trace_vxhs_reopen_vdisk_openfail(file_name);
>> >> +        res = EIO;
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +out:
>> >> +    g_free(of_vsa_addr);
>> >> +    g_free(file_name);
>> >> +    return res;
>> >> +}
>> >> +
>> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s)
>> >> +{
>> >> +    VXHSAIOCB *acb = NULL;
>> >> +    int res = 0;
>> >> +
>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
>> >> +        /*
>> >> +         * Before we process the acb, check whether I/O failover
>> >> +         * started again due to failback or cascading failure.
>> >> +         */
>> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            goto out;
>> >> +        }
>> >> +        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
>> >> +        s->vdisk_aio_retry_qd--;
>> >> +        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
>> >> +        if (OF_VDISK_FAILED(s)) {
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            vxhs_fail_aio(acb, EIO);
>> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +        } else {
>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +            res = vxhs_restart_aio(acb);
>> >> +            trace_vxhs_handle_queued_ios(acb, res);
>> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>> >> +            if (res) {
>> >> +                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>> >> +                                     acb, retry_entry);
>> >> +                OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +                goto out;
>> >> +            }
>> >> +        }
>> >> +    }
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>> >> +out:
>> >> +    return res;
>> >> +}
>> >> +
>> >> +int vxhs_restart_aio(VXHSAIOCB *acb)
>> >> +{
>> >> +    BDRVVXHSState *s = NULL;
>> >> +    int iio_flags = 0;
>> >> +    int res = 0;
>> >> +
>> >> +    s = acb->common.bs->opaque;
>> >> +
>> >> +    if (acb->direction == VDISK_AIO_WRITE) {
>> >> +        vxhs_inc_vdisk_iocount(s, 1);
>> >> +        vxhs_inc_acb_segment_count(acb, 1);
>> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> >> +        res = qemu_iio_writev(s->qnio_ctx,
>> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +                acb->qiov->iov, acb->qiov->niov,
>> >> +                acb->io_offset, (void *)acb, iio_flags);
>> >> +    }
>> >> +
>> >> +    if (acb->direction == VDISK_AIO_READ) {
>> >> +        vxhs_inc_vdisk_iocount(s, 1);
>> >> +        vxhs_inc_acb_segment_count(acb, 1);
>> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> >> +        res = qemu_iio_readv(s->qnio_ctx,
>> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>> >> +                acb->qiov->iov, acb->qiov->niov,
>> >> +                acb->io_offset, (void *)acb, iio_flags);
>> >> +    }
>> >> +
>> >> +    if (res != 0) {
>> >> +        vxhs_dec_vdisk_iocount(s, 1);
>> >> +        vxhs_dec_acb_segment_count(acb, 1);
>> >> +        trace_vxhs_restart_aio(acb->direction, res, errno);
>> >> +    }
>> >> +
>> >> +    return res;
>> >> +}
>> >> +
>> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err)
>> >> +{
>> >> +    BDRVVXHSState *s = NULL;
>> >> +    int segcount = 0;
>> >> +    int rv = 0;
>> >> +
>> >> +    s = acb->common.bs->opaque;
>> >> +
>> >> +    trace_vxhs_fail_aio(s->vdisk_guid, acb);
>> >> +    if (!acb->ret) {
>> >> +        acb->ret = err;
>> >> +    }
>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>> >> +    segcount = acb->segments;
>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>> >> +    if (segcount == 0) {
>> >> +        /*
>> >> +         * Complete the io request
>> >> +         */
>> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>> >> +        if (rv != sizeof(acb)) {
>> >> +            error_report("VXHS AIO completion failed: %s",
>> >> +                         strerror(errno));
>> >> +            abort();
>> >> +        }
>> >> +    }
>> >> +}
>> >> +
>> >> +static void vxhs_detach_aio_context(BlockDriverState *bs)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +
>> >> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
>> >> +                       false, NULL, NULL, NULL);
>> >> +
>> >> +}
>> >> +
>> >> +static void vxhs_attach_aio_context(BlockDriverState *bs,
>> >> +                                   AioContext *new_context)
>> >> +{
>> >> +    BDRVVXHSState *s = bs->opaque;
>> >> +
>> >> +    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
>> >> +                       false, vxhs_aio_event_reader, NULL, s);
>> >> +}
>> >> +
>> >> +static BlockDriver bdrv_vxhs = {
>> >> +    .format_name                  = "vxhs",
>> >> +    .protocol_name                = "vxhs",
>> >> +    .instance_size                = sizeof(BDRVVXHSState),
>> >> +    .bdrv_file_open               = vxhs_open,
>> >> +    .bdrv_parse_filename          = vxhs_parse_filename,
>> >> +    .bdrv_close                   = vxhs_close,
>> >> +    .bdrv_getlength               = vxhs_getlength,
>> >> +    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
>> >> +    .bdrv_aio_readv               = vxhs_aio_readv,
>> >> +    .bdrv_aio_writev              = vxhs_aio_writev,
>> >> +    .bdrv_co_flush_to_disk        = vxhs_co_flush,
>> >> +    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
>> >> +    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
>> >> +};
>> >> +
>> >> +void bdrv_vxhs_init(void)
>> >> +{
>> >> +    trace_vxhs_bdrv_init('.');
>> >> +    bdrv_register(&bdrv_vxhs);
>> >> +}
>> >> +
>> >> +block_init(bdrv_vxhs_init);
>> >> diff --git a/block/vxhs.h b/block/vxhs.h
>> >> new file mode 100644
>> >> index 0000000..38e0165
>> >> --- /dev/null
>> >> +++ b/block/vxhs.h
>> >> @@ -0,0 +1,237 @@
>> >> +/*
>> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
>> >> + *
>> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> >> + * See the COPYING file in the top-level directory.
>> >> + *
>> >> + */
>> >> +
>> >> +#ifndef VXHSD_H
>> >> +#define VXHSD_H
>> >> +
>> >> +#include <gmodule.h>
>> >> +#include <inttypes.h>
>> >> +#include <pthread.h>
>> >> +
>> >> +#include "qemu/osdep.h"
>> >> +#include "qapi/error.h"
>> >> +#include "qemu/error-report.h"
>> >> +#include "block/block_int.h"
>> >> +#include "qemu/uri.h"
>> >> +#include "qemu/queue.h"
>> >> +
>> >> +#define OF_GUID_STR_LEN             40
>> >> +#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
>> >> +#define QNIO_CONNECT_RETRY_SECS     5
>> >> +#define QNIO_CONNECT_TIMOUT_SECS    120
>> >> +
>> >> +/* constants from io_qnio.h */
>> >> +#define IIO_REASON_DONE     0x00000004
>> >> +#define IIO_REASON_EVENT    0x00000008
>> >> +#define IIO_REASON_HUP      0x00000010
>> >> +
>> >> +/*
>> >> + * IO specific flags
>> >> + */
>> >> +#define IIO_FLAG_ASYNC      0x00000001
>> >> +#define IIO_FLAG_DONE       0x00000010
>> >> +#define IIO_FLAG_SYNC       0
>> >> +
>> >> +/* constants from error.h */
>> >> +#define VXERROR_RETRY_ON_SOURCE     44
>> >> +#define VXERROR_HUP                 901
>> >> +#define VXERROR_CHANNEL_HUP         903
>> >> +
>> >> +/* constants from iomgr.h and opcode.h */
>> >> +#define IRP_READ_REQUEST                    0x1FFF
>> >> +#define IRP_WRITE_REQUEST                   0x2FFF
>> >> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
>> >> +
>> >> +/* Lock specific macros */
>> >> +#define VXHS_SPIN_LOCK_ALLOC                  \
>> >> +    (qemu_ck_initialize_lock())
>> >> +#define VXHS_SPIN_LOCK(lock)                  \
>> >> +    (qemu_ck_spin_lock(lock))
>> >> +#define VXHS_SPIN_UNLOCK(lock)                \
>> >> +    (qemu_ck_spin_unlock(lock))
>> >> +#define VXHS_SPIN_LOCK_DESTROY(lock)          \
>> >> +    (qemu_ck_destroy_lock(lock))
>> >
>> > What's the value of these macros, rather than just calling the underlying
>> > function directly in code?
>> >
>>
>> This way we can easily swap out CK locks with something else like
>> pthread_mutex if required.
>>
>> >> +
>> >> +typedef enum {
>> >> +    VXHS_IO_INPROGRESS,
>> >> +    VXHS_IO_COMPLETED,
>> >> +    VXHS_IO_ERROR
>> >> +} VXHSIOState;
>> >> +
>> >> +
>> >> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
>> >> +
>> >> +#define VDISK_FD_READ 0
>> >> +#define VDISK_FD_WRITE 1
>> >> +
>> >> +#define QNIO_VDISK_NONE        0x00
>> >> +#define QNIO_VDISK_CREATE      0x01
>> >> +
>> >> +/* max IO size supported by QEMU NIO lib */
>> >> +#define QNIO_MAX_IO_SIZE       4194304
>> >> +
>> >> +#define MAX_HOSTS               4
>> >> +
>> >> +/*
>> >> + * Opcodes for making IOCTL on QEMU NIO library
>> >> + */
>> >> +#define BASE_OPCODE_SHARED     1000
>> >> +#define BASE_OPCODE_DAL        2000
>> >> +#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
>> >> +#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
>> >> +#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
>> >> +#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
>> >> +
>> >> +/*
>> >> + * BDRVVXHSState specific flags
>> >> + */
>> >> +#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
>> >> +#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
>> >> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
>> >> +
>> >> +#define OF_VDISK_ACTIVE(s)                                              \
>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
>> >> +#define OF_VDISK_SET_ACTIVE(s)                                          \
>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
>> >> +#define OF_VDISK_RESET_ACTIVE(s)                                        \
>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
>> >> +
>> >> +#define OF_VDISK_FAILED(s)                                              \
>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
>> >> +#define OF_VDISK_SET_FAILED(s)                                          \
>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
>> >> +#define OF_VDISK_RESET_FAILED(s)                                        \
>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
>> >> +
>> >> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> >> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> >> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>> >> +
>> >> +/*
>> >> + * VXHSAIOCB specific flags
>> >> + */
>> >> +#define OF_ACB_QUEUED       0x00000001
>> >> +
>> >> +#define OF_AIOCB_FLAGS_QUEUED(a)            \
>> >> +        ((a)->flags & OF_ACB_QUEUED)
>> >> +#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
>> >> +        ((a)->flags |= OF_ACB_QUEUED)
>> >> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
>> >> +        ((a)->flags &= ~OF_ACB_QUEUED)
>> >> +
>> >
>> > Many helper macros here... IMO, it decreases the readability of the code,
>> > and my preference is to just have the underlying code without the use of the
>> > macros.
>> >
>>
>> Would like to keep this unchanged if that's OK!
>>
>> >> +typedef struct qemu2qnio_ctx {
>> >> +    uint32_t            qnio_flag;
>> >> +    uint64_t            qnio_size;
>> >> +    char                *qnio_channel;
>> >> +    char                *target;
>> >> +    qnio_callback_t     qnio_cb;
>> >> +} qemu2qnio_ctx_t;
>> >> +
>> >> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
>> >> +
>> >> +typedef struct LibQNIOSymbol {
>> >> +        const char *name;
>> >> +        gpointer *addr;
>> >> +} LibQNIOSymbol;
>> >> +
>> >> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
>> >> +                          void *reply);
>> >> +
>> >> +/*
>> >> + * HyperScale AIO callbacks structure
>> >> + */
>> >> +typedef struct VXHSAIOCB {
>> >> +    BlockAIOCB          common;
>> >> +    size_t              ret;
>> >> +    size_t              size;
>> >> +    QEMUBH              *bh;
>> >> +    int                 aio_done;
>> >> +    int                 segments;
>> >> +    int                 flags;
>> >> +    size_t              io_offset;
>> >> +    QEMUIOVector        *qiov;
>> >> +    void                *buffer;
>> >> +    int                 direction;  /* IO direction (r/w) */
>> >> +    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
>> >> +} VXHSAIOCB;
>> >> +
>> >> +typedef struct VXHSvDiskHostsInfo {
>> >> +    int     qnio_cfd;        /* Channel FD */
>> >> +    int     vdisk_rfd;      /* vDisk remote FD */
>> >> +    char    *hostip;        /* Host's IP addresses */
>> >> +    int     port;           /* Host's port number */
>> >> +} VXHSvDiskHostsInfo;
>> >> +
>> >> +/*
>> >> + * Structure per vDisk maintained for state
>> >> + */
>> >> +typedef struct BDRVVXHSState {
>> >> +    int                     fds[2];
>> >> +    int64_t                 vdisk_size;
>> >> +    int64_t                 vdisk_blocks;
>> >> +    int64_t                 vdisk_flags;
>> >> +    int                     vdisk_aio_count;
>> >> +    int                     event_reader_pos;
>> >> +    VXHSAIOCB               *qnio_event_acb;
>> >> +    void                    *qnio_ctx;
>> >> +    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
>> >> +    void                    *vdisk_acb_lock;  /* Protects ACB */
>> >> +    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
>> >> +    int                     vdisk_nhosts;   /* Total number of hosts */
>> >> +    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
>> >> +    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
>> >> +    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
>> >> +    int                     vdisk_aio_retry_qd;
>> >
>> > Is vdisk_aio_retry_qd just used for debugging purposes?  I see it
>> > incremented/decremented throughout the code, but no one checks it.
>> >
>>
>> Yes, it is for debugging only. Added a comment to state the same.
>>
>>
>> >> +    char                    *vdisk_guid;
>> >> +} BDRVVXHSState;
>> >> +
>> >> +void bdrv_vxhs_init(void);
>> >> +void *vxhs_setup_qnio(void);
>> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
>> >> +void vxhs_aio_event_reader(void *opaque);
>> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
>> >> +int vxhs_aio_flush_cb(void *opaque);
>> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg);
>> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
>> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
>> >> +              int bdrv_flags, Error **errp);
>> >> +void vxhs_close(BlockDriverState *bs);
>> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
>> >> +                                   QEMUIOVector *qiov, int nb_sectors,
>> >> +                                   BlockCompletionFunc *cb, void *opaque);
>> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> >> +                                    QEMUIOVector *qiov, int nb_sectors,
>> >> +                                    BlockCompletionFunc *cb,
>> >> +                                    void *opaque);
>> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
>> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
>> >> +                                QEMUIOVector *qiov, int nb_sectors,
>> >> +                                BlockCompletionFunc *cb,
>> >> +                                void *opaque, int write);
>> >> +int vxhs_co_flush(BlockDriverState *bs);
>> >> +int64_t vxhs_getlength(BlockDriverState *bs);
>> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
>> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
>> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr);
>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count);
>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count);
>> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>> >> +int vxhs_failover_io(BDRVVXHSState *s);
>> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
>> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s);
>> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s);
>> >> +int vxhs_restart_aio(VXHSAIOCB *acb);
>> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err);
>> >> +void vxhs_failover_ioctl_cb(int res, void *ctx);
>> >> +
>> >> +
>> >> +#endif
>> >> diff --git a/configure b/configure
>> >> index 4b808f9..c71e270 100755
>> >> --- a/configure
>> >> +++ b/configure
>> >> @@ -320,6 +320,7 @@ vhdx=""
>> >>  numa=""
>> >>  tcmalloc="no"
>> >>  jemalloc="no"
>> >> +vxhs=""
>> >>
>> >>  # parse CC options first
>> >>  for opt do
>> >> @@ -1150,6 +1151,11 @@ for opt do
>> >>    ;;
>> >>    --enable-jemalloc) jemalloc="yes"
>> >>    ;;
>> >> +  --disable-vxhs) vxhs="no"
>> >> +  ;;
>> >> +  --enable-vxhs) vxhs="yes"
>> >> +  ;;
>> >> +
>> >>    *)
>> >>        echo "ERROR: unknown option $opt"
>> >>        echo "Try '$0 --help' for more information"
>> >> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if available:
>> >>    numa            libnuma support
>> >>    tcmalloc        tcmalloc support
>> >>    jemalloc        jemalloc support
>> >> +  vxhs            Veritas HyperScale vDisk backend support
>> >>
>> >>  NOTE: The object files are built at the place where configure is launched
>> >>  EOF
>> >> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
>> >>  fi
>> >>
>> >>  ##########################################
>> >> +# Veritas HyperScale block driver VxHS
>> >> +# Check if libqnio is installed
>> >> +if test "$vxhs" != "no" ; then
>> >> +  cat > $TMPC <<EOF
>> >> +#include <stdio.h>
>> >> +#include <qnio/qnio_api.h>
>> >> +
>> >> +void vxhs_inc_acb_segment_count(void *acb, int count);
>> >> +void vxhs_dec_acb_segment_count(void *acb, int count);
>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>> >> +
>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>> >> +{
>> >> +}
>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>> >> +{
>> >> +}
>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>> >> +{
>> >> +}
>> >> +int main(void) {
>> >> +    qemu_ck_initialize_lock();
>> >> +    return 0;
>> >> +}
>> >> +EOF
>> >> +  vxhs_libs="-lqnioshim -lqnio"
>> >> +  if compile_prog "" "$vxhs_libs" ; then
>> >> +    vxhs=yes
>> >> +  else
>> >> +    if test "$vxhs" = "yes" ; then
>> >> +      feature_not_found "vxhs block device" "Install libqnio. See github"
>> >> +    fi
>> >> +    vxhs=no
>> >> +  fi
>> >> +fi
>> >> +
>> >> +##########################################
>> >>  # End of CC checks
>> >>  # After here, no more $cc or $ld runs
>> >>
>> >> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then
>> >>    echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
>> >>  fi
>> >>
>> >> +if test "$vxhs" = "yes" ; then
>> >> +  echo "CONFIG_VXHS=y" >> $config_host_mak
>> >> +  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
>> >> +  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
>> >> +fi
>> >> +
>> >>  if test "$tcg_interpreter" = "yes"; then
>> >>    QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
>> >>  elif test "$ARCH" = "sparc64" ; then
>> >
>> > Your configure file should indicate if vxhs is enabled or not. For instance,
>> > here is a snippet from where this occurs in the current configure file:
>> >
>> > [...]
>> > echo "NUMA host support $numa"
>> > echo "tcmalloc support  $tcmalloc"
>> > echo "jemalloc support  $jemalloc"
>> > echo "avx2 optimization $avx2_opt"
>> >
>> >
>>
>> Fixed.
>>
>> >> --
>> >> 2.5.5
>> >>
>> >>
Kevin Wolf Sept. 9, 2016, 10:38 a.m. UTC | #16
Am 08.09.2016 um 22:46 hat ashish mittal geschrieben:
> Hi Kevin,
> 
> By design, our writeback cache is on non-volatile SSD device. We do
> async writes to this cache and also maintain a persistent index map of
> the data written. This gives us the capability to recover write-back
> cache if needed.

So your server application uses something like O_SYNC to write data to
the cache SSD, in order to avoid that data is sitting only in the kernel
page cache or in a volatile write cache of the SSD?

Kevin

> On Thu, Sep 8, 2016 at 7:20 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> > Am 08.09.2016 um 16:00 hat Jeff Cody geschrieben:
> >> > >> +/*
> >> > >> + * This is called by QEMU when a flush gets triggered from within
> >> > >> + * a guest at the block layer, either for IDE or SCSI disks.
> >> > >> + */
> >> > >> +int vxhs_co_flush(BlockDriverState *bs)
> >> > >> +{
> >> > >> +    BDRVVXHSState *s = bs->opaque;
> >> > >> +    uint64_t size = 0;
> >> > >> +    int ret = 0;
> >> > >> +
> >> > >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
> >> > >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
> >> > >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
> >> > >> +
> >> > >> +    if (ret < 0) {
> >> > >> +        /*
> >> > >> +         * Currently not handling the flush ioctl
> >> > >> +         * failure because of network connection
> >> > >> +         * disconnect. Since all the writes are
> >> > >> +         * commited into persistent storage hence
> >> > >> +         * this flush call is noop and we can safely
> >> > >> +         * return success status to the caller.
> >> > >
> >> > > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
> >> > > above is a noop?
> >> > >
> >> >
> >> > Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
> >> > in case we later want to add some functionality to it. I have now
> >> > added a comment to this affect to avoid any confusion.
> >> >
> >>
> >> The problem is you don't know which version of the qnio library any given
> >> QEMU binary will be using, since it is a shared library.  Future versions
> >> may implement the flush ioctl as expressed above, in which case we may hide
> >> a valid error.
> >>
> >> Am I correct in assuming that this call suppresses errors because an error
> >> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
> >> you want a placeholder here for flushing, you should go all the way and stub
> >> out the underlying ioctl call to return success.  Then QEMU can at least
> >> rely on the error return from the flush operation.
> >
> > So what's the story behind the missing flush command?
> >
> > Does the server always use something like O_SYNC, i.e. all potential
> > write caches in the stack operate in a writethrough mode? So each write
> > request is only completed successfully if it is ensured that the data is
> > safe on disk rather than in a volatile writeback cache?
> >
> > As soon as any writeback cache can be involved (e.g. the kernel page
> > cache or a volatile disk cache) and there is no flush command (a real
> > one, not just stubbed), the driver is not operating correctly and
> > therefore not ready for inclusion.
> >
> > So Ashish, can you tell us something about caching behaviour across the
> > storage stack when vxhs is involved?
> >
> > Kevin
Ashish Mittal Sept. 21, 2016, 6:39 a.m. UTC | #17
Hi,

I have submitted a new patch (V6) after fixing the problem of
libqnioshim referencing symbols from inside qemu vxhs.c code.

(1) libqnioshim code has now been brought inside vxhs.c
(2) vxhs.c now needs to link only with libqnio.so.
(3) libqnio.so does not have any unreferenced symbols coming from qemu
code. This can also be witnessed from the code in configure that
checks for the presence of libqnio. It does not have to provide any
empty function definitions (as before) for the test code to compile.

Have also fixed a couple of other points raised during review. Will
reply to pending questions soon.

Regards,
Ashish

On Thu, Sep 8, 2016 at 4:15 PM, ashish mittal <ashmit602@gmail.com> wrote:
>>> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
>>> in case we later want to add some functionality to it. I have now
>>> added a comment to this affect to avoid any confusion.
>>>
>>
>> The problem is you don't know which version of the qnio library any given
>> QEMU binary will be using, since it is a shared library.  Future versions
>> may implement the flush ioctl as expressed above, in which case we may hide
>> a valid error.
>>
>> Am I correct in assuming that this call suppresses errors because an error
>> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
>> you want a placeholder here for flushing, you should go all the way and stub
>> out the underlying ioctl call to return success.  Then QEMU can at least
>> rely on the error return from the flush operation.
>>
>>
>
> I agree. Will change accordingly. I also completely agree on the
> discussion of libqnio calling a qemu function. Will check on the best
> way to resolve that and get back.
>
> Thanks,
> Ashish
>
>
> On Thu, Sep 8, 2016 at 7:00 AM, Jeff Cody <jcody@redhat.com> wrote:
>> On Wed, Sep 07, 2016 at 03:32:47PM -0700, ashish mittal wrote:
>>> Hi Jeff,
>>>
>>> Thank you for the comments and corrections.
>>>
>>> I have submitted a v5 patch after incorporating most of your review
>>> comments. Please find details in-line.
>>>
>>> Thanks,
>>> Ashish
>>>
>>> On Tue, Aug 30, 2016 at 10:35 AM, Jeff Cody <jcody@redhat.com> wrote:
>>> >
>>> > First-pass-over-the-code review:
>>> >
>>> > On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote:
>>> >> This patch adds support for a new block device type called "vxhs".
>>> >> Source code for the library that this code loads can be downloaded from:
>>> >> https://github.com/MittalAshish/libqnio.git
>>> >>
>>> >> Sample command line using JSON syntax:
>>> >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}'
>>> >>
>>> >> Sample command line using URI syntax:
>>> >> qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D
>>> >>
>>> >
>>> > Is there a reference specification for vxhs available?  If so, the commit
>>> > message would be a good place to drop a link.
>>> >
>>>
>>> We don't have a reference specification doc yet. Although, I will
>>> shortly be adding a test program to libqnio that can be used to
>>> test/demo basic libqnio IO transfer.
>>>
>>> >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com>
>>> >> ---
>>> >> v3 changelog:
>>> >> =============
>>> >> (1) Reworked QAPI/JSON parsing.
>>> >> (2) Reworked URI parsing as suggested by Kevin.
>>> >> (3) Fixes per review comments from Stefan on v1.
>>> >> (4) Fixes per review comments from Daniel on v3.
>>> >>
>>> >>  block/Makefile.objs |    2 +
>>> >>  block/trace-events  |   41 ++
>>> >>  block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>> >>  block/vxhs.h        |  237 ++++++++++
>>> >>  configure           |   50 ++
>>> >>  5 files changed, 1634 insertions(+)
>>> >>  create mode 100644 block/vxhs.c
>>> >>  create mode 100644 block/vxhs.h
>>> >>
>>> >> diff --git a/block/Makefile.objs b/block/Makefile.objs
>>> >> index 2593a2f..5f83305 100644
>>> >> --- a/block/Makefile.objs
>>> >> +++ b/block/Makefile.objs
>>> >> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o
>>> >>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
>>> >>  block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
>>> >>  block-obj-$(CONFIG_LIBSSH2) += ssh.o
>>> >> +block-obj-$(CONFIG_VXHS) += vxhs.o
>>> >>  block-obj-y += accounting.o dirty-bitmap.o
>>> >>  block-obj-y += write-threshold.o
>>> >>
>>> >> @@ -43,3 +44,4 @@ block-obj-m        += dmg.o
>>> >>  dmg.o-libs         := $(BZIP2_LIBS)
>>> >>  qcow.o-libs        := -lz
>>> >>  linux-aio.o-libs   := -laio
>>> >> +vxhs.o-libs        := $(VXHS_LIBS)
>>> >> diff --git a/block/trace-events b/block/trace-events
>>> >> index 05fa13c..06c6d8c 100644
>>> >> --- a/block/trace-events
>>> >> +++ b/block/trace-events
>>> >> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
>>> >>  qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>>> >>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>>> >>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
>>> >> +
>>> >> +# block/vxhs.c
>>> >> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
>>> >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
>>> >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
>>> >> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
>>> >> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
>>> >> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
>>> >> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
>>> >> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
>>> >> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
>>> >> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
>>> >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
>>> >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
>>> >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d"
>>> >> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
>>> >> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
>>> >> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
>>> >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
>>> >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
>>> >> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
>>> >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
>>> >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
>>> >> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
>>> >> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
>>> >> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
>>> >> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
>>> >> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
>>> >> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
>>> >> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
>>> >> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
>>> >> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
>>> >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
>>> >> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
>>> >> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
>>> >> +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
>>> >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
>>> >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
>>> >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
>>> >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
>>> >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
>>> >> diff --git a/block/vxhs.c b/block/vxhs.c
>>> >> new file mode 100644
>>> >> index 0000000..32c37c5
>>> >> --- /dev/null
>>> >> +++ b/block/vxhs.c
>>> >> @@ -0,0 +1,1304 @@
>>> >> +/*
>>> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
>>> >> + *
>>> >
>>> > Another good place for a link to the specification, if available.
>>> >
>>> >
>>> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> >> + * See the COPYING file in the top-level directory.
>>> >> + *
>>> >> + */
>>> >> +
>>> >> +#include "vxhs.h"
>>> >> +#include <qnio/qnio_api.h>
>>> >> +#include "trace.h"
>>> >> +#include "qapi/qmp/qerror.h"
>>> >> +#include "qapi/qmp/qdict.h"
>>> >> +#include "qapi/qmp/qstring.h"
>>> >> +
>>> >> +#define VXHS_OPT_FILENAME        "filename"
>>> >> +#define VXHS_OPT_VDISK_ID        "vdisk_id"
>>> >> +#define VXHS_OPT_SERVER          "server."
>>> >> +#define VXHS_OPT_HOST            "host"
>>> >> +#define VXHS_OPT_PORT            "port"
>>> >> +
>>> >> +/* qnio client ioapi_ctx */
>>> >> +static void *global_qnio_ctx;
>>> >> +
>>> >> +/* vdisk prefix to pass to qnio */
>>> >> +static const char vdisk_prefix[] = "/dev/of/vdisk";
>>> >> +
>>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = ptr;
>>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +    acb->segments += count;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +}
>>> >> +
>>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = ptr;
>>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +    acb->segments -= count;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +}
>>> >> +
>>> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = ptr;
>>> >> +    BDRVVXHSState *s = acb->common.bs->opaque;
>>> >> +    int segcount = 0;
>>> >> +
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +    acb->segments -= count;
>>> >> +    segcount = acb->segments;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +
>>> >> +    return segcount;
>>> >> +}
>>> >
>>> > Unused function?
>>> >
>>>
>>> Yes, it is unused. Sorry I forgot to remove it from V5. Will remember
>>> to remove in the next iteration.
>>>
>>> >> +
>>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = ptr;
>>> >> +
>>> >> +    acb->buffer = buffer;
>>> >> +}
>>> >> +
>>> >
>>> > Unused function?
>>> >
>>>
>>> This is called from within libqnio.
>>>
>>> > (There are several unused functions - are there other patches that use these
>>> > functions?  Maybe I missed them on the list.  If not, and they are for
>>> > future work, maybe it would be best to introduce these functions in the
>>> > patch series that will actually use them. It will cut down on the amount of
>>> > code to be reviewed, and since it is not exercised it can't be verified
>>> > anyway).
>>> >
>>>
>>> Removed all unused functions, except vxhs_dec_and_get_acb_segment_count().
>>>
>>> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
>>> >> +{
>>> >> +    BDRVVXHSState *s = ptr;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +    s->vdisk_aio_count += count;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +}
>>> >> +
>>> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
>>> >> +{
>>> >> +    BDRVVXHSState *s = ptr;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +    s->vdisk_aio_count -= count;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +}
>>> >> +
>>> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr)
>>> >> +{
>>> >> +    BDRVVXHSState *s = ptr;
>>> >> +    uint32_t count = 0;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +    count = s->vdisk_aio_count;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +
>>> >> +    return count;
>>> >> +}
>>> >
>>> > Function only used by another function that is not called from anywhere.
>>> >
>>> >
>>>
>>> Removed.
>>>
>>> >> +
>>> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = NULL;
>>> >> +    BDRVVXHSState *s = NULL;
>>> >> +    int rv = 0;
>>> >> +    int segcount = 0;
>>> >> +    uint32_t error = 0;
>>> >> +    uint32_t opcode = 0;
>>> >> +
>>> >> +    assert(m);
>>> >> +    if (m) {
>>> >> +        /* TODO: need common get message attrs, not two separate lib calls */
>>> >> +        error = qemu_iio_extract_msg_error(m);
>>> >> +        opcode = qemu_iio_extract_msg_opcode(m);
>>> >> +    }
>>> >> +    switch (opcode) {
>>> >> +    case IRP_READ_REQUEST:
>>> >> +    case IRP_WRITE_REQUEST:
>>> >> +
>>> >> +    /*
>>> >> +     * ctx is VXHSAIOCB*
>>> >> +     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
>>> >> +     */
>>> >> +    if (ctx) {
>>> >> +        acb = ctx;
>>> >> +        s = acb->common.bs->opaque;
>>> >> +    } else {
>>> >> +        trace_vxhs_iio_callback(error, reason);
>>> >> +        goto out;
>>> >> +    }
>>> >> +
>>> >> +    if (error) {
>>> >> +        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
>>> >> +
>>> >> +        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
>>> >> +            /*
>>> >> +             * Storage agent failed while I/O was in progress
>>> >> +             * Fail over only if the qnio channel dropped, indicating
>>> >> +             * storage agent failure. Don't fail over in response to other
>>> >> +             * I/O errors such as disk failure.
>>> >> +             */
>>> >> +            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
>>> >> +                error == VXERROR_CHANNEL_HUP || error == -1) {
>>> >> +                /*
>>> >> +                 * Start vDisk IO failover once callback is
>>> >> +                 * called against all the pending IOs.
>>> >> +                 * If vDisk has no redundancy enabled
>>> >> +                 * then IO failover routine will mark
>>> >> +                 * the vDisk failed and fail all the
>>> >> +                 * AIOs without retry (stateless vDisk)
>>> >> +                 */
>>> >> +                VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>>> >> +                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>>> >> +                }
>>> >> +                /*
>>> >> +                 * Check if this acb is already queued before.
>>> >> +                 * It is possible in case if I/Os are submitted
>>> >> +                 * in multiple segments (QNIO_MAX_IO_SIZE).
>>> >> +                 */
>>> >> +                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
>>> >> +                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>>> >> +                                         acb, retry_entry);
>>> >> +                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
>>> >> +                    s->vdisk_aio_retry_qd++;
>>> >> +                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
>>> >> +                }
>>> >> +                segcount = --acb->segments;
>>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +                /*
>>> >> +                 * Decrement AIO count only when callback is called
>>> >> +                 * against all the segments of aiocb.
>>> >> +                 */
>>> >> +                if (segcount == 0 && --s->vdisk_aio_count == 0) {
>>> >> +                    /*
>>> >> +                     * Start vDisk I/O failover
>>> >> +                     */
>>> >> +                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +                    /*
>>> >> +                     * TODO:
>>> >> +                     * Need to explore further if it is possible to optimize
>>> >> +                     * the failover operation on Virtual-Machine (global)
>>> >> +                     * specific rather vDisk specific.
>>> >> +                     */
>>> >> +                    vxhs_failover_io(s);
>>> >> +                    goto out;
>>> >> +                }
>>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +                goto out;
>>> >> +            }
>>> >> +        } else if (reason == IIO_REASON_HUP) {
>>> >> +            /*
>>> >> +             * Channel failed, spontaneous notification,
>>> >> +             * not in response to I/O
>>> >> +             */
>>> >> +            trace_vxhs_iio_callback_chnlfail(error);
>>> >> +            /*
>>> >> +             * TODO: Start channel failover when no I/O is outstanding
>>> >> +             */
>>> >> +            goto out;
>>> >> +        } else {
>>> >> +            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
>>> >> +                                         acb->size, error);
>>> >> +        }
>>> >> +    }
>>> >> +    /*
>>> >> +     * Set error into acb if not set. In case if acb is being
>>> >> +     * submitted in multiple segments then need to set the error
>>> >> +     * only once.
>>> >> +     *
>>> >> +     * Once acb done callback is called for the last segment
>>> >> +     * then acb->ret return status will be sent back to the
>>> >> +     * caller.
>>> >> +     */
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +    if (error && !acb->ret) {
>>> >> +        acb->ret = error;
>>> >> +    }
>>> >> +    --acb->segments;
>>> >> +    segcount = acb->segments;
>>> >> +    assert(segcount >= 0);
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +    /*
>>> >> +     * Check if all the outstanding I/Os are done against acb.
>>> >> +     * If yes then send signal for AIO completion.
>>> >> +     */
>>> >> +    if (segcount == 0) {
>>> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>>> >> +        if (rv != sizeof(acb)) {
>>> >> +            error_report("VXHS AIO completion failed: %s", strerror(errno));
>>> >> +            abort();
>>> >> +        }
>>> >> +    }
>>> >> +    break;
>>> >> +
>>> >> +    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
>>> >> +        /* ctx is BDRVVXHSState* */
>>> >> +        assert(ctx);
>>> >> +        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
>>> >> +                                      error);
>>> >> +        vxhs_failover_ioctl_cb(error, ctx);
>>> >> +        break;
>>> >> +
>>> >> +    default:
>>> >> +        if (reason == IIO_REASON_HUP) {
>>> >> +            /*
>>> >> +             * Channel failed, spontaneous notification,
>>> >> +             * not in response to I/O
>>> >> +             */
>>> >> +            trace_vxhs_iio_callback_chnfail(error, errno);
>>> >> +            /*
>>> >> +             * TODO: Start channel failover when no I/O is outstanding
>>> >> +             */
>>> >> +        } else {
>>> >> +            trace_vxhs_iio_callback_unknwn(opcode, error);
>>> >> +        }
>>> >> +        break;
>>> >> +    }
>>> >> +out:
>>> >> +    return;
>>> >> +}
>>> >> +
>>> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
>>> >> +{
>>> >> +    BlockCompletionFunc *cb = acb->common.cb;
>>> >> +    void *opaque = acb->common.opaque;
>>> >> +    int ret = 0;
>>> >> +
>>> >> +    if (acb->ret != 0) {
>>> >> +        trace_vxhs_complete_aio(acb, acb->ret);
>>> >> +    /*
>>> >> +     * We mask all the IO errors generically as EIO for upper layers
>>> >> +     * Right now our IO Manager uses non standard error codes. Instead
>>> >> +     * of confusing upper layers with incorrect interpretation we are
>>> >> +     * doing this workaround.
>>> >> +     */
>>> >> +        ret = (-EIO);
>>> >> +    }
>>> >> +    /*
>>> >> +     * Copy back contents from stablization buffer into original iovector
>>> >> +     * before returning the IO
>>> >> +     */
>>> >> +    if (acb->buffer != NULL) {
>>> >> +        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
>>> >> +        free(acb->buffer);
>>> >> +        acb->buffer = NULL;
>>> >> +    }
>>> >> +    vxhs_dec_vdisk_iocount(s, 1);
>>> >> +    acb->aio_done = VXHS_IO_COMPLETED;
>>> >> +    qemu_aio_unref(acb);
>>> >> +    cb(opaque, ret);
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * This is the HyperScale event handler registered to QEMU.
>>> >> + * It is invoked when any IO gets completed and written on pipe
>>> >> + * by callback called from QNIO thread context. Then it marks
>>> >> + * the AIO as completed, and releases HyperScale AIO callbacks.
>>> >> + */
>>> >> +void vxhs_aio_event_reader(void *opaque)
>>> >> +{
>>> >> +    BDRVVXHSState *s = opaque;
>>> >> +    ssize_t ret;
>>> >> +
>>> >> +    do {
>>> >> +        char *p = (char *)&s->qnio_event_acb;
>>> >> +
>>> >> +        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
>>> >> +                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
>>> >> +        if (ret > 0) {
>>> >> +            s->event_reader_pos += ret;
>>> >> +            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
>>> >> +                s->event_reader_pos = 0;
>>> >> +                vxhs_complete_aio(s->qnio_event_acb, s);
>>> >> +            }
>>> >> +        }
>>> >> +    } while (ret < 0 && errno == EINTR);
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * QEMU calls this to check if there are any pending IO on vDisk.
>>> >> + * It will wait in a loop until all the AIOs are completed.
>>> >> + */
>>> >> +int vxhs_aio_flush_cb(void *opaque)
>>> >> +{
>>> >> +    BDRVVXHSState *s = opaque;
>>> >> +
>>> >> +    return vxhs_get_vdisk_iocount(s);
>>> >> +}
>>> >
>>> > Unused function.
>>> >
>>> >
>>>
>>> Removed.
>>>
>>> >> +
>>> >> +/*
>>> >> + * This will be called by QEMU while booting for each vDisk.
>>> >> + * bs->opaque will be allocated by QEMU upper block layer before
>>> >> + * calling open. It will load all the QNIO operations from
>>> >> + * qemuqnio library and call QNIO operation to create channel to
>>> >> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk
>>> >> + * path and then sets HyperScale event handler to QEMU.
>>> >> + */
>>> >> +void *vxhs_setup_qnio(void)
>>> >> +{
>>> >> +    void *qnio_ctx = NULL;
>>> >> +
>>> >> +    qnio_ctx = qemu_iio_init(vxhs_iio_callback);
>>> >
>>> > One concern with this callback function - when does it get called?  I am
>>> > assuming that qemu_iio_init() is running threads / processes in the
>>> > background, so this callback could happen at anytime.
>>> >
>>> > (I'll discuss my specific concern later, in vxhs_failover_ioctl_cb())
>>> >
>>> >
>>> >
>>>
>>> Yes, that is correct. vxhs_setup_qnio() is called from the threads
>>> running in libqnio.
>>>
>>>
>>>
>>> > An aside:
>>> >
>>> > It is odd that the qnio library is using a qemu_ prefix for functions.
>>> > Seems like a good way to run into issues later, especially with relatively
>>> > generic function names such as 'qemu_ck_spin_lock()'. Consider that most
>>> > developers / maintainers are probably not going to compile in vxhs given the
>>> > external library dependence without a specific reason, so name collisions
>>> > could sneak in.
>>> >
>>> > 'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause:
>>> >
>>> > 0000000000004d07 T qemu_calculate_iovec_size
>>> > 000000000000663d T qemu_ck_destroy_lock
>>> > 00000000000065cf T qemu_ck_initialize_lock
>>> > 00000000000065f7 T qemu_ck_spin_lock
>>> > 000000000000661a T qemu_ck_spin_unlock
>>> > 0000000000004dfb T qemu_convert_iovector_to_buffer
>>> > 0000000000004d63 T qemu_copy_iov_to_buffer
>>> > 0000000000005f2f T qemu_extract_flush_response
>>> > 0000000000005c89 T qemu_extract_geometry_from_json
>>> > 0000000000005a78 T qemu_extract_size_from_json
>>> > 0000000000004f40 T qemu_is_iovector_read_aligned
>>> >
>>> >
>>>
>>> Replaced all qemu_ prefixed symbols within qnio.
>>>
>>> >
>>> >> +
>>> >> +    if (qnio_ctx != NULL) {
>>> >> +        trace_vxhs_setup_qnio(qnio_ctx);
>>> >> +    } else {
>>> >> +        trace_vxhs_setup_qnio_nwerror('.');
>>> >> +    }
>>> >> +
>>> >> +    return qnio_ctx;
>>> >> +}
>>> >> +
>>> >> +static QemuOptsList runtime_opts = {
>>> >> +    .name = "vxhs",
>>> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>>> >> +    .desc = {
>>> >> +        {
>>> >> +            .name = VXHS_OPT_FILENAME,
>>> >> +            .type = QEMU_OPT_STRING,
>>> >> +            .help = "URI to the Veritas HyperScale image",
>>> >> +        },
>>> >> +        {
>>> >> +            .name = VXHS_OPT_VDISK_ID,
>>> >> +            .type = QEMU_OPT_STRING,
>>> >> +            .help = "UUID of the VxHS vdisk",
>>> >> +        },
>>> >> +        { /* end of list */ }
>>> >> +    },
>>> >> +};
>>> >> +
>>> >> +static QemuOptsList runtime_tcp_opts = {
>>> >> +    .name = "vxhs_tcp",
>>> >> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
>>> >> +    .desc = {
>>> >> +        {
>>> >> +            .name = VXHS_OPT_HOST,
>>> >> +            .type = QEMU_OPT_STRING,
>>> >> +            .help = "host address (ipv4 addresses)",
>>> >> +        },
>>> >> +        {
>>> >> +            .name = VXHS_OPT_PORT,
>>> >> +            .type = QEMU_OPT_NUMBER,
>>> >> +            .help = "port number on which VxHSD is listening (default 9999)",
>>> >> +            .def_value_str = "9999"
>>> >> +        },
>>> >> +        {
>>> >> +            .name = "to",
>>> >> +            .type = QEMU_OPT_NUMBER,
>>> >> +            .help = "max port number, not supported by VxHS",
>>> >> +        },
>>> >> +        {
>>> >> +            .name = "ipv4",
>>> >> +            .type = QEMU_OPT_BOOL,
>>> >> +            .help = "ipv4 bool value, not supported by VxHS",
>>> >> +        },
>>> >> +        {
>>> >> +            .name = "ipv6",
>>> >> +            .type = QEMU_OPT_BOOL,
>>> >> +            .help = "ipv6 bool value, not supported by VxHS",
>>> >> +        },
>>> >> +        { /* end of list */ }
>>> >> +    },
>>> >> +};
>>> >> +
>>> >> +/*
>>> >> + * Parse the incoming URI and populate *options with all the host(s)
>>> >> + * information. Host at index 0 is local storage agent.
>>> >> + * Remaining are the reflection target storage agents. The local storage agent
>>> >> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
>>> >> + * The local storage agent address is stored at index 0. The reflection target
>>> >> + * ips, are the E-W data network addresses of the reflection node agents, also
>>> >> + * extracted from the uri.
>>> >> + */
>>> >> +static int vxhs_parse_uri(const char *filename, QDict *options)
>>> >> +{
>>> >> +    gchar **target_list;
>>> >> +    URI *uri = NULL;
>>> >> +    char *hoststr, *portstr;
>>> >> +    char *port;
>>> >> +    int ret = 0;
>>> >> +    int i = 0;
>>> >> +
>>> >> +    trace_vxhs_parse_uri_filename(filename);
>>> >> +    target_list = g_strsplit(filename, "%7D", 0);
>>> >> +    assert(target_list != NULL && target_list[0] != NULL);
>>> >> +
>>> >> +    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
>>> >> +        uri = uri_parse(target_list[i]);
>>> >> +        if (!uri || !uri->server) {
>>> >> +            uri_free(uri);
>>> >> +            ret = -EINVAL;
>>> >> +            break;
>>> >> +        }
>>> >> +
>>> >> +        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
>>> >> +        qdict_put(options, hoststr, qstring_from_str(uri->server));
>>> >> +
>>> >> +        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
>>> >> +        if (uri->port) {
>>> >> +            port = g_strdup_printf("%d", uri->port);
>>> >> +            qdict_put(options, portstr, qstring_from_str(port));
>>> >> +            g_free(port);
>>> >> +        }
>>> >> +
>>> >> +        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
>>> >> +            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
>>> >> +        }
>>> >> +
>>> >> +        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
>>> >> +        g_free(hoststr);
>>> >> +        g_free(portstr);
>>> >> +        uri_free(uri);
>>> >> +    }
>>> >> +
>>> >> +    g_strfreev(target_list);
>>> >> +    return ret;
>>> >> +}
>>> >> +
>>> >> +static void vxhs_parse_filename(const char *filename, QDict *options,
>>> >> +                               Error **errp)
>>> >> +{
>>> >> +    if (qdict_haskey(options, "host")
>>> >> +        || qdict_haskey(options, "port")
>>> >> +        || qdict_haskey(options, "path"))
>>> >> +    {
>>> >> +        error_setg(errp, "host/port/path and a file name may not be specified "
>>> >> +                         "at the same time");
>>> >> +        return;
>>> >> +    }
>>> >> +
>>> >> +    if (strstr(filename, "://")) {
>>> >> +        int ret = vxhs_parse_uri(filename, options);
>>> >> +        if (ret < 0) {
>>> >> +            error_setg(errp, "Invalid URI. URI should be of the form "
>>> >> +                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
>>> >> +        }
>>> >> +    }
>>> >> +}
>>> >> +
>>> >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
>>> >> +                              int *cfd, int *rfd, Error **errp)
>>> >> +{
>>> >> +    QDict *backing_options = NULL;
>>> >> +    const char *vxhs_filename;
>>> >> +    char *of_vsa_addr = NULL;
>>> >> +    Error *local_err = NULL;
>>> >> +    const char *ptr = NULL;
>>> >
>>> > ptr does not need to be initialized to NULL here.  I also recommend using a
>>> > more descriptive name like 'vdisk_id_opt' over something generic like 'ptr'.
>>> >
>>>
>>> Fixed.
>>>
>>>
>>> >> +    char *file_name = NULL;
>>> >> +    size_t num_servers;
>>> >> +    char *str = NULL;
>>> >> +    QemuOpts *opts;
>>> >> +    int ret = 0;
>>> >> +    int i;
>>> >> +
>>> >> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>>> >> +    qemu_opts_absorb_qdict(opts, options, &local_err);
>>> >> +    if (local_err) {
>>> >> +        ret = -EINVAL;
>>> >> +        goto out;
>>> >> +    }
>>> >> +
>>> >> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
>>> >> +    if (vxhs_filename) {
>>> >> +        trace_vxhs_qemu_init_filename(vxhs_filename);
>>> >> +    }
>>> >> +
>>> >> +    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
>>> >> +    if (!ptr) {
>>> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
>>> >> +        ret = -EINVAL;
>>> >> +        goto out;
>>> >> +    }
>>> >> +    s->vdisk_guid = g_strdup(ptr);
>>> >
>>> > This is never freed anywhere.
>>> >
>>>
>>> Now freed on error conditions and in vxhs_close().
>>>
>>> >> +    trace_vxhs_qemu_init_vdisk(ptr);
>>> >> +
>>> >> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
>>> >> +    if (num_servers < 1) {
>>> >> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
>>> >> +        ret = -EINVAL;
>>> >> +        goto out;
>>> >> +    }
>>> >> +    trace_vxhs_qemu_init_numservers(num_servers);
>>> >> +
>>> >> +    qemu_opts_del(opts);
>>> >> +
>>> >> +    for (i = 0; i < num_servers; i++) {
>>> >
>>> > s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size.  There is no
>>> > bounds checking on num_servers, which is an issue: we can easily blow
>>> > past this.
>>> >
>>> > In general, for any parameter supplied by a user, or read from a file /
>>> > protocol / etc, we need to do appropriate bounds checking to prevent
>>> > potential CVEs.
>>> >
>>>
>>> Fixed.
>>>
>>> >> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
>>> >> +        qdict_extract_subqdict(options, &backing_options, str);
>>> >> +
>>> >> +        /* create opts info from runtime_tcp_opts list */
>>> >> +        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
>>> >> +        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
>>> >> +        if (local_err) {
>>> >> +            ret = -EINVAL;
>>> >> +            qemu_opts_del(opts);
>>> >> +            goto out;
>>> >> +        }
>>> >> +
>>> >> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
>>> >> +                                                            VXHS_OPT_HOST));
>>> >
>>> > This is never freed anywhere.
>>> >
>>>
>>> Now freed on error and vxhs_close().
>>>
>>> >> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
>>> >> +                                                                 VXHS_OPT_PORT),
>>> >> +                                                    NULL, 0);
>>> >> +
>>> >> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
>>> >> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
>>> >> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
>>> >> +                             s->vdisk_hostinfo[i].port);
>>> >> +        qemu_opts_del(opts);
>>> >> +    }
>>> >> +
>>> >> +    s->vdisk_nhosts = i;
>>> >> +    s->vdisk_cur_host_idx = 0;
>>> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>>> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>>> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>>> >> +                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
>>> >> +
>>> >> +    /*
>>> >> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
>>> >> +     */
>>> >> +    if (global_qnio_ctx == NULL) {
>>> >> +        global_qnio_ctx = vxhs_setup_qnio();
>>> >> +        if (global_qnio_ctx == NULL) {
>>> >> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
>>> >> +            ret = -EINVAL;
>>> >> +            goto out;
>>> >> +        }
>>> >> +    }
>>> >> +
>>> >> +    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>>> >> +    if (*cfd < 0) {
>>> >> +        error_setg(&local_err, "Failed qemu_open_iio_conn");
>>> >> +        ret = -EIO;
>>> >> +        goto out;
>>> >> +    }
>>> >> +    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
>>> >> +    if (*rfd < 0) {
>>> >> +        error_setg(&local_err, "Failed qemu_iio_devopen");
>>> >> +        ret = -EIO;
>>> >> +        goto out;
>>> >> +    }
>>> >> +
>>> >> +out:
>>> >> +    g_free(file_name);
>>> >> +    g_free(of_vsa_addr);
>>> >> +    if (str) {
>>> >> +        qdict_del(backing_options, str);
>>> >> +        g_free(str);
>>> >> +    }
>>> >> +
>>> >> +    if (ret < 0) {
>>> >> +        errno = -ret;
>>> >> +    }
>>> >> +    error_propagate(errp, local_err);
>>> >> +    return ret;
>>> >> +}
>>> >> +
>>> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
>>> >> +              int bdrv_flags, Error **errp)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    AioContext *aio_context;
>>> >> +    int qemu_qnio_cfd = -1;
>>> >> +    int qemu_rfd = -1;
>>> >> +    int ret = 0;
>>> >> +
>>> >> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
>>> >> +    if (ret < 0) {
>>> >> +        trace_vxhs_open_fail(ret);
>>> >> +        return ret;
>>> >> +    }
>>> >> +
>>> >> +    s->qnio_ctx = global_qnio_ctx;
>>> >> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
>>> >> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
>>> >> +    s->vdisk_size = 0;
>>> >> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
>>> >> +
>>> >> +    /*
>>> >> +     * Create a pipe for communicating between two threads in different
>>> >> +     * context. Set handler for read event, which gets triggered when
>>> >> +     * IO completion is done by non-QEMU context.
>>> >> +     */
>>> >> +    ret = qemu_pipe(s->fds);
>>> >> +    if (ret < 0) {
>>> >> +        trace_vxhs_open_epipe('.');
>>> >> +        ret = -errno;
>>> >> +        goto out;
>>> >> +    }
>>> >> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
>>> >> +
>>> >> +    aio_context = bdrv_get_aio_context(bs);
>>> >> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
>>> >> +                       false, vxhs_aio_event_reader, NULL, s);
>>> >> +
>>> >> +    /*
>>> >> +     * Allocate/Initialize the spin-locks.
>>> >> +     *
>>> >> +     * NOTE:
>>> >> +     *      Since spin lock is being allocated
>>> >> +     *      dynamically hence moving acb struct
>>> >> +     *      specific lock to BDRVVXHSState
>>> >> +     *      struct. The reason being,
>>> >> +     *      we don't want the overhead of spin
>>> >> +     *      lock being dynamically allocated and
>>> >> +     *      freed for every AIO.
>>> >> +     */
>>> >> +    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
>>> >> +    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
>>> >> +
>>> >> +    return 0;
>>> >> +
>>> >> +out:
>>> >> +    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
>>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>>> >> +                                 s->vdisk_hostinfo[0].vdisk_rfd);
>>> >> +    }
>>> >> +    /* never close qnio_cfd */
>>> >
>>> > Surely we want to be able to close it at some point... or does the qnio
>>> > library take care of that cleanup somewhere/sometime?
>>> >
>>> >
>>>
>>> Changed to close channel fd on error conditions, vxhs_close() and
>>> vxhs_reopen_vdisk().
>>>
>>>
>>> >> +    trace_vxhs_open_fail(ret);
>>> >> +    return ret;
>>> >> +}
>>> >> +
>>> >> +static const AIOCBInfo vxhs_aiocb_info = {
>>> >> +    .aiocb_size = sizeof(VXHSAIOCB)
>>> >> +};
>>> >> +
>>> >> +/*
>>> >> + * This is called in QNIO thread context when IO done
>>> >> + * on IO Manager and QNIO client received the data or
>>> >> + * ACK. It notify another event handler thread running in QEMU context
>>> >> + * by writing on the pipe
>>> >> + */
>>> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = arg;
>>> >> +    BlockDriverState *bs = acb->common.bs;
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    int rv;
>>> >> +
>>> >> +    acb->ret = ret;
>>> >> +    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>>> >> +    if (rv != sizeof(acb)) {
>>> >> +        error_report("VXHS AIO completion failed: %s",
>>> >> +                     strerror(errno));
>>> >> +        abort();
>>> >> +    }
>>> >> +}
>>> >
>>> > Another unused function.
>>> >
>>>
>>> Removed.
>>>
>>>
>>> >> +
>>> >> +/*
>>> >> + * This allocates QEMU-VXHS callback for each IO
>>> >> + * and is passed to QNIO. When QNIO completes the work,
>>> >> + * it will be passed back through the callback.
>>> >> + */
>>> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
>>> >> +                                int64_t sector_num, QEMUIOVector *qiov,
>>> >> +                                int nb_sectors,
>>> >> +                                BlockCompletionFunc *cb,
>>> >> +                                void *opaque, int iodir)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = NULL;
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    size_t size;
>>> >> +    uint64_t offset;
>>> >> +    int iio_flags = 0;
>>> >> +    int ret = 0;
>>> >> +
>>> >> +    offset = sector_num * BDRV_SECTOR_SIZE;
>>> >> +    size = nb_sectors * BDRV_SECTOR_SIZE;
>>> >> +
>>> >> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
>>> >> +    /*
>>> >> +     * Setup or initialize VXHSAIOCB.
>>> >> +     * Every single field should be initialized since
>>> >> +     * acb will be picked up from the slab without
>>> >> +     * initializing with zero.
>>> >> +     */
>>> >> +    acb->io_offset = offset;
>>> >> +    acb->size = size;
>>> >> +    acb->ret = 0;
>>> >> +    acb->flags = 0;
>>> >> +    acb->aio_done = VXHS_IO_INPROGRESS;
>>> >> +    acb->segments = 0;
>>> >> +    acb->buffer = 0;
>>> >> +    acb->qiov = qiov;
>>> >> +    acb->direction = iodir;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +    if (OF_VDISK_FAILED(s)) {
>>> >> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
>>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +        goto errout;
>>> >> +    }
>>> >> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>>> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>>> >> +        s->vdisk_aio_retry_qd++;
>>> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
>>> >> +        goto out;
>>> >> +    }
>>> >> +    s->vdisk_aio_count++;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +
>>> >> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>>> >> +
>>> >> +    switch (iodir) {
>>> >> +    case VDISK_AIO_WRITE:
>>> >> +            vxhs_inc_acb_segment_count(acb, 1);
>>> >> +            ret = qemu_iio_writev(s->qnio_ctx,
>>> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>>> >> +            break;
>>> >> +    case VDISK_AIO_READ:
>>> >> +            vxhs_inc_acb_segment_count(acb, 1);
>>> >> +            ret = qemu_iio_readv(s->qnio_ctx,
>>> >> +                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
>>> >> +            break;
>>> >> +    default:
>>> >> +            trace_vxhs_aio_rw_invalid(iodir);
>>> >> +            goto errout;
>>> >> +    }
>>> >> +
>>> >> +    if (ret != 0) {
>>> >> +        trace_vxhs_aio_rw_ioerr(
>>> >> +                  s->vdisk_guid, iodir, size, offset,
>>> >> +                  acb, acb->segments, ret, errno);
>>> >> +        /*
>>> >> +         * Don't retry I/Os against vDisk having no
>>> >> +         * redundancy or stateful storage on compute
>>> >> +         *
>>> >> +         * TODO: Revisit this code path to see if any
>>> >> +         *       particular error needs to be handled.
>>> >> +         *       At this moment failing the I/O.
>>> >> +         */
>>> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +        if (s->vdisk_nhosts == 1) {
>>> >> +            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
>>> >> +            s->vdisk_aio_count--;
>>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            goto errout;
>>> >> +        }
>>> >> +        if (OF_VDISK_FAILED(s)) {
>>> >> +            trace_vxhs_aio_rw_devfail(
>>> >> +                      s->vdisk_guid, iodir, size, offset);
>>> >> +            s->vdisk_aio_count--;
>>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            goto errout;
>>> >> +        }
>>> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>>> >> +            /*
>>> >> +             * Queue all incoming io requests after failover starts.
>>> >> +             * Number of requests that can arrive is limited by io queue depth
>>> >> +             * so an app blasting independent ios will not exhaust memory.
>>> >> +             */
>>> >> +            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>>> >> +            s->vdisk_aio_retry_qd++;
>>> >> +            OF_AIOCB_FLAGS_SET_QUEUED(acb);
>>> >> +            s->vdisk_aio_count--;
>>> >> +            vxhs_dec_acb_segment_count(acb, 1);
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
>>> >> +            goto out;
>>> >> +        }
>>> >> +        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
>>> >> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>>> >> +        s->vdisk_aio_retry_qd++;
>>> >> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>>> >> +        vxhs_dec_acb_segment_count(acb, 1);
>>> >> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
>>> >> +        /*
>>> >> +         * Start I/O failover if there is no active
>>> >> +         * AIO within vxhs block driver.
>>> >> +         */
>>> >> +        if (--s->vdisk_aio_count == 0) {
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            /*
>>> >> +             * Start IO failover
>>> >> +             */
>>> >> +            vxhs_failover_io(s);
>>> >> +            goto out;
>>> >> +        }
>>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +    }
>>> >> +
>>> >> +out:
>>> >> +    return &acb->common;
>>> >> +
>>> >> +errout:
>>> >> +    qemu_aio_unref(acb);
>>> >> +    return NULL;
>>> >> +}
>>> >> +
>>> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
>>> >> +                                   int64_t sector_num, QEMUIOVector *qiov,
>>> >> +                                   int nb_sectors,
>>> >> +                                   BlockCompletionFunc *cb, void *opaque)
>>> >> +{
>>> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>>> >> +                         cb, opaque, VDISK_AIO_READ);
>>> >> +}
>>> >> +
>>> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
>>> >> +                                    int64_t sector_num, QEMUIOVector *qiov,
>>> >> +                                    int nb_sectors,
>>> >> +                                    BlockCompletionFunc *cb, void *opaque)
>>> >> +{
>>> >> +    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
>>> >> +                         cb, opaque, VDISK_AIO_WRITE);
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * This is called by QEMU when a flush gets triggered from within
>>> >> + * a guest at the block layer, either for IDE or SCSI disks.
>>> >> + */
>>> >> +int vxhs_co_flush(BlockDriverState *bs)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    uint64_t size = 0;
>>> >> +    int ret = 0;
>>> >> +
>>> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
>>> >> +
>>> >> +    if (ret < 0) {
>>> >> +        /*
>>> >> +         * Currently not handling the flush ioctl
>>> >> +         * failure because of network connection
>>> >> +         * disconnect. Since all the writes are
>>> >> +         * commited into persistent storage hence
>>> >> +         * this flush call is noop and we can safely
>>> >> +         * return success status to the caller.
>>> >
>>> > I'm not sure I understand here.  Are you saying the qemu_iio_ioctl() call
>>> > above is a noop?
>>> >
>>>
>>> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present
>>> in case we later want to add some functionality to it. I have now
>>> added a comment to this affect to avoid any confusion.
>>>
>>
>> The problem is you don't know which version of the qnio library any given
>> QEMU binary will be using, since it is a shared library.  Future versions
>> may implement the flush ioctl as expressed above, in which case we may hide
>> a valid error.
>>
>> Am I correct in assuming that this call suppresses errors because an error
>> is returned for an unknown ioctl operation of VDISK_AIO_FLUSH?  If so, and
>> you want a placeholder here for flushing, you should go all the way and stub
>> out the underlying ioctl call to return success.  Then QEMU can at least
>> rely on the error return from the flush operation.
>>
>>
>>> >> +         *
>>> >> +         * If any write failure occurs for inflight
>>> >> +         * write AIO because of network disconnect
>>> >> +         * then anyway IO failover will be triggered.
>>> >> +         */
>>> >> +        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
>>> >> +        ret = 0;
>>> >> +    }
>>> >> +
>>> >> +    return ret;
>>> >> +}
>>> >> +
>>> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
>>> >> +{
>>> >> +    void *ctx = NULL;
>>> >> +    int flags = 0;
>>> >> +    unsigned long vdisk_size = 0;
>>> >> +    int ret = 0;
>>> >> +
>>> >> +    ret = qemu_iio_ioctl(s->qnio_ctx,
>>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +            VDISK_STAT, &vdisk_size, ctx, flags);
>>> >> +
>>> >> +    if (ret < 0) {
>>> >> +        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
>>> >> +        return 0;
>>> >> +    }
>>> >> +
>>> >> +    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
>>> >> +    return vdisk_size;
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * Returns the size of vDisk in bytes. This is required
>>> >> + * by QEMU block upper block layer so that it is visible
>>> >> + * to guest.
>>> >> + */
>>> >> +int64_t vxhs_getlength(BlockDriverState *bs)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    unsigned long vdisk_size = 0;
>>> >> +
>>> >> +    if (s->vdisk_size > 0) {
>>> >> +        vdisk_size = s->vdisk_size;
>>> >
>>> > s->vdisk_size is int64_t, vdisk_size is unsigned long.  On 32-bit platforms,
>>> > I believe unsigned long is only 32-bits, so this assignment is problematic.
>>> >
>>> > What is the maximum disk size for vxhs?
>>> >
>>> > (this comment applies to all the vdisk_size usage, but I'll only mention it
>>> > here)
>>> >
>>>
>>> Changed to int64_t in all places. The code does not place any
>>> restriction on the maximum size of the vdisk.
>>>
>>> >> +    } else {
>>> >> +        /*
>>> >> +         * Fetch the vDisk size using stat ioctl
>>> >> +         */
>>> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
>>> >> +        if (vdisk_size > 0) {
>>> >> +            s->vdisk_size = vdisk_size;
>>> >> +        }
>>> >> +    }
>>> >> +
>>> >> +    if (vdisk_size > 0) {
>>> >> +        return (int64_t)vdisk_size; /* return size in bytes */
>>> >> +    } else {
>>> >> +        return -EIO;
>>> >> +    }
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * Returns actual blocks allocated for the vDisk.
>>> >> + * This is required by qemu-img utility.
>>> >> + */
>>> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +    unsigned long vdisk_size = 0;
>>> >> +
>>> >> +    if (s->vdisk_size > 0) {
>>> >> +        vdisk_size = s->vdisk_size;
>>> >> +    } else {
>>> >> +        /*
>>> >> +         * TODO:
>>> >> +         * Once HyperScale storage-virtualizer provides
>>> >> +         * actual physical allocation of blocks then
>>> >> +         * fetch that information and return back to the
>>> >> +         * caller but for now just get the full size.
>>> >> +         */
>>> >> +        vdisk_size = vxhs_get_vdisk_stat(s);
>>> >> +        if (vdisk_size > 0) {
>>> >> +            s->vdisk_size = vdisk_size;
>>> >> +        }
>>> >> +    }
>>> >> +
>>> >> +    if (vdisk_size > 0) {
>>> >> +        return (int64_t)vdisk_size; /* return size in bytes */
>>> >> +    } else {
>>> >> +        return -EIO;
>>> >> +    }
>>> >> +}
>>> >> +
>>> >> +void vxhs_close(BlockDriverState *bs)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +
>>> >> +    close(s->fds[VDISK_FD_READ]);
>>> >> +    close(s->fds[VDISK_FD_WRITE]);
>>> >> +
>>> >> +    /*
>>> >> +     * never close channel - not ref counted, will
>>> >> +     * close for all vdisks
>>> >> +     */
>>> >> +    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
>>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>>> >> +            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
>>> >> +    }
>>> >> +    if (s->vdisk_lock) {
>>> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
>>> >> +        s->vdisk_lock = NULL;
>>> >> +    }
>>> >> +    if (s->vdisk_acb_lock) {
>>> >> +        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
>>> >> +        s->vdisk_acb_lock = NULL;
>>> >> +    }
>>> >> +
>>> >> +    /*
>>> >> +     * TODO: Verify that all the resources were relinguished.
>>> >> +     */
>>> >
>>> > Which resources?  Allocated here in the driver, or stuff in libqnio?
>>> >
>>>
>>> The comment was for all resources allocated within vxhs.c.
>>> Added code to free all allocated memory and fds on close.
>>>
>>>
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * If errors are consistent with storage agent failure:
>>> >> + *  - Try to reconnect in case error is transient or storage agent restarted.
>>> >> + *  - Currently failover is being triggered on per vDisk basis. There is
>>> >> + *    a scope of further optimization where failover can be global (per VM).
>>> >> + *  - In case of network (storage agent) failure, for all the vDisks, having
>>> >> + *    no redundancy, I/Os will be failed without attempting for I/O failover
>>> >> + *    because of stateless nature of vDisk.
>>> >> + *  - If local or source storage agent is down then send an ioctl to remote
>>> >> + *    storage agent to check if remote storage agent in a state to accept
>>> >> + *    application I/Os.
>>> >> + *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
>>> >> + *  - If I/Os cannot be serviced then vDisk will be marked failed so that
>>> >> + *    new incoming I/Os are returned with failure immediately.
>>> >> + *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
>>> >> + *    queued and will be restarted or failed based on failover operation
>>> >> + *    is successful or not.
>>> >> + *  - I/O failover can be started either in I/O forward or I/O backward
>>> >> + *    path.
>>> >> + *  - I/O failover will be started as soon as all the pending acb(s)
>>> >> + *    are queued and there is no pending I/O count.
>>> >> + *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
>>> >> + *    then vDisk will be marked failed and all I/Os will be completed with
>>> >> + *    error.
>>> >> + */
>>> >> +
>>> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s)
>>> >> +{
>>> >> +    int res = 0;
>>> >> +    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
>>> >> +
>>> >> +    trace_vxhs_switch_storage_agent(
>>> >> +              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>>> >> +              s->vdisk_guid);
>>> >> +
>>> >> +    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
>>> >> +    if (res == 0) {
>>> >> +        res = qemu_iio_ioctl(s->qnio_ctx,
>>> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
>>> >> +                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
>>> >> +    } else {
>>> >> +        trace_vxhs_switch_storage_agent_failed(
>>> >> +                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
>>> >> +                  s->vdisk_guid, res, errno);
>>> >> +        /*
>>> >> +         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
>>> >> +         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
>>> >> +         * target list.
>>> >> +         */
>>> >> +
>>> >> +        /* try next host */
>>> >> +        vxhs_failover_ioctl_cb(res, s);
>>> >> +    }
>>> >> +    return res;
>>> >> +}
>>> >> +
>>> >> +void vxhs_failover_ioctl_cb(int res, void *ctx)
>>> >> +{
>>> >> +    BDRVVXHSState *s = ctx;
>>> >> +
>>> >> +    if (res == 0) {
>>> >> +        /* found failover target */
>>> >> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
>>> >> +        s->vdisk_ask_failover_idx = 0;
>>> >> +        trace_vxhs_failover_ioctl_cb(
>>> >> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>>> >> +                   s->vdisk_guid);
>>> >> +        VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
>>> >> +        VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +        vxhs_handle_queued_ios(s);
>>> >> +    } else {
>>> >> +        /* keep looking */
>>> >> +        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
>>> >> +        s->vdisk_ask_failover_idx++;
>>> >> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
>>> >
>>> > This function, vxhs_failover_ioctrl_cb(), is called from the external
>>> > library callback, in addition to QEMU itself (i.e., vxhs_aio_rw).  This
>>> > makes me concerned that we may have (or be able to induce) an unsafe race
>>> > condition when incrementing the vdisk_ask_failover_id array index.
>>> >
>>>
>>> vxhs_failover_ioctl_cb()  can be called from the callback
>>> vxhs_iio_callback() case IRP_VDISK_CHECK_IO_FAILOVER_READY in response
>>> to the qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY)
>>>
>>> This ioctl qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) is only
>>> called from within vxhs_switch_storage_agent()
>>>
>>> A direct call to vxhs_failover_ioctl_cb() is only issued from the same
>>> function vxhs_switch_storage_agent() on an "else" condition. So
>>> vxhs_failover_ioctl_cb() can only be called from a single thread
>>> context either in response to the ioctl, or directly. We should be
>>> safe here, but thanks for pointing it out!
>>
>>
>> The callback function 'vxhs_iio_callback' also calls
>> vxhs_failover_ioctl_cb().
>>
>> vxhs_failover_ioctl_cb() is also called from vxhs_aio_rw() by QEMU (via
>> 'vxds_failover_io').
>>
>> The above two paths make me concerned about the array pointer being
>> incremented.
>>
>> (BTW: 'vxhs_failover_ioctl_cb' seems to be misnamed, because it is not a
>> callback function, but just a helper function).
>>
>> Also related: this code reads like it will hang QEMU if QNIO goes offline;
>> we will be caught in perpetual retries, in which case vxhs_aio_rw() may
>> never return.
>>
>>>
>>> >> +            /* pause and cycle through list again */
>>> >> +            sleep(QNIO_CONNECT_RETRY_SECS);
>>> >> +            s->vdisk_ask_failover_idx = 0;
>>> >> +        }
>>> >> +        res = vxhs_switch_storage_agent(s);
>>> >> +    }
>>> >> +}
>>> >> +
>>> >> +int vxhs_failover_io(BDRVVXHSState *s)
>>> >> +{
>>> >> +    int res = 0;
>>> >> +
>>> >> +    trace_vxhs_failover_io(s->vdisk_guid);
>>> >> +
>>> >> +    s->vdisk_ask_failover_idx = 0;
>>> >> +    res = vxhs_switch_storage_agent(s);
>>> >> +
>>> >> +    return res;
>>> >> +}
>>> >> +
>>> >> +/*
>>> >> + * Try to reopen the vDisk on one of the available hosts
>>> >> + * If vDisk reopen is successful on any of the host then
>>> >> + * check if that node is ready to accept I/O.
>>> >> + */
>>> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
>>> >> +{
>>> >> +    char *of_vsa_addr = NULL;
>>> >> +    char *file_name = NULL;
>>> >> +    int  res = 0;
>>> >> +
>>> >> +    /*
>>> >> +     * Don't close the channel if it was opened
>>> >> +     * before successfully. It will be handled
>>> >> +     * within iio* api if the same channel open
>>> >> +     * fd is reused.
>>> >> +     *
>>> >> +     * close stale vdisk device remote fd since
>>> >> +     * it is invalid after channel disconnect.
>>> >> +     */
>>> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
>>> >> +        qemu_iio_devclose(s->qnio_ctx, 0,
>>> >> +                                 s->vdisk_hostinfo[index].vdisk_rfd);
>>> >> +        s->vdisk_hostinfo[index].vdisk_rfd = -1;
>>> >> +    }
>>> >> +
>>> >> +    /*
>>> >> +     * build storage agent address and vdisk device name strings
>>> >> +     */
>>> >> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>>> >> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>>> >> +             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
>>> >> +    /*
>>> >> +     * open qnio channel to storage agent if not opened before.
>>> >> +     */
>>> >> +    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>>> >> +        s->vdisk_hostinfo[index].qnio_cfd =
>>> >> +                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
>>> >> +        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
>>> >> +            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
>>> >> +            res = ENODEV;
>>> >> +            goto out;
>>> >> +        }
>>> >> +    }
>>> >> +    /*
>>> >> +     * open vdisk device
>>> >> +     */
>>> >> +    s->vdisk_hostinfo[index].vdisk_rfd =
>>> >> +            qemu_iio_devopen(global_qnio_ctx,
>>> >> +             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
>>> >> +    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
>>> >> +        trace_vxhs_reopen_vdisk_openfail(file_name);
>>> >> +        res = EIO;
>>> >> +        goto out;
>>> >> +    }
>>> >> +
>>> >> +out:
>>> >> +    g_free(of_vsa_addr);
>>> >> +    g_free(file_name);
>>> >> +    return res;
>>> >> +}
>>> >> +
>>> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s)
>>> >> +{
>>> >> +    VXHSAIOCB *acb = NULL;
>>> >> +    int res = 0;
>>> >> +
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
>>> >> +        /*
>>> >> +         * Before we process the acb, check whether I/O failover
>>> >> +         * started again due to failback or cascading failure.
>>> >> +         */
>>> >> +        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            goto out;
>>> >> +        }
>>> >> +        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
>>> >> +        s->vdisk_aio_retry_qd--;
>>> >> +        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
>>> >> +        if (OF_VDISK_FAILED(s)) {
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            vxhs_fail_aio(acb, EIO);
>>> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +        } else {
>>> >> +            VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +            res = vxhs_restart_aio(acb);
>>> >> +            trace_vxhs_handle_queued_ios(acb, res);
>>> >> +            VXHS_SPIN_LOCK(s->vdisk_lock);
>>> >> +            if (res) {
>>> >> +                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
>>> >> +                                     acb, retry_entry);
>>> >> +                OF_AIOCB_FLAGS_SET_QUEUED(acb);
>>> >> +                VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +                goto out;
>>> >> +            }
>>> >> +        }
>>> >> +    }
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_lock);
>>> >> +out:
>>> >> +    return res;
>>> >> +}
>>> >> +
>>> >> +int vxhs_restart_aio(VXHSAIOCB *acb)
>>> >> +{
>>> >> +    BDRVVXHSState *s = NULL;
>>> >> +    int iio_flags = 0;
>>> >> +    int res = 0;
>>> >> +
>>> >> +    s = acb->common.bs->opaque;
>>> >> +
>>> >> +    if (acb->direction == VDISK_AIO_WRITE) {
>>> >> +        vxhs_inc_vdisk_iocount(s, 1);
>>> >> +        vxhs_inc_acb_segment_count(acb, 1);
>>> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>>> >> +        res = qemu_iio_writev(s->qnio_ctx,
>>> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +                acb->qiov->iov, acb->qiov->niov,
>>> >> +                acb->io_offset, (void *)acb, iio_flags);
>>> >> +    }
>>> >> +
>>> >> +    if (acb->direction == VDISK_AIO_READ) {
>>> >> +        vxhs_inc_vdisk_iocount(s, 1);
>>> >> +        vxhs_inc_acb_segment_count(acb, 1);
>>> >> +        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>>> >> +        res = qemu_iio_readv(s->qnio_ctx,
>>> >> +                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
>>> >> +                acb->qiov->iov, acb->qiov->niov,
>>> >> +                acb->io_offset, (void *)acb, iio_flags);
>>> >> +    }
>>> >> +
>>> >> +    if (res != 0) {
>>> >> +        vxhs_dec_vdisk_iocount(s, 1);
>>> >> +        vxhs_dec_acb_segment_count(acb, 1);
>>> >> +        trace_vxhs_restart_aio(acb->direction, res, errno);
>>> >> +    }
>>> >> +
>>> >> +    return res;
>>> >> +}
>>> >> +
>>> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err)
>>> >> +{
>>> >> +    BDRVVXHSState *s = NULL;
>>> >> +    int segcount = 0;
>>> >> +    int rv = 0;
>>> >> +
>>> >> +    s = acb->common.bs->opaque;
>>> >> +
>>> >> +    trace_vxhs_fail_aio(s->vdisk_guid, acb);
>>> >> +    if (!acb->ret) {
>>> >> +        acb->ret = err;
>>> >> +    }
>>> >> +    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
>>> >> +    segcount = acb->segments;
>>> >> +    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
>>> >> +    if (segcount == 0) {
>>> >> +        /*
>>> >> +         * Complete the io request
>>> >> +         */
>>> >> +        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
>>> >> +        if (rv != sizeof(acb)) {
>>> >> +            error_report("VXHS AIO completion failed: %s",
>>> >> +                         strerror(errno));
>>> >> +            abort();
>>> >> +        }
>>> >> +    }
>>> >> +}
>>> >> +
>>> >> +static void vxhs_detach_aio_context(BlockDriverState *bs)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +
>>> >> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
>>> >> +                       false, NULL, NULL, NULL);
>>> >> +
>>> >> +}
>>> >> +
>>> >> +static void vxhs_attach_aio_context(BlockDriverState *bs,
>>> >> +                                   AioContext *new_context)
>>> >> +{
>>> >> +    BDRVVXHSState *s = bs->opaque;
>>> >> +
>>> >> +    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
>>> >> +                       false, vxhs_aio_event_reader, NULL, s);
>>> >> +}
>>> >> +
>>> >> +static BlockDriver bdrv_vxhs = {
>>> >> +    .format_name                  = "vxhs",
>>> >> +    .protocol_name                = "vxhs",
>>> >> +    .instance_size                = sizeof(BDRVVXHSState),
>>> >> +    .bdrv_file_open               = vxhs_open,
>>> >> +    .bdrv_parse_filename          = vxhs_parse_filename,
>>> >> +    .bdrv_close                   = vxhs_close,
>>> >> +    .bdrv_getlength               = vxhs_getlength,
>>> >> +    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
>>> >> +    .bdrv_aio_readv               = vxhs_aio_readv,
>>> >> +    .bdrv_aio_writev              = vxhs_aio_writev,
>>> >> +    .bdrv_co_flush_to_disk        = vxhs_co_flush,
>>> >> +    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
>>> >> +    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
>>> >> +};
>>> >> +
>>> >> +void bdrv_vxhs_init(void)
>>> >> +{
>>> >> +    trace_vxhs_bdrv_init('.');
>>> >> +    bdrv_register(&bdrv_vxhs);
>>> >> +}
>>> >> +
>>> >> +block_init(bdrv_vxhs_init);
>>> >> diff --git a/block/vxhs.h b/block/vxhs.h
>>> >> new file mode 100644
>>> >> index 0000000..38e0165
>>> >> --- /dev/null
>>> >> +++ b/block/vxhs.h
>>> >> @@ -0,0 +1,237 @@
>>> >> +/*
>>> >> + * QEMU Block driver for Veritas HyperScale (VxHS)
>>> >> + *
>>> >> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> >> + * See the COPYING file in the top-level directory.
>>> >> + *
>>> >> + */
>>> >> +
>>> >> +#ifndef VXHSD_H
>>> >> +#define VXHSD_H
>>> >> +
>>> >> +#include <gmodule.h>
>>> >> +#include <inttypes.h>
>>> >> +#include <pthread.h>
>>> >> +
>>> >> +#include "qemu/osdep.h"
>>> >> +#include "qapi/error.h"
>>> >> +#include "qemu/error-report.h"
>>> >> +#include "block/block_int.h"
>>> >> +#include "qemu/uri.h"
>>> >> +#include "qemu/queue.h"
>>> >> +
>>> >> +#define OF_GUID_STR_LEN             40
>>> >> +#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
>>> >> +#define QNIO_CONNECT_RETRY_SECS     5
>>> >> +#define QNIO_CONNECT_TIMOUT_SECS    120
>>> >> +
>>> >> +/* constants from io_qnio.h */
>>> >> +#define IIO_REASON_DONE     0x00000004
>>> >> +#define IIO_REASON_EVENT    0x00000008
>>> >> +#define IIO_REASON_HUP      0x00000010
>>> >> +
>>> >> +/*
>>> >> + * IO specific flags
>>> >> + */
>>> >> +#define IIO_FLAG_ASYNC      0x00000001
>>> >> +#define IIO_FLAG_DONE       0x00000010
>>> >> +#define IIO_FLAG_SYNC       0
>>> >> +
>>> >> +/* constants from error.h */
>>> >> +#define VXERROR_RETRY_ON_SOURCE     44
>>> >> +#define VXERROR_HUP                 901
>>> >> +#define VXERROR_CHANNEL_HUP         903
>>> >> +
>>> >> +/* constants from iomgr.h and opcode.h */
>>> >> +#define IRP_READ_REQUEST                    0x1FFF
>>> >> +#define IRP_WRITE_REQUEST                   0x2FFF
>>> >> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
>>> >> +
>>> >> +/* Lock specific macros */
>>> >> +#define VXHS_SPIN_LOCK_ALLOC                  \
>>> >> +    (qemu_ck_initialize_lock())
>>> >> +#define VXHS_SPIN_LOCK(lock)                  \
>>> >> +    (qemu_ck_spin_lock(lock))
>>> >> +#define VXHS_SPIN_UNLOCK(lock)                \
>>> >> +    (qemu_ck_spin_unlock(lock))
>>> >> +#define VXHS_SPIN_LOCK_DESTROY(lock)          \
>>> >> +    (qemu_ck_destroy_lock(lock))
>>> >
>>> > What's the value of these macros, rather than just calling the underlying
>>> > function directly in code?
>>> >
>>>
>>> This way we can easily swap out CK locks with something else like
>>> pthread_mutex if required.
>>>
>>> >> +
>>> >> +typedef enum {
>>> >> +    VXHS_IO_INPROGRESS,
>>> >> +    VXHS_IO_COMPLETED,
>>> >> +    VXHS_IO_ERROR
>>> >> +} VXHSIOState;
>>> >> +
>>> >> +
>>> >> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
>>> >> +
>>> >> +#define VDISK_FD_READ 0
>>> >> +#define VDISK_FD_WRITE 1
>>> >> +
>>> >> +#define QNIO_VDISK_NONE        0x00
>>> >> +#define QNIO_VDISK_CREATE      0x01
>>> >> +
>>> >> +/* max IO size supported by QEMU NIO lib */
>>> >> +#define QNIO_MAX_IO_SIZE       4194304
>>> >> +
>>> >> +#define MAX_HOSTS               4
>>> >> +
>>> >> +/*
>>> >> + * Opcodes for making IOCTL on QEMU NIO library
>>> >> + */
>>> >> +#define BASE_OPCODE_SHARED     1000
>>> >> +#define BASE_OPCODE_DAL        2000
>>> >> +#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
>>> >> +#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
>>> >> +#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
>>> >> +#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
>>> >> +
>>> >> +/*
>>> >> + * BDRVVXHSState specific flags
>>> >> + */
>>> >> +#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
>>> >> +#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
>>> >> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
>>> >> +
>>> >> +#define OF_VDISK_ACTIVE(s)                                              \
>>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
>>> >> +#define OF_VDISK_SET_ACTIVE(s)                                          \
>>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
>>> >> +#define OF_VDISK_RESET_ACTIVE(s)                                        \
>>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
>>> >> +
>>> >> +#define OF_VDISK_FAILED(s)                                              \
>>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
>>> >> +#define OF_VDISK_SET_FAILED(s)                                          \
>>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
>>> >> +#define OF_VDISK_RESET_FAILED(s)                                        \
>>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
>>> >> +
>>> >> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
>>> >> +        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>>> >> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
>>> >> +        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>>> >> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
>>> >> +        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
>>> >> +
>>> >> +/*
>>> >> + * VXHSAIOCB specific flags
>>> >> + */
>>> >> +#define OF_ACB_QUEUED       0x00000001
>>> >> +
>>> >> +#define OF_AIOCB_FLAGS_QUEUED(a)            \
>>> >> +        ((a)->flags & OF_ACB_QUEUED)
>>> >> +#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
>>> >> +        ((a)->flags |= OF_ACB_QUEUED)
>>> >> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
>>> >> +        ((a)->flags &= ~OF_ACB_QUEUED)
>>> >> +
>>> >
>>> > Many helper macros here... IMO, it decreases the readability of the code,
>>> > and my preference is to just have the underlying code without the use of the
>>> > macros.
>>> >
>>>
>>> Would like to keep this unchanged if that's OK!
>>>
>>> >> +typedef struct qemu2qnio_ctx {
>>> >> +    uint32_t            qnio_flag;
>>> >> +    uint64_t            qnio_size;
>>> >> +    char                *qnio_channel;
>>> >> +    char                *target;
>>> >> +    qnio_callback_t     qnio_cb;
>>> >> +} qemu2qnio_ctx_t;
>>> >> +
>>> >> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
>>> >> +
>>> >> +typedef struct LibQNIOSymbol {
>>> >> +        const char *name;
>>> >> +        gpointer *addr;
>>> >> +} LibQNIOSymbol;
>>> >> +
>>> >> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
>>> >> +                          void *reply);
>>> >> +
>>> >> +/*
>>> >> + * HyperScale AIO callbacks structure
>>> >> + */
>>> >> +typedef struct VXHSAIOCB {
>>> >> +    BlockAIOCB          common;
>>> >> +    size_t              ret;
>>> >> +    size_t              size;
>>> >> +    QEMUBH              *bh;
>>> >> +    int                 aio_done;
>>> >> +    int                 segments;
>>> >> +    int                 flags;
>>> >> +    size_t              io_offset;
>>> >> +    QEMUIOVector        *qiov;
>>> >> +    void                *buffer;
>>> >> +    int                 direction;  /* IO direction (r/w) */
>>> >> +    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
>>> >> +} VXHSAIOCB;
>>> >> +
>>> >> +typedef struct VXHSvDiskHostsInfo {
>>> >> +    int     qnio_cfd;        /* Channel FD */
>>> >> +    int     vdisk_rfd;      /* vDisk remote FD */
>>> >> +    char    *hostip;        /* Host's IP addresses */
>>> >> +    int     port;           /* Host's port number */
>>> >> +} VXHSvDiskHostsInfo;
>>> >> +
>>> >> +/*
>>> >> + * Structure per vDisk maintained for state
>>> >> + */
>>> >> +typedef struct BDRVVXHSState {
>>> >> +    int                     fds[2];
>>> >> +    int64_t                 vdisk_size;
>>> >> +    int64_t                 vdisk_blocks;
>>> >> +    int64_t                 vdisk_flags;
>>> >> +    int                     vdisk_aio_count;
>>> >> +    int                     event_reader_pos;
>>> >> +    VXHSAIOCB               *qnio_event_acb;
>>> >> +    void                    *qnio_ctx;
>>> >> +    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
>>> >> +    void                    *vdisk_acb_lock;  /* Protects ACB */
>>> >> +    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
>>> >> +    int                     vdisk_nhosts;   /* Total number of hosts */
>>> >> +    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
>>> >> +    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
>>> >> +    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
>>> >> +    int                     vdisk_aio_retry_qd;
>>> >
>>> > Is vdisk_aio_retry_qd just used for debugging purposes?  I see it
>>> > incremented/decremented throughout the code, but no one checks it.
>>> >
>>>
>>> Yes, it is for debugging only. Added a comment to state the same.
>>>
>>>
>>> >> +    char                    *vdisk_guid;
>>> >> +} BDRVVXHSState;
>>> >> +
>>> >> +void bdrv_vxhs_init(void);
>>> >> +void *vxhs_setup_qnio(void);
>>> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
>>> >> +void vxhs_aio_event_reader(void *opaque);
>>> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
>>> >> +int vxhs_aio_flush_cb(void *opaque);
>>> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg);
>>> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
>>> >> +int vxhs_open(BlockDriverState *bs, QDict *options,
>>> >> +              int bdrv_flags, Error **errp);
>>> >> +void vxhs_close(BlockDriverState *bs);
>>> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>> >> +                                   QEMUIOVector *qiov, int nb_sectors,
>>> >> +                                   BlockCompletionFunc *cb, void *opaque);
>>> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>> >> +                                    QEMUIOVector *qiov, int nb_sectors,
>>> >> +                                    BlockCompletionFunc *cb,
>>> >> +                                    void *opaque);
>>> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
>>> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
>>> >> +                                QEMUIOVector *qiov, int nb_sectors,
>>> >> +                                BlockCompletionFunc *cb,
>>> >> +                                void *opaque, int write);
>>> >> +int vxhs_co_flush(BlockDriverState *bs);
>>> >> +int64_t vxhs_getlength(BlockDriverState *bs);
>>> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
>>> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
>>> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr);
>>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count);
>>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count);
>>> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
>>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>>> >> +int vxhs_failover_io(BDRVVXHSState *s);
>>> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
>>> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s);
>>> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s);
>>> >> +int vxhs_restart_aio(VXHSAIOCB *acb);
>>> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err);
>>> >> +void vxhs_failover_ioctl_cb(int res, void *ctx);
>>> >> +
>>> >> +
>>> >> +#endif
>>> >> diff --git a/configure b/configure
>>> >> index 4b808f9..c71e270 100755
>>> >> --- a/configure
>>> >> +++ b/configure
>>> >> @@ -320,6 +320,7 @@ vhdx=""
>>> >>  numa=""
>>> >>  tcmalloc="no"
>>> >>  jemalloc="no"
>>> >> +vxhs=""
>>> >>
>>> >>  # parse CC options first
>>> >>  for opt do
>>> >> @@ -1150,6 +1151,11 @@ for opt do
>>> >>    ;;
>>> >>    --enable-jemalloc) jemalloc="yes"
>>> >>    ;;
>>> >> +  --disable-vxhs) vxhs="no"
>>> >> +  ;;
>>> >> +  --enable-vxhs) vxhs="yes"
>>> >> +  ;;
>>> >> +
>>> >>    *)
>>> >>        echo "ERROR: unknown option $opt"
>>> >>        echo "Try '$0 --help' for more information"
>>> >> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if available:
>>> >>    numa            libnuma support
>>> >>    tcmalloc        tcmalloc support
>>> >>    jemalloc        jemalloc support
>>> >> +  vxhs            Veritas HyperScale vDisk backend support
>>> >>
>>> >>  NOTE: The object files are built at the place where configure is launched
>>> >>  EOF
>>> >> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
>>> >>  fi
>>> >>
>>> >>  ##########################################
>>> >> +# Veritas HyperScale block driver VxHS
>>> >> +# Check if libqnio is installed
>>> >> +if test "$vxhs" != "no" ; then
>>> >> +  cat > $TMPC <<EOF
>>> >> +#include <stdio.h>
>>> >> +#include <qnio/qnio_api.h>
>>> >> +
>>> >> +void vxhs_inc_acb_segment_count(void *acb, int count);
>>> >> +void vxhs_dec_acb_segment_count(void *acb, int count);
>>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer);
>>> >> +
>>> >> +void vxhs_inc_acb_segment_count(void *ptr, int count)
>>> >> +{
>>> >> +}
>>> >> +void vxhs_dec_acb_segment_count(void *ptr, int count)
>>> >> +{
>>> >> +}
>>> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer)
>>> >> +{
>>> >> +}
>>> >> +int main(void) {
>>> >> +    qemu_ck_initialize_lock();
>>> >> +    return 0;
>>> >> +}
>>> >> +EOF
>>> >> +  vxhs_libs="-lqnioshim -lqnio"
>>> >> +  if compile_prog "" "$vxhs_libs" ; then
>>> >> +    vxhs=yes
>>> >> +  else
>>> >> +    if test "$vxhs" = "yes" ; then
>>> >> +      feature_not_found "vxhs block device" "Install libqnio. See github"
>>> >> +    fi
>>> >> +    vxhs=no
>>> >> +  fi
>>> >> +fi
>>> >> +
>>> >> +##########################################
>>> >>  # End of CC checks
>>> >>  # After here, no more $cc or $ld runs
>>> >>
>>> >> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then
>>> >>    echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
>>> >>  fi
>>> >>
>>> >> +if test "$vxhs" = "yes" ; then
>>> >> +  echo "CONFIG_VXHS=y" >> $config_host_mak
>>> >> +  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
>>> >> +  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
>>> >> +fi
>>> >> +
>>> >>  if test "$tcg_interpreter" = "yes"; then
>>> >>    QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
>>> >>  elif test "$ARCH" = "sparc64" ; then
>>> >
>>> > Your configure file should indicate if vxhs is enabled or not. For instance,
>>> > here is a snippet from where this occurs in the current configure file:
>>> >
>>> > [...]
>>> > echo "NUMA host support $numa"
>>> > echo "tcmalloc support  $tcmalloc"
>>> > echo "jemalloc support  $jemalloc"
>>> > echo "avx2 optimization $avx2_opt"
>>> >
>>> >
>>>
>>> Fixed.
>>>
>>> >> --
>>> >> 2.5.5
>>> >>
>>> >>
diff mbox

Patch

=============
(1) Reworked QAPI/JSON parsing.
(2) Reworked URI parsing as suggested by Kevin.
(3) Fixes per review comments from Stefan on v1.
(4) Fixes per review comments from Daniel on v3.

 block/Makefile.objs |    2 +
 block/trace-events  |   41 ++
 block/vxhs.c        | 1304 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/vxhs.h        |  237 ++++++++++
 configure           |   50 ++
 5 files changed, 1634 insertions(+)
 create mode 100644 block/vxhs.c
 create mode 100644 block/vxhs.h

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 2593a2f..5f83305 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,6 +20,7 @@  block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
+block-obj-$(CONFIG_VXHS) += vxhs.o
 block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 
@@ -43,3 +44,4 @@  block-obj-m        += dmg.o
 dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
+vxhs.o-libs        := $(VXHS_LIBS)
diff --git a/block/trace-events b/block/trace-events
index 05fa13c..06c6d8c 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -114,3 +114,44 @@  qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s
 qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+
+# block/vxhs.c
+vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c"
+vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d"
+vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p"
+vxhs_setup_qnio_nwerror(char c) "Could not initialize the network channel. Bailing out%c"
+vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) "Read/Write failed: error %d, reason %d, acb %p, segment %d"
+vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p to retry queue (5)"
+vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)"
+vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu Error = %d"
+vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p"
+vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d"
+vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d"
+vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d"
+vxhs_open_fail(int ret) "Could not open the device. Error = %d"
+vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing out%c"
+vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = %lu"
+vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb %p to retry queue(%d)"
+vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d"
+vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = %d, errno = %d"
+vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl failed ret = %d errno = %d"
+vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d"
+vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu"
+vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk %s"
+vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) "Query to host %s for vdisk %s failed, res = %d, errno = %d"
+vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server host-IP %s for vdisk %s"
+vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep looking for io target for vdisk %s"
+vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s"
+vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on host-ip %s"
+vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s"
+vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d"
+vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = %d Error = %d, errno = %d"
+vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld"
+vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed."
+vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu"
+vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s"
+vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s"
+vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d"
+vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d"
+vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState"
+vxhs_qemu_init_filename(const char *filename) "Filename passed as %s"
diff --git a/block/vxhs.c b/block/vxhs.c
new file mode 100644
index 0000000..32c37c5
--- /dev/null
+++ b/block/vxhs.c
@@ -0,0 +1,1304 @@ 
+/*
+ * QEMU Block driver for Veritas HyperScale (VxHS)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vxhs.h"
+#include <qnio/qnio_api.h>
+#include "trace.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+
+#define VXHS_OPT_FILENAME        "filename"
+#define VXHS_OPT_VDISK_ID        "vdisk_id"
+#define VXHS_OPT_SERVER          "server."
+#define VXHS_OPT_HOST            "host"
+#define VXHS_OPT_PORT            "port"
+
+/* qnio client ioapi_ctx */
+static void *global_qnio_ctx;
+
+/* vdisk prefix to pass to qnio */
+static const char vdisk_prefix[] = "/dev/of/vdisk";
+
+void vxhs_inc_acb_segment_count(void *ptr, int count)
+{
+    VXHSAIOCB *acb = ptr;
+    BDRVVXHSState *s = acb->common.bs->opaque;
+
+    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+    acb->segments += count;
+    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+}
+
+void vxhs_dec_acb_segment_count(void *ptr, int count)
+{
+    VXHSAIOCB *acb = ptr;
+    BDRVVXHSState *s = acb->common.bs->opaque;
+
+    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+    acb->segments -= count;
+    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+}
+
+int vxhs_dec_and_get_acb_segment_count(void *ptr, int count)
+{
+    VXHSAIOCB *acb = ptr;
+    BDRVVXHSState *s = acb->common.bs->opaque;
+    int segcount = 0;
+
+
+    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+    acb->segments -= count;
+    segcount = acb->segments;
+    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+
+    return segcount;
+}
+
+void vxhs_set_acb_buffer(void *ptr, void *buffer)
+{
+    VXHSAIOCB *acb = ptr;
+
+    acb->buffer = buffer;
+}
+
+void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count)
+{
+    BDRVVXHSState *s = ptr;
+
+    VXHS_SPIN_LOCK(s->vdisk_lock);
+    s->vdisk_aio_count += count;
+    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+}
+
+void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count)
+{
+    BDRVVXHSState *s = ptr;
+
+    VXHS_SPIN_LOCK(s->vdisk_lock);
+    s->vdisk_aio_count -= count;
+    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+}
+
+uint32_t vxhs_get_vdisk_iocount(void *ptr)
+{
+    BDRVVXHSState *s = ptr;
+    uint32_t count = 0;
+
+    VXHS_SPIN_LOCK(s->vdisk_lock);
+    count = s->vdisk_aio_count;
+    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+
+    return count;
+}
+
+void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m)
+{
+    VXHSAIOCB *acb = NULL;
+    BDRVVXHSState *s = NULL;
+    int rv = 0;
+    int segcount = 0;
+    uint32_t error = 0;
+    uint32_t opcode = 0;
+
+    assert(m);
+    if (m) {
+        /* TODO: need common get message attrs, not two separate lib calls */
+        error = qemu_iio_extract_msg_error(m);
+        opcode = qemu_iio_extract_msg_opcode(m);
+    }
+    switch (opcode) {
+    case IRP_READ_REQUEST:
+    case IRP_WRITE_REQUEST:
+
+    /*
+     * ctx is VXHSAIOCB*
+     * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is IIO_REASON_HUP
+     */
+    if (ctx) {
+        acb = ctx;
+        s = acb->common.bs->opaque;
+    } else {
+        trace_vxhs_iio_callback(error, reason);
+        goto out;
+    }
+
+    if (error) {
+        trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments);
+
+        if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) {
+            /*
+             * Storage agent failed while I/O was in progress
+             * Fail over only if the qnio channel dropped, indicating
+             * storage agent failure. Don't fail over in response to other
+             * I/O errors such as disk failure.
+             */
+            if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP ||
+                error == VXERROR_CHANNEL_HUP || error == -1) {
+                /*
+                 * Start vDisk IO failover once callback is
+                 * called against all the pending IOs.
+                 * If vDisk has no redundancy enabled
+                 * then IO failover routine will mark
+                 * the vDisk failed and fail all the
+                 * AIOs without retry (stateless vDisk)
+                 */
+                VXHS_SPIN_LOCK(s->vdisk_lock);
+                if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
+                    OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
+                }
+                /*
+                 * Check if this acb is already queued before.
+                 * It is possible in case if I/Os are submitted
+                 * in multiple segments (QNIO_MAX_IO_SIZE).
+                 */
+                VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+                if (!OF_AIOCB_FLAGS_QUEUED(acb)) {
+                    QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
+                                         acb, retry_entry);
+                    OF_AIOCB_FLAGS_SET_QUEUED(acb);
+                    s->vdisk_aio_retry_qd++;
+                    trace_vxhs_iio_callback_retry(s->vdisk_guid, acb);
+                }
+                segcount = --acb->segments;
+                VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+                /*
+                 * Decrement AIO count only when callback is called
+                 * against all the segments of aiocb.
+                 */
+                if (segcount == 0 && --s->vdisk_aio_count == 0) {
+                    /*
+                     * Start vDisk I/O failover
+                     */
+                    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+                    /*
+                     * TODO:
+                     * Need to explore further if it is possible to optimize
+                     * the failover operation on Virtual-Machine (global)
+                     * specific rather vDisk specific.
+                     */
+                    vxhs_failover_io(s);
+                    goto out;
+                }
+                VXHS_SPIN_UNLOCK(s->vdisk_lock);
+                goto out;
+            }
+        } else if (reason == IIO_REASON_HUP) {
+            /*
+             * Channel failed, spontaneous notification,
+             * not in response to I/O
+             */
+            trace_vxhs_iio_callback_chnlfail(error);
+            /*
+             * TODO: Start channel failover when no I/O is outstanding
+             */
+            goto out;
+        } else {
+            trace_vxhs_iio_callback_fail(reason, acb, acb->segments,
+                                         acb->size, error);
+        }
+    }
+    /*
+     * Set error into acb if not set. In case if acb is being
+     * submitted in multiple segments then need to set the error
+     * only once.
+     *
+     * Once acb done callback is called for the last segment
+     * then acb->ret return status will be sent back to the
+     * caller.
+     */
+    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+    if (error && !acb->ret) {
+        acb->ret = error;
+    }
+    --acb->segments;
+    segcount = acb->segments;
+    assert(segcount >= 0);
+    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+    /*
+     * Check if all the outstanding I/Os are done against acb.
+     * If yes then send signal for AIO completion.
+     */
+    if (segcount == 0) {
+        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
+        if (rv != sizeof(acb)) {
+            error_report("VXHS AIO completion failed: %s", strerror(errno));
+            abort();
+        }
+    }
+    break;
+
+    case IRP_VDISK_CHECK_IO_FAILOVER_READY:
+        /* ctx is BDRVVXHSState* */
+        assert(ctx);
+        trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid,
+                                      error);
+        vxhs_failover_ioctl_cb(error, ctx);
+        break;
+
+    default:
+        if (reason == IIO_REASON_HUP) {
+            /*
+             * Channel failed, spontaneous notification,
+             * not in response to I/O
+             */
+            trace_vxhs_iio_callback_chnfail(error, errno);
+            /*
+             * TODO: Start channel failover when no I/O is outstanding
+             */
+        } else {
+            trace_vxhs_iio_callback_unknwn(opcode, error);
+        }
+        break;
+    }
+out:
+    return;
+}
+
+void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s)
+{
+    BlockCompletionFunc *cb = acb->common.cb;
+    void *opaque = acb->common.opaque;
+    int ret = 0;
+
+    if (acb->ret != 0) {
+        trace_vxhs_complete_aio(acb, acb->ret);
+    /*
+     * We mask all the IO errors generically as EIO for upper layers
+     * Right now our IO Manager uses non standard error codes. Instead
+     * of confusing upper layers with incorrect interpretation we are
+     * doing this workaround.
+     */
+        ret = (-EIO);
+    }
+    /*
+     * Copy back contents from stablization buffer into original iovector
+     * before returning the IO
+     */
+    if (acb->buffer != NULL) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size);
+        free(acb->buffer);
+        acb->buffer = NULL;
+    }
+    vxhs_dec_vdisk_iocount(s, 1);
+    acb->aio_done = VXHS_IO_COMPLETED;
+    qemu_aio_unref(acb);
+    cb(opaque, ret);
+}
+
+/*
+ * This is the HyperScale event handler registered to QEMU.
+ * It is invoked when any IO gets completed and written on pipe
+ * by callback called from QNIO thread context. Then it marks
+ * the AIO as completed, and releases HyperScale AIO callbacks.
+ */
+void vxhs_aio_event_reader(void *opaque)
+{
+    BDRVVXHSState *s = opaque;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&s->qnio_event_acb;
+
+        ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos,
+                   sizeof(s->qnio_event_acb) - s->event_reader_pos);
+        if (ret > 0) {
+            s->event_reader_pos += ret;
+            if (s->event_reader_pos == sizeof(s->qnio_event_acb)) {
+                s->event_reader_pos = 0;
+                vxhs_complete_aio(s->qnio_event_acb, s);
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+/*
+ * QEMU calls this to check if there are any pending IO on vDisk.
+ * It will wait in a loop until all the AIOs are completed.
+ */
+int vxhs_aio_flush_cb(void *opaque)
+{
+    BDRVVXHSState *s = opaque;
+
+    return vxhs_get_vdisk_iocount(s);
+}
+
+/*
+ * This will be called by QEMU while booting for each vDisk.
+ * bs->opaque will be allocated by QEMU upper block layer before
+ * calling open. It will load all the QNIO operations from
+ * qemuqnio library and call QNIO operation to create channel to
+ * do IO on vDisk. It parses the URI, gets the hostname, vDisk
+ * path and then sets HyperScale event handler to QEMU.
+ */
+void *vxhs_setup_qnio(void)
+{
+    void *qnio_ctx = NULL;
+
+    qnio_ctx = qemu_iio_init(vxhs_iio_callback);
+
+    if (qnio_ctx != NULL) {
+        trace_vxhs_setup_qnio(qnio_ctx);
+    } else {
+        trace_vxhs_setup_qnio_nwerror('.');
+    }
+
+    return qnio_ctx;
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "vxhs",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = VXHS_OPT_FILENAME,
+            .type = QEMU_OPT_STRING,
+            .help = "URI to the Veritas HyperScale image",
+        },
+        {
+            .name = VXHS_OPT_VDISK_ID,
+            .type = QEMU_OPT_STRING,
+            .help = "UUID of the VxHS vdisk",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_tcp_opts = {
+    .name = "vxhs_tcp",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
+    .desc = {
+        {
+            .name = VXHS_OPT_HOST,
+            .type = QEMU_OPT_STRING,
+            .help = "host address (ipv4 addresses)",
+        },
+        {
+            .name = VXHS_OPT_PORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "port number on which VxHSD is listening (default 9999)",
+            .def_value_str = "9999"
+        },
+        {
+            .name = "to",
+            .type = QEMU_OPT_NUMBER,
+            .help = "max port number, not supported by VxHS",
+        },
+        {
+            .name = "ipv4",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv4 bool value, not supported by VxHS",
+        },
+        {
+            .name = "ipv6",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv6 bool value, not supported by VxHS",
+        },
+        { /* end of list */ }
+    },
+};
+
+/*
+ * Parse the incoming URI and populate *options with all the host(s)
+ * information. Host at index 0 is local storage agent.
+ * Remaining are the reflection target storage agents. The local storage agent
+ * ip is the efficient internal address in the uri, e.g. 192.168.0.2.
+ * The local storage agent address is stored at index 0. The reflection target
+ * ips, are the E-W data network addresses of the reflection node agents, also
+ * extracted from the uri.
+ */
+static int vxhs_parse_uri(const char *filename, QDict *options)
+{
+    gchar **target_list;
+    URI *uri = NULL;
+    char *hoststr, *portstr;
+    char *port;
+    int ret = 0;
+    int i = 0;
+
+    trace_vxhs_parse_uri_filename(filename);
+    target_list = g_strsplit(filename, "%7D", 0);
+    assert(target_list != NULL && target_list[0] != NULL);
+
+    for (i = 0; target_list[i] != NULL && *target_list[i]; i++) {
+        uri = uri_parse(target_list[i]);
+        if (!uri || !uri->server) {
+            uri_free(uri);
+            ret = -EINVAL;
+            break;
+        }
+
+        hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i);
+        qdict_put(options, hoststr, qstring_from_str(uri->server));
+
+        portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i);
+        if (uri->port) {
+            port = g_strdup_printf("%d", uri->port);
+            qdict_put(options, portstr, qstring_from_str(port));
+            g_free(port);
+        }
+
+        if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) {
+            qdict_put(options, "vdisk_id", qstring_from_str(uri->path));
+        }
+
+        trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port);
+        g_free(hoststr);
+        g_free(portstr);
+        uri_free(uri);
+    }
+
+    g_strfreev(target_list);
+    return ret;
+}
+
+static void vxhs_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    if (qdict_haskey(options, "host")
+        || qdict_haskey(options, "port")
+        || qdict_haskey(options, "path"))
+    {
+        error_setg(errp, "host/port/path and a file name may not be specified "
+                         "at the same time");
+        return;
+    }
+
+    if (strstr(filename, "://")) {
+        int ret = vxhs_parse_uri(filename, options);
+        if (ret < 0) {
+            error_setg(errp, "Invalid URI. URI should be of the form "
+                       "  vxhs://<host_ip>:<port>/{<vdisk_id>}");
+        }
+    }
+}
+
+static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
+                              int *cfd, int *rfd, Error **errp)
+{
+    QDict *backing_options = NULL;
+    const char *vxhs_filename;
+    char *of_vsa_addr = NULL;
+    Error *local_err = NULL;
+    const char *ptr = NULL;
+    char *file_name = NULL;
+    size_t num_servers;
+    char *str = NULL;
+    QemuOpts *opts;
+    int ret = 0;
+    int i;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
+    if (vxhs_filename) {
+        trace_vxhs_qemu_init_filename(vxhs_filename);
+    }
+
+    ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
+    if (!ptr) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
+        ret = -EINVAL;
+        goto out;
+    }
+    s->vdisk_guid = g_strdup(ptr);
+    trace_vxhs_qemu_init_vdisk(ptr);
+
+    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
+    if (num_servers < 1) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
+        ret = -EINVAL;
+        goto out;
+    }
+    trace_vxhs_qemu_init_numservers(num_servers);
+
+    qemu_opts_del(opts);
+
+    for (i = 0; i < num_servers; i++) {
+        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
+        qdict_extract_subqdict(options, &backing_options, str);
+
+        /* create opts info from runtime_tcp_opts list */
+        opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
+        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+        if (local_err) {
+            ret = -EINVAL;
+            qemu_opts_del(opts);
+            goto out;
+        }
+
+        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts,
+                                                            VXHS_OPT_HOST));
+        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts,
+                                                                 VXHS_OPT_PORT),
+                                                    NULL, 0);
+
+        s->vdisk_hostinfo[i].qnio_cfd = -1;
+        s->vdisk_hostinfo[i].vdisk_rfd = -1;
+        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
+                             s->vdisk_hostinfo[i].port);
+        qemu_opts_del(opts);
+    }
+
+    s->vdisk_nhosts = i;
+    s->vdisk_cur_host_idx = 0;
+    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
+    of_vsa_addr = g_strdup_printf("of://%s:%d",
+                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
+                                s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
+
+    /*
+     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
+     */
+    if (global_qnio_ctx == NULL) {
+        global_qnio_ctx = vxhs_setup_qnio();
+        if (global_qnio_ctx == NULL) {
+            error_setg(&local_err, "Failed vxhs_setup_qnio");
+            ret = -EINVAL;
+            goto out;
+        }
+    }
+
+    *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
+    if (*cfd < 0) {
+        error_setg(&local_err, "Failed qemu_open_iio_conn");
+        ret = -EIO;
+        goto out;
+    }
+    *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
+    if (*rfd < 0) {
+        error_setg(&local_err, "Failed qemu_iio_devopen");
+        ret = -EIO;
+        goto out;
+    }
+
+out:
+    g_free(file_name);
+    g_free(of_vsa_addr);
+    if (str) {
+        qdict_del(backing_options, str);
+        g_free(str);
+    }
+
+    if (ret < 0) {
+        errno = -ret;
+    }
+    error_propagate(errp, local_err);
+    return ret;
+}
+
+int vxhs_open(BlockDriverState *bs, QDict *options,
+              int bdrv_flags, Error **errp)
+{
+    BDRVVXHSState *s = bs->opaque;
+    AioContext *aio_context;
+    int qemu_qnio_cfd = -1;
+    int qemu_rfd = -1;
+    int ret = 0;
+
+    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
+    if (ret < 0) {
+        trace_vxhs_open_fail(ret);
+        return ret;
+    }
+
+    s->qnio_ctx = global_qnio_ctx;
+    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
+    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
+    s->vdisk_size = 0;
+    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
+
+    /*
+     * Create a pipe for communicating between two threads in different
+     * context. Set handler for read event, which gets triggered when
+     * IO completion is done by non-QEMU context.
+     */
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        trace_vxhs_open_epipe('.');
+        ret = -errno;
+        goto out;
+    }
+    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
+                       false, vxhs_aio_event_reader, NULL, s);
+
+    /*
+     * Allocate/Initialize the spin-locks.
+     *
+     * NOTE:
+     *      Since spin lock is being allocated
+     *      dynamically hence moving acb struct
+     *      specific lock to BDRVVXHSState
+     *      struct. The reason being,
+     *      we don't want the overhead of spin
+     *      lock being dynamically allocated and
+     *      freed for every AIO.
+     */
+    s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC;
+    s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC;
+
+    return 0;
+
+out:
+    if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) {
+        qemu_iio_devclose(s->qnio_ctx, 0,
+                                 s->vdisk_hostinfo[0].vdisk_rfd);
+    }
+    /* never close qnio_cfd */
+    trace_vxhs_open_fail(ret);
+    return ret;
+}
+
+static const AIOCBInfo vxhs_aiocb_info = {
+    .aiocb_size = sizeof(VXHSAIOCB)
+};
+
+/*
+ * This is called in QNIO thread context when IO done
+ * on IO Manager and QNIO client received the data or
+ * ACK. It notify another event handler thread running in QEMU context
+ * by writing on the pipe
+ */
+void vxhs_finish_aiocb(ssize_t ret, void *arg)
+{
+    VXHSAIOCB *acb = arg;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVVXHSState *s = bs->opaque;
+    int rv;
+
+    acb->ret = ret;
+    rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
+    if (rv != sizeof(acb)) {
+        error_report("VXHS AIO completion failed: %s",
+                     strerror(errno));
+        abort();
+    }
+}
+
+/*
+ * This allocates QEMU-VXHS callback for each IO
+ * and is passed to QNIO. When QNIO completes the work,
+ * it will be passed back through the callback.
+ */
+BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
+                                int64_t sector_num, QEMUIOVector *qiov,
+                                int nb_sectors,
+                                BlockCompletionFunc *cb,
+                                void *opaque, int iodir)
+{
+    VXHSAIOCB *acb = NULL;
+    BDRVVXHSState *s = bs->opaque;
+    size_t size;
+    uint64_t offset;
+    int iio_flags = 0;
+    int ret = 0;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+
+    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
+    /*
+     * Setup or initialize VXHSAIOCB.
+     * Every single field should be initialized since
+     * acb will be picked up from the slab without
+     * initializing with zero.
+     */
+    acb->io_offset = offset;
+    acb->size = size;
+    acb->ret = 0;
+    acb->flags = 0;
+    acb->aio_done = VXHS_IO_INPROGRESS;
+    acb->segments = 0;
+    acb->buffer = 0;
+    acb->qiov = qiov;
+    acb->direction = iodir;
+
+    VXHS_SPIN_LOCK(s->vdisk_lock);
+    if (OF_VDISK_FAILED(s)) {
+        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
+        VXHS_SPIN_UNLOCK(s->vdisk_lock);
+        goto errout;
+    }
+    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
+        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
+        s->vdisk_aio_retry_qd++;
+        OF_AIOCB_FLAGS_SET_QUEUED(acb);
+        VXHS_SPIN_UNLOCK(s->vdisk_lock);
+        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
+        goto out;
+    }
+    s->vdisk_aio_count++;
+    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+
+    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
+
+    switch (iodir) {
+    case VDISK_AIO_WRITE:
+            vxhs_inc_acb_segment_count(acb, 1);
+            ret = qemu_iio_writev(s->qnio_ctx,
+                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
+            break;
+    case VDISK_AIO_READ:
+            vxhs_inc_acb_segment_count(acb, 1);
+            ret = qemu_iio_readv(s->qnio_ctx,
+                    s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+                    qiov->iov, qiov->niov, offset, (void *)acb, iio_flags);
+            break;
+    default:
+            trace_vxhs_aio_rw_invalid(iodir);
+            goto errout;
+    }
+
+    if (ret != 0) {
+        trace_vxhs_aio_rw_ioerr(
+                  s->vdisk_guid, iodir, size, offset,
+                  acb, acb->segments, ret, errno);
+        /*
+         * Don't retry I/Os against vDisk having no
+         * redundancy or stateful storage on compute
+         *
+         * TODO: Revisit this code path to see if any
+         *       particular error needs to be handled.
+         *       At this moment failing the I/O.
+         */
+        VXHS_SPIN_LOCK(s->vdisk_lock);
+        if (s->vdisk_nhosts == 1) {
+            trace_vxhs_aio_rw_iofail(s->vdisk_guid);
+            s->vdisk_aio_count--;
+            vxhs_dec_acb_segment_count(acb, 1);
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            goto errout;
+        }
+        if (OF_VDISK_FAILED(s)) {
+            trace_vxhs_aio_rw_devfail(
+                      s->vdisk_guid, iodir, size, offset);
+            s->vdisk_aio_count--;
+            vxhs_dec_acb_segment_count(acb, 1);
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            goto errout;
+        }
+        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
+            /*
+             * Queue all incoming io requests after failover starts.
+             * Number of requests that can arrive is limited by io queue depth
+             * so an app blasting independent ios will not exhaust memory.
+             */
+            QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
+            s->vdisk_aio_retry_qd++;
+            OF_AIOCB_FLAGS_SET_QUEUED(acb);
+            s->vdisk_aio_count--;
+            vxhs_dec_acb_segment_count(acb, 1);
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2);
+            goto out;
+        }
+        OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s);
+        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
+        s->vdisk_aio_retry_qd++;
+        OF_AIOCB_FLAGS_SET_QUEUED(acb);
+        vxhs_dec_acb_segment_count(acb, 1);
+        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3);
+        /*
+         * Start I/O failover if there is no active
+         * AIO within vxhs block driver.
+         */
+        if (--s->vdisk_aio_count == 0) {
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            /*
+             * Start IO failover
+             */
+            vxhs_failover_io(s);
+            goto out;
+        }
+        VXHS_SPIN_UNLOCK(s->vdisk_lock);
+    }
+
+out:
+    return &acb->common;
+
+errout:
+    qemu_aio_unref(acb);
+    return NULL;
+}
+
+BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs,
+                                   int64_t sector_num, QEMUIOVector *qiov,
+                                   int nb_sectors,
+                                   BlockCompletionFunc *cb, void *opaque)
+{
+    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
+                         cb, opaque, VDISK_AIO_READ);
+}
+
+BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs,
+                                    int64_t sector_num, QEMUIOVector *qiov,
+                                    int nb_sectors,
+                                    BlockCompletionFunc *cb, void *opaque)
+{
+    return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors,
+                         cb, opaque, VDISK_AIO_WRITE);
+}
+
+/*
+ * This is called by QEMU when a flush gets triggered from within
+ * a guest at the block layer, either for IDE or SCSI disks.
+ */
+int vxhs_co_flush(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+    uint64_t size = 0;
+    int ret = 0;
+
+    ret = qemu_iio_ioctl(s->qnio_ctx,
+            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+            VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC);
+
+    if (ret < 0) {
+        /*
+         * Currently not handling the flush ioctl
+         * failure because of network connection
+         * disconnect. Since all the writes are
+         * commited into persistent storage hence
+         * this flush call is noop and we can safely
+         * return success status to the caller.
+         *
+         * If any write failure occurs for inflight
+         * write AIO because of network disconnect
+         * then anyway IO failover will be triggered.
+         */
+        trace_vxhs_co_flush(s->vdisk_guid, ret, errno);
+        ret = 0;
+    }
+
+    return ret;
+}
+
+unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s)
+{
+    void *ctx = NULL;
+    int flags = 0;
+    unsigned long vdisk_size = 0;
+    int ret = 0;
+
+    ret = qemu_iio_ioctl(s->qnio_ctx,
+            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+            VDISK_STAT, &vdisk_size, ctx, flags);
+
+    if (ret < 0) {
+        trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno);
+        return 0;
+    }
+
+    trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size);
+    return vdisk_size;
+}
+
+/*
+ * Returns the size of vDisk in bytes. This is required
+ * by QEMU block upper block layer so that it is visible
+ * to guest.
+ */
+int64_t vxhs_getlength(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+    unsigned long vdisk_size = 0;
+
+    if (s->vdisk_size > 0) {
+        vdisk_size = s->vdisk_size;
+    } else {
+        /*
+         * Fetch the vDisk size using stat ioctl
+         */
+        vdisk_size = vxhs_get_vdisk_stat(s);
+        if (vdisk_size > 0) {
+            s->vdisk_size = vdisk_size;
+        }
+    }
+
+    if (vdisk_size > 0) {
+        return (int64_t)vdisk_size; /* return size in bytes */
+    } else {
+        return -EIO;
+    }
+}
+
+/*
+ * Returns actual blocks allocated for the vDisk.
+ * This is required by qemu-img utility.
+ */
+int64_t vxhs_get_allocated_blocks(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+    unsigned long vdisk_size = 0;
+
+    if (s->vdisk_size > 0) {
+        vdisk_size = s->vdisk_size;
+    } else {
+        /*
+         * TODO:
+         * Once HyperScale storage-virtualizer provides
+         * actual physical allocation of blocks then
+         * fetch that information and return back to the
+         * caller but for now just get the full size.
+         */
+        vdisk_size = vxhs_get_vdisk_stat(s);
+        if (vdisk_size > 0) {
+            s->vdisk_size = vdisk_size;
+        }
+    }
+
+    if (vdisk_size > 0) {
+        return (int64_t)vdisk_size; /* return size in bytes */
+    } else {
+        return -EIO;
+    }
+}
+
+void vxhs_close(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+
+    close(s->fds[VDISK_FD_READ]);
+    close(s->fds[VDISK_FD_WRITE]);
+
+    /*
+     * never close channel - not ref counted, will
+     * close for all vdisks
+     */
+    if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) {
+        qemu_iio_devclose(s->qnio_ctx, 0,
+            s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd);
+    }
+    if (s->vdisk_lock) {
+        VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock);
+        s->vdisk_lock = NULL;
+    }
+    if (s->vdisk_acb_lock) {
+        VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock);
+        s->vdisk_acb_lock = NULL;
+    }
+
+    /*
+     * TODO: Verify that all the resources were relinguished.
+     */
+}
+
+/*
+ * If errors are consistent with storage agent failure:
+ *  - Try to reconnect in case error is transient or storage agent restarted.
+ *  - Currently failover is being triggered on per vDisk basis. There is
+ *    a scope of further optimization where failover can be global (per VM).
+ *  - In case of network (storage agent) failure, for all the vDisks, having
+ *    no redundancy, I/Os will be failed without attempting for I/O failover
+ *    because of stateless nature of vDisk.
+ *  - If local or source storage agent is down then send an ioctl to remote
+ *    storage agent to check if remote storage agent in a state to accept
+ *    application I/Os.
+ *  - Once remote storage agent is ready to accept I/O, start I/O shipping.
+ *  - If I/Os cannot be serviced then vDisk will be marked failed so that
+ *    new incoming I/Os are returned with failure immediately.
+ *  - If vDisk I/O failover is in progress then all new/inflight I/Os will
+ *    queued and will be restarted or failed based on failover operation
+ *    is successful or not.
+ *  - I/O failover can be started either in I/O forward or I/O backward
+ *    path.
+ *  - I/O failover will be started as soon as all the pending acb(s)
+ *    are queued and there is no pending I/O count.
+ *  - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS
+ *    then vDisk will be marked failed and all I/Os will be completed with
+ *    error.
+ */
+
+int vxhs_switch_storage_agent(BDRVVXHSState *s)
+{
+    int res = 0;
+    int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE);
+
+    trace_vxhs_switch_storage_agent(
+              s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
+              s->vdisk_guid);
+
+    res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx);
+    if (res == 0) {
+        res = qemu_iio_ioctl(s->qnio_ctx,
+                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd,
+                  VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags);
+    } else {
+        trace_vxhs_switch_storage_agent_failed(
+                  s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip,
+                  s->vdisk_guid, res, errno);
+        /*
+         * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio epoll
+         * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts in io
+         * target list.
+         */
+
+        /* try next host */
+        vxhs_failover_ioctl_cb(res, s);
+    }
+    return res;
+}
+
+void vxhs_failover_ioctl_cb(int res, void *ctx)
+{
+    BDRVVXHSState *s = ctx;
+
+    if (res == 0) {
+        /* found failover target */
+        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
+        s->vdisk_ask_failover_idx = 0;
+        trace_vxhs_failover_ioctl_cb(
+                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
+                   s->vdisk_guid);
+        VXHS_SPIN_LOCK(s->vdisk_lock);
+        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
+        VXHS_SPIN_UNLOCK(s->vdisk_lock);
+        vxhs_handle_queued_ios(s);
+    } else {
+        /* keep looking */
+        trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid);
+        s->vdisk_ask_failover_idx++;
+        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
+            /* pause and cycle through list again */
+            sleep(QNIO_CONNECT_RETRY_SECS);
+            s->vdisk_ask_failover_idx = 0;
+        }
+        res = vxhs_switch_storage_agent(s);
+    }
+}
+
+int vxhs_failover_io(BDRVVXHSState *s)
+{
+    int res = 0;
+
+    trace_vxhs_failover_io(s->vdisk_guid);
+
+    s->vdisk_ask_failover_idx = 0;
+    res = vxhs_switch_storage_agent(s);
+
+    return res;
+}
+
+/*
+ * Try to reopen the vDisk on one of the available hosts
+ * If vDisk reopen is successful on any of the host then
+ * check if that node is ready to accept I/O.
+ */
+int vxhs_reopen_vdisk(BDRVVXHSState *s, int index)
+{
+    char *of_vsa_addr = NULL;
+    char *file_name = NULL;
+    int  res = 0;
+
+    /*
+     * Don't close the channel if it was opened
+     * before successfully. It will be handled
+     * within iio* api if the same channel open
+     * fd is reused.
+     *
+     * close stale vdisk device remote fd since
+     * it is invalid after channel disconnect.
+     */
+    if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) {
+        qemu_iio_devclose(s->qnio_ctx, 0,
+                                 s->vdisk_hostinfo[index].vdisk_rfd);
+        s->vdisk_hostinfo[index].vdisk_rfd = -1;
+    }
+
+    /*
+     * build storage agent address and vdisk device name strings
+     */
+    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
+    of_vsa_addr = g_strdup_printf("of://%s:%d",
+             s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port);
+    /*
+     * open qnio channel to storage agent if not opened before.
+     */
+    if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
+        s->vdisk_hostinfo[index].qnio_cfd =
+                qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0);
+        if (s->vdisk_hostinfo[index].qnio_cfd < 0) {
+            trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip);
+            res = ENODEV;
+            goto out;
+        }
+    }
+    /*
+     * open vdisk device
+     */
+    s->vdisk_hostinfo[index].vdisk_rfd =
+            qemu_iio_devopen(global_qnio_ctx,
+             s->vdisk_hostinfo[index].qnio_cfd, file_name, 0);
+    if (s->vdisk_hostinfo[index].vdisk_rfd < 0) {
+        trace_vxhs_reopen_vdisk_openfail(file_name);
+        res = EIO;
+        goto out;
+    }
+
+out:
+    g_free(of_vsa_addr);
+    g_free(file_name);
+    return res;
+}
+
+int vxhs_handle_queued_ios(BDRVVXHSState *s)
+{
+    VXHSAIOCB *acb = NULL;
+    int res = 0;
+
+    VXHS_SPIN_LOCK(s->vdisk_lock);
+    while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) {
+        /*
+         * Before we process the acb, check whether I/O failover
+         * started again due to failback or cascading failure.
+         */
+        if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            goto out;
+        }
+        QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry);
+        s->vdisk_aio_retry_qd--;
+        OF_AIOCB_FLAGS_RESET_QUEUED(acb);
+        if (OF_VDISK_FAILED(s)) {
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            vxhs_fail_aio(acb, EIO);
+            VXHS_SPIN_LOCK(s->vdisk_lock);
+        } else {
+            VXHS_SPIN_UNLOCK(s->vdisk_lock);
+            res = vxhs_restart_aio(acb);
+            trace_vxhs_handle_queued_ios(acb, res);
+            VXHS_SPIN_LOCK(s->vdisk_lock);
+            if (res) {
+                QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq,
+                                     acb, retry_entry);
+                OF_AIOCB_FLAGS_SET_QUEUED(acb);
+                VXHS_SPIN_UNLOCK(s->vdisk_lock);
+                goto out;
+            }
+        }
+    }
+    VXHS_SPIN_UNLOCK(s->vdisk_lock);
+out:
+    return res;
+}
+
+int vxhs_restart_aio(VXHSAIOCB *acb)
+{
+    BDRVVXHSState *s = NULL;
+    int iio_flags = 0;
+    int res = 0;
+
+    s = acb->common.bs->opaque;
+
+    if (acb->direction == VDISK_AIO_WRITE) {
+        vxhs_inc_vdisk_iocount(s, 1);
+        vxhs_inc_acb_segment_count(acb, 1);
+        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
+        res = qemu_iio_writev(s->qnio_ctx,
+                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+                acb->qiov->iov, acb->qiov->niov,
+                acb->io_offset, (void *)acb, iio_flags);
+    }
+
+    if (acb->direction == VDISK_AIO_READ) {
+        vxhs_inc_vdisk_iocount(s, 1);
+        vxhs_inc_acb_segment_count(acb, 1);
+        iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
+        res = qemu_iio_readv(s->qnio_ctx,
+                s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd,
+                acb->qiov->iov, acb->qiov->niov,
+                acb->io_offset, (void *)acb, iio_flags);
+    }
+
+    if (res != 0) {
+        vxhs_dec_vdisk_iocount(s, 1);
+        vxhs_dec_acb_segment_count(acb, 1);
+        trace_vxhs_restart_aio(acb->direction, res, errno);
+    }
+
+    return res;
+}
+
+void vxhs_fail_aio(VXHSAIOCB *acb, int err)
+{
+    BDRVVXHSState *s = NULL;
+    int segcount = 0;
+    int rv = 0;
+
+    s = acb->common.bs->opaque;
+
+    trace_vxhs_fail_aio(s->vdisk_guid, acb);
+    if (!acb->ret) {
+        acb->ret = err;
+    }
+    VXHS_SPIN_LOCK(s->vdisk_acb_lock);
+    segcount = acb->segments;
+    VXHS_SPIN_UNLOCK(s->vdisk_acb_lock);
+    if (segcount == 0) {
+        /*
+         * Complete the io request
+         */
+        rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb));
+        if (rv != sizeof(acb)) {
+            error_report("VXHS AIO completion failed: %s",
+                         strerror(errno));
+            abort();
+        }
+    }
+}
+
+static void vxhs_detach_aio_context(BlockDriverState *bs)
+{
+    BDRVVXHSState *s = bs->opaque;
+
+    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
+                       false, NULL, NULL, NULL);
+
+}
+
+static void vxhs_attach_aio_context(BlockDriverState *bs,
+                                   AioContext *new_context)
+{
+    BDRVVXHSState *s = bs->opaque;
+
+    aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ],
+                       false, vxhs_aio_event_reader, NULL, s);
+}
+
+static BlockDriver bdrv_vxhs = {
+    .format_name                  = "vxhs",
+    .protocol_name                = "vxhs",
+    .instance_size                = sizeof(BDRVVXHSState),
+    .bdrv_file_open               = vxhs_open,
+    .bdrv_parse_filename          = vxhs_parse_filename,
+    .bdrv_close                   = vxhs_close,
+    .bdrv_getlength               = vxhs_getlength,
+    .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks,
+    .bdrv_aio_readv               = vxhs_aio_readv,
+    .bdrv_aio_writev              = vxhs_aio_writev,
+    .bdrv_co_flush_to_disk        = vxhs_co_flush,
+    .bdrv_detach_aio_context      = vxhs_detach_aio_context,
+    .bdrv_attach_aio_context      = vxhs_attach_aio_context,
+};
+
+void bdrv_vxhs_init(void)
+{
+    trace_vxhs_bdrv_init('.');
+    bdrv_register(&bdrv_vxhs);
+}
+
+block_init(bdrv_vxhs_init);
diff --git a/block/vxhs.h b/block/vxhs.h
new file mode 100644
index 0000000..38e0165
--- /dev/null
+++ b/block/vxhs.h
@@ -0,0 +1,237 @@ 
+/*
+ * QEMU Block driver for Veritas HyperScale (VxHS)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef VXHSD_H
+#define VXHSD_H
+
+#include <gmodule.h>
+#include <inttypes.h>
+#include <pthread.h>
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
+#include "qemu/uri.h"
+#include "qemu/queue.h"
+
+#define OF_GUID_STR_LEN             40
+#define OF_GUID_STR_SZ              (OF_GUID_STR_LEN + 1)
+#define QNIO_CONNECT_RETRY_SECS     5
+#define QNIO_CONNECT_TIMOUT_SECS    120
+
+/* constants from io_qnio.h */
+#define IIO_REASON_DONE     0x00000004
+#define IIO_REASON_EVENT    0x00000008
+#define IIO_REASON_HUP      0x00000010
+
+/*
+ * IO specific flags
+ */
+#define IIO_FLAG_ASYNC      0x00000001
+#define IIO_FLAG_DONE       0x00000010
+#define IIO_FLAG_SYNC       0
+
+/* constants from error.h */
+#define VXERROR_RETRY_ON_SOURCE     44
+#define VXERROR_HUP                 901
+#define VXERROR_CHANNEL_HUP         903
+
+/* constants from iomgr.h and opcode.h */
+#define IRP_READ_REQUEST                    0x1FFF
+#define IRP_WRITE_REQUEST                   0x2FFF
+#define IRP_VDISK_CHECK_IO_FAILOVER_READY   2020
+
+/* Lock specific macros */
+#define VXHS_SPIN_LOCK_ALLOC                  \
+    (qemu_ck_initialize_lock())
+#define VXHS_SPIN_LOCK(lock)                  \
+    (qemu_ck_spin_lock(lock))
+#define VXHS_SPIN_UNLOCK(lock)                \
+    (qemu_ck_spin_unlock(lock))
+#define VXHS_SPIN_LOCK_DESTROY(lock)          \
+    (qemu_ck_destroy_lock(lock))
+
+typedef enum {
+    VXHS_IO_INPROGRESS,
+    VXHS_IO_COMPLETED,
+    VXHS_IO_ERROR
+} VXHSIOState;
+
+
+typedef void (*qnio_callback_t)(ssize_t retval, void *arg);
+
+#define VDISK_FD_READ 0
+#define VDISK_FD_WRITE 1
+
+#define QNIO_VDISK_NONE        0x00
+#define QNIO_VDISK_CREATE      0x01
+
+/* max IO size supported by QEMU NIO lib */
+#define QNIO_MAX_IO_SIZE       4194304
+
+#define MAX_HOSTS               4
+
+/*
+ * Opcodes for making IOCTL on QEMU NIO library
+ */
+#define BASE_OPCODE_SHARED     1000
+#define BASE_OPCODE_DAL        2000
+#define IRP_VDISK_STAT                  (BASE_OPCODE_SHARED + 5)
+#define IRP_VDISK_GET_GEOMETRY          (BASE_OPCODE_DAL + 17)
+#define IRP_VDISK_READ_PARTITION        (BASE_OPCODE_DAL + 18)
+#define IRP_VDISK_FLUSH                 (BASE_OPCODE_DAL + 19)
+
+/*
+ * BDRVVXHSState specific flags
+ */
+#define OF_VDISK_FLAGS_STATE_ACTIVE             0x0000000000000001
+#define OF_VDISK_FLAGS_STATE_FAILED             0x0000000000000002
+#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS   0x0000000000000004
+
+#define OF_VDISK_ACTIVE(s)                                              \
+        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE)
+#define OF_VDISK_SET_ACTIVE(s)                                          \
+        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE)
+#define OF_VDISK_RESET_ACTIVE(s)                                        \
+        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE)
+
+#define OF_VDISK_FAILED(s)                                              \
+        ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED)
+#define OF_VDISK_SET_FAILED(s)                                          \
+        ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED)
+#define OF_VDISK_RESET_FAILED(s)                                        \
+        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED)
+
+#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s)                              \
+        ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
+#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s)                          \
+        ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
+#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s)                        \
+        ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS)
+
+/*
+ * VXHSAIOCB specific flags
+ */
+#define OF_ACB_QUEUED       0x00000001
+
+#define OF_AIOCB_FLAGS_QUEUED(a)            \
+        ((a)->flags & OF_ACB_QUEUED)
+#define OF_AIOCB_FLAGS_SET_QUEUED(a)        \
+        ((a)->flags |= OF_ACB_QUEUED)
+#define OF_AIOCB_FLAGS_RESET_QUEUED(a)      \
+        ((a)->flags &= ~OF_ACB_QUEUED)
+
+typedef struct qemu2qnio_ctx {
+    uint32_t            qnio_flag;
+    uint64_t            qnio_size;
+    char                *qnio_channel;
+    char                *target;
+    qnio_callback_t     qnio_cb;
+} qemu2qnio_ctx_t;
+
+typedef qemu2qnio_ctx_t qnio2qemu_ctx_t;
+
+typedef struct LibQNIOSymbol {
+        const char *name;
+        gpointer *addr;
+} LibQNIOSymbol;
+
+typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx,
+                          void *reply);
+
+/*
+ * HyperScale AIO callbacks structure
+ */
+typedef struct VXHSAIOCB {
+    BlockAIOCB          common;
+    size_t              ret;
+    size_t              size;
+    QEMUBH              *bh;
+    int                 aio_done;
+    int                 segments;
+    int                 flags;
+    size_t              io_offset;
+    QEMUIOVector        *qiov;
+    void                *buffer;
+    int                 direction;  /* IO direction (r/w) */
+    QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry;
+} VXHSAIOCB;
+
+typedef struct VXHSvDiskHostsInfo {
+    int     qnio_cfd;        /* Channel FD */
+    int     vdisk_rfd;      /* vDisk remote FD */
+    char    *hostip;        /* Host's IP addresses */
+    int     port;           /* Host's port number */
+} VXHSvDiskHostsInfo;
+
+/*
+ * Structure per vDisk maintained for state
+ */
+typedef struct BDRVVXHSState {
+    int                     fds[2];
+    int64_t                 vdisk_size;
+    int64_t                 vdisk_blocks;
+    int64_t                 vdisk_flags;
+    int                     vdisk_aio_count;
+    int                     event_reader_pos;
+    VXHSAIOCB               *qnio_event_acb;
+    void                    *qnio_ctx;
+    void                    *vdisk_lock; /* Lock to protect BDRVVXHSState */
+    void                    *vdisk_acb_lock;  /* Protects ACB */
+    VXHSvDiskHostsInfo      vdisk_hostinfo[MAX_HOSTS]; /* Per host info */
+    int                     vdisk_nhosts;   /* Total number of hosts */
+    int                     vdisk_cur_host_idx; /* IOs are being shipped to */
+    int                     vdisk_ask_failover_idx; /*asking permsn to ship io*/
+    QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
+    int                     vdisk_aio_retry_qd;
+    char                    *vdisk_guid;
+} BDRVVXHSState;
+
+void bdrv_vxhs_init(void);
+void *vxhs_setup_qnio(void);
+void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m);
+void vxhs_aio_event_reader(void *opaque);
+void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s);
+int vxhs_aio_flush_cb(void *opaque);
+void vxhs_finish_aiocb(ssize_t ret, void *arg);
+unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s);
+int vxhs_open(BlockDriverState *bs, QDict *options,
+              int bdrv_flags, Error **errp);
+void vxhs_close(BlockDriverState *bs);
+BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                                   QEMUIOVector *qiov, int nb_sectors,
+                                   BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                                    QEMUIOVector *qiov, int nb_sectors,
+                                    BlockCompletionFunc *cb,
+                                    void *opaque);
+int64_t vxhs_get_allocated_blocks(BlockDriverState *bs);
+BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num,
+                                QEMUIOVector *qiov, int nb_sectors,
+                                BlockCompletionFunc *cb,
+                                void *opaque, int write);
+int vxhs_co_flush(BlockDriverState *bs);
+int64_t vxhs_getlength(BlockDriverState *bs);
+void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta);
+void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta);
+uint32_t vxhs_get_vdisk_iocount(void *ptr);
+void vxhs_inc_acb_segment_count(void *ptr, int count);
+void vxhs_dec_acb_segment_count(void *ptr, int count);
+int vxhs_dec_and_get_acb_segment_count(void *ptr, int count);
+void vxhs_set_acb_buffer(void *ptr, void *buffer);
+int vxhs_failover_io(BDRVVXHSState *s);
+int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index);
+int vxhs_switch_storage_agent(BDRVVXHSState *s);
+int vxhs_handle_queued_ios(BDRVVXHSState *s);
+int vxhs_restart_aio(VXHSAIOCB *acb);
+void vxhs_fail_aio(VXHSAIOCB *acb, int err);
+void vxhs_failover_ioctl_cb(int res, void *ctx);
+
+
+#endif
diff --git a/configure b/configure
index 4b808f9..c71e270 100755
--- a/configure
+++ b/configure
@@ -320,6 +320,7 @@  vhdx=""
 numa=""
 tcmalloc="no"
 jemalloc="no"
+vxhs=""
 
 # parse CC options first
 for opt do
@@ -1150,6 +1151,11 @@  for opt do
   ;;
   --enable-jemalloc) jemalloc="yes"
   ;;
+  --disable-vxhs) vxhs="no"
+  ;;
+  --enable-vxhs) vxhs="yes"
+  ;;
+
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1380,6 +1386,7 @@  disabled with --disable-FEATURE, default is enabled if available:
   numa            libnuma support
   tcmalloc        tcmalloc support
   jemalloc        jemalloc support
+  vxhs            Veritas HyperScale vDisk backend support
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -4543,6 +4550,43 @@  if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then
 fi
 
 ##########################################
+# Veritas HyperScale block driver VxHS
+# Check if libqnio is installed
+if test "$vxhs" != "no" ; then
+  cat > $TMPC <<EOF
+#include <stdio.h>
+#include <qnio/qnio_api.h>
+
+void vxhs_inc_acb_segment_count(void *acb, int count);
+void vxhs_dec_acb_segment_count(void *acb, int count);
+void vxhs_set_acb_buffer(void *ptr, void *buffer);
+
+void vxhs_inc_acb_segment_count(void *ptr, int count)
+{
+}
+void vxhs_dec_acb_segment_count(void *ptr, int count)
+{
+}
+void vxhs_set_acb_buffer(void *ptr, void *buffer)
+{
+}
+int main(void) {
+    qemu_ck_initialize_lock();
+    return 0;
+}
+EOF
+  vxhs_libs="-lqnioshim -lqnio"
+  if compile_prog "" "$vxhs_libs" ; then
+    vxhs=yes
+  else
+    if test "$vxhs" = "yes" ; then
+      feature_not_found "vxhs block device" "Install libqnio. See github"
+    fi
+    vxhs=no
+  fi
+fi
+
+##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
 
@@ -5488,6 +5532,12 @@  if test "$pthread_setname_np" = "yes" ; then
   echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak
 fi
 
+if test "$vxhs" = "yes" ; then
+  echo "CONFIG_VXHS=y" >> $config_host_mak
+  echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak
+  echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak
+fi
+
 if test "$tcg_interpreter" = "yes"; then
   QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
 elif test "$ARCH" = "sparc64" ; then