Message ID | 1477432927-5451-1-git-send-email-ashish.mittal@veritas.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: > This patch adds support for a new block device type called "vxhs". > Source code for the library that this code loads can be downloaded from: > https://github.com/MittalAshish/libqnio.git > I grabbed the latest of libqnio, compiled it (had to disable -Werror), and tried it out. I was able to do a qemu-img info on a raw file, but it would just hang when trying a format such as qcow2. I am assuming this is a limitation of test_server, and not libqnio. This will make qemu-iotests more difficult however. I haven't looked at the latest qnio code yet (other than compiling the test-server to test), so the rest of this review is on the qemu driver. > Sample command line using JSON syntax: > ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"}]}' > > Sample command line using URI syntax: > qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D > > Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> > --- > block/Makefile.objs | 2 + > block/trace-events | 22 ++ > block/vxhs.c | 736 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > configure | 41 +++jj > 4 files changed, 801 insertions(+) > create mode 100644 block/vxhs.c I think this version still does not address Daniel's concerns regarding a QAPI schema for vxhs. We are also still needing qemu-iotests, and a test-server suitable to run the tests. > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index 67a036a..58313a2 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -18,6 +18,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o > block-obj-$(CONFIG_CURL) += curl.o > block-obj-$(CONFIG_RBD) += rbd.o > block-obj-$(CONFIG_GLUSTERFS) += gluster.o > +block-obj-$(CONFIG_VXHS) += vxhs.o > block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o > block-obj-$(CONFIG_LIBSSH2) += ssh.o > block-obj-y += accounting.o dirty-bitmap.o > @@ -38,6 +39,7 @@ rbd.o-cflags := $(RBD_CFLAGS) > rbd.o-libs := $(RBD_LIBS) > gluster.o-cflags := $(GLUSTERFS_CFLAGS) > gluster.o-libs := $(GLUSTERFS_LIBS) > +vxhs.o-libs := $(VXHS_LIBS) > ssh.o-cflags := $(LIBSSH2_CFLAGS) > ssh.o-libs := $(LIBSSH2_LIBS) > archipelago.o-libs := $(ARCHIPELAGO_LIBS) > diff --git a/block/trace-events b/block/trace-events > index 05fa13c..aea97cb 100644 > --- a/block/trace-events > +++ b/block/trace-events > @@ -114,3 +114,25 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s > qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" > + > +# block/vxhs.c > +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d" > +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p" > +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d" > +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d" > +vxhs_open_fail(int ret) "Could not open the device. Error = %d" > +vxhs_open_epipe(int ret) "Could not create a pipe for device. Bailing out. Error=%d" > +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" > +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p. Error = %d, errno = %d" > +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d" > +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu" > +vxhs_qnio_iio_open(const char *ip) "Failed to connect to storage agent on host-ip %s" > +vxhs_qnio_iio_devopen(const char *fname) "Failed to open vdisk device: %s" > +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld" > +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s" > +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s" > +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d" > +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d" > +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState" > +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s" > +vxhs_close(char *vdisk_guid) "Closing vdisk %s" > diff --git a/block/vxhs.c b/block/vxhs.c > new file mode 100644 > index 0000000..97fb804 > --- /dev/null > +++ b/block/vxhs.c > @@ -0,0 +1,736 @@ > +/* > + * QEMU Block driver for Veritas HyperScale (VxHS) > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qemu/osdep.h" > +#include "block/block_int.h" > +#include <qnio/qnio_api.h> > +#include "qapi/qmp/qerror.h" > +#include "qapi/qmp/qdict.h" > +#include "qapi/qmp/qstring.h" > +#include "trace.h" > +#include "qemu/uri.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > + > +#define VDISK_FD_READ 0 > +#define VDISK_FD_WRITE 1 > + > +#define VXHS_OPT_FILENAME "filename" > +#define VXHS_OPT_VDISK_ID "vdisk_id" > +#define VXHS_OPT_SERVER "server." > +#define VXHS_OPT_HOST "host" > +#define VXHS_OPT_PORT "port" > + > +/* qnio client ioapi_ctx */ > +static void *global_qnio_ctx; > + Why create a qnio ctx as a static global, rather than confining it to BDRVVXHSState, so that it is unique to each disk instance? Is the following usage supported (i.e. two different vxhs remote servers): qemu-system-x86_64 -drive file=vxhs://server1:9999/test1 \ -drive file=vxhs://server2:9999/test2 Why store the qnio ctx into this global briefly, and just to move it to where it belongs in the BDRVVXHSState struct? It makes more sense to be in the state struct the whole time, unless I am missing something fundamental about the protocol. > +/* vdisk prefix to pass to qnio */ > +static const char vdisk_prefix[] = "/dev/of/vdisk"; > + > +typedef enum { > + VXHS_IO_INPROGRESS, > + VXHS_IO_COMPLETED, > + VXHS_IO_ERROR > +} VXHSIOState; > + > +typedef enum { > + VDISK_AIO_READ, > + VDISK_AIO_WRITE, > + VDISK_STAT > +} VDISKAIOCmd; > + > +/* > + * HyperScale AIO callbacks structure > + */ > +typedef struct VXHSAIOCB { > + BlockAIOCB common; > + int err; > + int state; > + int direction; /* IO direction (r/w) */ > + size_t io_offset; > + size_t size; > + QEMUBH *bh; *bh is unused. > + QEMUIOVector *qiov; > + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; retry_entry is unused. > +} VXHSAIOCB; > + > +typedef struct VXHSvDiskHostsInfo { > + int qnio_cfd; /* Channel FD */ > + int vdisk_rfd; /* vDisk remote FD */ > + char *hostip; /* Host's IP addresses */ > + int port; /* Host's port number */ > +} VXHSvDiskHostsInfo; > + > +/* > + * Structure per vDisk maintained for state > + */ > +typedef struct BDRVVXHSState { > + int fds[2]; > + int64_t vdisk_size; > + int64_t vdisk_blocks; vdisk_blocks is unused. > + int event_reader_pos; > + VXHSAIOCB *qnio_event_acb; > + void *qnio_ctx; > + VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */ > + char *vdisk_guid; > +} BDRVVXHSState; > + > +static void vxhs_qnio_iio_close(BDRVVXHSState *s) > +{ > + /* > + * Close vDisk device > + */ > + if (s->vdisk_hostinfo.vdisk_rfd >= 0) { > + iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo.vdisk_rfd); > + s->vdisk_hostinfo.vdisk_rfd = -1; > + } > + > + /* > + * Close QNIO channel against cached channel-fd > + */ > + if (s->vdisk_hostinfo.qnio_cfd >= 0) { > + iio_close(s->qnio_ctx, s->vdisk_hostinfo.qnio_cfd); > + s->vdisk_hostinfo.qnio_cfd = -1; > + } > +} > + > +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr, > + int *rfd, const char *file_name) > +{ > + /* > + * Open qnio channel to storage agent if not opened before. > + */ > + if (*cfd < 0) { > + *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0); > + if (*cfd < 0) { > + trace_vxhs_qnio_iio_open(of_vsa_addr); > + return -ENODEV; > + } > + } > + > + /* > + * Open vdisk device > + */ > + *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0); > + if (*rfd < 0) { > + if (*cfd >= 0) { > + iio_close(global_qnio_ctx, *cfd); > + *cfd = -1; > + *rfd = -1; > + } > + > + trace_vxhs_qnio_iio_devopen(file_name); > + return -ENODEV; > + } > + > + return 0; > +} > + > +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx, > + uint32_t error, uint32_t opcode) > +{ > + VXHSAIOCB *acb = NULL; > + BDRVVXHSState *s = NULL; > + ssize_t ret; > + > + switch (opcode) { > + case IRP_READ_REQUEST: > + case IRP_WRITE_REQUEST: > + > + /* > + * ctx is VXHSAIOCB* > + * ctx is NULL if error is QNIOERROR_CHANNEL_HUP or > + * reason is IIO_REASON_HUP > + */ > + if (ctx) { > + acb = ctx; > + s = acb->common.bs->opaque; > + } else { > + trace_vxhs_iio_callback(error, reason); > + goto out; > + } > + > + if (error) { > + if (!acb->err) { > + acb->err = error; > + } > + trace_vxhs_iio_callback(error, reason); > + } > + > + ret = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); > + g_assert(ret == sizeof(acb)); > + break; > + > + default: > + if (error == QNIOERROR_CHANNEL_HUP) { > + /* > + * Channel failed, spontaneous notification, > + * not in response to I/O > + */ > + trace_vxhs_iio_callback_chnfail(error, errno); > + } else { > + trace_vxhs_iio_callback_unknwn(opcode, error); > + } > + break; > + } > +out: > + return; > +} > + > +static void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) > +{ > + BlockCompletionFunc *cb = acb->common.cb; > + void *opaque = acb->common.opaque; > + int ret = 0; > + > + if (acb->err != 0) { > + trace_vxhs_complete_aio(acb, acb->err); > + /* > + * We mask all the IO errors generically as EIO for upper layers > + * Right now our IO Manager uses non standard error codes. Instead > + * of confusing upper layers with incorrect interpretation we are > + * doing this workaround. > + */ > + ret = (-EIO); > + } > + > + acb->state = VXHS_IO_COMPLETED; What is this state used for? I see it also set to VXHS_IO_INPROGRESS at the beginning of the request. > + qemu_aio_unref(acb); > + cb(opaque, ret); > +} > + > +/* > + * This is the HyperScale event handler registered to QEMU. > + * It is invoked when any IO gets completed and written on pipe > + * by callback called from QNIO thread context. Then it marks > + * the AIO as completed, and releases HyperScale AIO callbacks. > + */ > +static void vxhs_aio_event_reader(void *opaque) > +{ > + BDRVVXHSState *s = opaque; > + char *p; > + ssize_t ret; > + > + do { > + p = (char *)&s->qnio_event_acb; > + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, > + sizeof(s->qnio_event_acb) - s->event_reader_pos); > + if (ret > 0) { > + s->event_reader_pos += ret; > + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { > + s->event_reader_pos = 0; > + vxhs_complete_aio(s->qnio_event_acb, s); > + } > + } > + } while (ret < 0 && errno == EINTR); I'm bit confused on this event reader loop. If ret is > 0, but s->event_reader_pos != sizeof(s->qnio_event_acb), we'll exit the event reader leaving event_read_pos non-zero. And since we are reading in a pointer for the acb, what does that mean for the next time we come we have an aio reader event? > +} > + > +/* > + * Call QNIO operation to create channels to do IO on vDisk. > + */ > + > +static void *vxhs_setup_qnio(void) > +{ > + void *qnio_ctx = NULL; > + > + qnio_ctx = iio_init(vxhs_iio_callback); > + if (qnio_ctx != NULL) { > + trace_vxhs_setup_qnio(qnio_ctx); > + } > + return qnio_ctx; > +} > + > +static QemuOptsList runtime_opts = { > + .name = "vxhs", > + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > + .desc = { > + { > + .name = VXHS_OPT_FILENAME, > + .type = QEMU_OPT_STRING, > + .help = "URI to the Veritas HyperScale image", > + }, > + { > + .name = VXHS_OPT_VDISK_ID, > + .type = QEMU_OPT_STRING, > + .help = "UUID of the VxHS vdisk", > + }, > + { /* end of list */ } > + }, > +}; > + > +static QemuOptsList runtime_tcp_opts = { > + .name = "vxhs_tcp", > + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), > + .desc = { > + { > + .name = VXHS_OPT_HOST, > + .type = QEMU_OPT_STRING, > + .help = "host address (ipv4 addresses)", > + }, > + { > + .name = VXHS_OPT_PORT, > + .type = QEMU_OPT_NUMBER, > + .help = "port number on which VxHSD is listening (default 9999)", > + .def_value_str = "9999" > + }, > + { /* end of list */ } > + }, > +}; > + > +/* > + * Parse the incoming URI and populate *options with the host information. > + * URI syntax has the limitation of supporting only one host info. > + * To pass multiple host information, use the JSON syntax. > + */ > +static int vxhs_parse_uri(const char *filename, QDict *options) > +{ > + URI *uri = NULL; > + char *hoststr, *portstr; > + char *port; > + int ret = 0; > + > + trace_vxhs_parse_uri_filename(filename); > + uri = uri_parse(filename); > + if (!uri || !uri->server || !uri->path) { > + uri_free(uri); > + return -EINVAL; > + } > + > + hoststr = g_strdup(VXHS_OPT_SERVER"0.host"); > + qdict_put(options, hoststr, qstring_from_str(uri->server)); > + g_free(hoststr); > + > + portstr = g_strdup(VXHS_OPT_SERVER"0.port"); > + if (uri->port) { > + port = g_strdup_printf("%d", uri->port); > + qdict_put(options, portstr, qstring_from_str(port)); > + g_free(port); > + } > + g_free(portstr); > + > + if (strstr(uri->path, "vxhs") == NULL) { > + qdict_put(options, "vdisk_id", qstring_from_str(uri->path)); > + } > + > + trace_vxhs_parse_uri_hostinfo(1, uri->server, uri->port); > + uri_free(uri); > + > + return ret; > +} > + > +static void vxhs_parse_filename(const char *filename, QDict *options, > + Error **errp) > +{ > + if (qdict_haskey(options, "vdisk_id") || qdict_haskey(options, "server")) { > + error_setg(errp, "vdisk_id/server and a file name may not be specified " > + "at the same time"); > + return; > + } > + > + if (strstr(filename, "://")) { > + int ret = vxhs_parse_uri(filename, options); > + if (ret < 0) { > + error_setg(errp, "Invalid URI. URI should be of the form " > + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); > + } > + } > +} > + > +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, > + int *cfd, int *rfd, Error **errp) > +{ > + QDict *backing_options = NULL; > + QemuOpts *opts, *tcp_opts; > + const char *vxhs_filename; > + char *of_vsa_addr = NULL; > + Error *local_err = NULL; > + const char *vdisk_id_opt; > + char *file_name = NULL; > + size_t num_servers = 0; > + char *str = NULL; > + int ret = 0; > + > + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); > + qemu_opts_absorb_qdict(opts, options, &local_err); > + if (local_err) { > + error_propagate(errp, local_err); You are already propagating this error in 'out'. > + ret = -EINVAL; > + goto out; > + } > + > + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); > + if (vxhs_filename) { > + trace_vxhs_qemu_init_filename(vxhs_filename); > + } > + > + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); > + if (!vdisk_id_opt) { > + error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID); > + ret = -EINVAL; > + goto out; > + } > + s->vdisk_guid = g_strdup(vdisk_id_opt); > + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); > + > + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); > + if (num_servers < 1) { > + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); > + ret = -EINVAL; > + goto out; > + } else if (num_servers > 1) { > + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); > + error_append_hint(errp, "Only one server allowed.\n"); s/errp/&local_err > + ret = -EINVAL; > + goto out; > + } > + trace_vxhs_qemu_init_numservers(num_servers); > + > + str = g_strdup_printf(VXHS_OPT_SERVER"0."); > + qdict_extract_subqdict(options, &backing_options, str); > + > + /* Create opts info from runtime_tcp_opts list */ > + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort); > + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); > + if (local_err) { > + qdict_del(backing_options, str); > + qemu_opts_del(tcp_opts); > + g_free(str); Rather than free str here, you can just do it in the 'out'. (g_free is NULL-safe) > + ret = -EINVAL; > + goto out; > + } > + > + s->vdisk_hostinfo.hostip = g_strdup(qemu_opt_get(tcp_opts, > + VXHS_OPT_HOST)); > + s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts, > + VXHS_OPT_PORT), > + NULL, 0); > + > + s->vdisk_hostinfo.qnio_cfd = -1; > + s->vdisk_hostinfo.vdisk_rfd = -1; > + trace_vxhs_qemu_init(s->vdisk_hostinfo.hostip, > + s->vdisk_hostinfo.port); > + > + qdict_del(backing_options, str); > + qemu_opts_del(tcp_opts); > + g_free(str); Same here. > + > + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); > + of_vsa_addr = g_strdup_printf("of://%s:%d", > + s->vdisk_hostinfo.hostip, > + s->vdisk_hostinfo.port); > + > + /* > + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. > + */ > + if (global_qnio_ctx == NULL) { > + global_qnio_ctx = vxhs_setup_qnio(); I assume this would prevent the scenario I described in the beginning (two different drives using two different vxhs servers) from working properly? Is that by design? If so, it needs to fail gracefully; I don't think the current implementation will. But it seems like you should just be able to do s->qnio_ctx = vxhs_setup_qnio() here, right? > + if (global_qnio_ctx == NULL) { > + error_setg(&local_err, "Failed vxhs_setup_qnio"); > + ret = -EINVAL; > + goto out; > + } > + } > + > + ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name); This overwrites the host / port, and I presume other info, of other drives in the case of another vxhs drive being opened. This seems wrong. > + if (ret) { > + error_setg(&local_err, "Failed qnio_iio_open"); > + ret = -EIO; > + } > + > +out: > + g_free(file_name); > + g_free(of_vsa_addr); > + qemu_opts_del(opts); > + > + if (ret < 0) { > + g_free(s->vdisk_hostinfo.hostip); > + g_free(s->vdisk_guid); > + s->vdisk_guid = NULL; > + errno = -ret; > + } > + error_propagate(errp, local_err); This can be moved up into the 'if' block above it. > + > + return ret; > +} > + > +static int vxhs_open(BlockDriverState *bs, QDict *options, > + int bdrv_flags, Error **errp) > +{ > + BDRVVXHSState *s = bs->opaque; > + AioContext *aio_context; > + int qemu_qnio_cfd = -1; > + int device_opened = 0; A bool makes more sense here. > + int qemu_rfd = -1; > + int ret = 0; > + > + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); > + if (ret < 0) { > + trace_vxhs_open_fail(ret); > + return ret; > + } > + > + device_opened = 1; > + s->qnio_ctx = global_qnio_ctx; > + s->vdisk_hostinfo.qnio_cfd = qemu_qnio_cfd; > + s->vdisk_hostinfo.vdisk_rfd = qemu_rfd; > + s->vdisk_size = 0; > + > + /* > + * Create a pipe for communicating between two threads in different > + * context. Set handler for read event, which gets triggered when > + * IO completion is done by non-QEMU context. > + */ > + ret = qemu_pipe(s->fds); > + if (ret < 0) { > + trace_vxhs_open_epipe(ret); > + ret = -errno; > + goto errout; > + } > + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); > + > + aio_context = bdrv_get_aio_context(bs); > + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], > + false, vxhs_aio_event_reader, NULL, s); > + return 0; > + > +errout: > + /* > + * Close remote vDisk device if it was opened earlier > + */ > + if (device_opened) { > + vxhs_qnio_iio_close(s); > + } > + trace_vxhs_open_fail(ret); > + return ret; > +} > + > +static const AIOCBInfo vxhs_aiocb_info = { > + .aiocb_size = sizeof(VXHSAIOCB) > +}; > + > +/* > + * This allocates QEMU-VXHS callback for each IO > + * and is passed to QNIO. When QNIO completes the work, > + * it will be passed back through the callback. > + */ > +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockCompletionFunc *cb, void *opaque, int iodir) > +{ > + VXHSAIOCB *acb = NULL; > + BDRVVXHSState *s = bs->opaque; > + size_t size; > + uint64_t offset; > + int iio_flags = 0; > + int ret = 0; > + void *qnio_ctx = s->qnio_ctx; > + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; > + > + offset = sector_num * BDRV_SECTOR_SIZE; > + size = nb_sectors * BDRV_SECTOR_SIZE; > + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); > + /* > + * Setup or initialize VXHSAIOCB. > + * Every single field should be initialized since > + * acb will be picked up from the slab without > + * initializing with zero. > + */ > + acb->io_offset = offset; > + acb->size = size; > + acb->err = 0; > + acb->state = VXHS_IO_INPROGRESS; > + acb->qiov = qiov; > + acb->direction = iodir; > + > + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); > + > + switch (iodir) { > + case VDISK_AIO_WRITE: > + ret = iio_writev(qnio_ctx, rfd, qiov->iov, qiov->niov, > + offset, (uint64_t)size, (void *)acb, iio_flags); > + break; > + case VDISK_AIO_READ: > + ret = iio_readv(qnio_ctx, rfd, qiov->iov, qiov->niov, > + offset, (uint64_t)size, (void *)acb, iio_flags); > + break; > + default: > + trace_vxhs_aio_rw_invalid(iodir); > + goto errout; > + } > + > + if (ret != 0) { > + trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset, > + acb, ret, errno); > + goto errout; > + } > + return &acb->common; > + > +errout: > + qemu_aio_unref(acb); > + return NULL; > +} > + > +static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, > + BlockCompletionFunc *cb, void *opaque) > +{ > + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, > + opaque, VDISK_AIO_READ); > +} > + > +static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, > + BlockCompletionFunc *cb, void *opaque) > +{ > + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, > + cb, opaque, VDISK_AIO_WRITE); > +} > + > +static void vxhs_close(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + > + trace_vxhs_close(s->vdisk_guid); > + close(s->fds[VDISK_FD_READ]); > + close(s->fds[VDISK_FD_WRITE]); > + > + /* > + * Clearing all the event handlers for oflame registered to QEMU > + */ > + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > + false, NULL, NULL, NULL); > + g_free(s->vdisk_guid); > + s->vdisk_guid = NULL; > + vxhs_qnio_iio_close(s); > + > + /* > + * Free the dynamically allocated hostip string > + */ > + g_free(s->vdisk_hostinfo.hostip); > + s->vdisk_hostinfo.hostip = NULL; > + s->vdisk_hostinfo.port = 0; > +} > + > +static unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) The functions that call this use its value as an int64_t, although this is returning an unsigned long. It should return int64_t. > +{ > + int64_t vdisk_size = 0; > + int ret = 0; > + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; > + > + ret = iio_ioctl(s->qnio_ctx, rfd, IOR_VDISK_STAT, &vdisk_size, NULL, 0); > + if (ret < 0) { > + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); > + return 0; Repeat comment from RFC v7: Why is is this returning 0 here, rather then 'ret'? Does IOR_VDIST_STAT return sane error codes? If not, instead of returning zero, map it to -EIO here. > + } > + > + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); > + return vdisk_size; > +} > + > +/* > + * Returns the size of vDisk in bytes. This is required > + * by QEMU block upper block layer so that it is visible > + * to guest. > + */ > +static int64_t vxhs_getlength(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + int64_t vdisk_size = 0; > + > + if (s->vdisk_size > 0) { > + vdisk_size = s->vdisk_size; > + } else { > + /* > + * Fetch the vDisk size using stat ioctl > + */ > + vdisk_size = vxhs_get_vdisk_stat(s); > + if (vdisk_size > 0) { > + s->vdisk_size = vdisk_size; > + } > + } > + > + if (vdisk_size > 0) { > + return vdisk_size; /* return size in bytes */ > + } > + Repeat comment from RFC v7: If vxhs_get_vdisk_stat() returned error rather than 0, then this check is unnecessary. You can just return vdisk_size. (Also: I don't think you need to return -EIO for a zero-sized image, do you?) > + return -EIO; > +} > + > +/* > + * Returns actual blocks allocated for the vDisk. s/blocks/bytes > + * This is required by qemu-img utility. > + */ > +static int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) s/blocks/bytes > +{ > + BDRVVXHSState *s = bs->opaque; > + int64_t vdisk_size = 0; > + > + if (s->vdisk_size > 0) { > + vdisk_size = s->vdisk_size; > + } else { > + /* > + * TODO: > + * Once HyperScale storage-virtualizer provides > + * actual physical allocation of blocks then > + * fetch that information and return back to the > + * caller but for now just get the full size. > + */ > + vdisk_size = vxhs_get_vdisk_stat(s); > + if (vdisk_size > 0) { > + s->vdisk_size = vdisk_size; > + } > + } > + > + if (vdisk_size > 0) { > + return vdisk_size; /* return size in bytes */ > + } > + > + return -EIO; Repeat comment from RFC v7: Since this is just identical to vxhs_getlength(), why not just call it here, and reduce code duplication? Or even better: do not implement this function. It isn't required. If it is not implemented, the block layer above this will just return -ENOTSUP, which in qapi will be interpreted as "unavailable". This function should be dropped, until it is something supported by VXHS. > +} > + > +static void vxhs_detach_aio_context(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + > + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > + false, NULL, NULL, NULL); > + > +} > + > +static void vxhs_attach_aio_context(BlockDriverState *bs, > + AioContext *new_context) > +{ > + BDRVVXHSState *s = bs->opaque; > + > + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], > + false, vxhs_aio_event_reader, NULL, s); > +} > + > +static BlockDriver bdrv_vxhs = { > + .format_name = "vxhs", > + .protocol_name = "vxhs", > + .instance_size = sizeof(BDRVVXHSState), > + .bdrv_file_open = vxhs_open, > + .bdrv_parse_filename = vxhs_parse_filename, > + .bdrv_close = vxhs_close, > + .bdrv_getlength = vxhs_getlength, > + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, > + .bdrv_aio_readv = vxhs_aio_readv, > + .bdrv_aio_writev = vxhs_aio_writev, > + .bdrv_detach_aio_context = vxhs_detach_aio_context, > + .bdrv_attach_aio_context = vxhs_attach_aio_context, > +}; > + > +static void bdrv_vxhs_init(void) > +{ > + bdrv_register(&bdrv_vxhs); > +} > + > +block_init(bdrv_vxhs_init); > diff --git a/configure b/configure > index d3dafcb..4413e88 100755 > --- a/configure > +++ b/configure > @@ -321,6 +321,7 @@ numa="" > tcmalloc="no" > jemalloc="no" > replication="yes" > +vxhs="" > > # parse CC options first > for opt do > @@ -1162,6 +1163,11 @@ for opt do > ;; > --enable-replication) replication="yes" > ;; > + --disable-vxhs) vxhs="no" > + ;; > + --enable-vxhs) vxhs="yes" > + ;; > + > *) > echo "ERROR: unknown option $opt" > echo "Try '$0 --help' for more information" > @@ -1391,6 +1397,7 @@ disabled with --disable-FEATURE, default is enabled if available: > tcmalloc tcmalloc support > jemalloc jemalloc support > replication replication support > + vxhs Veritas HyperScale vDisk backend support > > NOTE: The object files are built at the place where configure is launched > EOF > @@ -4625,6 +4632,33 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then > fi > > ########################################## > +# Veritas HyperScale block driver VxHS > +# Check if libqnio is installed > + > +if test "$vxhs" != "no" ; then > + cat > $TMPC <<EOF > +#include <stdint.h> > +#include <qnio/qnio_api.h> > + > +void *vxhs_callback; > + > +int main(void) { > + iio_init(vxhs_callback); > + return 0; > +} > +EOF > + vxhs_libs="-lqnio" > + if compile_prog "" "$vxhs_libs" ; then > + vxhs=yes > + else > + if test "$vxhs" = "yes" ; then > + feature_not_found "vxhs block device" "Install libqnio. See github" > + fi > + vxhs=no > + fi > +fi How is libqnio going to be distributed normally? Should we be checking for a minimum version here to compile against? > + > +########################################## > # End of CC checks > # After here, no more $cc or $ld runs > > @@ -4990,6 +5024,7 @@ echo "tcmalloc support $tcmalloc" > echo "jemalloc support $jemalloc" > echo "avx2 optimization $avx2_opt" > echo "replication support $replication" > +echo "VxHS block device $vxhs" > > if test "$sdl_too_old" = "yes"; then > echo "-> Your SDL version is too old - please upgrade to have SDL support" > @@ -5590,6 +5625,12 @@ if test "$pthread_setname_np" = "yes" ; then > echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak > fi > > +if test "$vxhs" = "yes" ; then > + echo "CONFIG_VXHS=y" >> $config_host_mak > + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak > + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak > +fi > + > if test "$tcg_interpreter" = "yes"; then > QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" > elif test "$ARCH" = "sparc64" ; then > -- > 2.5.5 >
On Wed, Oct 26, 2016 at 12:41:28AM -0400, Jeff Cody wrote: > On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: > > This patch adds support for a new block device type called "vxhs". > > Source code for the library that this code loads can be downloaded from: > > https://github.com/MittalAshish/libqnio.git > > > > I grabbed the latest of libqnio, compiled it (had to disable -Werror), and > tried it out. I was able to do a qemu-img info on a raw file, but it would > just hang when trying a format such as qcow2. I am assuming > this is a limitation of test_server, and not libqnio. > > This will make qemu-iotests more difficult however. > > I haven't looked at the latest qnio code yet (other than compiling the > test-server to test), so the rest of this review is on the qemu driver. > > > Sample command line using JSON syntax: > > ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"}]}' > > > > Sample command line using URI syntax: > > qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D > > > > Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> > > --- > > block/Makefile.objs | 2 + > > block/trace-events | 22 ++ > > block/vxhs.c | 736 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > configure | 41 +++jj > > 4 files changed, 801 insertions(+) > > create mode 100644 block/vxhs.c > > I think this version still does not address Daniel's concerns regarding a > QAPI schema for vxhs. Yes, indeed, which makes me question what level of testing this patch has had - hotplugging for devices is not testable without having the QAPI schema there. Regards, Daniel
On 10/25/16, 9:41 PM, "Jeff Cody" <jcody@redhat.com> wrote: >On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: >> This patch adds support for a new block device type called "vxhs". >> Source code for the library that this code loads can be downloaded from: >> https://github.com/MittalAshish/libqnio.git >> > >I grabbed the latest of libqnio, compiled it (had to disable -Werror), and >tried it out. I was able to do a qemu-img info on a raw file, but it >would >just hang when trying a format such as qcow2. I am assuming >this is a limitation of test_server, and not libqnio. On my build I did not get any build errors. The qcow2 issue is to do with the limitation in the test server, which we will fix in a seperate patch. > >This will make qemu-iotests more difficult however. > >I haven't looked at the latest qnio code yet (other than compiling the >test-server to test), so the rest of this review is on the qemu driver. > >> Sample command line using JSON syntax: >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us >>-vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 >>-msg timestamp=on >>'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410} >>","server":[{"host":"172.172.17.4","port":"9999"}]}' >> >> Sample command line using URI syntax: >> qemu-img convert -f raw -O raw -n >>/var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad >>vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D >> >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> >> --- >> block/Makefile.objs | 2 + >> block/trace-events | 22 ++ >> block/vxhs.c | 736 >>++++++++++++++++++++++++++++++++++++++++++++++++++++ >> configure | 41 +++jj >> 4 files changed, 801 insertions(+) >> create mode 100644 block/vxhs.c > >I think this version still does not address Daniel's concerns regarding a >QAPI schema for vxhs. We are working on QAPI schema changes and will submit them in separate patch. > >We are also still needing qemu-iotests, and a test-server suitable to run >the tests. > >> >> diff --git a/block/Makefile.objs b/block/Makefile.objs >> index 67a036a..58313a2 100644 >> --- a/block/Makefile.objs >> +++ b/block/Makefile.objs >> @@ -18,6 +18,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o >> block-obj-$(CONFIG_CURL) += curl.o >> block-obj-$(CONFIG_RBD) += rbd.o >> block-obj-$(CONFIG_GLUSTERFS) += gluster.o >> +block-obj-$(CONFIG_VXHS) += vxhs.o >> block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o >> block-obj-$(CONFIG_LIBSSH2) += ssh.o >> block-obj-y += accounting.o dirty-bitmap.o >> @@ -38,6 +39,7 @@ rbd.o-cflags := $(RBD_CFLAGS) >> rbd.o-libs := $(RBD_LIBS) >> gluster.o-cflags := $(GLUSTERFS_CFLAGS) >> gluster.o-libs := $(GLUSTERFS_LIBS) >> +vxhs.o-libs := $(VXHS_LIBS) >> ssh.o-cflags := $(LIBSSH2_CFLAGS) >> ssh.o-libs := $(LIBSSH2_LIBS) >> archipelago.o-libs := $(ARCHIPELAGO_LIBS) >> diff --git a/block/trace-events b/block/trace-events >> index 05fa13c..aea97cb 100644 >> --- a/block/trace-events >> +++ b/block/trace-events >> @@ -114,3 +114,25 @@ qed_aio_write_data(void *s, void *acb, int ret, >>uint64_t offset, size_t len) "s >> qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, >>uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 >> qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, >>uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 >> qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, >>size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" >> + >> +# block/vxhs.c >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, >>reason %d" >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p" >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no >>i/o %d, %d" >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, >>errno %d" >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d" >> +vxhs_open_epipe(int ret) "Could not create a pipe for device. Bailing >>out. Error=%d" >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, >>void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d >>size = %lu offset = %lu ACB = %p. Error = %d, errno = %d" >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat >>ioctl failed, ret = %d, errno = %d" >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s >>stat ioctl returned size %lu" >> +vxhs_qnio_iio_open(const char *ip) "Failed to connect to storage agent >>on host-ip %s" >> +vxhs_qnio_iio_devopen(const char *fname) "Failed to open vdisk device: >>%s" >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld" >> +vxhs_parse_uri_filename(const char *filename) "URI passed via >>bdrv_parse_filename %s" >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s" >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = >>%d" >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP >>%s, Port %d" >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to >>BDRVVXHSState" >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s" >> +vxhs_close(char *vdisk_guid) "Closing vdisk %s" >> diff --git a/block/vxhs.c b/block/vxhs.c >> new file mode 100644 >> index 0000000..97fb804 >> --- /dev/null >> +++ b/block/vxhs.c >> @@ -0,0 +1,736 @@ >> +/* >> + * QEMU Block driver for Veritas HyperScale (VxHS) >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2 or >>later. >> + * See the COPYING file in the top-level directory. >> + * >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "block/block_int.h" >> +#include <qnio/qnio_api.h> >> +#include "qapi/qmp/qerror.h" >> +#include "qapi/qmp/qdict.h" >> +#include "qapi/qmp/qstring.h" >> +#include "trace.h" >> +#include "qemu/uri.h" >> +#include "qapi/error.h" >> +#include "qemu/error-report.h" >> + >> +#define VDISK_FD_READ 0 >> +#define VDISK_FD_WRITE 1 >> + >> +#define VXHS_OPT_FILENAME "filename" >> +#define VXHS_OPT_VDISK_ID "vdisk_id" >> +#define VXHS_OPT_SERVER "server." >> +#define VXHS_OPT_HOST "host" >> +#define VXHS_OPT_PORT "port" >> + >> +/* qnio client ioapi_ctx */ >> +static void *global_qnio_ctx; >> + > > >Why create a qnio ctx as a static global, rather than confining it to >BDRVVXHSState, so that it is unique to each disk instance? > >Is the following usage supported (i.e. two different vxhs remote servers): > >qemu-system-x86_64 -drive file=vxhs://server1:9999/test1 \ > -drive file=vxhs://server2:9999/test2 ³global_qnio_ctx² is a global context, not per-disk context. Removed the qnio_ctx field From ³BDRVVXHSState² structure. The above command should work. > > >Why store the qnio ctx into this global briefly, and just to move it to >where it belongs in the BDRVVXHSState struct? It makes more sense to be >in >the state struct the whole time, unless I am missing something fundamental >about the protocol. > > >> +/* vdisk prefix to pass to qnio */ >> +static const char vdisk_prefix[] = "/dev/of/vdisk"; >> + >> +typedef enum { >> + VXHS_IO_INPROGRESS, >> + VXHS_IO_COMPLETED, >> + VXHS_IO_ERROR >> +} VXHSIOState; >> + >> +typedef enum { >> + VDISK_AIO_READ, >> + VDISK_AIO_WRITE, >> + VDISK_STAT >> +} VDISKAIOCmd; >> + >> +/* >> + * HyperScale AIO callbacks structure >> + */ >> +typedef struct VXHSAIOCB { >> + BlockAIOCB common; >> + int err; >> + int state; >> + int direction; /* IO direction (r/w) */ >> + size_t io_offset; >> + size_t size; >> + QEMUBH *bh; > >*bh is unused. Removed the unused variable. > >> + QEMUIOVector *qiov; >> + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; > >retry_entry is unused. Removed the unused variable. > >> +} VXHSAIOCB; >> + >> +typedef struct VXHSvDiskHostsInfo { >> + int qnio_cfd; /* Channel FD */ >> + int vdisk_rfd; /* vDisk remote FD */ >> + char *hostip; /* Host's IP addresses */ >> + int port; /* Host's port number */ >> +} VXHSvDiskHostsInfo; >> + >> +/* >> + * Structure per vDisk maintained for state >> + */ >> +typedef struct BDRVVXHSState { >> + int fds[2]; >> + int64_t vdisk_size; >> + int64_t vdisk_blocks; > >vdisk_blocks is unused. Removed the unused variable. > >> + int event_reader_pos; >> + VXHSAIOCB *qnio_event_acb; >> + void *qnio_ctx; >> + VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */ >> + char *vdisk_guid; >> +} BDRVVXHSState; >> + >> +static void vxhs_qnio_iio_close(BDRVVXHSState *s) >> +{ >> + /* >> + * Close vDisk device >> + */ >> + if (s->vdisk_hostinfo.vdisk_rfd >= 0) { >> + iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo.vdisk_rfd); >> + s->vdisk_hostinfo.vdisk_rfd = -1; >> + } >> + >> + /* >> + * Close QNIO channel against cached channel-fd >> + */ >> + if (s->vdisk_hostinfo.qnio_cfd >= 0) { >> + iio_close(s->qnio_ctx, s->vdisk_hostinfo.qnio_cfd); >> + s->vdisk_hostinfo.qnio_cfd = -1; >> + } >> +} >> + >> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr, >> + int *rfd, const char *file_name) >> +{ >> + /* >> + * Open qnio channel to storage agent if not opened before. >> + */ >> + if (*cfd < 0) { >> + *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0); >> + if (*cfd < 0) { >> + trace_vxhs_qnio_iio_open(of_vsa_addr); >> + return -ENODEV; >> + } >> + } >> + >> + /* >> + * Open vdisk device >> + */ >> + *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0); >> + if (*rfd < 0) { >> + if (*cfd >= 0) { >> + iio_close(global_qnio_ctx, *cfd); >> + *cfd = -1; >> + *rfd = -1; >> + } >> + >> + trace_vxhs_qnio_iio_devopen(file_name); >> + return -ENODEV; >> + } >> + >> + return 0; >> +} >> + >> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx, >> + uint32_t error, uint32_t opcode) >> +{ >> + VXHSAIOCB *acb = NULL; >> + BDRVVXHSState *s = NULL; >> + ssize_t ret; >> + >> + switch (opcode) { >> + case IRP_READ_REQUEST: >> + case IRP_WRITE_REQUEST: >> + >> + /* >> + * ctx is VXHSAIOCB* >> + * ctx is NULL if error is QNIOERROR_CHANNEL_HUP or >> + * reason is IIO_REASON_HUP >> + */ >> + if (ctx) { >> + acb = ctx; >> + s = acb->common.bs->opaque; >> + } else { >> + trace_vxhs_iio_callback(error, reason); >> + goto out; >> + } >> + >> + if (error) { >> + if (!acb->err) { >> + acb->err = error; >> + } >> + trace_vxhs_iio_callback(error, reason); >> + } >> + >> + ret = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, >>sizeof(acb)); >> + g_assert(ret == sizeof(acb)); >> + break; >> + >> + default: >> + if (error == QNIOERROR_CHANNEL_HUP) { >> + /* >> + * Channel failed, spontaneous notification, >> + * not in response to I/O >> + */ >> + trace_vxhs_iio_callback_chnfail(error, errno); >> + } else { >> + trace_vxhs_iio_callback_unknwn(opcode, error); >> + } >> + break; >> + } >> +out: >> + return; >> +} >> + >> +static void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) >> +{ >> + BlockCompletionFunc *cb = acb->common.cb; >> + void *opaque = acb->common.opaque; >> + int ret = 0; >> + >> + if (acb->err != 0) { >> + trace_vxhs_complete_aio(acb, acb->err); >> + /* >> + * We mask all the IO errors generically as EIO for upper >>layers >> + * Right now our IO Manager uses non standard error codes. >>Instead >> + * of confusing upper layers with incorrect interpretation we >>are >> + * doing this workaround. >> + */ >> + ret = (-EIO); >> + } >> + >> + acb->state = VXHS_IO_COMPLETED; > >What is this state used for? I see it also set to VXHS_IO_INPROGRESS at >the >beginning of the request. This was useful when failover code was implemented. It is no longer used. So removed the ³state² filed. > >> + qemu_aio_unref(acb); >> + cb(opaque, ret); >> +} >> + >> +/* >> + * This is the HyperScale event handler registered to QEMU. >> + * It is invoked when any IO gets completed and written on pipe >> + * by callback called from QNIO thread context. Then it marks >> + * the AIO as completed, and releases HyperScale AIO callbacks. >> + */ >> +static void vxhs_aio_event_reader(void *opaque) >> +{ >> + BDRVVXHSState *s = opaque; >> + char *p; >> + ssize_t ret; >> + >> + do { >> + p = (char *)&s->qnio_event_acb; >> + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, >> + sizeof(s->qnio_event_acb) - s->event_reader_pos); >> + if (ret > 0) { >> + s->event_reader_pos += ret; >> + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { >> + s->event_reader_pos = 0; >> + vxhs_complete_aio(s->qnio_event_acb, s); >> + } >> + } >> + } while (ret < 0 && errno == EINTR); > >I'm bit confused on this event reader loop. If ret is > 0, but >s->event_reader_pos != sizeof(s->qnio_event_acb), we'll exit the event >reader leaving event_read_pos non-zero. And since we are reading in a >pointer for the acb, what does that mean for the next time we come we have >an aio reader event? On the next reader event, it will read the rest of ³acb² structure(from s->event_reader_pos), and calls vxhs_complete_aio(). > >> +} >> + >> +/* >> + * Call QNIO operation to create channels to do IO on vDisk. >> + */ >> + >> +static void *vxhs_setup_qnio(void) >> +{ >> + void *qnio_ctx = NULL; >> + >> + qnio_ctx = iio_init(vxhs_iio_callback); >> + if (qnio_ctx != NULL) { >> + trace_vxhs_setup_qnio(qnio_ctx); >> + } >> + return qnio_ctx; >> +} >> + >> +static QemuOptsList runtime_opts = { >> + .name = "vxhs", >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), >> + .desc = { >> + { >> + .name = VXHS_OPT_FILENAME, >> + .type = QEMU_OPT_STRING, >> + .help = "URI to the Veritas HyperScale image", >> + }, >> + { >> + .name = VXHS_OPT_VDISK_ID, >> + .type = QEMU_OPT_STRING, >> + .help = "UUID of the VxHS vdisk", >> + }, >> + { /* end of list */ } >> + }, >> +}; >> + >> +static QemuOptsList runtime_tcp_opts = { >> + .name = "vxhs_tcp", >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), >> + .desc = { >> + { >> + .name = VXHS_OPT_HOST, >> + .type = QEMU_OPT_STRING, >> + .help = "host address (ipv4 addresses)", >> + }, >> + { >> + .name = VXHS_OPT_PORT, >> + .type = QEMU_OPT_NUMBER, >> + .help = "port number on which VxHSD is listening (default >>9999)", >> + .def_value_str = "9999" >> + }, >> + { /* end of list */ } >> + }, >> +}; >> + >> +/* >> + * Parse the incoming URI and populate *options with the host >>information. >> + * URI syntax has the limitation of supporting only one host info. >> + * To pass multiple host information, use the JSON syntax. >> + */ >> +static int vxhs_parse_uri(const char *filename, QDict *options) >> +{ >> + URI *uri = NULL; >> + char *hoststr, *portstr; >> + char *port; >> + int ret = 0; >> + >> + trace_vxhs_parse_uri_filename(filename); >> + uri = uri_parse(filename); >> + if (!uri || !uri->server || !uri->path) { >> + uri_free(uri); >> + return -EINVAL; >> + } >> + >> + hoststr = g_strdup(VXHS_OPT_SERVER"0.host"); >> + qdict_put(options, hoststr, qstring_from_str(uri->server)); >> + g_free(hoststr); >> + >> + portstr = g_strdup(VXHS_OPT_SERVER"0.port"); >> + if (uri->port) { >> + port = g_strdup_printf("%d", uri->port); >> + qdict_put(options, portstr, qstring_from_str(port)); >> + g_free(port); >> + } >> + g_free(portstr); >> + >> + if (strstr(uri->path, "vxhs") == NULL) { >> + qdict_put(options, "vdisk_id", qstring_from_str(uri->path)); >> + } >> + >> + trace_vxhs_parse_uri_hostinfo(1, uri->server, uri->port); >> + uri_free(uri); >> + >> + return ret; >> +} >> + >> +static void vxhs_parse_filename(const char *filename, QDict *options, >> + Error **errp) >> +{ >> + if (qdict_haskey(options, "vdisk_id") || qdict_haskey(options, >>"server")) { >> + error_setg(errp, "vdisk_id/server and a file name may not be >>specified " >> + "at the same time"); >> + return; >> + } >> + >> + if (strstr(filename, "://")) { >> + int ret = vxhs_parse_uri(filename, options); >> + if (ret < 0) { >> + error_setg(errp, "Invalid URI. URI should be of the form " >> + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); >> + } >> + } >> +} >> + >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, >> + int *cfd, int *rfd, Error **errp) >> +{ >> + QDict *backing_options = NULL; >> + QemuOpts *opts, *tcp_opts; >> + const char *vxhs_filename; >> + char *of_vsa_addr = NULL; >> + Error *local_err = NULL; >> + const char *vdisk_id_opt; >> + char *file_name = NULL; >> + size_t num_servers = 0; >> + char *str = NULL; >> + int ret = 0; >> + >> + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); >> + qemu_opts_absorb_qdict(opts, options, &local_err); >> + if (local_err) { >> + error_propagate(errp, local_err); > >You are already propagating this error in 'out'. Removed ³error_propagate()² call. > >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); >> + if (vxhs_filename) { >> + trace_vxhs_qemu_init_filename(vxhs_filename); >> + } >> + >> + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); >> + if (!vdisk_id_opt) { >> + error_setg(&local_err, QERR_MISSING_PARAMETER, >>VXHS_OPT_VDISK_ID); >> + ret = -EINVAL; >> + goto out; >> + } >> + s->vdisk_guid = g_strdup(vdisk_id_opt); >> + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); >> + >> + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); >> + if (num_servers < 1) { >> + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); >> + ret = -EINVAL; >> + goto out; >> + } else if (num_servers > 1) { >> + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); >> + error_append_hint(errp, "Only one server allowed.\n"); > >s/errp/&local_err Replaced as suggested. > >> + ret = -EINVAL; >> + goto out; >> + } >> + trace_vxhs_qemu_init_numservers(num_servers); >> + >> + str = g_strdup_printf(VXHS_OPT_SERVER"0."); >> + qdict_extract_subqdict(options, &backing_options, str); >> + >> + /* Create opts info from runtime_tcp_opts list */ >> + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, >>&error_abort); >> + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); >> + if (local_err) { >> + qdict_del(backing_options, str); >> + qemu_opts_del(tcp_opts); >> + g_free(str); > >Rather than free str here, you can just do it in the 'out'. (g_free is >NULL-safe) Moved g_free() call to ³out² label. > >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + s->vdisk_hostinfo.hostip = g_strdup(qemu_opt_get(tcp_opts, >> + VXHS_OPT_HOST)); >> + s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts, >> + >>VXHS_OPT_PORT), >> + NULL, 0); >> + >> + s->vdisk_hostinfo.qnio_cfd = -1; >> + s->vdisk_hostinfo.vdisk_rfd = -1; >> + trace_vxhs_qemu_init(s->vdisk_hostinfo.hostip, >> + s->vdisk_hostinfo.port); >> + >> + qdict_del(backing_options, str); >> + qemu_opts_del(tcp_opts); >> + g_free(str); > >Same here. Moved g_free() call to ³out² label. > >> + >> + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); >> + of_vsa_addr = g_strdup_printf("of://%s:%d", >> + s->vdisk_hostinfo.hostip, >> + s->vdisk_hostinfo.port); >> + >> + /* >> + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. >> + */ >> + if (global_qnio_ctx == NULL) { >> + global_qnio_ctx = vxhs_setup_qnio(); > >I assume this would prevent the scenario I described in the beginning >(two different >drives using two different vxhs servers) from working properly? Is that >by >design? > >If so, it needs to fail gracefully; I don't think the current >implementation >will. > > >But it seems like you should just be able to do >s->qnio_ctx = vxhs_setup_qnio() here, right? As mentioned before ³qnio_ctx² is global context, and gets initialized on first disk open. > > >> + if (global_qnio_ctx == NULL) { >> + error_setg(&local_err, "Failed vxhs_setup_qnio"); >> + ret = -EINVAL; >> + goto out; >> + } >> + } >> + >> + ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name); > >This overwrites the host / port, and I presume other info, of other drives >in the case of another vxhs drive being opened. This seems wrong. > >> + if (ret) { >> + error_setg(&local_err, "Failed qnio_iio_open"); >> + ret = -EIO; >> + } >> + >> +out: >> + g_free(file_name); >> + g_free(of_vsa_addr); >> + qemu_opts_del(opts); >> + >> + if (ret < 0) { >> + g_free(s->vdisk_hostinfo.hostip); >> + g_free(s->vdisk_guid); >> + s->vdisk_guid = NULL; >> + errno = -ret; >> + } >> + error_propagate(errp, local_err); > >This can be moved up into the 'if' block above it. Moved the code as suggested. > >> + >> + return ret; >> +} >> + >> +static int vxhs_open(BlockDriverState *bs, QDict *options, >> + int bdrv_flags, Error **errp) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + AioContext *aio_context; >> + int qemu_qnio_cfd = -1; >> + int device_opened = 0; > >A bool makes more sense here. Modified the type to ³bool² as suggested. > >> + int qemu_rfd = -1; >> + int ret = 0; >> + >> + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); >> + if (ret < 0) { >> + trace_vxhs_open_fail(ret); >> + return ret; >> + } >> + >> + device_opened = 1; >> + s->qnio_ctx = global_qnio_ctx; >> + s->vdisk_hostinfo.qnio_cfd = qemu_qnio_cfd; >> + s->vdisk_hostinfo.vdisk_rfd = qemu_rfd; >> + s->vdisk_size = 0; >> + >> + /* >> + * Create a pipe for communicating between two threads in different >> + * context. Set handler for read event, which gets triggered when >> + * IO completion is done by non-QEMU context. >> + */ >> + ret = qemu_pipe(s->fds); >> + if (ret < 0) { >> + trace_vxhs_open_epipe(ret); >> + ret = -errno; >> + goto errout; >> + } >> + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); >> + >> + aio_context = bdrv_get_aio_context(bs); >> + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], >> + false, vxhs_aio_event_reader, NULL, s); >> + return 0; >> + >> +errout: >> + /* >> + * Close remote vDisk device if it was opened earlier >> + */ >> + if (device_opened) { >> + vxhs_qnio_iio_close(s); >> + } >> + trace_vxhs_open_fail(ret); >> + return ret; >> +} >> + >> +static const AIOCBInfo vxhs_aiocb_info = { >> + .aiocb_size = sizeof(VXHSAIOCB) >> +}; >> + >> +/* >> + * This allocates QEMU-VXHS callback for each IO >> + * and is passed to QNIO. When QNIO completes the work, >> + * it will be passed back through the callback. >> + */ >> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t >>sector_num, >> + QEMUIOVector *qiov, int nb_sectors, >> + BlockCompletionFunc *cb, void *opaque, >>int iodir) >> +{ >> + VXHSAIOCB *acb = NULL; >> + BDRVVXHSState *s = bs->opaque; >> + size_t size; >> + uint64_t offset; >> + int iio_flags = 0; >> + int ret = 0; >> + void *qnio_ctx = s->qnio_ctx; >> + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; >> + >> + offset = sector_num * BDRV_SECTOR_SIZE; >> + size = nb_sectors * BDRV_SECTOR_SIZE; >> + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); >> + /* >> + * Setup or initialize VXHSAIOCB. >> + * Every single field should be initialized since >> + * acb will be picked up from the slab without >> + * initializing with zero. >> + */ >> + acb->io_offset = offset; >> + acb->size = size; >> + acb->err = 0; >> + acb->state = VXHS_IO_INPROGRESS; >> + acb->qiov = qiov; >> + acb->direction = iodir; >> + >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); >> + >> + switch (iodir) { >> + case VDISK_AIO_WRITE: >> + ret = iio_writev(qnio_ctx, rfd, qiov->iov, qiov->niov, >> + offset, (uint64_t)size, (void *)acb, >>iio_flags); >> + break; >> + case VDISK_AIO_READ: >> + ret = iio_readv(qnio_ctx, rfd, qiov->iov, qiov->niov, >> + offset, (uint64_t)size, (void *)acb, >>iio_flags); >> + break; >> + default: >> + trace_vxhs_aio_rw_invalid(iodir); >> + goto errout; >> + } >> + >> + if (ret != 0) { >> + trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset, >> + acb, ret, errno); >> + goto errout; >> + } >> + return &acb->common; >> + >> +errout: >> + qemu_aio_unref(acb); >> + return NULL; >> +} >> + >> +static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, >> + int64_t sector_num, QEMUIOVector >>*qiov, >> + int nb_sectors, >> + BlockCompletionFunc *cb, void >>*opaque) >> +{ >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, >> + opaque, VDISK_AIO_READ); >> +} >> + >> +static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, >> + int64_t sector_num, QEMUIOVector >>*qiov, >> + int nb_sectors, >> + BlockCompletionFunc *cb, void >>*opaque) >> +{ >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, >> + cb, opaque, VDISK_AIO_WRITE); >> +} >> + >> +static void vxhs_close(BlockDriverState *bs) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + >> + trace_vxhs_close(s->vdisk_guid); >> + close(s->fds[VDISK_FD_READ]); >> + close(s->fds[VDISK_FD_WRITE]); >> + >> + /* >> + * Clearing all the event handlers for oflame registered to QEMU >> + */ >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], >> + false, NULL, NULL, NULL); >> + g_free(s->vdisk_guid); >> + s->vdisk_guid = NULL; >> + vxhs_qnio_iio_close(s); >> + >> + /* >> + * Free the dynamically allocated hostip string >> + */ >> + g_free(s->vdisk_hostinfo.hostip); >> + s->vdisk_hostinfo.hostip = NULL; >> + s->vdisk_hostinfo.port = 0; >> +} >> + >> +static unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) > >The functions that call this use its value as an int64_t, although this is >returning an unsigned long. It should return int64_t. Modified the return value as suggested. > >> +{ >> + int64_t vdisk_size = 0; >> + int ret = 0; >> + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; >> + >> + ret = iio_ioctl(s->qnio_ctx, rfd, IOR_VDISK_STAT, &vdisk_size, >>NULL, 0); >> + if (ret < 0) { >> + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); >> + return 0; > >Repeat comment from RFC v7: > >Why is is this returning 0 here, rather then 'ret'? Does IOR_VDIST_STAT >return sane error codes? If not, instead of returning zero, map it to >-EIO >here. Changed the return value to ³-EIO² incase of error. > >> + } >> + >> + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); >> + return vdisk_size; >> +} >> + >> +/* >> + * Returns the size of vDisk in bytes. This is required >> + * by QEMU block upper block layer so that it is visible >> + * to guest. >> + */ >> +static int64_t vxhs_getlength(BlockDriverState *bs) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + int64_t vdisk_size = 0; >> + >> + if (s->vdisk_size > 0) { >> + vdisk_size = s->vdisk_size; >> + } else { >> + /* >> + * Fetch the vDisk size using stat ioctl >> + */ >> + vdisk_size = vxhs_get_vdisk_stat(s); >> + if (vdisk_size > 0) { >> + s->vdisk_size = vdisk_size; >> + } >> + } >> + >> + if (vdisk_size > 0) { >> + return vdisk_size; /* return size in bytes */ >> + } >> + > >Repeat comment from RFC v7: > >If vxhs_get_vdisk_stat() returned error rather than 0, then this check >is unnecessary. You can just return vdisk_size. > >(Also: I don't think you need to return -EIO for a zero-sized image, do >you?) Modified the code as suggested. Zero size disk will return zero. > >> + return -EIO; >> +} >> + >> +/* >> + * Returns actual blocks allocated for the vDisk. > >s/blocks/bytes > >> + * This is required by qemu-img utility. >> + */ >> +static int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) > >s/blocks/bytes > >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + int64_t vdisk_size = 0; >> + >> + if (s->vdisk_size > 0) { >> + vdisk_size = s->vdisk_size; >> + } else { >> + /* >> + * TODO: >> + * Once HyperScale storage-virtualizer provides >> + * actual physical allocation of blocks then >> + * fetch that information and return back to the >> + * caller but for now just get the full size. >> + */ >> + vdisk_size = vxhs_get_vdisk_stat(s); >> + if (vdisk_size > 0) { >> + s->vdisk_size = vdisk_size; >> + } >> + } >> + >> + if (vdisk_size > 0) { >> + return vdisk_size; /* return size in bytes */ >> + } >> + >> + return -EIO; > >Repeat comment from RFC v7: > >Since this is just identical to vxhs_getlength(), why not just call it >here, >and reduce code duplication? > >Or even better: do not implement this function. It isn't required. If it >is not implemented, the block layer above this will just return -ENOTSUP, >which in qapi will be interpreted as "unavailable". This function should >be >dropped, until it is something supported by VXHS. Removed vxhs_get_allocated_blocks() function. > >> +} >> + >> +static void vxhs_detach_aio_context(BlockDriverState *bs) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], >> + false, NULL, NULL, NULL); >> + >> +} >> + >> +static void vxhs_attach_aio_context(BlockDriverState *bs, >> + AioContext *new_context) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + >> + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], >> + false, vxhs_aio_event_reader, NULL, s); >> +} >> + >> +static BlockDriver bdrv_vxhs = { >> + .format_name = "vxhs", >> + .protocol_name = "vxhs", >> + .instance_size = sizeof(BDRVVXHSState), >> + .bdrv_file_open = vxhs_open, >> + .bdrv_parse_filename = vxhs_parse_filename, >> + .bdrv_close = vxhs_close, >> + .bdrv_getlength = vxhs_getlength, >> + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, >> + .bdrv_aio_readv = vxhs_aio_readv, >> + .bdrv_aio_writev = vxhs_aio_writev, >> + .bdrv_detach_aio_context = vxhs_detach_aio_context, >> + .bdrv_attach_aio_context = vxhs_attach_aio_context, >> +}; >> + >> +static void bdrv_vxhs_init(void) >> +{ >> + bdrv_register(&bdrv_vxhs); >> +} >> + >> +block_init(bdrv_vxhs_init); >> diff --git a/configure b/configure >> index d3dafcb..4413e88 100755 >> --- a/configure >> +++ b/configure >> @@ -321,6 +321,7 @@ numa="" >> tcmalloc="no" >> jemalloc="no" >> replication="yes" >> +vxhs="" >> >> # parse CC options first >> for opt do >> @@ -1162,6 +1163,11 @@ for opt do >> ;; >> --enable-replication) replication="yes" >> ;; >> + --disable-vxhs) vxhs="no" >> + ;; >> + --enable-vxhs) vxhs="yes" >> + ;; >> + >> *) >> echo "ERROR: unknown option $opt" >> echo "Try '$0 --help' for more information" >> @@ -1391,6 +1397,7 @@ disabled with --disable-FEATURE, default is >>enabled if available: >> tcmalloc tcmalloc support >> jemalloc jemalloc support >> replication replication support >> + vxhs Veritas HyperScale vDisk backend support >> >> NOTE: The object files are built at the place where configure is >>launched >> EOF >> @@ -4625,6 +4632,33 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o >>$TMPMO $TMPO; then >> fi >> >> ########################################## >> +# Veritas HyperScale block driver VxHS >> +# Check if libqnio is installed >> + >> +if test "$vxhs" != "no" ; then >> + cat > $TMPC <<EOF >> +#include <stdint.h> >> +#include <qnio/qnio_api.h> >> + >> +void *vxhs_callback; >> + >> +int main(void) { >> + iio_init(vxhs_callback); >> + return 0; >> +} >> +EOF >> + vxhs_libs="-lqnio" >> + if compile_prog "" "$vxhs_libs" ; then >> + vxhs=yes >> + else >> + if test "$vxhs" = "yes" ; then >> + feature_not_found "vxhs block device" "Install libqnio. See >>github" >> + fi >> + vxhs=no >> + fi >> +fi > >How is libqnio going to be distributed normally? Should we be checking >for >a minimum version here to compile against? Yes, version check is required. We will add the changes in separate patch. > >> + >> +########################################## >> # End of CC checks >> # After here, no more $cc or $ld runs >> >> @@ -4990,6 +5024,7 @@ echo "tcmalloc support $tcmalloc" >> echo "jemalloc support $jemalloc" >> echo "avx2 optimization $avx2_opt" >> echo "replication support $replication" >> +echo "VxHS block device $vxhs" >> >> if test "$sdl_too_old" = "yes"; then >> echo "-> Your SDL version is too old - please upgrade to have SDL >>support" >> @@ -5590,6 +5625,12 @@ if test "$pthread_setname_np" = "yes" ; then >> echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak >> fi >> >> +if test "$vxhs" = "yes" ; then >> + echo "CONFIG_VXHS=y" >> $config_host_mak >> + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak >> + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak >> +fi >> + >> if test "$tcg_interpreter" = "yes"; then >> QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" >> elif test "$ARCH" = "sparc64" ; then >> -- >> 2.5.5 >> >
On Wed, Oct 26, 2016 at 09:33:30PM +0000, Buddhi Madhav wrote: > > > On 10/25/16, 9:41 PM, "Jeff Cody" <jcody@redhat.com> wrote: > > >On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: > >> This patch adds support for a new block device type called "vxhs". > >> Source code for the library that this code loads can be downloaded from: > >> https://github.com/MittalAshish/libqnio.git > >> > > > >I grabbed the latest of libqnio, compiled it (had to disable -Werror), and > >tried it out. I was able to do a qemu-img info on a raw file, but it > >would > >just hang when trying a format such as qcow2. I am assuming > >this is a limitation of test_server, and not libqnio. > > On my build I did not get any build errors. > Likely a difference in gcc versions; I am compiling with gcc 6.2.1. Here are the warnings I get (this is just for your benefit, not really part of this driver review): lib/qnio/cJSON.c: In function ‘cJSON_strcasecmp’: lib/qnio/cJSON.c:41:5: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] if (!s1) return (s1==s2)?0:1;if (!s2) return 1; ^~ lib/qnio/cJSON.c:41:34: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ if (!s1) return (s1==s2)?0:1;if (!s2) return 1; ^~ lib/qnio/cJSON.c: In function ‘print_object’: lib/qnio/cJSON.c:440:9: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] if (fmt) *ptr++='\n';*ptr=0; ^~ lib/qnio/cJSON.c:440:30: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ if (fmt) *ptr++='\n';*ptr=0; ^ lib/qnio/cJSON.c: In function ‘cJSON_DetachItemFromArray’: lib/qnio/cJSON.c:467:5: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] if (c->prev) c->prev->next=c->next;if (c->next) c->next->prev=c->prev;if (c==array->child) array->child=c->next;c->prev=c->next=0;return c;} ^~ lib/qnio/cJSON.c:467:40: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ if (c->prev) c->prev->next=c->next;if (c->next) c->next->prev=c->prev;if (c==array->child) array->child=c->next;c->prev=c->next=0;return c;} > The qcow2 issue is to do with the limitation in the test server, which we > will fix in a seperate patch. > > > > >This will make qemu-iotests more difficult however. > > > >I haven't looked at the latest qnio code yet (other than compiling the > >test-server to test), so the rest of this review is on the qemu driver. > > > >> Sample command line using JSON syntax: > >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us > >>-vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 > >>-msg timestamp=on > >>'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410} > >>","server":[{"host":"172.172.17.4","port":"9999"}]}' > >> > >> Sample command line using URI syntax: > >> qemu-img convert -f raw -O raw -n > >>/var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad > >>vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D > >> > >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> > >> --- > >> block/Makefile.objs | 2 + > >> block/trace-events | 22 ++ > >> block/vxhs.c | 736 > >>++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> configure | 41 +++jj > >> 4 files changed, 801 insertions(+) > >> create mode 100644 block/vxhs.c > > > >I think this version still does not address Daniel's concerns regarding a > >QAPI schema for vxhs. > > We are working on QAPI schema changes and will submit them in separate > patch. > When you submit them, since there is a dependency with the patches, it should be submitted as a patch series along with this patch. Same with the qemu-iotests, once those are ready. > > > >We are also still needing qemu-iotests, and a test-server suitable to run > >the tests. > > > >> > >> diff --git a/block/Makefile.objs b/block/Makefile.objs > >> index 67a036a..58313a2 100644 > >> --- a/block/Makefile.objs > >> +++ b/block/Makefile.objs > >> @@ -18,6 +18,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o > >> block-obj-$(CONFIG_CURL) += curl.o > >> block-obj-$(CONFIG_RBD) += rbd.o > >> block-obj-$(CONFIG_GLUSTERFS) += gluster.o > >> +block-obj-$(CONFIG_VXHS) += vxhs.o > >> block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o > >> block-obj-$(CONFIG_LIBSSH2) += ssh.o > >> block-obj-y += accounting.o dirty-bitmap.o > >> @@ -38,6 +39,7 @@ rbd.o-cflags := $(RBD_CFLAGS) > >> rbd.o-libs := $(RBD_LIBS) > >> gluster.o-cflags := $(GLUSTERFS_CFLAGS) > >> gluster.o-libs := $(GLUSTERFS_LIBS) > >> +vxhs.o-libs := $(VXHS_LIBS) > >> ssh.o-cflags := $(LIBSSH2_CFLAGS) > >> ssh.o-libs := $(LIBSSH2_LIBS) > >> archipelago.o-libs := $(ARCHIPELAGO_LIBS) > >> diff --git a/block/trace-events b/block/trace-events > >> index 05fa13c..aea97cb 100644 > >> --- a/block/trace-events > >> +++ b/block/trace-events > >> @@ -114,3 +114,25 @@ qed_aio_write_data(void *s, void *acb, int ret, > >>uint64_t offset, size_t len) "s > >> qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, > >>uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > >> qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, > >>uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > >> qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, > >>size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" > >> + > >> +# block/vxhs.c > >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, > >>reason %d" > >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p" > >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no > >>i/o %d, %d" > >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, > >>errno %d" > >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d" > >> +vxhs_open_epipe(int ret) "Could not create a pipe for device. Bailing > >>out. Error=%d" > >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" > >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, > >>void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d > >>size = %lu offset = %lu ACB = %p. Error = %d, errno = %d" > >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat > >>ioctl failed, ret = %d, errno = %d" > >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s > >>stat ioctl returned size %lu" > >> +vxhs_qnio_iio_open(const char *ip) "Failed to connect to storage agent > >>on host-ip %s" > >> +vxhs_qnio_iio_devopen(const char *fname) "Failed to open vdisk device: > >>%s" > >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld" > >> +vxhs_parse_uri_filename(const char *filename) "URI passed via > >>bdrv_parse_filename %s" > >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s" > >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = > >>%d" > >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP > >>%s, Port %d" > >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to > >>BDRVVXHSState" > >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s" > >> +vxhs_close(char *vdisk_guid) "Closing vdisk %s" > >> diff --git a/block/vxhs.c b/block/vxhs.c > >> new file mode 100644 > >> index 0000000..97fb804 > >> --- /dev/null > >> +++ b/block/vxhs.c > >> @@ -0,0 +1,736 @@ > >> +/* > >> + * QEMU Block driver for Veritas HyperScale (VxHS) > >> + * > >> + * This work is licensed under the terms of the GNU GPL, version 2 or > >>later. > >> + * See the COPYING file in the top-level directory. > >> + * > >> + */ > >> + > >> +#include "qemu/osdep.h" > >> +#include "block/block_int.h" > >> +#include <qnio/qnio_api.h> > >> +#include "qapi/qmp/qerror.h" > >> +#include "qapi/qmp/qdict.h" > >> +#include "qapi/qmp/qstring.h" > >> +#include "trace.h" > >> +#include "qemu/uri.h" > >> +#include "qapi/error.h" > >> +#include "qemu/error-report.h" > >> + > >> +#define VDISK_FD_READ 0 > >> +#define VDISK_FD_WRITE 1 > >> + > >> +#define VXHS_OPT_FILENAME "filename" > >> +#define VXHS_OPT_VDISK_ID "vdisk_id" > >> +#define VXHS_OPT_SERVER "server." > >> +#define VXHS_OPT_HOST "host" > >> +#define VXHS_OPT_PORT "port" > >> + > >> +/* qnio client ioapi_ctx */ > >> +static void *global_qnio_ctx; > >> + > > > > > >Why create a qnio ctx as a static global, rather than confining it to > >BDRVVXHSState, so that it is unique to each disk instance? > > > >Is the following usage supported (i.e. two different vxhs remote servers): > > > >qemu-system-x86_64 -drive file=vxhs://server1:9999/test1 \ > > -drive file=vxhs://server2:9999/test2 > > ³global_qnio_ctx² is a global context, not per-disk context. Removed the > qnio_ctx field > From ³BDRVVXHSState² structure. The above command should work. > OK, great, thanks for the clarification. I assume then there is no drive-specific or host-specific stateful information in the global_qnio_ctx structure. How about cleanup on close? Looking at the libqnio code, it allocs memory during the qnio_ctx creation. Currently, I don't think there is anything in the vxhs driver that frees the global_qnio_ctx and its internal structures. Looking at qnio_api.h, there also doesn't seem to be an api that will do that, either. We need to be able to free all resources once we close the last vxhs drive. Since multiple drives could all be using the global_qnio_ctx, I think you'll likely need to wrap it in a struct and create a ref counter. Increment the ref on every open, and decrement the ref on close (and if zero, free the resources). (If my interpretation is incorrect, and it is already freed somewhere, just let me know). > > > > > >Why store the qnio ctx into this global briefly, and just to move it to > >where it belongs in the BDRVVXHSState struct? It makes more sense to be > >in > >the state struct the whole time, unless I am missing something fundamental > >about the protocol. > > > > > >> +/* vdisk prefix to pass to qnio */ > >> +static const char vdisk_prefix[] = "/dev/of/vdisk"; > >> + > >> +typedef enum { > >> + VXHS_IO_INPROGRESS, > >> + VXHS_IO_COMPLETED, > >> + VXHS_IO_ERROR > >> +} VXHSIOState; > >> + > >> +typedef enum { > >> + VDISK_AIO_READ, > >> + VDISK_AIO_WRITE, > >> + VDISK_STAT > >> +} VDISKAIOCmd; > >> + > >> +/* > >> + * HyperScale AIO callbacks structure > >> + */ > >> +typedef struct VXHSAIOCB { > >> + BlockAIOCB common; > >> + int err; > >> + int state; > >> + int direction; /* IO direction (r/w) */ > >> + size_t io_offset; > >> + size_t size; > >> + QEMUBH *bh; > > > >*bh is unused. > > Removed the unused variable. > > > >> + QEMUIOVector *qiov; > >> + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; > > > >retry_entry is unused. > > Removed the unused variable. > > > >> +} VXHSAIOCB; > >> + > >> +typedef struct VXHSvDiskHostsInfo { > >> + int qnio_cfd; /* Channel FD */ > >> + int vdisk_rfd; /* vDisk remote FD */ > >> + char *hostip; /* Host's IP addresses */ > >> + int port; /* Host's port number */ > >> +} VXHSvDiskHostsInfo; > >> + > >> +/* > >> + * Structure per vDisk maintained for state > >> + */ > >> +typedef struct BDRVVXHSState { > >> + int fds[2]; > >> + int64_t vdisk_size; > >> + int64_t vdisk_blocks; > > > >vdisk_blocks is unused. > > Removed the unused variable. > > > >> + int event_reader_pos; > >> + VXHSAIOCB *qnio_event_acb; > >> + void *qnio_ctx; > >> + VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */ > >> + char *vdisk_guid; > >> +} BDRVVXHSState; > >> + > >> +static void vxhs_qnio_iio_close(BDRVVXHSState *s) > >> +{ > >> + /* > >> + * Close vDisk device > >> + */ > >> + if (s->vdisk_hostinfo.vdisk_rfd >= 0) { > >> + iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo.vdisk_rfd); > >> + s->vdisk_hostinfo.vdisk_rfd = -1; > >> + } > >> + > >> + /* > >> + * Close QNIO channel against cached channel-fd > >> + */ > >> + if (s->vdisk_hostinfo.qnio_cfd >= 0) { > >> + iio_close(s->qnio_ctx, s->vdisk_hostinfo.qnio_cfd); > >> + s->vdisk_hostinfo.qnio_cfd = -1; > >> + } > >> +} > >> + > >> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr, > >> + int *rfd, const char *file_name) > >> +{ > >> + /* > >> + * Open qnio channel to storage agent if not opened before. > >> + */ > >> + if (*cfd < 0) { > >> + *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0); > >> + if (*cfd < 0) { > >> + trace_vxhs_qnio_iio_open(of_vsa_addr); > >> + return -ENODEV; > >> + } > >> + } > >> + > >> + /* > >> + * Open vdisk device > >> + */ > >> + *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0); > >> + if (*rfd < 0) { > >> + if (*cfd >= 0) { > >> + iio_close(global_qnio_ctx, *cfd); > >> + *cfd = -1; > >> + *rfd = -1; > >> + } > >> + > >> + trace_vxhs_qnio_iio_devopen(file_name); > >> + return -ENODEV; > >> + } > >> + > >> + return 0; > >> +} > >> + > >> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx, > >> + uint32_t error, uint32_t opcode) > >> +{ > >> + VXHSAIOCB *acb = NULL; > >> + BDRVVXHSState *s = NULL; > >> + ssize_t ret; > >> + > >> + switch (opcode) { > >> + case IRP_READ_REQUEST: > >> + case IRP_WRITE_REQUEST: > >> + > >> + /* > >> + * ctx is VXHSAIOCB* > >> + * ctx is NULL if error is QNIOERROR_CHANNEL_HUP or > >> + * reason is IIO_REASON_HUP > >> + */ > >> + if (ctx) { > >> + acb = ctx; > >> + s = acb->common.bs->opaque; > >> + } else { > >> + trace_vxhs_iio_callback(error, reason); > >> + goto out; > >> + } > >> + > >> + if (error) { > >> + if (!acb->err) { > >> + acb->err = error; > >> + } > >> + trace_vxhs_iio_callback(error, reason); > >> + } > >> + > >> + ret = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, > >>sizeof(acb)); > >> + g_assert(ret == sizeof(acb)); > >> + break; > >> + > >> + default: > >> + if (error == QNIOERROR_CHANNEL_HUP) { > >> + /* > >> + * Channel failed, spontaneous notification, > >> + * not in response to I/O > >> + */ > >> + trace_vxhs_iio_callback_chnfail(error, errno); > >> + } else { > >> + trace_vxhs_iio_callback_unknwn(opcode, error); > >> + } > >> + break; > >> + } > >> +out: > >> + return; > >> +} > >> + > >> +static void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) > >> +{ > >> + BlockCompletionFunc *cb = acb->common.cb; > >> + void *opaque = acb->common.opaque; > >> + int ret = 0; > >> + > >> + if (acb->err != 0) { > >> + trace_vxhs_complete_aio(acb, acb->err); > >> + /* > >> + * We mask all the IO errors generically as EIO for upper > >>layers > >> + * Right now our IO Manager uses non standard error codes. > >>Instead > >> + * of confusing upper layers with incorrect interpretation we > >>are > >> + * doing this workaround. > >> + */ > >> + ret = (-EIO); > >> + } > >> + > >> + acb->state = VXHS_IO_COMPLETED; > > > >What is this state used for? I see it also set to VXHS_IO_INPROGRESS at > >the > >beginning of the request. > > This was useful when failover code was implemented. It is no longer used. > So removed the ³state² filed. > > > > >> + qemu_aio_unref(acb); > >> + cb(opaque, ret); > >> +} > >> + > >> +/* > >> + * This is the HyperScale event handler registered to QEMU. > >> + * It is invoked when any IO gets completed and written on pipe > >> + * by callback called from QNIO thread context. Then it marks > >> + * the AIO as completed, and releases HyperScale AIO callbacks. > >> + */ > >> +static void vxhs_aio_event_reader(void *opaque) > >> +{ > >> + BDRVVXHSState *s = opaque; > >> + char *p; > >> + ssize_t ret; > >> + > >> + do { > >> + p = (char *)&s->qnio_event_acb; > >> + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, > >> + sizeof(s->qnio_event_acb) - s->event_reader_pos); > >> + if (ret > 0) { > >> + s->event_reader_pos += ret; > >> + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { > >> + s->event_reader_pos = 0; > >> + vxhs_complete_aio(s->qnio_event_acb, s); > >> + } > >> + } > >> + } while (ret < 0 && errno == EINTR); > > > >I'm bit confused on this event reader loop. If ret is > 0, but > >s->event_reader_pos != sizeof(s->qnio_event_acb), we'll exit the event > >reader leaving event_read_pos non-zero. And since we are reading in a > >pointer for the acb, what does that mean for the next time we come we have > >an aio reader event? > On the next reader event, it will read the rest of ³acb² structure(from > s->event_reader_pos), and calls vxhs_complete_aio(). > > > > > >> +} > >> + > >> +/* > >> + * Call QNIO operation to create channels to do IO on vDisk. > >> + */ > >> + > >> +static void *vxhs_setup_qnio(void) > >> +{ > >> + void *qnio_ctx = NULL; > >> + > >> + qnio_ctx = iio_init(vxhs_iio_callback); > >> + if (qnio_ctx != NULL) { > >> + trace_vxhs_setup_qnio(qnio_ctx); > >> + } > >> + return qnio_ctx; > >> +} > >> + > >> +static QemuOptsList runtime_opts = { > >> + .name = "vxhs", > >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > >> + .desc = { > >> + { > >> + .name = VXHS_OPT_FILENAME, > >> + .type = QEMU_OPT_STRING, > >> + .help = "URI to the Veritas HyperScale image", > >> + }, > >> + { > >> + .name = VXHS_OPT_VDISK_ID, > >> + .type = QEMU_OPT_STRING, > >> + .help = "UUID of the VxHS vdisk", > >> + }, > >> + { /* end of list */ } > >> + }, > >> +}; > >> + > >> +static QemuOptsList runtime_tcp_opts = { > >> + .name = "vxhs_tcp", > >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), > >> + .desc = { > >> + { > >> + .name = VXHS_OPT_HOST, > >> + .type = QEMU_OPT_STRING, > >> + .help = "host address (ipv4 addresses)", > >> + }, > >> + { > >> + .name = VXHS_OPT_PORT, > >> + .type = QEMU_OPT_NUMBER, > >> + .help = "port number on which VxHSD is listening (default > >>9999)", > >> + .def_value_str = "9999" > >> + }, > >> + { /* end of list */ } > >> + }, > >> +}; > >> + > >> +/* > >> + * Parse the incoming URI and populate *options with the host > >>information. > >> + * URI syntax has the limitation of supporting only one host info. > >> + * To pass multiple host information, use the JSON syntax. > >> + */ > >> +static int vxhs_parse_uri(const char *filename, QDict *options) > >> +{ > >> + URI *uri = NULL; > >> + char *hoststr, *portstr; > >> + char *port; > >> + int ret = 0; > >> + > >> + trace_vxhs_parse_uri_filename(filename); > >> + uri = uri_parse(filename); > >> + if (!uri || !uri->server || !uri->path) { > >> + uri_free(uri); > >> + return -EINVAL; > >> + } > >> + > >> + hoststr = g_strdup(VXHS_OPT_SERVER"0.host"); > >> + qdict_put(options, hoststr, qstring_from_str(uri->server)); > >> + g_free(hoststr); > >> + > >> + portstr = g_strdup(VXHS_OPT_SERVER"0.port"); > >> + if (uri->port) { > >> + port = g_strdup_printf("%d", uri->port); > >> + qdict_put(options, portstr, qstring_from_str(port)); > >> + g_free(port); > >> + } > >> + g_free(portstr); > >> + > >> + if (strstr(uri->path, "vxhs") == NULL) { > >> + qdict_put(options, "vdisk_id", qstring_from_str(uri->path)); > >> + } > >> + > >> + trace_vxhs_parse_uri_hostinfo(1, uri->server, uri->port); > >> + uri_free(uri); > >> + > >> + return ret; > >> +} > >> + > >> +static void vxhs_parse_filename(const char *filename, QDict *options, > >> + Error **errp) > >> +{ > >> + if (qdict_haskey(options, "vdisk_id") || qdict_haskey(options, > >>"server")) { > >> + error_setg(errp, "vdisk_id/server and a file name may not be > >>specified " > >> + "at the same time"); > >> + return; > >> + } > >> + > >> + if (strstr(filename, "://")) { > >> + int ret = vxhs_parse_uri(filename, options); > >> + if (ret < 0) { > >> + error_setg(errp, "Invalid URI. URI should be of the form " > >> + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); > >> + } > >> + } > >> +} > >> + > >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, > >> + int *cfd, int *rfd, Error **errp) > >> +{ > >> + QDict *backing_options = NULL; > >> + QemuOpts *opts, *tcp_opts; > >> + const char *vxhs_filename; > >> + char *of_vsa_addr = NULL; > >> + Error *local_err = NULL; > >> + const char *vdisk_id_opt; > >> + char *file_name = NULL; > >> + size_t num_servers = 0; > >> + char *str = NULL; > >> + int ret = 0; > >> + > >> + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); > >> + qemu_opts_absorb_qdict(opts, options, &local_err); > >> + if (local_err) { > >> + error_propagate(errp, local_err); > > > >You are already propagating this error in 'out'. > Removed ³error_propagate()² call. > > > > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + > >> + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); > >> + if (vxhs_filename) { > >> + trace_vxhs_qemu_init_filename(vxhs_filename); > >> + } > >> + > >> + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); > >> + if (!vdisk_id_opt) { > >> + error_setg(&local_err, QERR_MISSING_PARAMETER, > >>VXHS_OPT_VDISK_ID); > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + s->vdisk_guid = g_strdup(vdisk_id_opt); > >> + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); > >> + > >> + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); > >> + if (num_servers < 1) { > >> + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); > >> + ret = -EINVAL; > >> + goto out; > >> + } else if (num_servers > 1) { > >> + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); > >> + error_append_hint(errp, "Only one server allowed.\n"); > > > >s/errp/&local_err > > Replaced as suggested. > > > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + trace_vxhs_qemu_init_numservers(num_servers); > >> + > >> + str = g_strdup_printf(VXHS_OPT_SERVER"0."); > >> + qdict_extract_subqdict(options, &backing_options, str); > >> + > >> + /* Create opts info from runtime_tcp_opts list */ > >> + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, > >>&error_abort); > >> + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); > >> + if (local_err) { > >> + qdict_del(backing_options, str); > >> + qemu_opts_del(tcp_opts); > >> + g_free(str); > > > >Rather than free str here, you can just do it in the 'out'. (g_free is > >NULL-safe) > > Moved g_free() call to ³out² label. > > > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + > >> + s->vdisk_hostinfo.hostip = g_strdup(qemu_opt_get(tcp_opts, > >> + VXHS_OPT_HOST)); > >> + s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts, > >> + > >>VXHS_OPT_PORT), > >> + NULL, 0); > >> + > >> + s->vdisk_hostinfo.qnio_cfd = -1; > >> + s->vdisk_hostinfo.vdisk_rfd = -1; > >> + trace_vxhs_qemu_init(s->vdisk_hostinfo.hostip, > >> + s->vdisk_hostinfo.port); > >> + > >> + qdict_del(backing_options, str); > >> + qemu_opts_del(tcp_opts); > >> + g_free(str); > > > >Same here. > > Moved g_free() call to ³out² label. > > > > >> + > >> + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); > >> + of_vsa_addr = g_strdup_printf("of://%s:%d", > >> + s->vdisk_hostinfo.hostip, > >> + s->vdisk_hostinfo.port); > >> + > >> + /* > >> + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. > >> + */ > >> + if (global_qnio_ctx == NULL) { > >> + global_qnio_ctx = vxhs_setup_qnio(); > > > >I assume this would prevent the scenario I described in the beginning > >(two different > >drives using two different vxhs servers) from working properly? Is that > >by > >design? > > > >If so, it needs to fail gracefully; I don't think the current > >implementation > >will. > > > > > >But it seems like you should just be able to do > >s->qnio_ctx = vxhs_setup_qnio() here, right? > > As mentioned before ³qnio_ctx² is global context, and gets initialized on > first disk open. > > > > > > >> + if (global_qnio_ctx == NULL) { > >> + error_setg(&local_err, "Failed vxhs_setup_qnio"); > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + } > >> + > >> + ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name); > > > >This overwrites the host / port, and I presume other info, of other drives > >in the case of another vxhs drive being opened. This seems wrong. > > > >> + if (ret) { > >> + error_setg(&local_err, "Failed qnio_iio_open"); > >> + ret = -EIO; > >> + } > >> + > >> +out: > >> + g_free(file_name); > >> + g_free(of_vsa_addr); > >> + qemu_opts_del(opts); > >> + > >> + if (ret < 0) { > >> + g_free(s->vdisk_hostinfo.hostip); > >> + g_free(s->vdisk_guid); > >> + s->vdisk_guid = NULL; > >> + errno = -ret; > >> + } > >> + error_propagate(errp, local_err); > > > >This can be moved up into the 'if' block above it. > > Moved the code as suggested. > > > > >> + > >> + return ret; > >> +} > >> + > >> +static int vxhs_open(BlockDriverState *bs, QDict *options, > >> + int bdrv_flags, Error **errp) > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + AioContext *aio_context; > >> + int qemu_qnio_cfd = -1; > >> + int device_opened = 0; > > > >A bool makes more sense here. > > Modified the type to ³bool² as suggested. > > > >> + int qemu_rfd = -1; > >> + int ret = 0; > >> + > >> + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); > >> + if (ret < 0) { > >> + trace_vxhs_open_fail(ret); > >> + return ret; > >> + } > >> + > >> + device_opened = 1; > >> + s->qnio_ctx = global_qnio_ctx; > >> + s->vdisk_hostinfo.qnio_cfd = qemu_qnio_cfd; > >> + s->vdisk_hostinfo.vdisk_rfd = qemu_rfd; > >> + s->vdisk_size = 0; > >> + > >> + /* > >> + * Create a pipe for communicating between two threads in different > >> + * context. Set handler for read event, which gets triggered when > >> + * IO completion is done by non-QEMU context. > >> + */ > >> + ret = qemu_pipe(s->fds); > >> + if (ret < 0) { > >> + trace_vxhs_open_epipe(ret); > >> + ret = -errno; > >> + goto errout; > >> + } > >> + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); > >> + > >> + aio_context = bdrv_get_aio_context(bs); > >> + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], > >> + false, vxhs_aio_event_reader, NULL, s); > >> + return 0; > >> + > >> +errout: > >> + /* > >> + * Close remote vDisk device if it was opened earlier > >> + */ > >> + if (device_opened) { > >> + vxhs_qnio_iio_close(s); > >> + } > >> + trace_vxhs_open_fail(ret); > >> + return ret; > >> +} > >> + > >> +static const AIOCBInfo vxhs_aiocb_info = { > >> + .aiocb_size = sizeof(VXHSAIOCB) > >> +}; > >> + > >> +/* > >> + * This allocates QEMU-VXHS callback for each IO > >> + * and is passed to QNIO. When QNIO completes the work, > >> + * it will be passed back through the callback. > >> + */ > >> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t > >>sector_num, > >> + QEMUIOVector *qiov, int nb_sectors, > >> + BlockCompletionFunc *cb, void *opaque, > >>int iodir) > >> +{ > >> + VXHSAIOCB *acb = NULL; > >> + BDRVVXHSState *s = bs->opaque; > >> + size_t size; > >> + uint64_t offset; > >> + int iio_flags = 0; > >> + int ret = 0; > >> + void *qnio_ctx = s->qnio_ctx; > >> + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; > >> + > >> + offset = sector_num * BDRV_SECTOR_SIZE; > >> + size = nb_sectors * BDRV_SECTOR_SIZE; > >> + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); > >> + /* > >> + * Setup or initialize VXHSAIOCB. > >> + * Every single field should be initialized since > >> + * acb will be picked up from the slab without > >> + * initializing with zero. > >> + */ > >> + acb->io_offset = offset; > >> + acb->size = size; > >> + acb->err = 0; > >> + acb->state = VXHS_IO_INPROGRESS; > >> + acb->qiov = qiov; > >> + acb->direction = iodir; > >> + > >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); > >> + > >> + switch (iodir) { > >> + case VDISK_AIO_WRITE: > >> + ret = iio_writev(qnio_ctx, rfd, qiov->iov, qiov->niov, > >> + offset, (uint64_t)size, (void *)acb, > >>iio_flags); > >> + break; > >> + case VDISK_AIO_READ: > >> + ret = iio_readv(qnio_ctx, rfd, qiov->iov, qiov->niov, > >> + offset, (uint64_t)size, (void *)acb, > >>iio_flags); > >> + break; > >> + default: > >> + trace_vxhs_aio_rw_invalid(iodir); > >> + goto errout; > >> + } > >> + > >> + if (ret != 0) { > >> + trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset, > >> + acb, ret, errno); > >> + goto errout; > >> + } > >> + return &acb->common; > >> + > >> +errout: > >> + qemu_aio_unref(acb); > >> + return NULL; > >> +} > >> + > >> +static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, > >> + int64_t sector_num, QEMUIOVector > >>*qiov, > >> + int nb_sectors, > >> + BlockCompletionFunc *cb, void > >>*opaque) > >> +{ > >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, > >> + opaque, VDISK_AIO_READ); > >> +} > >> + > >> +static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, > >> + int64_t sector_num, QEMUIOVector > >>*qiov, > >> + int nb_sectors, > >> + BlockCompletionFunc *cb, void > >>*opaque) > >> +{ > >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, > >> + cb, opaque, VDISK_AIO_WRITE); > >> +} > >> + > >> +static void vxhs_close(BlockDriverState *bs) > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + > >> + trace_vxhs_close(s->vdisk_guid); > >> + close(s->fds[VDISK_FD_READ]); > >> + close(s->fds[VDISK_FD_WRITE]); > >> + > >> + /* > >> + * Clearing all the event handlers for oflame registered to QEMU > >> + */ > >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > >> + false, NULL, NULL, NULL); > >> + g_free(s->vdisk_guid); > >> + s->vdisk_guid = NULL; > >> + vxhs_qnio_iio_close(s); > >> + > >> + /* > >> + * Free the dynamically allocated hostip string > >> + */ > >> + g_free(s->vdisk_hostinfo.hostip); > >> + s->vdisk_hostinfo.hostip = NULL; > >> + s->vdisk_hostinfo.port = 0; > >> +} > >> + > >> +static unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) > > > >The functions that call this use its value as an int64_t, although this is > >returning an unsigned long. It should return int64_t. > > Modified the return value as suggested. > > > >> +{ > >> + int64_t vdisk_size = 0; > >> + int ret = 0; > >> + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; > >> + > >> + ret = iio_ioctl(s->qnio_ctx, rfd, IOR_VDISK_STAT, &vdisk_size, > >>NULL, 0); > >> + if (ret < 0) { > >> + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); > >> + return 0; > > > >Repeat comment from RFC v7: > > > >Why is is this returning 0 here, rather then 'ret'? Does IOR_VDIST_STAT > >return sane error codes? If not, instead of returning zero, map it to > >-EIO > >here. > Changed the return value to ³-EIO² incase of error. > > > > >> + } > >> + > >> + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); > >> + return vdisk_size; > >> +} > >> + > >> +/* > >> + * Returns the size of vDisk in bytes. This is required > >> + * by QEMU block upper block layer so that it is visible > >> + * to guest. > >> + */ > >> +static int64_t vxhs_getlength(BlockDriverState *bs) > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + int64_t vdisk_size = 0; > >> + > >> + if (s->vdisk_size > 0) { > >> + vdisk_size = s->vdisk_size; > >> + } else { > >> + /* > >> + * Fetch the vDisk size using stat ioctl > >> + */ > >> + vdisk_size = vxhs_get_vdisk_stat(s); > >> + if (vdisk_size > 0) { > >> + s->vdisk_size = vdisk_size; > >> + } > >> + } > >> + > >> + if (vdisk_size > 0) { > >> + return vdisk_size; /* return size in bytes */ > >> + } > >> + > > > >Repeat comment from RFC v7: > > > >If vxhs_get_vdisk_stat() returned error rather than 0, then this check > >is unnecessary. You can just return vdisk_size. > > > >(Also: I don't think you need to return -EIO for a zero-sized image, do > >you?) > Modified the code as suggested. Zero size disk will return zero. > > > > >> + return -EIO; > >> +} > >> + > >> +/* > >> + * Returns actual blocks allocated for the vDisk. > > > >s/blocks/bytes > > > >> + * This is required by qemu-img utility. > >> + */ > >> +static int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) > > > >s/blocks/bytes > > > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + int64_t vdisk_size = 0; > >> + > >> + if (s->vdisk_size > 0) { > >> + vdisk_size = s->vdisk_size; > >> + } else { > >> + /* > >> + * TODO: > >> + * Once HyperScale storage-virtualizer provides > >> + * actual physical allocation of blocks then > >> + * fetch that information and return back to the > >> + * caller but for now just get the full size. > >> + */ > >> + vdisk_size = vxhs_get_vdisk_stat(s); > >> + if (vdisk_size > 0) { > >> + s->vdisk_size = vdisk_size; > >> + } > >> + } > >> + > >> + if (vdisk_size > 0) { > >> + return vdisk_size; /* return size in bytes */ > >> + } > >> + > >> + return -EIO; > > > >Repeat comment from RFC v7: > > > >Since this is just identical to vxhs_getlength(), why not just call it > >here, > >and reduce code duplication? > > > >Or even better: do not implement this function. It isn't required. If it > >is not implemented, the block layer above this will just return -ENOTSUP, > >which in qapi will be interpreted as "unavailable". This function should > >be > >dropped, until it is something supported by VXHS. > > Removed vxhs_get_allocated_blocks() function. > > > >> +} > >> + > >> +static void vxhs_detach_aio_context(BlockDriverState *bs) > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + > >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > >> + false, NULL, NULL, NULL); > >> + > >> +} > >> + > >> +static void vxhs_attach_aio_context(BlockDriverState *bs, > >> + AioContext *new_context) > >> +{ > >> + BDRVVXHSState *s = bs->opaque; > >> + > >> + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], > >> + false, vxhs_aio_event_reader, NULL, s); > >> +} > >> + > >> +static BlockDriver bdrv_vxhs = { > >> + .format_name = "vxhs", > >> + .protocol_name = "vxhs", > >> + .instance_size = sizeof(BDRVVXHSState), > >> + .bdrv_file_open = vxhs_open, > >> + .bdrv_parse_filename = vxhs_parse_filename, > >> + .bdrv_close = vxhs_close, > >> + .bdrv_getlength = vxhs_getlength, > >> + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, > >> + .bdrv_aio_readv = vxhs_aio_readv, > >> + .bdrv_aio_writev = vxhs_aio_writev, > >> + .bdrv_detach_aio_context = vxhs_detach_aio_context, > >> + .bdrv_attach_aio_context = vxhs_attach_aio_context, > >> +}; > >> + > >> +static void bdrv_vxhs_init(void) > >> +{ > >> + bdrv_register(&bdrv_vxhs); > >> +} > >> + > >> +block_init(bdrv_vxhs_init); > >> diff --git a/configure b/configure > >> index d3dafcb..4413e88 100755 > >> --- a/configure > >> +++ b/configure > >> @@ -321,6 +321,7 @@ numa="" > >> tcmalloc="no" > >> jemalloc="no" > >> replication="yes" > >> +vxhs="" > >> > >> # parse CC options first > >> for opt do > >> @@ -1162,6 +1163,11 @@ for opt do > >> ;; > >> --enable-replication) replication="yes" > >> ;; > >> + --disable-vxhs) vxhs="no" > >> + ;; > >> + --enable-vxhs) vxhs="yes" > >> + ;; > >> + > >> *) > >> echo "ERROR: unknown option $opt" > >> echo "Try '$0 --help' for more information" > >> @@ -1391,6 +1397,7 @@ disabled with --disable-FEATURE, default is > >>enabled if available: > >> tcmalloc tcmalloc support > >> jemalloc jemalloc support > >> replication replication support > >> + vxhs Veritas HyperScale vDisk backend support > >> > >> NOTE: The object files are built at the place where configure is > >>launched > >> EOF > >> @@ -4625,6 +4632,33 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o > >>$TMPMO $TMPO; then > >> fi > >> > >> ########################################## > >> +# Veritas HyperScale block driver VxHS > >> +# Check if libqnio is installed > >> + > >> +if test "$vxhs" != "no" ; then > >> + cat > $TMPC <<EOF > >> +#include <stdint.h> > >> +#include <qnio/qnio_api.h> > >> + > >> +void *vxhs_callback; > >> + > >> +int main(void) { > >> + iio_init(vxhs_callback); > >> + return 0; > >> +} > >> +EOF > >> + vxhs_libs="-lqnio" > >> + if compile_prog "" "$vxhs_libs" ; then > >> + vxhs=yes > >> + else > >> + if test "$vxhs" = "yes" ; then > >> + feature_not_found "vxhs block device" "Install libqnio. See > >>github" > >> + fi > >> + vxhs=no > >> + fi > >> +fi > > > >How is libqnio going to be distributed normally? Should we be checking > >for > >a minimum version here to compile against? > > Yes, version check is required. We will add the changes in separate patch. > > > >> + > >> +########################################## > >> # End of CC checks > >> # After here, no more $cc or $ld runs > >> > >> @@ -4990,6 +5024,7 @@ echo "tcmalloc support $tcmalloc" > >> echo "jemalloc support $jemalloc" > >> echo "avx2 optimization $avx2_opt" > >> echo "replication support $replication" > >> +echo "VxHS block device $vxhs" > >> > >> if test "$sdl_too_old" = "yes"; then > >> echo "-> Your SDL version is too old - please upgrade to have SDL > >>support" > >> @@ -5590,6 +5625,12 @@ if test "$pthread_setname_np" = "yes" ; then > >> echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak > >> fi > >> > >> +if test "$vxhs" = "yes" ; then > >> + echo "CONFIG_VXHS=y" >> $config_host_mak > >> + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak > >> + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak > >> +fi > >> + > >> if test "$tcg_interpreter" = "yes"; then > >> QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" > >> elif test "$ARCH" = "sparc64" ; then > >> -- > >> 2.5.5 > >> > > > >
Jeff Cody <jcody@redhat.com> writes: > On Wed, Oct 26, 2016 at 09:33:30PM +0000, Buddhi Madhav wrote: >> >> >> On 10/25/16, 9:41 PM, "Jeff Cody" <jcody@redhat.com> wrote: >> >> >On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: >> >> This patch adds support for a new block device type called "vxhs". >> >> Source code for the library that this code loads can be downloaded from: >> >> https://github.com/MittalAshish/libqnio.git >> >> >> > >> >I grabbed the latest of libqnio, compiled it (had to disable -Werror), and >> >tried it out. I was able to do a qemu-img info on a raw file, but it >> >would >> >just hang when trying a format such as qcow2. I am assuming >> >this is a limitation of test_server, and not libqnio. >> >> On my build I did not get any build errors. >> > > Likely a difference in gcc versions; I am compiling with gcc 6.2.1. Here > are the warnings I get (this is just for your benefit, not really part of > this driver review): > > lib/qnio/cJSON.c: In function ‘cJSON_strcasecmp’: > lib/qnio/cJSON.c:41:5: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] > if (!s1) return (s1==s2)?0:1;if (!s2) return 1; > ^~ > lib/qnio/cJSON.c:41:34: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ > if (!s1) return (s1==s2)?0:1;if (!s2) return 1; > ^~ > lib/qnio/cJSON.c: In function ‘print_object’: > lib/qnio/cJSON.c:440:9: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] > if (fmt) *ptr++='\n';*ptr=0; > ^~ > lib/qnio/cJSON.c:440:30: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ > if (fmt) *ptr++='\n';*ptr=0; > ^ > lib/qnio/cJSON.c: In function ‘cJSON_DetachItemFromArray’: > lib/qnio/cJSON.c:467:5: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation] > if (c->prev) c->prev->next=c->next;if (c->next) c->next->prev=c->prev;if (c==array->child) array->child=c->next;c->prev=c->next=0;return c;} > ^~ > lib/qnio/cJSON.c:467:40: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘if’ > if (c->prev) c->prev->next=c->next;if (c->next) c->next->prev=c->prev;if (c==array->child) array->child=c->next;c->prev=c->next=0;return c;} Looks like this code hasn't been competently reviewed, not even fed to a state-of-the-art static analyzer. As long as that remains the case, linking with it outside experimental settings feels unadvisable. [...]
On Wed, Oct 26, 2016 at 09:33:30PM +0000, Buddhi Madhav wrote: > > > On 10/25/16, 9:41 PM, "Jeff Cody" <jcody@redhat.com> wrote: > > >On Tue, Oct 25, 2016 at 03:02:07PM -0700, Ashish Mittal wrote: > >> This patch adds support for a new block device type called "vxhs". > >> Source code for the library that this code loads can be downloaded from: > >> https://github.com/MittalAshish/libqnio.git > >> > > > >I grabbed the latest of libqnio, compiled it (had to disable -Werror), and > >tried it out. I was able to do a qemu-img info on a raw file, but it > >would > >just hang when trying a format such as qcow2. I am assuming > >this is a limitation of test_server, and not libqnio. > > On my build I did not get any build errors. > > The qcow2 issue is to do with the limitation in the test server, which we > will fix in a seperate patch. > > > > >This will make qemu-iotests more difficult however. > > > >I haven't looked at the latest qnio code yet (other than compiling the > >test-server to test), so the rest of this review is on the qemu driver. > > > >> Sample command line using JSON syntax: > >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us > >>-vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 > >>-msg timestamp=on > >>'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410} > >>","server":[{"host":"172.172.17.4","port":"9999"}]}' > >> > >> Sample command line using URI syntax: > >> qemu-img convert -f raw -O raw -n > >>/var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad > >>vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D > >> > >> Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> > >> --- > >> block/Makefile.objs | 2 + > >> block/trace-events | 22 ++ > >> block/vxhs.c | 736 > >>++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> configure | 41 +++jj > >> 4 files changed, 801 insertions(+) > >> create mode 100644 block/vxhs.c > > > >I think this version still does not address Daniel's concerns regarding a > >QAPI schema for vxhs. > > We are working on QAPI schema changes and will submit them in separate > patch. They should be part of the main patch, not a separate patch as they are a pre-requisite for any new block driver. Regards, Daniel
diff --git a/block/Makefile.objs b/block/Makefile.objs index 67a036a..58313a2 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_VXHS) += vxhs.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o block-obj-y += accounting.o dirty-bitmap.o @@ -38,6 +39,7 @@ rbd.o-cflags := $(RBD_CFLAGS) rbd.o-libs := $(RBD_LIBS) gluster.o-cflags := $(GLUSTERFS_CFLAGS) gluster.o-libs := $(GLUSTERFS_LIBS) +vxhs.o-libs := $(VXHS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) archipelago.o-libs := $(ARCHIPELAGO_LIBS) diff --git a/block/trace-events b/block/trace-events index 05fa13c..aea97cb 100644 --- a/block/trace-events +++ b/block/trace-events @@ -114,3 +114,25 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" + +# block/vxhs.c +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason %d" +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p" +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d" +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d" +vxhs_open_fail(int ret) "Could not open the device. Error = %d" +vxhs_open_epipe(int ret) "Could not create a pipe for device. Bailing out. Error=%d" +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %lu offset = %lu ACB = %p. Error = %d, errno = %d" +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d" +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %lu" +vxhs_qnio_iio_open(const char *ip) "Failed to connect to storage agent on host-ip %s" +vxhs_qnio_iio_devopen(const char *fname) "Failed to open vdisk device: %s" +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld" +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s" +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s" +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = %d" +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, Port %d" +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState" +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s" +vxhs_close(char *vdisk_guid) "Closing vdisk %s" diff --git a/block/vxhs.c b/block/vxhs.c new file mode 100644 index 0000000..97fb804 --- /dev/null +++ b/block/vxhs.c @@ -0,0 +1,736 @@ +/* + * QEMU Block driver for Veritas HyperScale (VxHS) + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "block/block_int.h" +#include <qnio/qnio_api.h> +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" +#include "trace.h" +#include "qemu/uri.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +#define VDISK_FD_READ 0 +#define VDISK_FD_WRITE 1 + +#define VXHS_OPT_FILENAME "filename" +#define VXHS_OPT_VDISK_ID "vdisk_id" +#define VXHS_OPT_SERVER "server." +#define VXHS_OPT_HOST "host" +#define VXHS_OPT_PORT "port" + +/* qnio client ioapi_ctx */ +static void *global_qnio_ctx; + +/* vdisk prefix to pass to qnio */ +static const char vdisk_prefix[] = "/dev/of/vdisk"; + +typedef enum { + VXHS_IO_INPROGRESS, + VXHS_IO_COMPLETED, + VXHS_IO_ERROR +} VXHSIOState; + +typedef enum { + VDISK_AIO_READ, + VDISK_AIO_WRITE, + VDISK_STAT +} VDISKAIOCmd; + +/* + * HyperScale AIO callbacks structure + */ +typedef struct VXHSAIOCB { + BlockAIOCB common; + int err; + int state; + int direction; /* IO direction (r/w) */ + size_t io_offset; + size_t size; + QEMUBH *bh; + QEMUIOVector *qiov; + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; +} VXHSAIOCB; + +typedef struct VXHSvDiskHostsInfo { + int qnio_cfd; /* Channel FD */ + int vdisk_rfd; /* vDisk remote FD */ + char *hostip; /* Host's IP addresses */ + int port; /* Host's port number */ +} VXHSvDiskHostsInfo; + +/* + * Structure per vDisk maintained for state + */ +typedef struct BDRVVXHSState { + int fds[2]; + int64_t vdisk_size; + int64_t vdisk_blocks; + int event_reader_pos; + VXHSAIOCB *qnio_event_acb; + void *qnio_ctx; + VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */ + char *vdisk_guid; +} BDRVVXHSState; + +static void vxhs_qnio_iio_close(BDRVVXHSState *s) +{ + /* + * Close vDisk device + */ + if (s->vdisk_hostinfo.vdisk_rfd >= 0) { + iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo.vdisk_rfd); + s->vdisk_hostinfo.vdisk_rfd = -1; + } + + /* + * Close QNIO channel against cached channel-fd + */ + if (s->vdisk_hostinfo.qnio_cfd >= 0) { + iio_close(s->qnio_ctx, s->vdisk_hostinfo.qnio_cfd); + s->vdisk_hostinfo.qnio_cfd = -1; + } +} + +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr, + int *rfd, const char *file_name) +{ + /* + * Open qnio channel to storage agent if not opened before. + */ + if (*cfd < 0) { + *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0); + if (*cfd < 0) { + trace_vxhs_qnio_iio_open(of_vsa_addr); + return -ENODEV; + } + } + + /* + * Open vdisk device + */ + *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0); + if (*rfd < 0) { + if (*cfd >= 0) { + iio_close(global_qnio_ctx, *cfd); + *cfd = -1; + *rfd = -1; + } + + trace_vxhs_qnio_iio_devopen(file_name); + return -ENODEV; + } + + return 0; +} + +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx, + uint32_t error, uint32_t opcode) +{ + VXHSAIOCB *acb = NULL; + BDRVVXHSState *s = NULL; + ssize_t ret; + + switch (opcode) { + case IRP_READ_REQUEST: + case IRP_WRITE_REQUEST: + + /* + * ctx is VXHSAIOCB* + * ctx is NULL if error is QNIOERROR_CHANNEL_HUP or + * reason is IIO_REASON_HUP + */ + if (ctx) { + acb = ctx; + s = acb->common.bs->opaque; + } else { + trace_vxhs_iio_callback(error, reason); + goto out; + } + + if (error) { + if (!acb->err) { + acb->err = error; + } + trace_vxhs_iio_callback(error, reason); + } + + ret = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); + g_assert(ret == sizeof(acb)); + break; + + default: + if (error == QNIOERROR_CHANNEL_HUP) { + /* + * Channel failed, spontaneous notification, + * not in response to I/O + */ + trace_vxhs_iio_callback_chnfail(error, errno); + } else { + trace_vxhs_iio_callback_unknwn(opcode, error); + } + break; + } +out: + return; +} + +static void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) +{ + BlockCompletionFunc *cb = acb->common.cb; + void *opaque = acb->common.opaque; + int ret = 0; + + if (acb->err != 0) { + trace_vxhs_complete_aio(acb, acb->err); + /* + * We mask all the IO errors generically as EIO for upper layers + * Right now our IO Manager uses non standard error codes. Instead + * of confusing upper layers with incorrect interpretation we are + * doing this workaround. + */ + ret = (-EIO); + } + + acb->state = VXHS_IO_COMPLETED; + qemu_aio_unref(acb); + cb(opaque, ret); +} + +/* + * This is the HyperScale event handler registered to QEMU. + * It is invoked when any IO gets completed and written on pipe + * by callback called from QNIO thread context. Then it marks + * the AIO as completed, and releases HyperScale AIO callbacks. + */ +static void vxhs_aio_event_reader(void *opaque) +{ + BDRVVXHSState *s = opaque; + char *p; + ssize_t ret; + + do { + p = (char *)&s->qnio_event_acb; + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, + sizeof(s->qnio_event_acb) - s->event_reader_pos); + if (ret > 0) { + s->event_reader_pos += ret; + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { + s->event_reader_pos = 0; + vxhs_complete_aio(s->qnio_event_acb, s); + } + } + } while (ret < 0 && errno == EINTR); +} + +/* + * Call QNIO operation to create channels to do IO on vDisk. + */ + +static void *vxhs_setup_qnio(void) +{ + void *qnio_ctx = NULL; + + qnio_ctx = iio_init(vxhs_iio_callback); + if (qnio_ctx != NULL) { + trace_vxhs_setup_qnio(qnio_ctx); + } + return qnio_ctx; +} + +static QemuOptsList runtime_opts = { + .name = "vxhs", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = VXHS_OPT_FILENAME, + .type = QEMU_OPT_STRING, + .help = "URI to the Veritas HyperScale image", + }, + { + .name = VXHS_OPT_VDISK_ID, + .type = QEMU_OPT_STRING, + .help = "UUID of the VxHS vdisk", + }, + { /* end of list */ } + }, +}; + +static QemuOptsList runtime_tcp_opts = { + .name = "vxhs_tcp", + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), + .desc = { + { + .name = VXHS_OPT_HOST, + .type = QEMU_OPT_STRING, + .help = "host address (ipv4 addresses)", + }, + { + .name = VXHS_OPT_PORT, + .type = QEMU_OPT_NUMBER, + .help = "port number on which VxHSD is listening (default 9999)", + .def_value_str = "9999" + }, + { /* end of list */ } + }, +}; + +/* + * Parse the incoming URI and populate *options with the host information. + * URI syntax has the limitation of supporting only one host info. + * To pass multiple host information, use the JSON syntax. + */ +static int vxhs_parse_uri(const char *filename, QDict *options) +{ + URI *uri = NULL; + char *hoststr, *portstr; + char *port; + int ret = 0; + + trace_vxhs_parse_uri_filename(filename); + uri = uri_parse(filename); + if (!uri || !uri->server || !uri->path) { + uri_free(uri); + return -EINVAL; + } + + hoststr = g_strdup(VXHS_OPT_SERVER"0.host"); + qdict_put(options, hoststr, qstring_from_str(uri->server)); + g_free(hoststr); + + portstr = g_strdup(VXHS_OPT_SERVER"0.port"); + if (uri->port) { + port = g_strdup_printf("%d", uri->port); + qdict_put(options, portstr, qstring_from_str(port)); + g_free(port); + } + g_free(portstr); + + if (strstr(uri->path, "vxhs") == NULL) { + qdict_put(options, "vdisk_id", qstring_from_str(uri->path)); + } + + trace_vxhs_parse_uri_hostinfo(1, uri->server, uri->port); + uri_free(uri); + + return ret; +} + +static void vxhs_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + if (qdict_haskey(options, "vdisk_id") || qdict_haskey(options, "server")) { + error_setg(errp, "vdisk_id/server and a file name may not be specified " + "at the same time"); + return; + } + + if (strstr(filename, "://")) { + int ret = vxhs_parse_uri(filename, options); + if (ret < 0) { + error_setg(errp, "Invalid URI. URI should be of the form " + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); + } + } +} + +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, + int *cfd, int *rfd, Error **errp) +{ + QDict *backing_options = NULL; + QemuOpts *opts, *tcp_opts; + const char *vxhs_filename; + char *of_vsa_addr = NULL; + Error *local_err = NULL; + const char *vdisk_id_opt; + char *file_name = NULL; + size_t num_servers = 0; + char *str = NULL; + int ret = 0; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); + if (vxhs_filename) { + trace_vxhs_qemu_init_filename(vxhs_filename); + } + + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); + if (!vdisk_id_opt) { + error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID); + ret = -EINVAL; + goto out; + } + s->vdisk_guid = g_strdup(vdisk_id_opt); + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); + + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); + if (num_servers < 1) { + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); + ret = -EINVAL; + goto out; + } else if (num_servers > 1) { + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); + error_append_hint(errp, "Only one server allowed.\n"); + ret = -EINVAL; + goto out; + } + trace_vxhs_qemu_init_numservers(num_servers); + + str = g_strdup_printf(VXHS_OPT_SERVER"0."); + qdict_extract_subqdict(options, &backing_options, str); + + /* Create opts info from runtime_tcp_opts list */ + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); + if (local_err) { + qdict_del(backing_options, str); + qemu_opts_del(tcp_opts); + g_free(str); + ret = -EINVAL; + goto out; + } + + s->vdisk_hostinfo.hostip = g_strdup(qemu_opt_get(tcp_opts, + VXHS_OPT_HOST)); + s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts, + VXHS_OPT_PORT), + NULL, 0); + + s->vdisk_hostinfo.qnio_cfd = -1; + s->vdisk_hostinfo.vdisk_rfd = -1; + trace_vxhs_qemu_init(s->vdisk_hostinfo.hostip, + s->vdisk_hostinfo.port); + + qdict_del(backing_options, str); + qemu_opts_del(tcp_opts); + g_free(str); + + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); + of_vsa_addr = g_strdup_printf("of://%s:%d", + s->vdisk_hostinfo.hostip, + s->vdisk_hostinfo.port); + + /* + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. + */ + if (global_qnio_ctx == NULL) { + global_qnio_ctx = vxhs_setup_qnio(); + if (global_qnio_ctx == NULL) { + error_setg(&local_err, "Failed vxhs_setup_qnio"); + ret = -EINVAL; + goto out; + } + } + + ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name); + if (ret) { + error_setg(&local_err, "Failed qnio_iio_open"); + ret = -EIO; + } + +out: + g_free(file_name); + g_free(of_vsa_addr); + qemu_opts_del(opts); + + if (ret < 0) { + g_free(s->vdisk_hostinfo.hostip); + g_free(s->vdisk_guid); + s->vdisk_guid = NULL; + errno = -ret; + } + error_propagate(errp, local_err); + + return ret; +} + +static int vxhs_open(BlockDriverState *bs, QDict *options, + int bdrv_flags, Error **errp) +{ + BDRVVXHSState *s = bs->opaque; + AioContext *aio_context; + int qemu_qnio_cfd = -1; + int device_opened = 0; + int qemu_rfd = -1; + int ret = 0; + + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); + if (ret < 0) { + trace_vxhs_open_fail(ret); + return ret; + } + + device_opened = 1; + s->qnio_ctx = global_qnio_ctx; + s->vdisk_hostinfo.qnio_cfd = qemu_qnio_cfd; + s->vdisk_hostinfo.vdisk_rfd = qemu_rfd; + s->vdisk_size = 0; + + /* + * Create a pipe for communicating between two threads in different + * context. Set handler for read event, which gets triggered when + * IO completion is done by non-QEMU context. + */ + ret = qemu_pipe(s->fds); + if (ret < 0) { + trace_vxhs_open_epipe(ret); + ret = -errno; + goto errout; + } + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); + + aio_context = bdrv_get_aio_context(bs); + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], + false, vxhs_aio_event_reader, NULL, s); + return 0; + +errout: + /* + * Close remote vDisk device if it was opened earlier + */ + if (device_opened) { + vxhs_qnio_iio_close(s); + } + trace_vxhs_open_fail(ret); + return ret; +} + +static const AIOCBInfo vxhs_aiocb_info = { + .aiocb_size = sizeof(VXHSAIOCB) +}; + +/* + * This allocates QEMU-VXHS callback for each IO + * and is passed to QNIO. When QNIO completes the work, + * it will be passed back through the callback. + */ +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int iodir) +{ + VXHSAIOCB *acb = NULL; + BDRVVXHSState *s = bs->opaque; + size_t size; + uint64_t offset; + int iio_flags = 0; + int ret = 0; + void *qnio_ctx = s->qnio_ctx; + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); + /* + * Setup or initialize VXHSAIOCB. + * Every single field should be initialized since + * acb will be picked up from the slab without + * initializing with zero. + */ + acb->io_offset = offset; + acb->size = size; + acb->err = 0; + acb->state = VXHS_IO_INPROGRESS; + acb->qiov = qiov; + acb->direction = iodir; + + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); + + switch (iodir) { + case VDISK_AIO_WRITE: + ret = iio_writev(qnio_ctx, rfd, qiov->iov, qiov->niov, + offset, (uint64_t)size, (void *)acb, iio_flags); + break; + case VDISK_AIO_READ: + ret = iio_readv(qnio_ctx, rfd, qiov->iov, qiov->niov, + offset, (uint64_t)size, (void *)acb, iio_flags); + break; + default: + trace_vxhs_aio_rw_invalid(iodir); + goto errout; + } + + if (ret != 0) { + trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset, + acb, ret, errno); + goto errout; + } + return &acb->common; + +errout: + qemu_aio_unref(acb); + return NULL; +} + +static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, VDISK_AIO_READ); +} + +static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, + cb, opaque, VDISK_AIO_WRITE); +} + +static void vxhs_close(BlockDriverState *bs) +{ + BDRVVXHSState *s = bs->opaque; + + trace_vxhs_close(s->vdisk_guid); + close(s->fds[VDISK_FD_READ]); + close(s->fds[VDISK_FD_WRITE]); + + /* + * Clearing all the event handlers for oflame registered to QEMU + */ + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], + false, NULL, NULL, NULL); + g_free(s->vdisk_guid); + s->vdisk_guid = NULL; + vxhs_qnio_iio_close(s); + + /* + * Free the dynamically allocated hostip string + */ + g_free(s->vdisk_hostinfo.hostip); + s->vdisk_hostinfo.hostip = NULL; + s->vdisk_hostinfo.port = 0; +} + +static unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) +{ + int64_t vdisk_size = 0; + int ret = 0; + uint32_t rfd = s->vdisk_hostinfo.vdisk_rfd; + + ret = iio_ioctl(s->qnio_ctx, rfd, IOR_VDISK_STAT, &vdisk_size, NULL, 0); + if (ret < 0) { + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); + return 0; + } + + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); + return vdisk_size; +} + +/* + * Returns the size of vDisk in bytes. This is required + * by QEMU block upper block layer so that it is visible + * to guest. + */ +static int64_t vxhs_getlength(BlockDriverState *bs) +{ + BDRVVXHSState *s = bs->opaque; + int64_t vdisk_size = 0; + + if (s->vdisk_size > 0) { + vdisk_size = s->vdisk_size; + } else { + /* + * Fetch the vDisk size using stat ioctl + */ + vdisk_size = vxhs_get_vdisk_stat(s); + if (vdisk_size > 0) { + s->vdisk_size = vdisk_size; + } + } + + if (vdisk_size > 0) { + return vdisk_size; /* return size in bytes */ + } + + return -EIO; +} + +/* + * Returns actual blocks allocated for the vDisk. + * This is required by qemu-img utility. + */ +static int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) +{ + BDRVVXHSState *s = bs->opaque; + int64_t vdisk_size = 0; + + if (s->vdisk_size > 0) { + vdisk_size = s->vdisk_size; + } else { + /* + * TODO: + * Once HyperScale storage-virtualizer provides + * actual physical allocation of blocks then + * fetch that information and return back to the + * caller but for now just get the full size. + */ + vdisk_size = vxhs_get_vdisk_stat(s); + if (vdisk_size > 0) { + s->vdisk_size = vdisk_size; + } + } + + if (vdisk_size > 0) { + return vdisk_size; /* return size in bytes */ + } + + return -EIO; +} + +static void vxhs_detach_aio_context(BlockDriverState *bs) +{ + BDRVVXHSState *s = bs->opaque; + + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], + false, NULL, NULL, NULL); + +} + +static void vxhs_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVVXHSState *s = bs->opaque; + + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], + false, vxhs_aio_event_reader, NULL, s); +} + +static BlockDriver bdrv_vxhs = { + .format_name = "vxhs", + .protocol_name = "vxhs", + .instance_size = sizeof(BDRVVXHSState), + .bdrv_file_open = vxhs_open, + .bdrv_parse_filename = vxhs_parse_filename, + .bdrv_close = vxhs_close, + .bdrv_getlength = vxhs_getlength, + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, + .bdrv_aio_readv = vxhs_aio_readv, + .bdrv_aio_writev = vxhs_aio_writev, + .bdrv_detach_aio_context = vxhs_detach_aio_context, + .bdrv_attach_aio_context = vxhs_attach_aio_context, +}; + +static void bdrv_vxhs_init(void) +{ + bdrv_register(&bdrv_vxhs); +} + +block_init(bdrv_vxhs_init); diff --git a/configure b/configure index d3dafcb..4413e88 100755 --- a/configure +++ b/configure @@ -321,6 +321,7 @@ numa="" tcmalloc="no" jemalloc="no" replication="yes" +vxhs="" # parse CC options first for opt do @@ -1162,6 +1163,11 @@ for opt do ;; --enable-replication) replication="yes" ;; + --disable-vxhs) vxhs="no" + ;; + --enable-vxhs) vxhs="yes" + ;; + *) echo "ERROR: unknown option $opt" echo "Try '$0 --help' for more information" @@ -1391,6 +1397,7 @@ disabled with --disable-FEATURE, default is enabled if available: tcmalloc tcmalloc support jemalloc jemalloc support replication replication support + vxhs Veritas HyperScale vDisk backend support NOTE: The object files are built at the place where configure is launched EOF @@ -4625,6 +4632,33 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO $TMPO; then fi ########################################## +# Veritas HyperScale block driver VxHS +# Check if libqnio is installed + +if test "$vxhs" != "no" ; then + cat > $TMPC <<EOF +#include <stdint.h> +#include <qnio/qnio_api.h> + +void *vxhs_callback; + +int main(void) { + iio_init(vxhs_callback); + return 0; +} +EOF + vxhs_libs="-lqnio" + if compile_prog "" "$vxhs_libs" ; then + vxhs=yes + else + if test "$vxhs" = "yes" ; then + feature_not_found "vxhs block device" "Install libqnio. See github" + fi + vxhs=no + fi +fi + +########################################## # End of CC checks # After here, no more $cc or $ld runs @@ -4990,6 +5024,7 @@ echo "tcmalloc support $tcmalloc" echo "jemalloc support $jemalloc" echo "avx2 optimization $avx2_opt" echo "replication support $replication" +echo "VxHS block device $vxhs" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5590,6 +5625,12 @@ if test "$pthread_setname_np" = "yes" ; then echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak fi +if test "$vxhs" = "yes" ; then + echo "CONFIG_VXHS=y" >> $config_host_mak + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak +fi + if test "$tcg_interpreter" = "yes"; then QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" elif test "$ARCH" = "sparc64" ; then
This patch adds support for a new block device type called "vxhs". Source code for the library that this code loads can be downloaded from: https://github.com/MittalAshish/libqnio.git Sample command line using JSON syntax: ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"}]}' Sample command line using URI syntax: qemu-img convert -f raw -O raw -n /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D Signed-off-by: Ashish Mittal <ashish.mittal@veritas.com> --- block/Makefile.objs | 2 + block/trace-events | 22 ++ block/vxhs.c | 736 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure | 41 +++ 4 files changed, 801 insertions(+) create mode 100644 block/vxhs.c