Message ID | 20190109112728.9214-5-xieyongji@baidu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vhost-user-blk: Add support for backend reconnecting | expand |
On 2019/1/9 下午7:27, elohimes@gmail.com wrote: > From: Xie Yongji <xieyongji@baidu.com> > > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and > VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory > to/from qemu. Then we maintain a "bitmap" of all descriptors in > the shared memory for each queue to track inflight I/O. > > Signed-off-by: Xie Yongji <xieyongji@baidu.com> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com> > --- > Makefile | 2 +- > contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++-- > contrib/libvhost-user/libvhost-user.h | 29 +++ > 3 files changed, 268 insertions(+), 21 deletions(-) > > diff --git a/Makefile b/Makefile > index dd53965f77..b5c9092605 100644 > --- a/Makefile > +++ b/Makefile > @@ -473,7 +473,7 @@ Makefile: $(version-obj-y) > # Build libraries > > libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y) > -libvhost-user.a: $(libvhost-user-obj-y) > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y) > > ###################################################################### > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c > index 23bd52264c..e73ce04619 100644 > --- a/contrib/libvhost-user/libvhost-user.c > +++ b/contrib/libvhost-user/libvhost-user.c > @@ -41,6 +41,8 @@ > #endif > > #include "qemu/atomic.h" > +#include "qemu/osdep.h" > +#include "qemu/memfd.h" > > #include "libvhost-user.h" > > @@ -53,6 +55,18 @@ > _min1 < _min2 ? _min1 : _min2; }) > #endif > > +/* Round number down to multiple */ > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > + > +/* Round number up to multiple */ > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > + > +/* Align each region to cache line size in inflight buffer */ > +#define INFLIGHT_ALIGNMENT 64 > + > +/* The version of inflight buffer */ > +#define INFLIGHT_VERSION 1 > + > #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) > > /* The version of the protocol we support */ > @@ -66,6 +80,20 @@ > } \ > } while (0) > > +static inline > +bool has_feature(uint64_t features, unsigned int fbit) > +{ > + assert(fbit < 64); > + return !!(features & (1ULL << fbit)); > +} > + > +static inline > +bool vu_has_feature(VuDev *dev, > + unsigned int fbit) > +{ > + return has_feature(dev->features, fbit); > +} > + > static const char * > vu_request_to_string(unsigned int req) > { > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req) > REQ(VHOST_USER_POSTCOPY_ADVISE), > REQ(VHOST_USER_POSTCOPY_LISTEN), > REQ(VHOST_USER_POSTCOPY_END), > + REQ(VHOST_USER_GET_INFLIGHT_FD), > + REQ(VHOST_USER_SET_INFLIGHT_FD), > REQ(VHOST_USER_MAX), > }; > #undef REQ > @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) > return true; > } > > +static int > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) > +{ > + int i = 0; > + > + if (!has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > + return 0; > + } > + > + if (unlikely(!vq->inflight)) { > + return -1; > + } > + > + vq->used_idx = vq->vring.used->idx; > + vq->inflight_num = 0; > + for (i = 0; i < vq->vring.num; i++) { > + if (vq->inflight->desc[i] == 0) { > + continue; > + } > + > + vq->inflight_desc[vq->inflight_num++] = i; > + vq->inuse++; > + } > + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; > + > + /* in case of I/O hang after reconnecting */ > + if (eventfd_write(vq->kick_fd, 1) || > + eventfd_write(vq->call_fd, 1)) { > + return -1; > + } > + > + return 0; > +} > + > static bool > vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > { > @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > dev->vq[index].kick_fd, index); > } > > + if (vu_check_queue_inflights(dev, &dev->vq[index])) { > + vu_panic(dev, "Failed to check inflights for vq: %d\n", index); > + } > + > return false; > } > > @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) > return true; > } > > +static bool > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > +{ > + int fd; > + void *addr; > + uint64_t mmap_size; > + > + if (vmsg->size != sizeof(vmsg->payload.inflight)) { > + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); > + vmsg->payload.inflight.mmap_size = 0; > + return true; > + } > + > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", > + vmsg->payload.inflight.num_queues); > + > + mmap_size = vmsg->payload.inflight.num_queues * > + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT); > + > + addr = qemu_memfd_alloc("vhost-inflight", mmap_size, > + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, > + &fd, NULL); > + > + if (!addr) { > + vu_panic(dev, "Failed to alloc vhost inflight area"); > + vmsg->payload.inflight.mmap_size = 0; > + return true; > + } > + > + dev->inflight_info.addr = addr; > + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; > + vmsg->payload.inflight.mmap_offset = 0; > + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT; > + vmsg->payload.inflight.version = INFLIGHT_VERSION; > + vmsg->fd_num = 1; > + dev->inflight_info.fd = vmsg->fds[0] = fd; > + > + DPRINT("send inflight mmap_size: %"PRId64"\n", > + vmsg->payload.inflight.mmap_size); > + DPRINT("send inflight mmap offset: %"PRId64"\n", > + vmsg->payload.inflight.mmap_offset); > + DPRINT("send inflight align: %"PRId32"\n", > + vmsg->payload.inflight.align); > + DPRINT("send inflight version: %"PRId16"\n", > + vmsg->payload.inflight.version); > + > + return true; > +} > + > +static bool > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > +{ > + int fd, i; > + uint64_t mmap_size, mmap_offset; > + uint32_t align; > + uint16_t num_queues, version; > + void *rc; > + > + if (vmsg->fd_num != 1 || > + vmsg->size != sizeof(vmsg->payload.inflight)) { > + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", > + vmsg->size, vmsg->fd_num); > + return false; > + } > + > + fd = vmsg->fds[0]; > + mmap_size = vmsg->payload.inflight.mmap_size; > + mmap_offset = vmsg->payload.inflight.mmap_offset; > + align = vmsg->payload.inflight.align; > + num_queues = vmsg->payload.inflight.num_queues; > + version = vmsg->payload.inflight.version; > + > + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); > + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); > + DPRINT("set_inflight_fd align: %"PRId32"\n", align); > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); > + DPRINT("set_inflight_fd version: %"PRId16"\n", version); > + > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, > + fd, mmap_offset); > + > + if (rc == MAP_FAILED) { > + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); > + return false; > + } > + > + if (version != INFLIGHT_VERSION) { > + vu_panic(dev, "Invalid set_inflight_fd version: %d", version); > + return false; > + } > + > + if (dev->inflight_info.fd) { > + close(dev->inflight_info.fd); > + } > + > + if (dev->inflight_info.addr) { > + munmap(dev->inflight_info.addr, dev->inflight_info.size); > + } > + > + dev->inflight_info.fd = fd; > + dev->inflight_info.addr = rc; > + dev->inflight_info.size = mmap_size; > + > + for (i = 0; i < num_queues; i++) { > + dev->vq[i].inflight = (VuVirtqInflight *)rc; > + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align)); > + } > + > + return false; > +} > + > static bool > vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > { > @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > return vu_set_postcopy_listen(dev, vmsg); > case VHOST_USER_POSTCOPY_END: > return vu_set_postcopy_end(dev, vmsg); > + case VHOST_USER_GET_INFLIGHT_FD: > + return vu_get_inflight_fd(dev, vmsg); > + case VHOST_USER_SET_INFLIGHT_FD: > + return vu_set_inflight_fd(dev, vmsg); > default: > vmsg_close_fds(vmsg); > vu_panic(dev, "Unhandled request: %d", vmsg->request); > @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev) > close(vq->err_fd); > vq->err_fd = -1; > } > + vq->inflight = NULL; > } > > + if (dev->inflight_info.addr) { > + munmap(dev->inflight_info.addr, dev->inflight_info.size); > + dev->inflight_info.addr = NULL; > + } > + > + if (dev->inflight_info.fd > 0) { > + close(dev->inflight_info.fd); > + dev->inflight_info.fd = -1; > + } > > vu_close_log(dev); > if (dev->slave_fd != -1) { > @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) > return vring_avail_idx(vq) == vq->last_avail_idx; > } > > -static inline > -bool has_feature(uint64_t features, unsigned int fbit) > -{ > - assert(fbit < 64); > - return !!(features & (1ULL << fbit)); > -} > - > -static inline > -bool vu_has_feature(VuDev *dev, > - unsigned int fbit) > -{ > - return has_feature(dev->features, fbit); > -} > - > static bool > vring_notify(VuDev *dev, VuVirtq *vq) > { > @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev, > *p_num_sg = num_sg; > } > > -/* Round number down to multiple */ > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > - > -/* Round number up to multiple */ > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > - > static void * > virtqueue_alloc_element(size_t sz, > unsigned out_num, unsigned in_num) > @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) > return elem; > } > > +static int > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) > +{ > + if (!has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > + return 0; > + } > + > + if (unlikely(!vq->inflight)) { > + return -1; > + } > + Just wonder what happens if backend get killed at this point? You want to survive from the backend crash but you still depend on backend to get and put inflight descriptors which seems somehow conflict. Thanks > + vq->inflight->desc[desc_idx] = 1; > + > + return 0; > +} > + > +static int > +vu_queue_inflight_put(VuDev *dev, VuVirtq *vq, int desc_idx) > +{ > + if (!has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > + return 0; > + } > + > + if (unlikely(!vq->inflight)) { > + return -1; > + } > + > + vq->inflight->desc[desc_idx] = 0; > + > + return 0; > +} > + > void * > vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) > { > + int i; > unsigned int head; > VuVirtqElement *elem; > > @@ -1946,6 +2155,12 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) > return NULL; > } > > + if (unlikely(vq->inflight_num > 0)) { > + i = (--vq->inflight_num); > + elem = vu_queue_map_desc(dev, vq, vq->inflight_desc[i], sz); > + return elem; > + } > + > if (vu_queue_empty(dev, vq)) { > return NULL; > } > @@ -1976,6 +2191,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) > > vq->inuse++; > > + vu_queue_inflight_get(dev, vq, head); > + > return elem; > } > > @@ -2121,4 +2338,5 @@ vu_queue_push(VuDev *dev, VuVirtq *vq, > { > vu_queue_fill(dev, vq, elem, len, 0); > vu_queue_flush(dev, vq, 1); > + vu_queue_inflight_put(dev, vq, elem->index); > } > diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h > index 4aa55b4d2d..5afb80ea5c 100644 > --- a/contrib/libvhost-user/libvhost-user.h > +++ b/contrib/libvhost-user/libvhost-user.h > @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_CONFIG = 9, > VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, > > VHOST_USER_PROTOCOL_F_MAX > }; > @@ -91,6 +92,8 @@ typedef enum VhostUserRequest { > VHOST_USER_POSTCOPY_ADVISE = 28, > VHOST_USER_POSTCOPY_LISTEN = 29, > VHOST_USER_POSTCOPY_END = 30, > + VHOST_USER_GET_INFLIGHT_FD = 31, > + VHOST_USER_SET_INFLIGHT_FD = 32, > VHOST_USER_MAX > } VhostUserRequest; > > @@ -138,6 +141,14 @@ typedef struct VhostUserVringArea { > uint64_t offset; > } VhostUserVringArea; > > +typedef struct VhostUserInflight { > + uint64_t mmap_size; > + uint64_t mmap_offset; > + uint32_t align; > + uint16_t num_queues; > + uint16_t version; > +} VhostUserInflight; > + > #if defined(_WIN32) > # define VU_PACKED __attribute__((gcc_struct, packed)) > #else > @@ -163,6 +174,7 @@ typedef struct VhostUserMsg { > VhostUserLog log; > VhostUserConfig config; > VhostUserVringArea area; > + VhostUserInflight inflight; > } payload; > > int fds[VHOST_MEMORY_MAX_NREGIONS]; > @@ -234,9 +246,19 @@ typedef struct VuRing { > uint32_t flags; > } VuRing; > > +typedef struct VuVirtqInflight { > + char desc[VIRTQUEUE_MAX_SIZE]; > +} VuVirtqInflight; > + > typedef struct VuVirtq { > VuRing vring; > > + VuVirtqInflight *inflight; > + > + uint16_t inflight_desc[VIRTQUEUE_MAX_SIZE]; > + > + uint16_t inflight_num; > + > /* Next head to pop */ > uint16_t last_avail_idx; > > @@ -279,11 +301,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition, > vu_watch_cb cb, void *data); > typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd); > > +typedef struct VuDevInflightInfo { > + int fd; > + void *addr; > + uint64_t size; > +} VuDevInflightInfo; > + > struct VuDev { > int sock; > uint32_t nregions; > VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; > VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE]; > + VuDevInflightInfo inflight_info; > int log_call_fd; > int slave_fd; > uint64_t log_size;
On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote: > > > On 2019/1/9 下午7:27, elohimes@gmail.com wrote: > > From: Xie Yongji <xieyongji@baidu.com> > > > > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and > > VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory > > to/from qemu. Then we maintain a "bitmap" of all descriptors in > > the shared memory for each queue to track inflight I/O. > > > > Signed-off-by: Xie Yongji <xieyongji@baidu.com> > > Signed-off-by: Zhang Yu <zhangyu31@baidu.com> > > --- > > Makefile | 2 +- > > contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++-- > > contrib/libvhost-user/libvhost-user.h | 29 +++ > > 3 files changed, 268 insertions(+), 21 deletions(-) > > > > diff --git a/Makefile b/Makefile > > index dd53965f77..b5c9092605 100644 > > --- a/Makefile > > +++ b/Makefile > > @@ -473,7 +473,7 @@ Makefile: $(version-obj-y) > > # Build libraries > > > > libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y) > > -libvhost-user.a: $(libvhost-user-obj-y) > > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y) > > > > ###################################################################### > > > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c > > index 23bd52264c..e73ce04619 100644 > > --- a/contrib/libvhost-user/libvhost-user.c > > +++ b/contrib/libvhost-user/libvhost-user.c > > @@ -41,6 +41,8 @@ > > #endif > > > > #include "qemu/atomic.h" > > +#include "qemu/osdep.h" > > +#include "qemu/memfd.h" > > > > #include "libvhost-user.h" > > > > @@ -53,6 +55,18 @@ > > _min1 < _min2 ? _min1 : _min2; }) > > #endif > > > > +/* Round number down to multiple */ > > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > > + > > +/* Round number up to multiple */ > > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > > + > > +/* Align each region to cache line size in inflight buffer */ > > +#define INFLIGHT_ALIGNMENT 64 > > + > > +/* The version of inflight buffer */ > > +#define INFLIGHT_VERSION 1 > > + > > #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) > > > > /* The version of the protocol we support */ > > @@ -66,6 +80,20 @@ > > } \ > > } while (0) > > > > +static inline > > +bool has_feature(uint64_t features, unsigned int fbit) > > +{ > > + assert(fbit < 64); > > + return !!(features & (1ULL << fbit)); > > +} > > + > > +static inline > > +bool vu_has_feature(VuDev *dev, > > + unsigned int fbit) > > +{ > > + return has_feature(dev->features, fbit); > > +} > > + > > static const char * > > vu_request_to_string(unsigned int req) > > { > > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req) > > REQ(VHOST_USER_POSTCOPY_ADVISE), > > REQ(VHOST_USER_POSTCOPY_LISTEN), > > REQ(VHOST_USER_POSTCOPY_END), > > + REQ(VHOST_USER_GET_INFLIGHT_FD), > > + REQ(VHOST_USER_SET_INFLIGHT_FD), > > REQ(VHOST_USER_MAX), > > }; > > #undef REQ > > @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) > > return true; > > } > > > > +static int > > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) > > +{ > > + int i = 0; > > + > > + if (!has_feature(dev->protocol_features, > > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > > + return 0; > > + } > > + > > + if (unlikely(!vq->inflight)) { > > + return -1; > > + } > > + > > + vq->used_idx = vq->vring.used->idx; > > + vq->inflight_num = 0; > > + for (i = 0; i < vq->vring.num; i++) { > > + if (vq->inflight->desc[i] == 0) { > > + continue; > > + } > > + > > + vq->inflight_desc[vq->inflight_num++] = i; > > + vq->inuse++; > > + } > > + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; > > + > > + /* in case of I/O hang after reconnecting */ > > + if (eventfd_write(vq->kick_fd, 1) || > > + eventfd_write(vq->call_fd, 1)) { > > + return -1; > > + } > > + > > + return 0; > > +} > > + > > static bool > > vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > > { > > @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > > dev->vq[index].kick_fd, index); > > } > > > > + if (vu_check_queue_inflights(dev, &dev->vq[index])) { > > + vu_panic(dev, "Failed to check inflights for vq: %d\n", index); > > + } > > + > > return false; > > } > > > > @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) > > return true; > > } > > > > +static bool > > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > > +{ > > + int fd; > > + void *addr; > > + uint64_t mmap_size; > > + > > + if (vmsg->size != sizeof(vmsg->payload.inflight)) { > > + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); > > + vmsg->payload.inflight.mmap_size = 0; > > + return true; > > + } > > + > > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", > > + vmsg->payload.inflight.num_queues); > > + > > + mmap_size = vmsg->payload.inflight.num_queues * > > + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT); > > + > > + addr = qemu_memfd_alloc("vhost-inflight", mmap_size, > > + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, > > + &fd, NULL); > > + > > + if (!addr) { > > + vu_panic(dev, "Failed to alloc vhost inflight area"); > > + vmsg->payload.inflight.mmap_size = 0; > > + return true; > > + } > > + > > + dev->inflight_info.addr = addr; > > + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; > > + vmsg->payload.inflight.mmap_offset = 0; > > + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT; > > + vmsg->payload.inflight.version = INFLIGHT_VERSION; > > + vmsg->fd_num = 1; > > + dev->inflight_info.fd = vmsg->fds[0] = fd; > > + > > + DPRINT("send inflight mmap_size: %"PRId64"\n", > > + vmsg->payload.inflight.mmap_size); > > + DPRINT("send inflight mmap offset: %"PRId64"\n", > > + vmsg->payload.inflight.mmap_offset); > > + DPRINT("send inflight align: %"PRId32"\n", > > + vmsg->payload.inflight.align); > > + DPRINT("send inflight version: %"PRId16"\n", > > + vmsg->payload.inflight.version); > > + > > + return true; > > +} > > + > > +static bool > > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > > +{ > > + int fd, i; > > + uint64_t mmap_size, mmap_offset; > > + uint32_t align; > > + uint16_t num_queues, version; > > + void *rc; > > + > > + if (vmsg->fd_num != 1 || > > + vmsg->size != sizeof(vmsg->payload.inflight)) { > > + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", > > + vmsg->size, vmsg->fd_num); > > + return false; > > + } > > + > > + fd = vmsg->fds[0]; > > + mmap_size = vmsg->payload.inflight.mmap_size; > > + mmap_offset = vmsg->payload.inflight.mmap_offset; > > + align = vmsg->payload.inflight.align; > > + num_queues = vmsg->payload.inflight.num_queues; > > + version = vmsg->payload.inflight.version; > > + > > + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); > > + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); > > + DPRINT("set_inflight_fd align: %"PRId32"\n", align); > > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); > > + DPRINT("set_inflight_fd version: %"PRId16"\n", version); > > + > > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, > > + fd, mmap_offset); > > + > > + if (rc == MAP_FAILED) { > > + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); > > + return false; > > + } > > + > > + if (version != INFLIGHT_VERSION) { > > + vu_panic(dev, "Invalid set_inflight_fd version: %d", version); > > + return false; > > + } > > + > > + if (dev->inflight_info.fd) { > > + close(dev->inflight_info.fd); > > + } > > + > > + if (dev->inflight_info.addr) { > > + munmap(dev->inflight_info.addr, dev->inflight_info.size); > > + } > > + > > + dev->inflight_info.fd = fd; > > + dev->inflight_info.addr = rc; > > + dev->inflight_info.size = mmap_size; > > + > > + for (i = 0; i < num_queues; i++) { > > + dev->vq[i].inflight = (VuVirtqInflight *)rc; > > + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align)); > > + } > > + > > + return false; > > +} > > + > > static bool > > vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > > { > > @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > > return vu_set_postcopy_listen(dev, vmsg); > > case VHOST_USER_POSTCOPY_END: > > return vu_set_postcopy_end(dev, vmsg); > > + case VHOST_USER_GET_INFLIGHT_FD: > > + return vu_get_inflight_fd(dev, vmsg); > > + case VHOST_USER_SET_INFLIGHT_FD: > > + return vu_set_inflight_fd(dev, vmsg); > > default: > > vmsg_close_fds(vmsg); > > vu_panic(dev, "Unhandled request: %d", vmsg->request); > > @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev) > > close(vq->err_fd); > > vq->err_fd = -1; > > } > > + vq->inflight = NULL; > > } > > > > + if (dev->inflight_info.addr) { > > + munmap(dev->inflight_info.addr, dev->inflight_info.size); > > + dev->inflight_info.addr = NULL; > > + } > > + > > + if (dev->inflight_info.fd > 0) { > > + close(dev->inflight_info.fd); > > + dev->inflight_info.fd = -1; > > + } > > > > vu_close_log(dev); > > if (dev->slave_fd != -1) { > > @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) > > return vring_avail_idx(vq) == vq->last_avail_idx; > > } > > > > -static inline > > -bool has_feature(uint64_t features, unsigned int fbit) > > -{ > > - assert(fbit < 64); > > - return !!(features & (1ULL << fbit)); > > -} > > - > > -static inline > > -bool vu_has_feature(VuDev *dev, > > - unsigned int fbit) > > -{ > > - return has_feature(dev->features, fbit); > > -} > > - > > static bool > > vring_notify(VuDev *dev, VuVirtq *vq) > > { > > @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev, > > *p_num_sg = num_sg; > > } > > > > -/* Round number down to multiple */ > > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > > - > > -/* Round number up to multiple */ > > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > > - > > static void * > > virtqueue_alloc_element(size_t sz, > > unsigned out_num, unsigned in_num) > > @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) > > return elem; > > } > > > > +static int > > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) > > +{ > > + if (!has_feature(dev->protocol_features, > > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > > + return 0; > > + } > > + > > + if (unlikely(!vq->inflight)) { > > + return -1; > > + } > > + > > > Just wonder what happens if backend get killed at this point? > We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx At this point, backend could consume this entry correctly after reconnect. > You want to survive from the backend crash but you still depend on > backend to get and put inflight descriptors which seems somehow conflict. > But if backend get killed in vu_queue_inflight_put(), I think you are right, there is something conflict. One descriptor is consumed by guest but still marked as inused in inflight buffer. Then we will re-send this old descriptor after restart. Maybe we can add something like that to fix this issue: void vu_queue_push() { vq->inflight->elem_idx = elem->idx; vu_queue_fill(); vu_queue_flush(); vq->inflight->desc[elem->idx] = 0; vq->inflight->used_idx = vq->vring.used->idx; } static int vu_check_queue_inflights() { .... if (vq->inflight->used_idx != vq->vring.used->idx) { /* Crash in vu_queue_push() */ vq->inflight->desc[vq->inflight->elem_idx] = 0; } .... } Thanks, Yongji
On 2019/1/11 下午2:10, Yongji Xie wrote: > On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote: >> >> On 2019/1/9 下午7:27, elohimes@gmail.com wrote: >>> From: Xie Yongji <xieyongji@baidu.com> >>> >>> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and >>> VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory >>> to/from qemu. Then we maintain a "bitmap" of all descriptors in >>> the shared memory for each queue to track inflight I/O. >>> >>> Signed-off-by: Xie Yongji <xieyongji@baidu.com> >>> Signed-off-by: Zhang Yu <zhangyu31@baidu.com> >>> --- >>> Makefile | 2 +- >>> contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++-- >>> contrib/libvhost-user/libvhost-user.h | 29 +++ >>> 3 files changed, 268 insertions(+), 21 deletions(-) >>> >>> diff --git a/Makefile b/Makefile >>> index dd53965f77..b5c9092605 100644 >>> --- a/Makefile >>> +++ b/Makefile >>> @@ -473,7 +473,7 @@ Makefile: $(version-obj-y) >>> # Build libraries >>> >>> libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y) >>> -libvhost-user.a: $(libvhost-user-obj-y) >>> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y) >>> >>> ###################################################################### >>> >>> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c >>> index 23bd52264c..e73ce04619 100644 >>> --- a/contrib/libvhost-user/libvhost-user.c >>> +++ b/contrib/libvhost-user/libvhost-user.c >>> @@ -41,6 +41,8 @@ >>> #endif >>> >>> #include "qemu/atomic.h" >>> +#include "qemu/osdep.h" >>> +#include "qemu/memfd.h" >>> >>> #include "libvhost-user.h" >>> >>> @@ -53,6 +55,18 @@ >>> _min1 < _min2 ? _min1 : _min2; }) >>> #endif >>> >>> +/* Round number down to multiple */ >>> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) >>> + >>> +/* Round number up to multiple */ >>> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) >>> + >>> +/* Align each region to cache line size in inflight buffer */ >>> +#define INFLIGHT_ALIGNMENT 64 >>> + >>> +/* The version of inflight buffer */ >>> +#define INFLIGHT_VERSION 1 >>> + >>> #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) >>> >>> /* The version of the protocol we support */ >>> @@ -66,6 +80,20 @@ >>> } \ >>> } while (0) >>> >>> +static inline >>> +bool has_feature(uint64_t features, unsigned int fbit) >>> +{ >>> + assert(fbit < 64); >>> + return !!(features & (1ULL << fbit)); >>> +} >>> + >>> +static inline >>> +bool vu_has_feature(VuDev *dev, >>> + unsigned int fbit) >>> +{ >>> + return has_feature(dev->features, fbit); >>> +} >>> + >>> static const char * >>> vu_request_to_string(unsigned int req) >>> { >>> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req) >>> REQ(VHOST_USER_POSTCOPY_ADVISE), >>> REQ(VHOST_USER_POSTCOPY_LISTEN), >>> REQ(VHOST_USER_POSTCOPY_END), >>> + REQ(VHOST_USER_GET_INFLIGHT_FD), >>> + REQ(VHOST_USER_SET_INFLIGHT_FD), >>> REQ(VHOST_USER_MAX), >>> }; >>> #undef REQ >>> @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) >>> return true; >>> } >>> >>> +static int >>> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) >>> +{ >>> + int i = 0; >>> + >>> + if (!has_feature(dev->protocol_features, >>> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { >>> + return 0; >>> + } >>> + >>> + if (unlikely(!vq->inflight)) { >>> + return -1; >>> + } >>> + >>> + vq->used_idx = vq->vring.used->idx; >>> + vq->inflight_num = 0; >>> + for (i = 0; i < vq->vring.num; i++) { >>> + if (vq->inflight->desc[i] == 0) { >>> + continue; >>> + } >>> + >>> + vq->inflight_desc[vq->inflight_num++] = i; >>> + vq->inuse++; >>> + } >>> + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; >>> + >>> + /* in case of I/O hang after reconnecting */ >>> + if (eventfd_write(vq->kick_fd, 1) || >>> + eventfd_write(vq->call_fd, 1)) { >>> + return -1; >>> + } >>> + >>> + return 0; >>> +} >>> + >>> static bool >>> vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) >>> { >>> @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) >>> dev->vq[index].kick_fd, index); >>> } >>> >>> + if (vu_check_queue_inflights(dev, &dev->vq[index])) { >>> + vu_panic(dev, "Failed to check inflights for vq: %d\n", index); >>> + } >>> + >>> return false; >>> } >>> >>> @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) >>> return true; >>> } >>> >>> +static bool >>> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) >>> +{ >>> + int fd; >>> + void *addr; >>> + uint64_t mmap_size; >>> + >>> + if (vmsg->size != sizeof(vmsg->payload.inflight)) { >>> + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); >>> + vmsg->payload.inflight.mmap_size = 0; >>> + return true; >>> + } >>> + >>> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", >>> + vmsg->payload.inflight.num_queues); >>> + >>> + mmap_size = vmsg->payload.inflight.num_queues * >>> + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT); >>> + >>> + addr = qemu_memfd_alloc("vhost-inflight", mmap_size, >>> + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, >>> + &fd, NULL); >>> + >>> + if (!addr) { >>> + vu_panic(dev, "Failed to alloc vhost inflight area"); >>> + vmsg->payload.inflight.mmap_size = 0; >>> + return true; >>> + } >>> + >>> + dev->inflight_info.addr = addr; >>> + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; >>> + vmsg->payload.inflight.mmap_offset = 0; >>> + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT; >>> + vmsg->payload.inflight.version = INFLIGHT_VERSION; >>> + vmsg->fd_num = 1; >>> + dev->inflight_info.fd = vmsg->fds[0] = fd; >>> + >>> + DPRINT("send inflight mmap_size: %"PRId64"\n", >>> + vmsg->payload.inflight.mmap_size); >>> + DPRINT("send inflight mmap offset: %"PRId64"\n", >>> + vmsg->payload.inflight.mmap_offset); >>> + DPRINT("send inflight align: %"PRId32"\n", >>> + vmsg->payload.inflight.align); >>> + DPRINT("send inflight version: %"PRId16"\n", >>> + vmsg->payload.inflight.version); >>> + >>> + return true; >>> +} >>> + >>> +static bool >>> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) >>> +{ >>> + int fd, i; >>> + uint64_t mmap_size, mmap_offset; >>> + uint32_t align; >>> + uint16_t num_queues, version; >>> + void *rc; >>> + >>> + if (vmsg->fd_num != 1 || >>> + vmsg->size != sizeof(vmsg->payload.inflight)) { >>> + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", >>> + vmsg->size, vmsg->fd_num); >>> + return false; >>> + } >>> + >>> + fd = vmsg->fds[0]; >>> + mmap_size = vmsg->payload.inflight.mmap_size; >>> + mmap_offset = vmsg->payload.inflight.mmap_offset; >>> + align = vmsg->payload.inflight.align; >>> + num_queues = vmsg->payload.inflight.num_queues; >>> + version = vmsg->payload.inflight.version; >>> + >>> + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); >>> + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); >>> + DPRINT("set_inflight_fd align: %"PRId32"\n", align); >>> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); >>> + DPRINT("set_inflight_fd version: %"PRId16"\n", version); >>> + >>> + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, >>> + fd, mmap_offset); >>> + >>> + if (rc == MAP_FAILED) { >>> + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); >>> + return false; >>> + } >>> + >>> + if (version != INFLIGHT_VERSION) { >>> + vu_panic(dev, "Invalid set_inflight_fd version: %d", version); >>> + return false; >>> + } >>> + >>> + if (dev->inflight_info.fd) { >>> + close(dev->inflight_info.fd); >>> + } >>> + >>> + if (dev->inflight_info.addr) { >>> + munmap(dev->inflight_info.addr, dev->inflight_info.size); >>> + } >>> + >>> + dev->inflight_info.fd = fd; >>> + dev->inflight_info.addr = rc; >>> + dev->inflight_info.size = mmap_size; >>> + >>> + for (i = 0; i < num_queues; i++) { >>> + dev->vq[i].inflight = (VuVirtqInflight *)rc; >>> + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align)); >>> + } >>> + >>> + return false; >>> +} >>> + >>> static bool >>> vu_process_message(VuDev *dev, VhostUserMsg *vmsg) >>> { >>> @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) >>> return vu_set_postcopy_listen(dev, vmsg); >>> case VHOST_USER_POSTCOPY_END: >>> return vu_set_postcopy_end(dev, vmsg); >>> + case VHOST_USER_GET_INFLIGHT_FD: >>> + return vu_get_inflight_fd(dev, vmsg); >>> + case VHOST_USER_SET_INFLIGHT_FD: >>> + return vu_set_inflight_fd(dev, vmsg); >>> default: >>> vmsg_close_fds(vmsg); >>> vu_panic(dev, "Unhandled request: %d", vmsg->request); >>> @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev) >>> close(vq->err_fd); >>> vq->err_fd = -1; >>> } >>> + vq->inflight = NULL; >>> } >>> >>> + if (dev->inflight_info.addr) { >>> + munmap(dev->inflight_info.addr, dev->inflight_info.size); >>> + dev->inflight_info.addr = NULL; >>> + } >>> + >>> + if (dev->inflight_info.fd > 0) { >>> + close(dev->inflight_info.fd); >>> + dev->inflight_info.fd = -1; >>> + } >>> >>> vu_close_log(dev); >>> if (dev->slave_fd != -1) { >>> @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) >>> return vring_avail_idx(vq) == vq->last_avail_idx; >>> } >>> >>> -static inline >>> -bool has_feature(uint64_t features, unsigned int fbit) >>> -{ >>> - assert(fbit < 64); >>> - return !!(features & (1ULL << fbit)); >>> -} >>> - >>> -static inline >>> -bool vu_has_feature(VuDev *dev, >>> - unsigned int fbit) >>> -{ >>> - return has_feature(dev->features, fbit); >>> -} >>> - >>> static bool >>> vring_notify(VuDev *dev, VuVirtq *vq) >>> { >>> @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev, >>> *p_num_sg = num_sg; >>> } >>> >>> -/* Round number down to multiple */ >>> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) >>> - >>> -/* Round number up to multiple */ >>> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) >>> - >>> static void * >>> virtqueue_alloc_element(size_t sz, >>> unsigned out_num, unsigned in_num) >>> @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) >>> return elem; >>> } >>> >>> +static int >>> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) >>> +{ >>> + if (!has_feature(dev->protocol_features, >>> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { >>> + return 0; >>> + } >>> + >>> + if (unlikely(!vq->inflight)) { >>> + return -1; >>> + } >>> + >> >> Just wonder what happens if backend get killed at this point? >> > We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx I'm not sure I get you here, but it looks to me at least one pending descriptor is missed since you don't set vq->inflight->desc[desc_idx] to 1? > > At this point, backend could consume this entry correctly after reconnect. > >> You want to survive from the backend crash but you still depend on >> backend to get and put inflight descriptors which seems somehow conflict. >> > But if backend get killed in vu_queue_inflight_put(), I think you are > right, there is something conflict. One descriptor is consumed by > guest but still marked as inused in inflight buffer. Then we will > re-send this old descriptor after restart. > > Maybe we can add something like that to fix this issue: > > void vu_queue_push() > { > vq->inflight->elem_idx = elem->idx; > vu_queue_fill(); > vu_queue_flush(); > vq->inflight->desc[elem->idx] = 0; Does this safe to be killed here? > vq->inflight->used_idx = vq->vring.used->idx; > } > > static int vu_check_queue_inflights() > { > .... > if (vq->inflight->used_idx != vq->vring.used->idx) { > /* Crash in vu_queue_push() */ > vq->inflight->desc[vq->inflight->elem_idx] = 0; > } > .... > } > > Thanks, > Yongji Well, this may work but here're my points: 1) The code want to recover from backed crash by introducing extra space to store inflight data, but it still depends on the backend to set/get the inflight state 2) Since the backend could be killed at any time, the backend must have the ability to recover from the partial inflight state So it looks to me 1) tends to be self-contradictory and 2) tends to be recursive. The above lines show how tricky could the code looks like. Solving this at vhost-user level through at backend is probably wrong. It's time to consider the support from virtio itself. Thanks
On Tue, 15 Jan 2019 at 15:52, Jason Wang <jasowang@redhat.com> wrote: > > > On 2019/1/11 下午2:10, Yongji Xie wrote: > > On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2019/1/9 下午7:27, elohimes@gmail.com wrote: > >>> From: Xie Yongji <xieyongji@baidu.com> > >>> > >>> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and > >>> VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory > >>> to/from qemu. Then we maintain a "bitmap" of all descriptors in > >>> the shared memory for each queue to track inflight I/O. > >>> > >>> Signed-off-by: Xie Yongji <xieyongji@baidu.com> > >>> Signed-off-by: Zhang Yu <zhangyu31@baidu.com> > >>> --- > >>> Makefile | 2 +- > >>> contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++-- > >>> contrib/libvhost-user/libvhost-user.h | 29 +++ > >>> 3 files changed, 268 insertions(+), 21 deletions(-) > >>> > >>> diff --git a/Makefile b/Makefile > >>> index dd53965f77..b5c9092605 100644 > >>> --- a/Makefile > >>> +++ b/Makefile > >>> @@ -473,7 +473,7 @@ Makefile: $(version-obj-y) > >>> # Build libraries > >>> > >>> libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y) > >>> -libvhost-user.a: $(libvhost-user-obj-y) > >>> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y) > >>> > >>> ###################################################################### > >>> > >>> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c > >>> index 23bd52264c..e73ce04619 100644 > >>> --- a/contrib/libvhost-user/libvhost-user.c > >>> +++ b/contrib/libvhost-user/libvhost-user.c > >>> @@ -41,6 +41,8 @@ > >>> #endif > >>> > >>> #include "qemu/atomic.h" > >>> +#include "qemu/osdep.h" > >>> +#include "qemu/memfd.h" > >>> > >>> #include "libvhost-user.h" > >>> > >>> @@ -53,6 +55,18 @@ > >>> _min1 < _min2 ? _min1 : _min2; }) > >>> #endif > >>> > >>> +/* Round number down to multiple */ > >>> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > >>> + > >>> +/* Round number up to multiple */ > >>> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > >>> + > >>> +/* Align each region to cache line size in inflight buffer */ > >>> +#define INFLIGHT_ALIGNMENT 64 > >>> + > >>> +/* The version of inflight buffer */ > >>> +#define INFLIGHT_VERSION 1 > >>> + > >>> #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) > >>> > >>> /* The version of the protocol we support */ > >>> @@ -66,6 +80,20 @@ > >>> } \ > >>> } while (0) > >>> > >>> +static inline > >>> +bool has_feature(uint64_t features, unsigned int fbit) > >>> +{ > >>> + assert(fbit < 64); > >>> + return !!(features & (1ULL << fbit)); > >>> +} > >>> + > >>> +static inline > >>> +bool vu_has_feature(VuDev *dev, > >>> + unsigned int fbit) > >>> +{ > >>> + return has_feature(dev->features, fbit); > >>> +} > >>> + > >>> static const char * > >>> vu_request_to_string(unsigned int req) > >>> { > >>> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req) > >>> REQ(VHOST_USER_POSTCOPY_ADVISE), > >>> REQ(VHOST_USER_POSTCOPY_LISTEN), > >>> REQ(VHOST_USER_POSTCOPY_END), > >>> + REQ(VHOST_USER_GET_INFLIGHT_FD), > >>> + REQ(VHOST_USER_SET_INFLIGHT_FD), > >>> REQ(VHOST_USER_MAX), > >>> }; > >>> #undef REQ > >>> @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) > >>> return true; > >>> } > >>> > >>> +static int > >>> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) > >>> +{ > >>> + int i = 0; > >>> + > >>> + if (!has_feature(dev->protocol_features, > >>> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > >>> + return 0; > >>> + } > >>> + > >>> + if (unlikely(!vq->inflight)) { > >>> + return -1; > >>> + } > >>> + > >>> + vq->used_idx = vq->vring.used->idx; > >>> + vq->inflight_num = 0; > >>> + for (i = 0; i < vq->vring.num; i++) { > >>> + if (vq->inflight->desc[i] == 0) { > >>> + continue; > >>> + } > >>> + > >>> + vq->inflight_desc[vq->inflight_num++] = i; > >>> + vq->inuse++; > >>> + } > >>> + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; > >>> + > >>> + /* in case of I/O hang after reconnecting */ > >>> + if (eventfd_write(vq->kick_fd, 1) || > >>> + eventfd_write(vq->call_fd, 1)) { > >>> + return -1; > >>> + } > >>> + > >>> + return 0; > >>> +} > >>> + > >>> static bool > >>> vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > >>> { > >>> @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > >>> dev->vq[index].kick_fd, index); > >>> } > >>> > >>> + if (vu_check_queue_inflights(dev, &dev->vq[index])) { > >>> + vu_panic(dev, "Failed to check inflights for vq: %d\n", index); > >>> + } > >>> + > >>> return false; > >>> } > >>> > >>> @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) > >>> return true; > >>> } > >>> > >>> +static bool > >>> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > >>> +{ > >>> + int fd; > >>> + void *addr; > >>> + uint64_t mmap_size; > >>> + > >>> + if (vmsg->size != sizeof(vmsg->payload.inflight)) { > >>> + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); > >>> + vmsg->payload.inflight.mmap_size = 0; > >>> + return true; > >>> + } > >>> + > >>> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", > >>> + vmsg->payload.inflight.num_queues); > >>> + > >>> + mmap_size = vmsg->payload.inflight.num_queues * > >>> + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT); > >>> + > >>> + addr = qemu_memfd_alloc("vhost-inflight", mmap_size, > >>> + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, > >>> + &fd, NULL); > >>> + > >>> + if (!addr) { > >>> + vu_panic(dev, "Failed to alloc vhost inflight area"); > >>> + vmsg->payload.inflight.mmap_size = 0; > >>> + return true; > >>> + } > >>> + > >>> + dev->inflight_info.addr = addr; > >>> + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; > >>> + vmsg->payload.inflight.mmap_offset = 0; > >>> + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT; > >>> + vmsg->payload.inflight.version = INFLIGHT_VERSION; > >>> + vmsg->fd_num = 1; > >>> + dev->inflight_info.fd = vmsg->fds[0] = fd; > >>> + > >>> + DPRINT("send inflight mmap_size: %"PRId64"\n", > >>> + vmsg->payload.inflight.mmap_size); > >>> + DPRINT("send inflight mmap offset: %"PRId64"\n", > >>> + vmsg->payload.inflight.mmap_offset); > >>> + DPRINT("send inflight align: %"PRId32"\n", > >>> + vmsg->payload.inflight.align); > >>> + DPRINT("send inflight version: %"PRId16"\n", > >>> + vmsg->payload.inflight.version); > >>> + > >>> + return true; > >>> +} > >>> + > >>> +static bool > >>> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > >>> +{ > >>> + int fd, i; > >>> + uint64_t mmap_size, mmap_offset; > >>> + uint32_t align; > >>> + uint16_t num_queues, version; > >>> + void *rc; > >>> + > >>> + if (vmsg->fd_num != 1 || > >>> + vmsg->size != sizeof(vmsg->payload.inflight)) { > >>> + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", > >>> + vmsg->size, vmsg->fd_num); > >>> + return false; > >>> + } > >>> + > >>> + fd = vmsg->fds[0]; > >>> + mmap_size = vmsg->payload.inflight.mmap_size; > >>> + mmap_offset = vmsg->payload.inflight.mmap_offset; > >>> + align = vmsg->payload.inflight.align; > >>> + num_queues = vmsg->payload.inflight.num_queues; > >>> + version = vmsg->payload.inflight.version; > >>> + > >>> + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); > >>> + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); > >>> + DPRINT("set_inflight_fd align: %"PRId32"\n", align); > >>> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); > >>> + DPRINT("set_inflight_fd version: %"PRId16"\n", version); > >>> + > >>> + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, > >>> + fd, mmap_offset); > >>> + > >>> + if (rc == MAP_FAILED) { > >>> + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); > >>> + return false; > >>> + } > >>> + > >>> + if (version != INFLIGHT_VERSION) { > >>> + vu_panic(dev, "Invalid set_inflight_fd version: %d", version); > >>> + return false; > >>> + } > >>> + > >>> + if (dev->inflight_info.fd) { > >>> + close(dev->inflight_info.fd); > >>> + } > >>> + > >>> + if (dev->inflight_info.addr) { > >>> + munmap(dev->inflight_info.addr, dev->inflight_info.size); > >>> + } > >>> + > >>> + dev->inflight_info.fd = fd; > >>> + dev->inflight_info.addr = rc; > >>> + dev->inflight_info.size = mmap_size; > >>> + > >>> + for (i = 0; i < num_queues; i++) { > >>> + dev->vq[i].inflight = (VuVirtqInflight *)rc; > >>> + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align)); > >>> + } > >>> + > >>> + return false; > >>> +} > >>> + > >>> static bool > >>> vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > >>> { > >>> @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > >>> return vu_set_postcopy_listen(dev, vmsg); > >>> case VHOST_USER_POSTCOPY_END: > >>> return vu_set_postcopy_end(dev, vmsg); > >>> + case VHOST_USER_GET_INFLIGHT_FD: > >>> + return vu_get_inflight_fd(dev, vmsg); > >>> + case VHOST_USER_SET_INFLIGHT_FD: > >>> + return vu_set_inflight_fd(dev, vmsg); > >>> default: > >>> vmsg_close_fds(vmsg); > >>> vu_panic(dev, "Unhandled request: %d", vmsg->request); > >>> @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev) > >>> close(vq->err_fd); > >>> vq->err_fd = -1; > >>> } > >>> + vq->inflight = NULL; > >>> } > >>> > >>> + if (dev->inflight_info.addr) { > >>> + munmap(dev->inflight_info.addr, dev->inflight_info.size); > >>> + dev->inflight_info.addr = NULL; > >>> + } > >>> + > >>> + if (dev->inflight_info.fd > 0) { > >>> + close(dev->inflight_info.fd); > >>> + dev->inflight_info.fd = -1; > >>> + } > >>> > >>> vu_close_log(dev); > >>> if (dev->slave_fd != -1) { > >>> @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) > >>> return vring_avail_idx(vq) == vq->last_avail_idx; > >>> } > >>> > >>> -static inline > >>> -bool has_feature(uint64_t features, unsigned int fbit) > >>> -{ > >>> - assert(fbit < 64); > >>> - return !!(features & (1ULL << fbit)); > >>> -} > >>> - > >>> -static inline > >>> -bool vu_has_feature(VuDev *dev, > >>> - unsigned int fbit) > >>> -{ > >>> - return has_feature(dev->features, fbit); > >>> -} > >>> - > >>> static bool > >>> vring_notify(VuDev *dev, VuVirtq *vq) > >>> { > >>> @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev, > >>> *p_num_sg = num_sg; > >>> } > >>> > >>> -/* Round number down to multiple */ > >>> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > >>> - > >>> -/* Round number up to multiple */ > >>> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > >>> - > >>> static void * > >>> virtqueue_alloc_element(size_t sz, > >>> unsigned out_num, unsigned in_num) > >>> @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) > >>> return elem; > >>> } > >>> > >>> +static int > >>> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) > >>> +{ > >>> + if (!has_feature(dev->protocol_features, > >>> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { > >>> + return 0; > >>> + } > >>> + > >>> + if (unlikely(!vq->inflight)) { > >>> + return -1; > >>> + } > >>> + > >> > >> Just wonder what happens if backend get killed at this point? > >> > > We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx > > > I'm not sure I get you here, but it looks to me at least one pending > descriptor is missed since you don't set vq->inflight->desc[desc_idx] to 1? > > > > > > At this point, backend could consume this entry correctly after reconnect. > > > >> You want to survive from the backend crash but you still depend on > >> backend to get and put inflight descriptors which seems somehow conflict. > >> > > But if backend get killed in vu_queue_inflight_put(), I think you are > > right, there is something conflict. One descriptor is consumed by > > guest but still marked as inused in inflight buffer. Then we will > > re-send this old descriptor after restart. > > > > Maybe we can add something like that to fix this issue: > > > > void vu_queue_push() > > { > > vq->inflight->elem_idx = elem->idx; > > vu_queue_fill(); > > vu_queue_flush(); > > vq->inflight->desc[elem->idx] = 0; > > > Does this safe to be killed here? > > > > vq->inflight->used_idx = vq->vring.used->idx; > > } > > > > static int vu_check_queue_inflights() > > { > > .... > > if (vq->inflight->used_idx != vq->vring.used->idx) { > > /* Crash in vu_queue_push() */ > > vq->inflight->desc[vq->inflight->elem_idx] = 0; > > } > > .... > > } > > > > Thanks, > > Yongji > > > Well, this may work but here're my points: > > 1) The code want to recover from backed crash by introducing extra space > to store inflight data, but it still depends on the backend to set/get > the inflight state > > 2) Since the backend could be killed at any time, the backend must have > the ability to recover from the partial inflight state > > So it looks to me 1) tends to be self-contradictory and 2) tends to be > recursive. The above lines show how tricky could the code looks like. > > Solving this at vhost-user level through at backend is probably wrong. > It's time to consider the support from virtio itself. > I agree that supporting this in virtio level may be better. For example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in Stefan's proposal. But I still think QEMU should be able to provide this ability too. Supposed that one vhost-user backend need to support multiple VMs. We can't enable reconnect ability until all VMs' guest driver support the new feature. It's limited. But if QEMU have the ability to store inflight buffer, the backend could at least have a chance to support this case. Maybe backend could have other way to avoid the tricky code. Thanks, Yongji
On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote: > Well, this may work but here're my points: > > 1) The code want to recover from backed crash by introducing extra space to > store inflight data, but it still depends on the backend to set/get the > inflight state > > 2) Since the backend could be killed at any time, the backend must have the > ability to recover from the partial inflight state > > So it looks to me 1) tends to be self-contradictory and 2) tends to be > recursive. The above lines show how tricky could the code looks like. This is a well studied field. Basically you make sure you commit with an atomic write. Restartable sequences allow accelerating this even further. > Solving this at vhost-user level through at backend is probably wrong. It's > time to consider the support from virtio itself. > > Thanks I think both approaches have their space.
On 2019/1/15 下午10:51, Yongji Xie wrote: >> Well, this may work but here're my points: >> >> 1) The code want to recover from backed crash by introducing extra space >> to store inflight data, but it still depends on the backend to set/get >> the inflight state >> >> 2) Since the backend could be killed at any time, the backend must have >> the ability to recover from the partial inflight state >> >> So it looks to me 1) tends to be self-contradictory and 2) tends to be >> recursive. The above lines show how tricky could the code looks like. >> >> Solving this at vhost-user level through at backend is probably wrong. >> It's time to consider the support from virtio itself. >> > I agree that supporting this in virtio level may be better. For > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > Stefan's proposal. But I still think QEMU should be able to provide > this ability too. Supposed that one vhost-user backend need to support > multiple VMs. We can't enable reconnect ability until all VMs' guest > driver support the new feature. It's limited. That's the way virtio evolves. > But if QEMU have the > ability to store inflight buffer, the backend could at least have a > chance to support this case. The problem is, you need a careful designed protocol described somewhere (is vhost-user.txt a good place for this?). And this work will be (partial) duplicated for the future support from virtio spec itself. > Maybe backend could have other way to > avoid the tricky code. I'm not sure, but it was probably not easy. Thanks > > Thanks,
On 2019/1/15 下午11:58, Michael S. Tsirkin wrote: > On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote: >> Well, this may work but here're my points: >> >> 1) The code want to recover from backed crash by introducing extra space to >> store inflight data, but it still depends on the backend to set/get the >> inflight state >> >> 2) Since the backend could be killed at any time, the backend must have the >> ability to recover from the partial inflight state >> >> So it looks to me 1) tends to be self-contradictory and 2) tends to be >> recursive. The above lines show how tricky could the code looks like. > This is a well studied field. Basically you make sure you commit with an > atomic write. Restartable sequences allow accelerating this even > further. I'm not sure I get this. But the issue is to exactly deduce all the inflight descriptors even if backend could be killed when doing the logging. If we could not be 100% accurate, it's have much less value. > >> Solving this at vhost-user level through at backend is probably wrong. It's >> time to consider the support from virtio itself. >> >> Thanks > I think both approaches have their space. But there will be a lot of duplicated work if we decide to support it from virtio. Thanks > > -- MST
On Thu, Jan 17, 2019 at 05:57:29PM +0800, Jason Wang wrote: > > On 2019/1/15 下午10:51, Yongji Xie wrote: > > > Well, this may work but here're my points: > > > > > > 1) The code want to recover from backed crash by introducing extra space > > > to store inflight data, but it still depends on the backend to set/get > > > the inflight state > > > > > > 2) Since the backend could be killed at any time, the backend must have > > > the ability to recover from the partial inflight state > > > > > > So it looks to me 1) tends to be self-contradictory and 2) tends to be > > > recursive. The above lines show how tricky could the code looks like. > > > > > > Solving this at vhost-user level through at backend is probably wrong. > > > It's time to consider the support from virtio itself. > > > > > I agree that supporting this in virtio level may be better. For > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > > Stefan's proposal. But I still think QEMU should be able to provide > > this ability too. Supposed that one vhost-user backend need to support > > multiple VMs. We can't enable reconnect ability until all VMs' guest > > driver support the new feature. It's limited. > > > That's the way virtio evolves. > > > > But if QEMU have the > > ability to store inflight buffer, the backend could at least have a > > chance to support this case. > > > The problem is, you need a careful designed protocol described somewhere (is > vhost-user.txt a good place for this?). And this work will be (partial) > duplicated for the future support from virtio spec itself. > > > > Maybe backend could have other way to > > avoid the tricky code. > > > I'm not sure, but it was probably not easy. I see an implementation in libvhost-user. Do you see an issue there? > Thanks > > > > > > Thanks,
On Thu, Jan 17, 2019 at 06:01:16PM +0800, Jason Wang wrote: > > On 2019/1/15 下午11:58, Michael S. Tsirkin wrote: > > On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote: > > > Well, this may work but here're my points: > > > > > > 1) The code want to recover from backed crash by introducing extra space to > > > store inflight data, but it still depends on the backend to set/get the > > > inflight state > > > > > > 2) Since the backend could be killed at any time, the backend must have the > > > ability to recover from the partial inflight state > > > > > > So it looks to me 1) tends to be self-contradictory and 2) tends to be > > > recursive. The above lines show how tricky could the code looks like. > > This is a well studied field. Basically you make sure you commit with an > > atomic write. Restartable sequences allow accelerating this even > > further. > > > I'm not sure I get this. But the issue is to exactly deduce all the inflight > descriptors even if backend could be killed when doing the logging. If we > could not be 100% accurate, it's have much less value. I agree. But why discuss theoretical issues? Can you point out a problem in the contrib/ code included here? If yes it must be fixed I think. I personally think it's not too hard. Consider packed ring for example - just maintain a list of the inflight descriptors, as the last step write out the flags atomically. > > > > > > Solving this at vhost-user level through at backend is probably wrong. It's > > > time to consider the support from virtio itself. > > > > > > Thanks > > I think both approaches have their space. > > > But there will be a lot of duplicated work if we decide to support it from > virtio. > > Thanks > > > > > > -- MST
On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: > > > On 2019/1/15 下午10:51, Yongji Xie wrote: > >> Well, this may work but here're my points: > >> > >> 1) The code want to recover from backed crash by introducing extra space > >> to store inflight data, but it still depends on the backend to set/get > >> the inflight state > >> > >> 2) Since the backend could be killed at any time, the backend must have > >> the ability to recover from the partial inflight state > >> > >> So it looks to me 1) tends to be self-contradictory and 2) tends to be > >> recursive. The above lines show how tricky could the code looks like. > >> > >> Solving this at vhost-user level through at backend is probably wrong. > >> It's time to consider the support from virtio itself. > >> > > I agree that supporting this in virtio level may be better. For > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > > Stefan's proposal. But I still think QEMU should be able to provide > > this ability too. Supposed that one vhost-user backend need to support > > multiple VMs. We can't enable reconnect ability until all VMs' guest > > driver support the new feature. It's limited. > > > That's the way virtio evolves. > > > > But if QEMU have the > > ability to store inflight buffer, the backend could at least have a > > chance to support this case. > > > The problem is, you need a careful designed protocol described somewhere That's what we should discuss in detail in this series. > (is vhost-user.txt a good place for this?). And this work will be > (partial) duplicated for the future support from virtio spec itself. > I think the duplicated code is to maintain the inflight descriptor list which should be in backend. That's not main work in this series. And backend could choose to include it or not. Thanks, Yongji
On Fri, Jan 18, 2019 at 11:32:03AM +0800, Yongji Xie wrote: > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: > > > > > > On 2019/1/15 下午10:51, Yongji Xie wrote: > > >> Well, this may work but here're my points: > > >> > > >> 1) The code want to recover from backed crash by introducing extra space > > >> to store inflight data, but it still depends on the backend to set/get > > >> the inflight state > > >> > > >> 2) Since the backend could be killed at any time, the backend must have > > >> the ability to recover from the partial inflight state > > >> > > >> So it looks to me 1) tends to be self-contradictory and 2) tends to be > > >> recursive. The above lines show how tricky could the code looks like. > > >> > > >> Solving this at vhost-user level through at backend is probably wrong. > > >> It's time to consider the support from virtio itself. > > >> > > > I agree that supporting this in virtio level may be better. For > > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > > > Stefan's proposal. But I still think QEMU should be able to provide > > > this ability too. Supposed that one vhost-user backend need to support > > > multiple VMs. We can't enable reconnect ability until all VMs' guest > > > driver support the new feature. It's limited. > > > > > > That's the way virtio evolves. > > > > > > > But if QEMU have the > > > ability to store inflight buffer, the backend could at least have a > > > chance to support this case. > > > > > > The problem is, you need a careful designed protocol described somewhere > > That's what we should discuss in detail in this series. > > > (is vhost-user.txt a good place for this?). And this work will be > > (partial) duplicated for the future support from virtio spec itself. > > > > I think the duplicated code is to maintain the inflight descriptor > list which should be in backend. That's not main work in this series. > And backend could choose to include it or not. > > Thanks, > Yongji It would be if someone volunteered to rewrite the vhost user informal description that we have in qemu and make it a full spec. So far a text + implementation in contrib seems plenty to me.
On 2019/1/17 下午10:59, Michael S. Tsirkin wrote: > On Thu, Jan 17, 2019 at 05:57:29PM +0800, Jason Wang wrote: >> On 2019/1/15 下午10:51, Yongji Xie wrote: >>>> Well, this may work but here're my points: >>>> >>>> 1) The code want to recover from backed crash by introducing extra space >>>> to store inflight data, but it still depends on the backend to set/get >>>> the inflight state >>>> >>>> 2) Since the backend could be killed at any time, the backend must have >>>> the ability to recover from the partial inflight state >>>> >>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be >>>> recursive. The above lines show how tricky could the code looks like. >>>> >>>> Solving this at vhost-user level through at backend is probably wrong. >>>> It's time to consider the support from virtio itself. >>>> >>> I agree that supporting this in virtio level may be better. For >>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in >>> Stefan's proposal. But I still think QEMU should be able to provide >>> this ability too. Supposed that one vhost-user backend need to support >>> multiple VMs. We can't enable reconnect ability until all VMs' guest >>> driver support the new feature. It's limited. >> >> That's the way virtio evolves. >> >> >>> But if QEMU have the >>> ability to store inflight buffer, the backend could at least have a >>> chance to support this case. >> >> The problem is, you need a careful designed protocol described somewhere (is >> vhost-user.txt a good place for this?). And this work will be (partial) >> duplicated for the future support from virtio spec itself. >> >> >>> Maybe backend could have other way to >>> avoid the tricky code. >> >> I'm not sure, but it was probably not easy. > I see an implementation in libvhost-user. > > Do you see an issue there? I've asked some questions in this thread, it looks like we can still miss some inflight descriptors. Thanks > > >> Thanks >> >> >>> Thanks,
On 2019/1/18 上午11:32, Yongji Xie wrote: > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: >> >> On 2019/1/15 下午10:51, Yongji Xie wrote: >>>> Well, this may work but here're my points: >>>> >>>> 1) The code want to recover from backed crash by introducing extra space >>>> to store inflight data, but it still depends on the backend to set/get >>>> the inflight state >>>> >>>> 2) Since the backend could be killed at any time, the backend must have >>>> the ability to recover from the partial inflight state >>>> >>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be >>>> recursive. The above lines show how tricky could the code looks like. >>>> >>>> Solving this at vhost-user level through at backend is probably wrong. >>>> It's time to consider the support from virtio itself. >>>> >>> I agree that supporting this in virtio level may be better. For >>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in >>> Stefan's proposal. But I still think QEMU should be able to provide >>> this ability too. Supposed that one vhost-user backend need to support >>> multiple VMs. We can't enable reconnect ability until all VMs' guest >>> driver support the new feature. It's limited. >> >> That's the way virtio evolves. >> >> >>> But if QEMU have the >>> ability to store inflight buffer, the backend could at least have a >>> chance to support this case. >> >> The problem is, you need a careful designed protocol described somewhere > That's what we should discuss in detail in this series. Well, I ask some questions for this patch, but it looks like they were still not answered. No? > >> (is vhost-user.txt a good place for this?). And this work will be >> (partial) duplicated for the future support from virtio spec itself. >> > I think the duplicated code is to maintain the inflight descriptor > list which should be in backend. That's not main work in this series. > And backend could choose to include it or not. You need to have a documentation to describe the protocol. Otherwise, it would be very hard for other backend to implement. Thanks > > Thanks, > Yongji >
On Fri, Jan 18, 2019 at 11:59:50AM +0800, Jason Wang wrote: > > On 2019/1/18 上午11:32, Yongji Xie wrote: > > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: > > > > > > On 2019/1/15 下午10:51, Yongji Xie wrote: > > > > > Well, this may work but here're my points: > > > > > > > > > > 1) The code want to recover from backed crash by introducing extra space > > > > > to store inflight data, but it still depends on the backend to set/get > > > > > the inflight state > > > > > > > > > > 2) Since the backend could be killed at any time, the backend must have > > > > > the ability to recover from the partial inflight state > > > > > > > > > > So it looks to me 1) tends to be self-contradictory and 2) tends to be > > > > > recursive. The above lines show how tricky could the code looks like. > > > > > > > > > > Solving this at vhost-user level through at backend is probably wrong. > > > > > It's time to consider the support from virtio itself. > > > > > > > > > I agree that supporting this in virtio level may be better. For > > > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > > > > Stefan's proposal. But I still think QEMU should be able to provide > > > > this ability too. Supposed that one vhost-user backend need to support > > > > multiple VMs. We can't enable reconnect ability until all VMs' guest > > > > driver support the new feature. It's limited. > > > > > > That's the way virtio evolves. > > > > > > > > > > But if QEMU have the > > > > ability to store inflight buffer, the backend could at least have a > > > > chance to support this case. > > > > > > The problem is, you need a careful designed protocol described somewhere > > That's what we should discuss in detail in this series. > > > Well, I ask some questions for this patch, but it looks like they were still > not answered. No? > Oh absolutely. I can't say I like the implementation, I think it's both not the most robust and suboptimal. > > > > > (is vhost-user.txt a good place for this?). And this work will be > > > (partial) duplicated for the future support from virtio spec itself. > > > > > I think the duplicated code is to maintain the inflight descriptor > > list which should be in backend. That's not main work in this series. > > And backend could choose to include it or not. > > > You need to have a documentation to describe the protocol. Otherwise, it > would be very hard for other backend to implement. > > Thanks Meaning how the inflight descriptors are saved in the buffer. Yes I agree. > > > > > Thanks, > > Yongji > >
On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote: > > > On 2019/1/18 上午11:32, Yongji Xie wrote: > > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2019/1/15 下午10:51, Yongji Xie wrote: > >>>> Well, this may work but here're my points: > >>>> > >>>> 1) The code want to recover from backed crash by introducing extra space > >>>> to store inflight data, but it still depends on the backend to set/get > >>>> the inflight state > >>>> > >>>> 2) Since the backend could be killed at any time, the backend must have > >>>> the ability to recover from the partial inflight state > >>>> > >>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be > >>>> recursive. The above lines show how tricky could the code looks like. > >>>> > >>>> Solving this at vhost-user level through at backend is probably wrong. > >>>> It's time to consider the support from virtio itself. > >>>> > >>> I agree that supporting this in virtio level may be better. For > >>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > >>> Stefan's proposal. But I still think QEMU should be able to provide > >>> this ability too. Supposed that one vhost-user backend need to support > >>> multiple VMs. We can't enable reconnect ability until all VMs' guest > >>> driver support the new feature. It's limited. > >> > >> That's the way virtio evolves. > >> > >> > >>> But if QEMU have the > >>> ability to store inflight buffer, the backend could at least have a > >>> chance to support this case. > >> > >> The problem is, you need a careful designed protocol described somewhere > > That's what we should discuss in detail in this series. > > > Well, I ask some questions for this patch, but it looks like they were > still not answered. No? > > Oh, sorry, I missed those questions. Let me try to answer them here. Q1: If backend get killed in vu_queue_inflight_get() without setting vq->inflight->desc[desc_idx] to 1, is there any problem? The entry which stores the head of this inflight descriptor is not lost in avail ring. So we can still get this inflight descriptor from avail ring although we didn't set vq->inflight->desc[desc_idx] to 1. Q2: void vu_queue_push() { vq->inflight->elem_idx = elem->idx; vu_queue_fill(); vu_queue_flush(); vq->inflight->desc[elem->idx] = 0; <-------- Does this safe to be killed here? vq->inflight->used_idx = vq->vring.used->idx; } Because there are no concurrency between vu_queue_push() and vu_queue_pop(), I don't see any problem here. Basically we just need to make sure this two operations (vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are atomic. I think there are some approach to achieve that. I'm not sure my approach here is good enough, but it should work. > > > >> (is vhost-user.txt a good place for this?). And this work will be > >> (partial) duplicated for the future support from virtio spec itself. > >> > > I think the duplicated code is to maintain the inflight descriptor > > list which should be in backend. That's not main work in this series. > > And backend could choose to include it or not. > > > You need to have a documentation to describe the protocol. Otherwise, it > would be very hard for other backend to implement. > Yes, actually now I'm working on adding more detail to vhost-user.txt. Thanks, Yongji
On 2019/1/18 下午3:01, Yongji Xie wrote: > On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote: >> >> On 2019/1/18 上午11:32, Yongji Xie wrote: >>> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: >>>> On 2019/1/15 下午10:51, Yongji Xie wrote: >>>>>> Well, this may work but here're my points: >>>>>> >>>>>> 1) The code want to recover from backed crash by introducing extra space >>>>>> to store inflight data, but it still depends on the backend to set/get >>>>>> the inflight state >>>>>> >>>>>> 2) Since the backend could be killed at any time, the backend must have >>>>>> the ability to recover from the partial inflight state >>>>>> >>>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be >>>>>> recursive. The above lines show how tricky could the code looks like. >>>>>> >>>>>> Solving this at vhost-user level through at backend is probably wrong. >>>>>> It's time to consider the support from virtio itself. >>>>>> >>>>> I agree that supporting this in virtio level may be better. For >>>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in >>>>> Stefan's proposal. But I still think QEMU should be able to provide >>>>> this ability too. Supposed that one vhost-user backend need to support >>>>> multiple VMs. We can't enable reconnect ability until all VMs' guest >>>>> driver support the new feature. It's limited. >>>> That's the way virtio evolves. >>>> >>>> >>>>> But if QEMU have the >>>>> ability to store inflight buffer, the backend could at least have a >>>>> chance to support this case. >>>> The problem is, you need a careful designed protocol described somewhere >>> That's what we should discuss in detail in this series. >> >> Well, I ask some questions for this patch, but it looks like they were >> still not answered. No? >> >> > Oh, sorry, I missed those questions. Let me try to answer them here. > > Q1: If backend get killed in vu_queue_inflight_get() without setting > vq->inflight->desc[desc_idx] to 1, is there any problem? > > The entry which stores the head of this inflight descriptor is not > lost in avail ring. So we can still get this inflight descriptor from > avail ring although we didn't set vq->inflight->desc[desc_idx] to 1. Ok I get this. > Q2: > void vu_queue_push() > { > vq->inflight->elem_idx = elem->idx; > vu_queue_fill(); > vu_queue_flush(); > vq->inflight->desc[elem->idx] = 0; > <-------- Does > this safe to be killed here? > vq->inflight->used_idx = vq->vring.used->idx; > } > > Because there are no concurrency between vu_queue_push() and > vu_queue_pop(), I don't see any problem here. > > Basically we just need to make sure this two operations > (vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are > atomic. I think there are some approach to achieve that. I'm not sure > my approach here is good enough, but it should work. Rethink about this, some findings: - What you suggest requires strict ordering in some part of vu_queue_push(). E.g it looks to me you need a compiler barrier to make sure used_idx is set before clearing desc[elem->idx]? This a an side effect of introduce new metadata (inflight->elem_idx and inflight->used_idx) to recover from crashing when logging inflgith->desc[] array. - Modern backends like dpdk do batching aggressively, which means you probably need an array of elem_idx[]. This tends to be more complex since we probably need to negotiate the size of this array and the overhead is probably noticeable. I don't audit all other places of the codes, but I suspect it would be hard to be 100% correct. And what happens for packed virtqueue? Basically, we don't want to produce tricky codes that is hard to debug in the future. Think in another direction, in order will be supported by virtio 1.1. With that, the inflight descriptors could be much more easier to be deduced like [used_idx, avail_idx)? So it looks to me supporting this from virtio layer is much more easier. Thoughts? Thanks > >>>> (is vhost-user.txt a good place for this?). And this work will be >>>> (partial) duplicated for the future support from virtio spec itself. >>>> >>> I think the duplicated code is to maintain the inflight descriptor >>> list which should be in backend. That's not main work in this series. >>> And backend could choose to include it or not. >> >> You need to have a documentation to describe the protocol. Otherwise, it >> would be very hard for other backend to implement. >> > Yes, actually now I'm working on adding more detail to vhost-user.txt. > > Thanks, > Yongji
On Fri, 18 Jan 2019 at 17:27, Jason Wang <jasowang@redhat.com> wrote: > > > On 2019/1/18 下午3:01, Yongji Xie wrote: > > On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2019/1/18 上午11:32, Yongji Xie wrote: > >>> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2019/1/15 下午10:51, Yongji Xie wrote: > >>>>>> Well, this may work but here're my points: > >>>>>> > >>>>>> 1) The code want to recover from backed crash by introducing extra space > >>>>>> to store inflight data, but it still depends on the backend to set/get > >>>>>> the inflight state > >>>>>> > >>>>>> 2) Since the backend could be killed at any time, the backend must have > >>>>>> the ability to recover from the partial inflight state > >>>>>> > >>>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be > >>>>>> recursive. The above lines show how tricky could the code looks like. > >>>>>> > >>>>>> Solving this at vhost-user level through at backend is probably wrong. > >>>>>> It's time to consider the support from virtio itself. > >>>>>> > >>>>> I agree that supporting this in virtio level may be better. For > >>>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in > >>>>> Stefan's proposal. But I still think QEMU should be able to provide > >>>>> this ability too. Supposed that one vhost-user backend need to support > >>>>> multiple VMs. We can't enable reconnect ability until all VMs' guest > >>>>> driver support the new feature. It's limited. > >>>> That's the way virtio evolves. > >>>> > >>>> > >>>>> But if QEMU have the > >>>>> ability to store inflight buffer, the backend could at least have a > >>>>> chance to support this case. > >>>> The problem is, you need a careful designed protocol described somewhere > >>> That's what we should discuss in detail in this series. > >> > >> Well, I ask some questions for this patch, but it looks like they were > >> still not answered. No? > >> > >> > > Oh, sorry, I missed those questions. Let me try to answer them here. > > > > Q1: If backend get killed in vu_queue_inflight_get() without setting > > vq->inflight->desc[desc_idx] to 1, is there any problem? > > > > The entry which stores the head of this inflight descriptor is not > > lost in avail ring. So we can still get this inflight descriptor from > > avail ring although we didn't set vq->inflight->desc[desc_idx] to 1. > > > Ok I get this. > > > > Q2: > > void vu_queue_push() > > { > > vq->inflight->elem_idx = elem->idx; > > vu_queue_fill(); > > vu_queue_flush(); > > vq->inflight->desc[elem->idx] = 0; > > <-------- Does > > this safe to be killed here? > > vq->inflight->used_idx = vq->vring.used->idx; > > } > > > > Because there are no concurrency between vu_queue_push() and > > vu_queue_pop(), I don't see any problem here. > > > > Basically we just need to make sure this two operations > > (vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are > > atomic. I think there are some approach to achieve that. I'm not sure > > my approach here is good enough, but it should work. > > > Rethink about this, some findings: > > - What you suggest requires strict ordering in some part of > vu_queue_push(). E.g it looks to me you need a compiler barrier to make > sure used_idx is set before clearing desc[elem->idx]? This a an side > effect of introduce new metadata (inflight->elem_idx and > inflight->used_idx) to recover from crashing when logging > inflgith->desc[] array. > Yes, compiler barries are needed here. > - Modern backends like dpdk do batching aggressively, which means you > probably need an array of elem_idx[]. This tends to be more complex > since we probably need to negotiate the size of this array and the > overhead is probably noticeable. > Maybe this new approach could make things easier: void vu_queue_flush(const VuVirtqElement *elem, unsigned int count) { uint16_t old = vring.used->idx; uint16_t new = old + count; inflight->used_idx = new; barrier(); for (i = 0; i < count; i++) { inflgith->desc[elem[i]->idx] = FLAG_PROCESS; } barrier(); vring.used->idx = new; barrier(); for (i = 0; i < count; i++) { inflgith->desc[elem[i]->idx] = FLAG_AVAIL; } } static int vu_check_queue_inflights() { .... for (i = 0; i < desc_count; i++) { if (inflight->desc[i] == FLAG_PROCESS) { if (inflight->used_idx != vring.used->idx) { /* rollback */ inflight->desc[i] = FLAG_INUSE; } else { /* commit */ inflight->desc[i] = FLAG_AVAIL; } } } .... } > I don't audit all other places of the codes, but I suspect it would be > hard to be 100% correct. And what happens for packed virtqueue? > Basically, we don't want to produce tricky codes that is hard to debug > in the future. Think in another direction, in order will be supported by > virtio 1.1. With that, the inflight descriptors could be much more > easier to be deduced like [used_idx, avail_idx)? So it looks to me > supporting this from virtio layer is much more easier. > Yes, I agree that we should support this in virtio layer for packed virtqueue or newer virtio device. And I'll be happy to do that. But this series also make sense to me. Because we can use this to support legacy virtio 1.0 or 0.9 device. Thanks, Yongji
diff --git a/Makefile b/Makefile index dd53965f77..b5c9092605 100644 --- a/Makefile +++ b/Makefile @@ -473,7 +473,7 @@ Makefile: $(version-obj-y) # Build libraries libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y) -libvhost-user.a: $(libvhost-user-obj-y) +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y) ###################################################################### diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 23bd52264c..e73ce04619 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -41,6 +41,8 @@ #endif #include "qemu/atomic.h" +#include "qemu/osdep.h" +#include "qemu/memfd.h" #include "libvhost-user.h" @@ -53,6 +55,18 @@ _min1 < _min2 ? _min1 : _min2; }) #endif +/* Round number down to multiple */ +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) + +/* Round number up to multiple */ +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + +/* Align each region to cache line size in inflight buffer */ +#define INFLIGHT_ALIGNMENT 64 + +/* The version of inflight buffer */ +#define INFLIGHT_VERSION 1 + #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) /* The version of the protocol we support */ @@ -66,6 +80,20 @@ } \ } while (0) +static inline +bool has_feature(uint64_t features, unsigned int fbit) +{ + assert(fbit < 64); + return !!(features & (1ULL << fbit)); +} + +static inline +bool vu_has_feature(VuDev *dev, + unsigned int fbit) +{ + return has_feature(dev->features, fbit); +} + static const char * vu_request_to_string(unsigned int req) { @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_POSTCOPY_ADVISE), REQ(VHOST_USER_POSTCOPY_LISTEN), REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), REQ(VHOST_USER_MAX), }; #undef REQ @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) return true; } +static int +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) +{ + int i = 0; + + if (!has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { + return 0; + } + + if (unlikely(!vq->inflight)) { + return -1; + } + + vq->used_idx = vq->vring.used->idx; + vq->inflight_num = 0; + for (i = 0; i < vq->vring.num; i++) { + if (vq->inflight->desc[i] == 0) { + continue; + } + + vq->inflight_desc[vq->inflight_num++] = i; + vq->inuse++; + } + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; + + /* in case of I/O hang after reconnecting */ + if (eventfd_write(vq->kick_fd, 1) || + eventfd_write(vq->call_fd, 1)) { + return -1; + } + + return 0; +} + static bool vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) { @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) dev->vq[index].kick_fd, index); } + if (vu_check_queue_inflights(dev, &dev->vq[index])) { + vu_panic(dev, "Failed to check inflights for vq: %d\n", index); + } + return false; } @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) return true; } +static bool +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) +{ + int fd; + void *addr; + uint64_t mmap_size; + + if (vmsg->size != sizeof(vmsg->payload.inflight)) { + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); + vmsg->payload.inflight.mmap_size = 0; + return true; + } + + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", + vmsg->payload.inflight.num_queues); + + mmap_size = vmsg->payload.inflight.num_queues * + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT); + + addr = qemu_memfd_alloc("vhost-inflight", mmap_size, + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, + &fd, NULL); + + if (!addr) { + vu_panic(dev, "Failed to alloc vhost inflight area"); + vmsg->payload.inflight.mmap_size = 0; + return true; + } + + dev->inflight_info.addr = addr; + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; + vmsg->payload.inflight.mmap_offset = 0; + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT; + vmsg->payload.inflight.version = INFLIGHT_VERSION; + vmsg->fd_num = 1; + dev->inflight_info.fd = vmsg->fds[0] = fd; + + DPRINT("send inflight mmap_size: %"PRId64"\n", + vmsg->payload.inflight.mmap_size); + DPRINT("send inflight mmap offset: %"PRId64"\n", + vmsg->payload.inflight.mmap_offset); + DPRINT("send inflight align: %"PRId32"\n", + vmsg->payload.inflight.align); + DPRINT("send inflight version: %"PRId16"\n", + vmsg->payload.inflight.version); + + return true; +} + +static bool +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) +{ + int fd, i; + uint64_t mmap_size, mmap_offset; + uint32_t align; + uint16_t num_queues, version; + void *rc; + + if (vmsg->fd_num != 1 || + vmsg->size != sizeof(vmsg->payload.inflight)) { + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", + vmsg->size, vmsg->fd_num); + return false; + } + + fd = vmsg->fds[0]; + mmap_size = vmsg->payload.inflight.mmap_size; + mmap_offset = vmsg->payload.inflight.mmap_offset; + align = vmsg->payload.inflight.align; + num_queues = vmsg->payload.inflight.num_queues; + version = vmsg->payload.inflight.version; + + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); + DPRINT("set_inflight_fd align: %"PRId32"\n", align); + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); + DPRINT("set_inflight_fd version: %"PRId16"\n", version); + + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, mmap_offset); + + if (rc == MAP_FAILED) { + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); + return false; + } + + if (version != INFLIGHT_VERSION) { + vu_panic(dev, "Invalid set_inflight_fd version: %d", version); + return false; + } + + if (dev->inflight_info.fd) { + close(dev->inflight_info.fd); + } + + if (dev->inflight_info.addr) { + munmap(dev->inflight_info.addr, dev->inflight_info.size); + } + + dev->inflight_info.fd = fd; + dev->inflight_info.addr = rc; + dev->inflight_info.size = mmap_size; + + for (i = 0; i < num_queues; i++) { + dev->vq[i].inflight = (VuVirtqInflight *)rc; + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align)); + } + + return false; +} + static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_set_postcopy_listen(dev, vmsg); case VHOST_USER_POSTCOPY_END: return vu_set_postcopy_end(dev, vmsg); + case VHOST_USER_GET_INFLIGHT_FD: + return vu_get_inflight_fd(dev, vmsg); + case VHOST_USER_SET_INFLIGHT_FD: + return vu_set_inflight_fd(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev) close(vq->err_fd); vq->err_fd = -1; } + vq->inflight = NULL; } + if (dev->inflight_info.addr) { + munmap(dev->inflight_info.addr, dev->inflight_info.size); + dev->inflight_info.addr = NULL; + } + + if (dev->inflight_info.fd > 0) { + close(dev->inflight_info.fd); + dev->inflight_info.fd = -1; + } vu_close_log(dev); if (dev->slave_fd != -1) { @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) return vring_avail_idx(vq) == vq->last_avail_idx; } -static inline -bool has_feature(uint64_t features, unsigned int fbit) -{ - assert(fbit < 64); - return !!(features & (1ULL << fbit)); -} - -static inline -bool vu_has_feature(VuDev *dev, - unsigned int fbit) -{ - return has_feature(dev->features, fbit); -} - static bool vring_notify(VuDev *dev, VuVirtq *vq) { @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev, *p_num_sg = num_sg; } -/* Round number down to multiple */ -#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) - -/* Round number up to multiple */ -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) - static void * virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) return elem; } +static int +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) +{ + if (!has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { + return 0; + } + + if (unlikely(!vq->inflight)) { + return -1; + } + + vq->inflight->desc[desc_idx] = 1; + + return 0; +} + +static int +vu_queue_inflight_put(VuDev *dev, VuVirtq *vq, int desc_idx) +{ + if (!has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { + return 0; + } + + if (unlikely(!vq->inflight)) { + return -1; + } + + vq->inflight->desc[desc_idx] = 0; + + return 0; +} + void * vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) { + int i; unsigned int head; VuVirtqElement *elem; @@ -1946,6 +2155,12 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) return NULL; } + if (unlikely(vq->inflight_num > 0)) { + i = (--vq->inflight_num); + elem = vu_queue_map_desc(dev, vq, vq->inflight_desc[i], sz); + return elem; + } + if (vu_queue_empty(dev, vq)) { return NULL; } @@ -1976,6 +2191,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) vq->inuse++; + vu_queue_inflight_get(dev, vq, head); + return elem; } @@ -2121,4 +2338,5 @@ vu_queue_push(VuDev *dev, VuVirtq *vq, { vu_queue_fill(dev, vq, elem, len, 0); vu_queue_flush(dev, vq, 1); + vu_queue_inflight_put(dev, vq, elem->index); } diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 4aa55b4d2d..5afb80ea5c 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_CONFIG = 9, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, VHOST_USER_PROTOCOL_F_MAX }; @@ -91,6 +92,8 @@ typedef enum VhostUserRequest { VHOST_USER_POSTCOPY_ADVISE = 28, VHOST_USER_POSTCOPY_LISTEN = 29, VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, VHOST_USER_MAX } VhostUserRequest; @@ -138,6 +141,14 @@ typedef struct VhostUserVringArea { uint64_t offset; } VhostUserVringArea; +typedef struct VhostUserInflight { + uint64_t mmap_size; + uint64_t mmap_offset; + uint32_t align; + uint16_t num_queues; + uint16_t version; +} VhostUserInflight; + #if defined(_WIN32) # define VU_PACKED __attribute__((gcc_struct, packed)) #else @@ -163,6 +174,7 @@ typedef struct VhostUserMsg { VhostUserLog log; VhostUserConfig config; VhostUserVringArea area; + VhostUserInflight inflight; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; @@ -234,9 +246,19 @@ typedef struct VuRing { uint32_t flags; } VuRing; +typedef struct VuVirtqInflight { + char desc[VIRTQUEUE_MAX_SIZE]; +} VuVirtqInflight; + typedef struct VuVirtq { VuRing vring; + VuVirtqInflight *inflight; + + uint16_t inflight_desc[VIRTQUEUE_MAX_SIZE]; + + uint16_t inflight_num; + /* Next head to pop */ uint16_t last_avail_idx; @@ -279,11 +301,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition, vu_watch_cb cb, void *data); typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd); +typedef struct VuDevInflightInfo { + int fd; + void *addr; + uint64_t size; +} VuDevInflightInfo; + struct VuDev { int sock; uint32_t nregions; VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE]; + VuDevInflightInfo inflight_info; int log_call_fd; int slave_fd; uint64_t log_size;