Message ID | 20230503172121.733642-2-david@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vhost: memslot handling improvements | expand |
On Wed, May 03, 2023 at 07:21:19PM +0200, David Hildenbrand wrote: > Having multiple vhost devices, some filtering out fd-less memslots and > some not, can mess up the "used_memslot" accounting. Consequently our > "free memslot" checks become unreliable and we might run out of free > memslots at runtime later. > > An example sequence which can trigger a potential issue that involves > different vhost backends (vhost-kernel and vhost-user) and hotplugged > memory devices can be found at [1]. > > Let's make the filtering mechanism less generic and distinguish between > backends that support private memslots (without a fd) and ones that only > support shared memslots (with a fd). Track the used_memslots for both > cases separately and use the corresponding value when required. > > Note: Most probably we should filter out MAP_PRIVATE fd-based RAM regions > (for example, via memory-backend-memfd,...,shared=off or as default with > memory-backend-file) as well. When not using MAP_SHARED, it might not work > as expected. Add a TODO for now. > > [1] https://lkml.kernel.org/r/fad9136f-08d3-3fd9-71a1-502069c000cf@redhat.com > > Fixes: 988a27754bbb ("vhost: allow backends to filter memory sections") > Cc: Tiwei Bie <tiwei.bie@intel.com> > Acked-by: Igor Mammedov <imammedo@redhat.com> > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > hw/virtio/vhost-user.c | 7 ++-- > hw/virtio/vhost.c | 56 ++++++++++++++++++++++++++----- > include/hw/virtio/vhost-backend.h | 5 ++- > 3 files changed, 52 insertions(+), 16 deletions(-) > > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index e5285df4ba..0c3e2702b1 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -2453,10 +2453,9 @@ vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id) > return 0; > } > > -static bool vhost_user_mem_section_filter(struct vhost_dev *dev, > - MemoryRegionSection *section) > +static bool vhost_user_no_private_memslots(struct vhost_dev *dev) > { > - return memory_region_get_fd(section->mr) >= 0; > + return true; > } > > static int vhost_user_get_inflight_fd(struct vhost_dev *dev, > @@ -2686,6 +2685,7 @@ const VhostOps user_ops = { > .vhost_backend_init = vhost_user_backend_init, > .vhost_backend_cleanup = vhost_user_backend_cleanup, > .vhost_backend_memslots_limit = vhost_user_memslots_limit, > + .vhost_backend_no_private_memslots = vhost_user_no_private_memslots, > .vhost_set_log_base = vhost_user_set_log_base, > .vhost_set_mem_table = vhost_user_set_mem_table, > .vhost_set_vring_addr = vhost_user_set_vring_addr, > @@ -2712,7 +2712,6 @@ const VhostOps user_ops = { > .vhost_set_config = vhost_user_set_config, > .vhost_crypto_create_session = vhost_user_crypto_create_session, > .vhost_crypto_close_session = vhost_user_crypto_close_session, > - .vhost_backend_mem_section_filter = vhost_user_mem_section_filter, > .vhost_get_inflight_fd = vhost_user_get_inflight_fd, > .vhost_set_inflight_fd = vhost_user_set_inflight_fd, > .vhost_dev_start = vhost_user_dev_start, > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index 746d130c74..4fe08c809f 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -46,20 +46,33 @@ > static struct vhost_log *vhost_log; > static struct vhost_log *vhost_log_shm; > > +/* Memslots used by backends that support private memslots (without an fd). */ > static unsigned int used_memslots; > + > +/* Memslots used by backends that only support shared memslots (with an fd). */ > +static unsigned int used_shared_memslots; It's just that these vars are updated multiple times when >1 vhost is there, accessing these fields are still a bit confusing - I think it's implicitly protected by BQL so looks always safe. Since we already have the shared/private handling, maybe for the long term it'll be nicer to just keep such info per-device e.g. in vhost_dev so we can also drop vhost_backend_no_private_memslots(). Anyway the code is internal so can be done on top even if worthwhile. > + > static QLIST_HEAD(, vhost_dev) vhost_devices = > QLIST_HEAD_INITIALIZER(vhost_devices); > > bool vhost_has_free_slot(void) > { > - unsigned int slots_limit = ~0U; > + unsigned int free = UINT_MAX; > struct vhost_dev *hdev; > > QLIST_FOREACH(hdev, &vhost_devices, entry) { > unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); > - slots_limit = MIN(slots_limit, r); > + unsigned int cur_free; > + > + if (hdev->vhost_ops->vhost_backend_no_private_memslots && > + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { > + cur_free = r - used_shared_memslots; > + } else { > + cur_free = r - used_memslots; > + } > + free = MIN(free, cur_free); > } > - return slots_limit > used_memslots; > + return free > 1; Should here be "free > 0" instead? Trivial but maybe still matter when some device used exactly the size of all memslots of a device.. Other than this the patch looks all good here. Thanks,
[...] >> --- a/hw/virtio/vhost.c >> +++ b/hw/virtio/vhost.c >> @@ -46,20 +46,33 @@ >> static struct vhost_log *vhost_log; >> static struct vhost_log *vhost_log_shm; >> >> +/* Memslots used by backends that support private memslots (without an fd). */ >> static unsigned int used_memslots; >> + >> +/* Memslots used by backends that only support shared memslots (with an fd). */ >> +static unsigned int used_shared_memslots; > > It's just that these vars are updated multiple times when >1 vhost is > there, accessing these fields are still a bit confusing - I think it's > implicitly protected by BQL so looks always safe. Yes, like the existing variable. > > Since we already have the shared/private handling, maybe for the long term > it'll be nicer to just keep such info per-device e.g. in vhost_dev so we > can also drop vhost_backend_no_private_memslots(). Anyway the code is > internal so can be done on top even if worthwhile. Might be possible, but I remember there was a catch to it when hotplugging a device. > >> + >> static QLIST_HEAD(, vhost_dev) vhost_devices = >> QLIST_HEAD_INITIALIZER(vhost_devices); >> >> bool vhost_has_free_slot(void) >> { >> - unsigned int slots_limit = ~0U; >> + unsigned int free = UINT_MAX; >> struct vhost_dev *hdev; >> >> QLIST_FOREACH(hdev, &vhost_devices, entry) { >> unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); >> - slots_limit = MIN(slots_limit, r); >> + unsigned int cur_free; >> + >> + if (hdev->vhost_ops->vhost_backend_no_private_memslots && >> + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { >> + cur_free = r - used_shared_memslots; >> + } else { >> + cur_free = r - used_memslots; >> + } >> + free = MIN(free, cur_free); >> } >> - return slots_limit > used_memslots; >> + return free > 1; > > Should here be "free > 0" instead? > > Trivial but maybe still matter when some device used exactly the size of > all memslots of a device.. Very good catch, thanks Peter!
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index e5285df4ba..0c3e2702b1 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2453,10 +2453,9 @@ vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id) return 0; } -static bool vhost_user_mem_section_filter(struct vhost_dev *dev, - MemoryRegionSection *section) +static bool vhost_user_no_private_memslots(struct vhost_dev *dev) { - return memory_region_get_fd(section->mr) >= 0; + return true; } static int vhost_user_get_inflight_fd(struct vhost_dev *dev, @@ -2686,6 +2685,7 @@ const VhostOps user_ops = { .vhost_backend_init = vhost_user_backend_init, .vhost_backend_cleanup = vhost_user_backend_cleanup, .vhost_backend_memslots_limit = vhost_user_memslots_limit, + .vhost_backend_no_private_memslots = vhost_user_no_private_memslots, .vhost_set_log_base = vhost_user_set_log_base, .vhost_set_mem_table = vhost_user_set_mem_table, .vhost_set_vring_addr = vhost_user_set_vring_addr, @@ -2712,7 +2712,6 @@ const VhostOps user_ops = { .vhost_set_config = vhost_user_set_config, .vhost_crypto_create_session = vhost_user_crypto_create_session, .vhost_crypto_close_session = vhost_user_crypto_close_session, - .vhost_backend_mem_section_filter = vhost_user_mem_section_filter, .vhost_get_inflight_fd = vhost_user_get_inflight_fd, .vhost_set_inflight_fd = vhost_user_set_inflight_fd, .vhost_dev_start = vhost_user_dev_start, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 746d130c74..4fe08c809f 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -46,20 +46,33 @@ static struct vhost_log *vhost_log; static struct vhost_log *vhost_log_shm; +/* Memslots used by backends that support private memslots (without an fd). */ static unsigned int used_memslots; + +/* Memslots used by backends that only support shared memslots (with an fd). */ +static unsigned int used_shared_memslots; + static QLIST_HEAD(, vhost_dev) vhost_devices = QLIST_HEAD_INITIALIZER(vhost_devices); bool vhost_has_free_slot(void) { - unsigned int slots_limit = ~0U; + unsigned int free = UINT_MAX; struct vhost_dev *hdev; QLIST_FOREACH(hdev, &vhost_devices, entry) { unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); - slots_limit = MIN(slots_limit, r); + unsigned int cur_free; + + if (hdev->vhost_ops->vhost_backend_no_private_memslots && + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { + cur_free = r - used_shared_memslots; + } else { + cur_free = r - used_memslots; + } + free = MIN(free, cur_free); } - return slots_limit > used_memslots; + return free > 1; } static void vhost_dev_sync_region(struct vhost_dev *dev, @@ -475,8 +488,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev, * vhost_section: identify sections needed for vhost access * * We only care about RAM sections here (where virtqueue and guest - * internals accessed by virtio might live). If we find one we still - * allow the backend to potentially filter it out of our list. + * internals accessed by virtio might live). */ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) { @@ -503,8 +515,16 @@ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) return false; } - if (dev->vhost_ops->vhost_backend_mem_section_filter && - !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) { + /* + * Some backends (like vhost-user) can only handle memory regions + * that have an fd (can be mapped into a different process). Filter + * the ones without an fd out, if requested. + * + * TODO: we might have to limit to MAP_SHARED as well. + */ + if (memory_region_get_fd(section->mr) < 0 && + dev->vhost_ops->vhost_backend_no_private_memslots && + dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { trace_vhost_reject_section(mr->name, 2); return false; } @@ -569,7 +589,14 @@ static void vhost_commit(MemoryListener *listener) dev->n_mem_sections * sizeof dev->mem->regions[0]; dev->mem = g_realloc(dev->mem, regions_size); dev->mem->nregions = dev->n_mem_sections; - used_memslots = dev->mem->nregions; + + if (dev->vhost_ops->vhost_backend_no_private_memslots && + dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { + used_shared_memslots = dev->mem->nregions; + } else { + used_memslots = dev->mem->nregions; + } + for (i = 0; i < dev->n_mem_sections; i++) { struct vhost_memory_region *cur_vmr = dev->mem->regions + i; struct MemoryRegionSection *mrs = dev->mem_sections + i; @@ -1387,6 +1414,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) { + unsigned int used; uint64_t features; int i, r, n_initialized_vqs = 0; @@ -1482,7 +1510,17 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, memory_listener_register(&hdev->memory_listener, &address_space_memory); QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); - if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { + /* + * The listener we registered properly updated the corresponding counter. + * So we can trust that these values are accurate. + */ + if (hdev->vhost_ops->vhost_backend_no_private_memslots && + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { + used = used_shared_memslots; + } else { + used = used_memslots; + } + if (used > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { error_setg(errp, "vhost backend memory slots limit is less" " than current number of present memory slots"); r = -EINVAL; diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index ec3fbae58d..2349a4a7d2 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -108,8 +108,7 @@ typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev, typedef int (*vhost_crypto_close_session_op)(struct vhost_dev *dev, uint64_t session_id); -typedef bool (*vhost_backend_mem_section_filter_op)(struct vhost_dev *dev, - MemoryRegionSection *section); +typedef bool (*vhost_backend_no_private_memslots_op)(struct vhost_dev *dev); typedef int (*vhost_get_inflight_fd_op)(struct vhost_dev *dev, uint16_t queue_size, @@ -138,6 +137,7 @@ typedef struct VhostOps { vhost_backend_init vhost_backend_init; vhost_backend_cleanup vhost_backend_cleanup; vhost_backend_memslots_limit vhost_backend_memslots_limit; + vhost_backend_no_private_memslots_op vhost_backend_no_private_memslots; vhost_net_set_backend_op vhost_net_set_backend; vhost_net_set_mtu_op vhost_net_set_mtu; vhost_scsi_set_endpoint_op vhost_scsi_set_endpoint; @@ -172,7 +172,6 @@ typedef struct VhostOps { vhost_set_config_op vhost_set_config; vhost_crypto_create_session_op vhost_crypto_create_session; vhost_crypto_close_session_op vhost_crypto_close_session; - vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter; vhost_get_inflight_fd_op vhost_get_inflight_fd; vhost_set_inflight_fd_op vhost_set_inflight_fd; vhost_dev_start_op vhost_dev_start;