Message ID | 20181213101022.12475-4-jasowang@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vhost: accelerate metadata access through vmap() | expand |
On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > It was noticed that the copy_user() friends that was used to access > virtqueue metdata tends to be very expensive for dataplane > implementation like vhost since it involves lots of software check, > speculation barrier, hardware feature toggling (e.g SMAP). The > extra cost will be more obvious when transferring small packets. > > This patch tries to eliminate those overhead by pin vq metadata pages > and access them through vmap(). During SET_VRING_ADDR, we will setup > those mappings and memory accessors are modified to use pointers to > access the metadata directly. > > Note, this was only done when device IOTLB is not enabled. We could > use similar method to optimize it in the future. > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > Before: ~5.0Mpps > After: ~6.1Mpps > > Signed-off-by: Jason Wang <jasowang@redhat.com> > --- > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > drivers/vhost/vhost.h | 11 +++ > 2 files changed, 189 insertions(+) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index bafe39d2e637..1bd24203afb6 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > vq->indirect = NULL; > vq->heads = NULL; > vq->dev = dev; > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > mutex_init(&vq->mutex); > vhost_vq_reset(dev, vq); > if (vq->handle_kick) > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > spin_unlock(&dev->iotlb_lock); > } > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > + size_t size, int write) > +{ > + struct page **pages; > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > + int npinned; > + void *vaddr; > + > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > + if (!pages) > + return -ENOMEM; > + > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > + if (npinned != npages) > + goto err; > + As I said I have doubts about the whole approach, but this implementation in particular isn't a good idea as it keeps the page around forever. So no THP, no NUMA rebalancing, userspace-controlled amount of memory locked up and not accounted for. Don't get me wrong it's a great patch in an ideal world. But then in an ideal world no barriers smap etc are necessary at all. > + vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); > + if (!vaddr) > + goto err; > + > + map->pages = pages; > + map->addr = vaddr + (uaddr & (PAGE_SIZE - 1)); > + map->npages = npages; > + > + return 0; > + > +err: > + if (npinned > 0) > + release_pages(pages, npinned); > + kfree(pages); > + return -EFAULT; > +} > + > +static void vhost_uninit_vmap(struct vhost_vmap *map) > +{ > + if (!map->addr) > + return; > + > + vunmap(map->addr); > + release_pages(map->pages, map->npages); > + kfree(map->pages); > + > + map->addr = NULL; > + map->pages = NULL; > + map->npages = 0; > +} > + > +static void vhost_clean_vmaps(struct vhost_virtqueue *vq) > +{ > + vhost_uninit_vmap(&vq->avail_ring); > + vhost_uninit_vmap(&vq->desc_ring); > + vhost_uninit_vmap(&vq->used_ring); > +} > + > +static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail, > + unsigned long desc, unsigned long used) > +{ > + size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; > + size_t avail_size, desc_size, used_size; > + int ret; > + > + vhost_clean_vmaps(vq); > + > + avail_size = sizeof(*vq->avail) + > + sizeof(*vq->avail->ring) * vq->num + event; > + ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for avail ring!\n"); > + goto err_avail; > + } > + > + desc_size = sizeof(*vq->desc) * vq->num; > + ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for desc ring!\n"); > + goto err_desc; > + } > + > + used_size = sizeof(*vq->used) + > + sizeof(*vq->used->ring) * vq->num + event; > + ret = vhost_init_vmap(&vq->used_ring, used, used_size, true); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for used ring!\n"); > + goto err_used; > + } > + > + return 0; > + > +err_used: > + vhost_uninit_vmap(&vq->used_ring); > +err_desc: > + vhost_uninit_vmap(&vq->avail_ring); > +err_avail: > + return -EFAULT; > +} > + > void vhost_dev_cleanup(struct vhost_dev *dev) > { > int i; > @@ -626,6 +725,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > if (dev->vqs[i]->call_ctx) > eventfd_ctx_put(dev->vqs[i]->call_ctx); > vhost_vq_reset(dev, dev->vqs[i]); > + vhost_clean_vmaps(dev->vqs[i]); > } > vhost_dev_free_iovecs(dev); > if (dev->log_ctx) > @@ -873,6 +973,14 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, > > static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + *((__virtio16 *)&used->ring[vq->num]) = > + cpu_to_vhost16(vq, vq->avail_idx); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), > vhost_avail_event(vq)); > } > @@ -881,6 +989,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > struct vring_used_elem *head, int idx, > int count) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + memcpy(used->ring + idx, head, count * sizeof(*head)); > + return 0; > + } > + > return vhost_copy_to_user(vq, vq->used->ring + idx, head, > count * sizeof(*head)); > } > @@ -888,6 +1003,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + used->flags = cpu_to_vhost16(vq, vq->used_flags); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), > &vq->used->flags); > } > @@ -895,6 +1017,13 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + used->idx = cpu_to_vhost16(vq, vq->last_used_idx); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), > &vq->used->idx); > } > @@ -926,12 +1055,26 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, > __virtio16 *idx) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *idx = avail->idx; > + return 0; > + } > + > return vhost_get_avail(vq, *idx, &vq->avail->idx); > } > > static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > __virtio16 *head, int idx) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *head = avail->ring[idx & (vq->num - 1)]; > + return 0; > + } > + > return vhost_get_avail(vq, *head, > &vq->avail->ring[idx & (vq->num - 1)]); > } > @@ -939,24 +1082,52 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, > __virtio16 *flags) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *flags = avail->flags; > + return 0; > + } > + > return vhost_get_avail(vq, *flags, &vq->avail->flags); > } > > static inline int vhost_get_used_event(struct vhost_virtqueue *vq, > __virtio16 *event) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *event = (__virtio16)avail->ring[vq->num]; > + return 0; > + } > + > return vhost_get_avail(vq, *event, vhost_used_event(vq)); > } > > static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, > __virtio16 *idx) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + *idx = used->idx; > + return 0; > + } > + > return vhost_get_used(vq, *idx, &vq->used->idx); > } > > static inline int vhost_get_desc(struct vhost_virtqueue *vq, > struct vring_desc *desc, int idx) > { > + if (!vq->iotlb) { > + struct vring_desc *d = vq->desc_ring.addr; > + > + *desc = *(d + idx); > + return 0; > + } > + > return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); > } > > @@ -1551,6 +1722,13 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > } > } > > + if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr, > + a.desc_user_addr, > + a.used_user_addr)) { > + r = -EINVAL; > + break; > + } > + > vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); > vq->desc = (void __user *)(unsigned long)a.desc_user_addr; > vq->avail = (void __user *)(unsigned long)a.avail_user_addr; > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 466ef7542291..89dc0ad3d055 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -80,6 +80,12 @@ enum vhost_uaddr_type { > VHOST_NUM_ADDRS = 3, > }; > > +struct vhost_vmap { > + struct page **pages; > + void *addr; > + int npages; > +}; > + > /* The virtqueue structure describes a queue attached to a device. */ > struct vhost_virtqueue { > struct vhost_dev *dev; > @@ -90,6 +96,11 @@ struct vhost_virtqueue { > struct vring_desc __user *desc; > struct vring_avail __user *avail; > struct vring_used __user *used; > + > + struct vhost_vmap avail_ring; > + struct vhost_vmap desc_ring; > + struct vhost_vmap used_ring; > + > const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; > struct file *kick; > struct eventfd_ctx *call_ctx; > -- > 2.17.1
.giant snip.. > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > + if (npinned != npages) > > + goto err; > > + > > As I said I have doubts about the whole approach, but this > implementation in particular isn't a good idea > as it keeps the page around forever. > So no THP, no NUMA rebalancing, userspace-controlled > amount of memory locked up and not accounted for. > > Don't get me wrong it's a great patch in an ideal world. > But then in an ideal world no barriers smap etc are necessary at all. So .. suggestions on how this could be accepted? As in other ways where we still get vmap and the issues you mentioned are not troubling you? Thanks!
On Thu, Dec 13, 2018 at 04:18:40PM -0500, Konrad Rzeszutek Wilk wrote: > .giant snip.. > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > + if (npinned != npages) > > > + goto err; > > > + > > > > As I said I have doubts about the whole approach, but this > > implementation in particular isn't a good idea > > as it keeps the page around forever. > > So no THP, no NUMA rebalancing, userspace-controlled > > amount of memory locked up and not accounted for. > > > > Don't get me wrong it's a great patch in an ideal world. > > But then in an ideal world no barriers smap etc are necessary at all. > > So .. suggestions on how this could be accepted? As in other ways > where we still get vmap and the issues you mentioned are not troubling you? > > Thanks! I'd suggest leave vmap alone and find ways to speed up accesses that can fault.
On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >> It was noticed that the copy_user() friends that was used to access >> virtqueue metdata tends to be very expensive for dataplane >> implementation like vhost since it involves lots of software check, >> speculation barrier, hardware feature toggling (e.g SMAP). The >> extra cost will be more obvious when transferring small packets. >> >> This patch tries to eliminate those overhead by pin vq metadata pages >> and access them through vmap(). During SET_VRING_ADDR, we will setup >> those mappings and memory accessors are modified to use pointers to >> access the metadata directly. >> >> Note, this was only done when device IOTLB is not enabled. We could >> use similar method to optimize it in the future. >> >> Tests shows about ~24% improvement on TX PPS when using virtio-user + >> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >> >> Before: ~5.0Mpps >> After: ~6.1Mpps >> >> Signed-off-by: Jason Wang<jasowang@redhat.com> >> --- >> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >> drivers/vhost/vhost.h | 11 +++ >> 2 files changed, 189 insertions(+) >> >> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >> index bafe39d2e637..1bd24203afb6 100644 >> --- a/drivers/vhost/vhost.c >> +++ b/drivers/vhost/vhost.c >> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >> vq->indirect = NULL; >> vq->heads = NULL; >> vq->dev = dev; >> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >> mutex_init(&vq->mutex); >> vhost_vq_reset(dev, vq); >> if (vq->handle_kick) >> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >> spin_unlock(&dev->iotlb_lock); >> } >> >> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >> + size_t size, int write) >> +{ >> + struct page **pages; >> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >> + int npinned; >> + void *vaddr; >> + >> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >> + if (!pages) >> + return -ENOMEM; >> + >> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >> + if (npinned != npages) >> + goto err; >> + > As I said I have doubts about the whole approach, but this > implementation in particular isn't a good idea > as it keeps the page around forever. > So no THP, no NUMA rebalancing, This is the price of all GUP users not only vhost itself. What's more important, the goal is not to be left too much behind for other backends like DPDK or AF_XDP (all of which are using GUP). > userspace-controlled > amount of memory locked up and not accounted for. It's pretty easy to add this since the slow path was still kept. If we exceeds the limitation, we can switch back to slow path. > > Don't get me wrong it's a great patch in an ideal world. > But then in an ideal world no barriers smap etc are necessary at all. Again, this is only for metadata accessing not the data which has been used for years for real use cases. For SMAP, it makes senses for the address that kernel can not forcast. But it's not the case for the vhost metadata since we know the address will be accessed very frequently. For speculation barrier, it helps nothing for the data path of vhost which is a kthread. Packet or AF_XDP benefit from accessing metadata directly, we should do it as well. Thanks
On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > > > It was noticed that the copy_user() friends that was used to access > > > virtqueue metdata tends to be very expensive for dataplane > > > implementation like vhost since it involves lots of software check, > > > speculation barrier, hardware feature toggling (e.g SMAP). The > > > extra cost will be more obvious when transferring small packets. > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages > > > and access them through vmap(). During SET_VRING_ADDR, we will setup > > > those mappings and memory accessors are modified to use pointers to > > > access the metadata directly. > > > > > > Note, this was only done when device IOTLB is not enabled. We could > > > use similar method to optimize it in the future. > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > > > > > Before: ~5.0Mpps > > > After: ~6.1Mpps > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com> > > > --- > > > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > > > drivers/vhost/vhost.h | 11 +++ > > > 2 files changed, 189 insertions(+) > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > index bafe39d2e637..1bd24203afb6 100644 > > > --- a/drivers/vhost/vhost.c > > > +++ b/drivers/vhost/vhost.c > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > > > vq->indirect = NULL; > > > vq->heads = NULL; > > > vq->dev = dev; > > > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > > > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > > > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > > > mutex_init(&vq->mutex); > > > vhost_vq_reset(dev, vq); > > > if (vq->handle_kick) > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > > > spin_unlock(&dev->iotlb_lock); > > > } > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > > > + size_t size, int write) > > > +{ > > > + struct page **pages; > > > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > > > + int npinned; > > > + void *vaddr; > > > + > > > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > > > + if (!pages) > > > + return -ENOMEM; > > > + > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > + if (npinned != npages) > > > + goto err; > > > + > > As I said I have doubts about the whole approach, but this > > implementation in particular isn't a good idea > > as it keeps the page around forever. > > So no THP, no NUMA rebalancing, > > > This is the price of all GUP users not only vhost itself. Yes. GUP is just not a great interface for vhost to use. > What's more > important, the goal is not to be left too much behind for other backends > like DPDK or AF_XDP (all of which are using GUP). So these guys assume userspace knows what it's doing. We can't assume that. > > > userspace-controlled > > amount of memory locked up and not accounted for. > > > It's pretty easy to add this since the slow path was still kept. If we > exceeds the limitation, we can switch back to slow path. > > > > > Don't get me wrong it's a great patch in an ideal world. > > But then in an ideal world no barriers smap etc are necessary at all. > > > Again, this is only for metadata accessing not the data which has been used > for years for real use cases. > > For SMAP, it makes senses for the address that kernel can not forcast. But > it's not the case for the vhost metadata since we know the address will be > accessed very frequently. For speculation barrier, it helps nothing for the > data path of vhost which is a kthread. I don't see how a kthread makes any difference. We do have a validation step which makes some difference. > Packet or AF_XDP benefit from > accessing metadata directly, we should do it as well. > > Thanks
Hi Jason, I love your patch! Yet something to improve: [auto build test ERROR on net-next/master] url: https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-accelerate-metadata-access-through-vmap/20181214-200417 config: mips-malta_kvm_defconfig (attached as .config) compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=mips All errors (new ones prefixed by >>): drivers//vhost/vhost.c: In function 'vhost_init_vmap': >> drivers//vhost/vhost.c:648:3: error: implicit declaration of function 'release_pages'; did you mean 'release_task'? [-Werror=implicit-function-declaration] release_pages(pages, npinned); ^~~~~~~~~~~~~ release_task cc1: some warnings being treated as errors vim +648 drivers//vhost/vhost.c 619 620 static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, 621 size_t size, int write) 622 { 623 struct page **pages; 624 int npages = DIV_ROUND_UP(size, PAGE_SIZE); 625 int npinned; 626 void *vaddr; 627 628 pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); 629 if (!pages) 630 return -ENOMEM; 631 632 npinned = get_user_pages_fast(uaddr, npages, write, pages); 633 if (npinned != npages) 634 goto err; 635 636 vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); 637 if (!vaddr) 638 goto err; 639 640 map->pages = pages; 641 map->addr = vaddr + (uaddr & (PAGE_SIZE - 1)); 642 map->npages = npages; 643 644 return 0; 645 646 err: 647 if (npinned > 0) > 648 release_pages(pages, npinned); 649 kfree(pages); 650 return -EFAULT; 651 } 652 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
From: Jason Wang <jasowang@redhat.com> Date: Fri, 14 Dec 2018 11:57:35 +0800 > This is the price of all GUP users not only vhost itself. What's more > important, the goal is not to be left too much behind for other > backends like DPDK or AF_XDP (all of which are using GUP). +1
On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: >> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: >>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >>>> It was noticed that the copy_user() friends that was used to access >>>> virtqueue metdata tends to be very expensive for dataplane >>>> implementation like vhost since it involves lots of software check, >>>> speculation barrier, hardware feature toggling (e.g SMAP). The >>>> extra cost will be more obvious when transferring small packets. >>>> >>>> This patch tries to eliminate those overhead by pin vq metadata pages >>>> and access them through vmap(). During SET_VRING_ADDR, we will setup >>>> those mappings and memory accessors are modified to use pointers to >>>> access the metadata directly. >>>> >>>> Note, this was only done when device IOTLB is not enabled. We could >>>> use similar method to optimize it in the future. >>>> >>>> Tests shows about ~24% improvement on TX PPS when using virtio-user + >>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >>>> >>>> Before: ~5.0Mpps >>>> After: ~6.1Mpps >>>> >>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>> --- >>>> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >>>> drivers/vhost/vhost.h | 11 +++ >>>> 2 files changed, 189 insertions(+) >>>> >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >>>> index bafe39d2e637..1bd24203afb6 100644 >>>> --- a/drivers/vhost/vhost.c >>>> +++ b/drivers/vhost/vhost.c >>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >>>> vq->indirect = NULL; >>>> vq->heads = NULL; >>>> vq->dev = dev; >>>> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >>>> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >>>> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >>>> mutex_init(&vq->mutex); >>>> vhost_vq_reset(dev, vq); >>>> if (vq->handle_kick) >>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >>>> spin_unlock(&dev->iotlb_lock); >>>> } >>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >>>> + size_t size, int write) >>>> +{ >>>> + struct page **pages; >>>> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >>>> + int npinned; >>>> + void *vaddr; >>>> + >>>> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >>>> + if (!pages) >>>> + return -ENOMEM; >>>> + >>>> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >>>> + if (npinned != npages) >>>> + goto err; >>>> + >>> As I said I have doubts about the whole approach, but this >>> implementation in particular isn't a good idea >>> as it keeps the page around forever. The pages wil be released during set features. >>> So no THP, no NUMA rebalancing, For THP, we will probably miss 2 or 4 pages, but does this really matter consider the gain we have? For NUMA rebalancing, I'm even not quite sure if it can helps for the case of IPC (vhost). It looks to me the worst case it may cause page to be thrash between nodes if vhost and userspace are running in two nodes. >> >> This is the price of all GUP users not only vhost itself. > Yes. GUP is just not a great interface for vhost to use. Zerocopy codes (enabled by defualt) use them for years. > >> What's more >> important, the goal is not to be left too much behind for other backends >> like DPDK or AF_XDP (all of which are using GUP). > > So these guys assume userspace knows what it's doing. > We can't assume that. What kind of assumption do you they have? > >>> userspace-controlled >>> amount of memory locked up and not accounted for. >> >> It's pretty easy to add this since the slow path was still kept. If we >> exceeds the limitation, we can switch back to slow path. >> >>> Don't get me wrong it's a great patch in an ideal world. >>> But then in an ideal world no barriers smap etc are necessary at all. >> >> Again, this is only for metadata accessing not the data which has been used >> for years for real use cases. >> >> For SMAP, it makes senses for the address that kernel can not forcast. But >> it's not the case for the vhost metadata since we know the address will be >> accessed very frequently. For speculation barrier, it helps nothing for the >> data path of vhost which is a kthread. > I don't see how a kthread makes any difference. We do have a validation > step which makes some difference. The problem is not kthread but the address of userspace address. The addresses of vq metadata tends to be consistent for a while, and vhost knows they will be frequently. SMAP doesn't help too much in this case. Thanks. > >> Packet or AF_XDP benefit from >> accessing metadata directly, we should do it as well. >> >> Thanks
On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > > > > > It was noticed that the copy_user() friends that was used to access > > > > > virtqueue metdata tends to be very expensive for dataplane > > > > > implementation like vhost since it involves lots of software check, > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The > > > > > extra cost will be more obvious when transferring small packets. > > > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup > > > > > those mappings and memory accessors are modified to use pointers to > > > > > access the metadata directly. > > > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could > > > > > use similar method to optimize it in the future. > > > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > > > > > > > > > Before: ~5.0Mpps > > > > > After: ~6.1Mpps > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com> > > > > > --- > > > > > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > > > > > drivers/vhost/vhost.h | 11 +++ > > > > > 2 files changed, 189 insertions(+) > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > index bafe39d2e637..1bd24203afb6 100644 > > > > > --- a/drivers/vhost/vhost.c > > > > > +++ b/drivers/vhost/vhost.c > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > > > > > vq->indirect = NULL; > > > > > vq->heads = NULL; > > > > > vq->dev = dev; > > > > > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > > > > > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > > > > > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > > > > > mutex_init(&vq->mutex); > > > > > vhost_vq_reset(dev, vq); > > > > > if (vq->handle_kick) > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > > > > > spin_unlock(&dev->iotlb_lock); > > > > > } > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > > > > > + size_t size, int write) > > > > > +{ > > > > > + struct page **pages; > > > > > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > > > > > + int npinned; > > > > > + void *vaddr; > > > > > + > > > > > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > > > > > + if (!pages) > > > > > + return -ENOMEM; > > > > > + > > > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > > > + if (npinned != npages) > > > > > + goto err; > > > > > + > > > > As I said I have doubts about the whole approach, but this > > > > implementation in particular isn't a good idea > > > > as it keeps the page around forever. > > > The pages wil be released during set features. > > > > > > So no THP, no NUMA rebalancing, > > > For THP, we will probably miss 2 or 4 pages, but does this really matter > consider the gain we have? We as in vhost? networking isn't the only thing guest does. We don't even know if this guest does a lot of networking. You don't know what else is in this huge page. Can be something very important that guest touches all the time. > For NUMA rebalancing, I'm even not quite sure if > it can helps for the case of IPC (vhost). It looks to me the worst case it > may cause page to be thrash between nodes if vhost and userspace are running > in two nodes. So again it's a gain for vhost but has a completely unpredictable effect on other functionality of the guest. That's what bothers me with this approach. > > > > > > > This is the price of all GUP users not only vhost itself. > > Yes. GUP is just not a great interface for vhost to use. > > > Zerocopy codes (enabled by defualt) use them for years. But only for TX and temporarily. We pin, read, unpin. Your patch is different - it writes into memory and GUP has known issues with file backed memory - it keeps pages pinned forever > > > > > > What's more > > > important, the goal is not to be left too much behind for other backends > > > like DPDK or AF_XDP (all of which are using GUP). > > > > So these guys assume userspace knows what it's doing. > > We can't assume that. > > > What kind of assumption do you they have? > > > > > > > > userspace-controlled > > > > amount of memory locked up and not accounted for. > > > > > > It's pretty easy to add this since the slow path was still kept. If we > > > exceeds the limitation, we can switch back to slow path. > > > > > > > Don't get me wrong it's a great patch in an ideal world. > > > > But then in an ideal world no barriers smap etc are necessary at all. > > > > > > Again, this is only for metadata accessing not the data which has been used > > > for years for real use cases. > > > > > > For SMAP, it makes senses for the address that kernel can not forcast. But > > > it's not the case for the vhost metadata since we know the address will be > > > accessed very frequently. For speculation barrier, it helps nothing for the > > > data path of vhost which is a kthread. > > I don't see how a kthread makes any difference. We do have a validation > > step which makes some difference. > > > The problem is not kthread but the address of userspace address. The > addresses of vq metadata tends to be consistent for a while, and vhost knows > they will be frequently. SMAP doesn't help too much in this case. > > Thanks. It's true for a real life applications but a malicious one can call the setup ioctls any number of times. And SMAP is all about malcious applications. > > > > > > Packet or AF_XDP benefit from > > > accessing metadata directly, we should do it as well. > > > > > > Thanks
On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: >> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: >>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: >>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: >>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >>>>>> It was noticed that the copy_user() friends that was used to access >>>>>> virtqueue metdata tends to be very expensive for dataplane >>>>>> implementation like vhost since it involves lots of software check, >>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The >>>>>> extra cost will be more obvious when transferring small packets. >>>>>> >>>>>> This patch tries to eliminate those overhead by pin vq metadata pages >>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup >>>>>> those mappings and memory accessors are modified to use pointers to >>>>>> access the metadata directly. >>>>>> >>>>>> Note, this was only done when device IOTLB is not enabled. We could >>>>>> use similar method to optimize it in the future. >>>>>> >>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user + >>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >>>>>> >>>>>> Before: ~5.0Mpps >>>>>> After: ~6.1Mpps >>>>>> >>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>> --- >>>>>> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >>>>>> drivers/vhost/vhost.h | 11 +++ >>>>>> 2 files changed, 189 insertions(+) >>>>>> >>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >>>>>> index bafe39d2e637..1bd24203afb6 100644 >>>>>> --- a/drivers/vhost/vhost.c >>>>>> +++ b/drivers/vhost/vhost.c >>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >>>>>> vq->indirect = NULL; >>>>>> vq->heads = NULL; >>>>>> vq->dev = dev; >>>>>> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >>>>>> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >>>>>> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >>>>>> mutex_init(&vq->mutex); >>>>>> vhost_vq_reset(dev, vq); >>>>>> if (vq->handle_kick) >>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >>>>>> spin_unlock(&dev->iotlb_lock); >>>>>> } >>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >>>>>> + size_t size, int write) >>>>>> +{ >>>>>> + struct page **pages; >>>>>> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >>>>>> + int npinned; >>>>>> + void *vaddr; >>>>>> + >>>>>> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >>>>>> + if (!pages) >>>>>> + return -ENOMEM; >>>>>> + >>>>>> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >>>>>> + if (npinned != npages) >>>>>> + goto err; >>>>>> + >>>>> As I said I have doubts about the whole approach, but this >>>>> implementation in particular isn't a good idea >>>>> as it keeps the page around forever. >> >> The pages wil be released during set features. >> >> >>>>> So no THP, no NUMA rebalancing, >> >> For THP, we will probably miss 2 or 4 pages, but does this really matter >> consider the gain we have? > We as in vhost? networking isn't the only thing guest does. > We don't even know if this guest does a lot of networking. > You don't > know what else is in this huge page. Can be something very important > that guest touches all the time. Well, the probability should be very small consider we usually give several gigabytes to guest. The rest of the pages that doesn't sit in the same hugepage with metadata can still be merged by THP. Anyway, I can test the differences. > >> For NUMA rebalancing, I'm even not quite sure if >> it can helps for the case of IPC (vhost). It looks to me the worst case it >> may cause page to be thrash between nodes if vhost and userspace are running >> in two nodes. > > So again it's a gain for vhost but has a completely unpredictable effect on > other functionality of the guest. > > That's what bothers me with this approach. So: - The rest of the pages could still be balanced to other nodes, no? - try to balance metadata pages (belongs to co-operate processes) itself is still questionable > > > > >>>> This is the price of all GUP users not only vhost itself. >>> Yes. GUP is just not a great interface for vhost to use. >> >> Zerocopy codes (enabled by defualt) use them for years. > But only for TX and temporarily. We pin, read, unpin. Probably not. For several reasons that the page will be not be released soon or held for a very long period of time or even forever. > > Your patch is different > > - it writes into memory and GUP has known issues with file > backed memory The ordinary user for vhost is anonymous pages I think? > - it keeps pages pinned forever > > > >>>> What's more >>>> important, the goal is not to be left too much behind for other backends >>>> like DPDK or AF_XDP (all of which are using GUP). >>> So these guys assume userspace knows what it's doing. >>> We can't assume that. >> >> What kind of assumption do you they have? >> >> >>>>> userspace-controlled >>>>> amount of memory locked up and not accounted for. >>>> It's pretty easy to add this since the slow path was still kept. If we >>>> exceeds the limitation, we can switch back to slow path. >>>> >>>>> Don't get me wrong it's a great patch in an ideal world. >>>>> But then in an ideal world no barriers smap etc are necessary at all. >>>> Again, this is only for metadata accessing not the data which has been used >>>> for years for real use cases. >>>> >>>> For SMAP, it makes senses for the address that kernel can not forcast. But >>>> it's not the case for the vhost metadata since we know the address will be >>>> accessed very frequently. For speculation barrier, it helps nothing for the >>>> data path of vhost which is a kthread. >>> I don't see how a kthread makes any difference. We do have a validation >>> step which makes some difference. >> >> The problem is not kthread but the address of userspace address. The >> addresses of vq metadata tends to be consistent for a while, and vhost knows >> they will be frequently. SMAP doesn't help too much in this case. >> >> Thanks. > It's true for a real life applications but a malicious one > can call the setup ioctls any number of times. And SMAP is > all about malcious applications. We don't do this in the path of ioctl, there's no context switch between userspace and kernel in the worker thread. SMAP is used to prevent kernel from accessing userspace pages unexpectedly which is not the case for metadata access. Thanks > >>>> Packet or AF_XDP benefit from >>>> accessing metadata directly, we should do it as well. >>>> >>>> Thanks
On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: > > On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: > > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: > > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: > > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: > > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > > > > > > > It was noticed that the copy_user() friends that was used to access > > > > > > > virtqueue metdata tends to be very expensive for dataplane > > > > > > > implementation like vhost since it involves lots of software check, > > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The > > > > > > > extra cost will be more obvious when transferring small packets. > > > > > > > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages > > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup > > > > > > > those mappings and memory accessors are modified to use pointers to > > > > > > > access the metadata directly. > > > > > > > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could > > > > > > > use similar method to optimize it in the future. > > > > > > > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > > > > > > > > > > > > > Before: ~5.0Mpps > > > > > > > After: ~6.1Mpps > > > > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com> > > > > > > > --- > > > > > > > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > > > > > > > drivers/vhost/vhost.h | 11 +++ > > > > > > > 2 files changed, 189 insertions(+) > > > > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > > > index bafe39d2e637..1bd24203afb6 100644 > > > > > > > --- a/drivers/vhost/vhost.c > > > > > > > +++ b/drivers/vhost/vhost.c > > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > > > > > > > vq->indirect = NULL; > > > > > > > vq->heads = NULL; > > > > > > > vq->dev = dev; > > > > > > > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > > > > > > > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > > > > > > > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > > > > > > > mutex_init(&vq->mutex); > > > > > > > vhost_vq_reset(dev, vq); > > > > > > > if (vq->handle_kick) > > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > > > > > > > spin_unlock(&dev->iotlb_lock); > > > > > > > } > > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > > > > > > > + size_t size, int write) > > > > > > > +{ > > > > > > > + struct page **pages; > > > > > > > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > > > > > > > + int npinned; > > > > > > > + void *vaddr; > > > > > > > + > > > > > > > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > > > > > > > + if (!pages) > > > > > > > + return -ENOMEM; > > > > > > > + > > > > > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > > > > > + if (npinned != npages) > > > > > > > + goto err; > > > > > > > + > > > > > > As I said I have doubts about the whole approach, but this > > > > > > implementation in particular isn't a good idea > > > > > > as it keeps the page around forever. > > > > > > The pages wil be released during set features. > > > > > > > > > > > > So no THP, no NUMA rebalancing, > > > > > > For THP, we will probably miss 2 or 4 pages, but does this really matter > > > consider the gain we have? > > We as in vhost? networking isn't the only thing guest does. > > We don't even know if this guest does a lot of networking. > > You don't > > know what else is in this huge page. Can be something very important > > that guest touches all the time. > > > Well, the probability should be very small consider we usually give several > gigabytes to guest. The rest of the pages that doesn't sit in the same > hugepage with metadata can still be merged by THP. Anyway, I can test the > differences. Thanks! > > > > > > For NUMA rebalancing, I'm even not quite sure if > > > it can helps for the case of IPC (vhost). It looks to me the worst case it > > > may cause page to be thrash between nodes if vhost and userspace are running > > > in two nodes. > > > > So again it's a gain for vhost but has a completely unpredictable effect on > > other functionality of the guest. > > > > That's what bothers me with this approach. > > > So: > > - The rest of the pages could still be balanced to other nodes, no? > > - try to balance metadata pages (belongs to co-operate processes) itself is > still questionable I am not sure why. It should be easy enough to force the VCPU and vhost to move (e.g. start them pinned to 1 cpu, then pin them to another one). Clearly sometimes this would be necessary for load balancing reasons. With autonuma after a while (could take seconds but it will happen) the memory will migrate. > > > > > > > > > > > > > > This is the price of all GUP users not only vhost itself. > > > > Yes. GUP is just not a great interface for vhost to use. > > > > > > Zerocopy codes (enabled by defualt) use them for years. > > But only for TX and temporarily. We pin, read, unpin. > > > Probably not. For several reasons that the page will be not be released soon > or held for a very long period of time or even forever. With zero copy? Well it's pinned until transmit. Takes a while but could be enough for autocopy to work esp since its the packet memory so not reused immediately. > > > > > Your patch is different > > > > - it writes into memory and GUP has known issues with file > > backed memory > > > The ordinary user for vhost is anonymous pages I think? It's not the most common scenario and not the fastest one (e.g. THP does not work) but file backed is useful sometimes. It would not be nice at all to corrupt guest memory in that case. > > > - it keeps pages pinned forever > > > > > > > > > > > What's more > > > > > important, the goal is not to be left too much behind for other backends > > > > > like DPDK or AF_XDP (all of which are using GUP). > > > > So these guys assume userspace knows what it's doing. > > > > We can't assume that. > > > > > > What kind of assumption do you they have? > > > > > > > > > > > > userspace-controlled > > > > > > amount of memory locked up and not accounted for. > > > > > It's pretty easy to add this since the slow path was still kept. If we > > > > > exceeds the limitation, we can switch back to slow path. > > > > > > > > > > > Don't get me wrong it's a great patch in an ideal world. > > > > > > But then in an ideal world no barriers smap etc are necessary at all. > > > > > Again, this is only for metadata accessing not the data which has been used > > > > > for years for real use cases. > > > > > > > > > > For SMAP, it makes senses for the address that kernel can not forcast. But > > > > > it's not the case for the vhost metadata since we know the address will be > > > > > accessed very frequently. For speculation barrier, it helps nothing for the > > > > > data path of vhost which is a kthread. > > > > I don't see how a kthread makes any difference. We do have a validation > > > > step which makes some difference. > > > > > > The problem is not kthread but the address of userspace address. The > > > addresses of vq metadata tends to be consistent for a while, and vhost knows > > > they will be frequently. SMAP doesn't help too much in this case. > > > > > > Thanks. > > It's true for a real life applications but a malicious one > > can call the setup ioctls any number of times. And SMAP is > > all about malcious applications. > > > We don't do this in the path of ioctl, there's no context switch between > userspace and kernel in the worker thread. SMAP is used to prevent kernel > from accessing userspace pages unexpectedly which is not the case for > metadata access. > > Thanks OK let's forget smap for now. > > > > > > > > Packet or AF_XDP benefit from > > > > > accessing metadata directly, we should do it as well. > > > > > > > > > > Thanks
On 2018/12/25 下午8:50, Michael S. Tsirkin wrote: > On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: >> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: >>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: >>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: >>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: >>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: >>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >>>>>>>> It was noticed that the copy_user() friends that was used to access >>>>>>>> virtqueue metdata tends to be very expensive for dataplane >>>>>>>> implementation like vhost since it involves lots of software check, >>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The >>>>>>>> extra cost will be more obvious when transferring small packets. >>>>>>>> >>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages >>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup >>>>>>>> those mappings and memory accessors are modified to use pointers to >>>>>>>> access the metadata directly. >>>>>>>> >>>>>>>> Note, this was only done when device IOTLB is not enabled. We could >>>>>>>> use similar method to optimize it in the future. >>>>>>>> >>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user + >>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >>>>>>>> >>>>>>>> Before: ~5.0Mpps >>>>>>>> After: ~6.1Mpps >>>>>>>> >>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>>>> --- >>>>>>>> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >>>>>>>> drivers/vhost/vhost.h | 11 +++ >>>>>>>> 2 files changed, 189 insertions(+) >>>>>>>> >>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >>>>>>>> index bafe39d2e637..1bd24203afb6 100644 >>>>>>>> --- a/drivers/vhost/vhost.c >>>>>>>> +++ b/drivers/vhost/vhost.c >>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >>>>>>>> vq->indirect = NULL; >>>>>>>> vq->heads = NULL; >>>>>>>> vq->dev = dev; >>>>>>>> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >>>>>>>> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >>>>>>>> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >>>>>>>> mutex_init(&vq->mutex); >>>>>>>> vhost_vq_reset(dev, vq); >>>>>>>> if (vq->handle_kick) >>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >>>>>>>> spin_unlock(&dev->iotlb_lock); >>>>>>>> } >>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >>>>>>>> + size_t size, int write) >>>>>>>> +{ >>>>>>>> + struct page **pages; >>>>>>>> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >>>>>>>> + int npinned; >>>>>>>> + void *vaddr; >>>>>>>> + >>>>>>>> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >>>>>>>> + if (!pages) >>>>>>>> + return -ENOMEM; >>>>>>>> + >>>>>>>> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >>>>>>>> + if (npinned != npages) >>>>>>>> + goto err; >>>>>>>> + >>>>>>> As I said I have doubts about the whole approach, but this >>>>>>> implementation in particular isn't a good idea >>>>>>> as it keeps the page around forever. >>>> The pages wil be released during set features. >>>> >>>> >>>>>>> So no THP, no NUMA rebalancing, >>>> For THP, we will probably miss 2 or 4 pages, but does this really matter >>>> consider the gain we have? >>> We as in vhost? networking isn't the only thing guest does. >>> We don't even know if this guest does a lot of networking. >>> You don't >>> know what else is in this huge page. Can be something very important >>> that guest touches all the time. >> >> Well, the probability should be very small consider we usually give several >> gigabytes to guest. The rest of the pages that doesn't sit in the same >> hugepage with metadata can still be merged by THP. Anyway, I can test the >> differences. > Thanks! > >>>> For NUMA rebalancing, I'm even not quite sure if >>>> it can helps for the case of IPC (vhost). It looks to me the worst case it >>>> may cause page to be thrash between nodes if vhost and userspace are running >>>> in two nodes. >>> So again it's a gain for vhost but has a completely unpredictable effect on >>> other functionality of the guest. >>> >>> That's what bothers me with this approach. >> >> So: >> >> - The rest of the pages could still be balanced to other nodes, no? >> >> - try to balance metadata pages (belongs to co-operate processes) itself is >> still questionable > I am not sure why. It should be easy enough to force the VCPU and vhost > to move (e.g. start them pinned to 1 cpu, then pin them to another one). > Clearly sometimes this would be necessary for load balancing reasons. Yes, but it looks to me the part of motivation of auto NUMA is to avoid manual pinning. > With autonuma after a while (could take seconds but it will happen) the > memory will migrate. > Yes. As you mentioned during the discuss, I wonder we could do it similarly through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86: Unpin and remove kvm_arch->apic_access_page") > > >>> >>> >>> >>>>>> This is the price of all GUP users not only vhost itself. >>>>> Yes. GUP is just not a great interface for vhost to use. >>>> Zerocopy codes (enabled by defualt) use them for years. >>> But only for TX and temporarily. We pin, read, unpin. >> >> Probably not. For several reasons that the page will be not be released soon >> or held for a very long period of time or even forever. > > With zero copy? Well it's pinned until transmit. Takes a while > but could be enough for autocopy to work esp since > its the packet memory so not reused immediately. > >>> Your patch is different >>> >>> - it writes into memory and GUP has known issues with file >>> backed memory >> >> The ordinary user for vhost is anonymous pages I think? > > It's not the most common scenario and not the fastest one > (e.g. THP does not work) but file backed is useful sometimes. > It would not be nice at all to corrupt guest memory in that case. Ok. > >>> - it keeps pages pinned forever >>> >>> >>> >>>>>> What's more >>>>>> important, the goal is not to be left too much behind for other backends >>>>>> like DPDK or AF_XDP (all of which are using GUP). >>>>> So these guys assume userspace knows what it's doing. >>>>> We can't assume that. >>>> What kind of assumption do you they have? >>>> >>>> >>>>>>> userspace-controlled >>>>>>> amount of memory locked up and not accounted for. >>>>>> It's pretty easy to add this since the slow path was still kept. If we >>>>>> exceeds the limitation, we can switch back to slow path. >>>>>> >>>>>>> Don't get me wrong it's a great patch in an ideal world. >>>>>>> But then in an ideal world no barriers smap etc are necessary at all. >>>>>> Again, this is only for metadata accessing not the data which has been used >>>>>> for years for real use cases. >>>>>> >>>>>> For SMAP, it makes senses for the address that kernel can not forcast. But >>>>>> it's not the case for the vhost metadata since we know the address will be >>>>>> accessed very frequently. For speculation barrier, it helps nothing for the >>>>>> data path of vhost which is a kthread. >>>>> I don't see how a kthread makes any difference. We do have a validation >>>>> step which makes some difference. >>>> The problem is not kthread but the address of userspace address. The >>>> addresses of vq metadata tends to be consistent for a while, and vhost knows >>>> they will be frequently. SMAP doesn't help too much in this case. >>>> >>>> Thanks. >>> It's true for a real life applications but a malicious one >>> can call the setup ioctls any number of times. And SMAP is >>> all about malcious applications. >> >> We don't do this in the path of ioctl, there's no context switch between >> userspace and kernel in the worker thread. SMAP is used to prevent kernel >> from accessing userspace pages unexpectedly which is not the case for >> metadata access. >> >> Thanks > OK let's forget smap for now. Some numbers I measured: On an old Sandy bridge machine without SMAP support. Remove speculation barrier boost the performance from 4.6Mpps to 5.1Mpps On a newer Broadwell machine with SMAP support. Remove speculation barrier only gives 2%-5% improvement, disable SMAP completely through Kconfig boost 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it only bypass SMAP for metadata). So it looks like for recent machine, SMAP becomes pain point when the copy is short (e.g 64B) for high PPS. Thanks > >>>>>> Packet or AF_XDP benefit from >>>>>> accessing metadata directly, we should do it as well. >>>>>> >>>>>> Thanks
On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote: > > On 2018/12/25 下午8:50, Michael S. Tsirkin wrote: > > On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: > > > On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: > > > > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: > > > > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: > > > > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: > > > > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > > > > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > > > > > > > > > It was noticed that the copy_user() friends that was used to access > > > > > > > > > virtqueue metdata tends to be very expensive for dataplane > > > > > > > > > implementation like vhost since it involves lots of software check, > > > > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The > > > > > > > > > extra cost will be more obvious when transferring small packets. > > > > > > > > > > > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages > > > > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup > > > > > > > > > those mappings and memory accessors are modified to use pointers to > > > > > > > > > access the metadata directly. > > > > > > > > > > > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could > > > > > > > > > use similar method to optimize it in the future. > > > > > > > > > > > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > > > > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > > > > > > > > > > > > > > > > > Before: ~5.0Mpps > > > > > > > > > After: ~6.1Mpps > > > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com> > > > > > > > > > --- > > > > > > > > > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > drivers/vhost/vhost.h | 11 +++ > > > > > > > > > 2 files changed, 189 insertions(+) > > > > > > > > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > > > > > index bafe39d2e637..1bd24203afb6 100644 > > > > > > > > > --- a/drivers/vhost/vhost.c > > > > > > > > > +++ b/drivers/vhost/vhost.c > > > > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > > > > > > > > > vq->indirect = NULL; > > > > > > > > > vq->heads = NULL; > > > > > > > > > vq->dev = dev; > > > > > > > > > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > > > > > > > > > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > > > > > > > > > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > > > > > > > > > mutex_init(&vq->mutex); > > > > > > > > > vhost_vq_reset(dev, vq); > > > > > > > > > if (vq->handle_kick) > > > > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > > > > > > > > > spin_unlock(&dev->iotlb_lock); > > > > > > > > > } > > > > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > > > > > > > > > + size_t size, int write) > > > > > > > > > +{ > > > > > > > > > + struct page **pages; > > > > > > > > > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > > > > > > > > > + int npinned; > > > > > > > > > + void *vaddr; > > > > > > > > > + > > > > > > > > > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > > > > > > > > > + if (!pages) > > > > > > > > > + return -ENOMEM; > > > > > > > > > + > > > > > > > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > > > > > > > + if (npinned != npages) > > > > > > > > > + goto err; > > > > > > > > > + > > > > > > > > As I said I have doubts about the whole approach, but this > > > > > > > > implementation in particular isn't a good idea > > > > > > > > as it keeps the page around forever. > > > > > The pages wil be released during set features. > > > > > > > > > > > > > > > > > > So no THP, no NUMA rebalancing, > > > > > For THP, we will probably miss 2 or 4 pages, but does this really matter > > > > > consider the gain we have? > > > > We as in vhost? networking isn't the only thing guest does. > > > > We don't even know if this guest does a lot of networking. > > > > You don't > > > > know what else is in this huge page. Can be something very important > > > > that guest touches all the time. > > > > > > Well, the probability should be very small consider we usually give several > > > gigabytes to guest. The rest of the pages that doesn't sit in the same > > > hugepage with metadata can still be merged by THP. Anyway, I can test the > > > differences. > > Thanks! > > > > > > > For NUMA rebalancing, I'm even not quite sure if > > > > > it can helps for the case of IPC (vhost). It looks to me the worst case it > > > > > may cause page to be thrash between nodes if vhost and userspace are running > > > > > in two nodes. > > > > So again it's a gain for vhost but has a completely unpredictable effect on > > > > other functionality of the guest. > > > > > > > > That's what bothers me with this approach. > > > > > > So: > > > > > > - The rest of the pages could still be balanced to other nodes, no? > > > > > > - try to balance metadata pages (belongs to co-operate processes) itself is > > > still questionable > > I am not sure why. It should be easy enough to force the VCPU and vhost > > to move (e.g. start them pinned to 1 cpu, then pin them to another one). > > Clearly sometimes this would be necessary for load balancing reasons. > > > Yes, but it looks to me the part of motivation of auto NUMA is to avoid > manual pinning. ... of memory. Yes. > > > With autonuma after a while (could take seconds but it will happen) the > > memory will migrate. > > > > Yes. As you mentioned during the discuss, I wonder we could do it similarly > through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86: > Unpin and remove kvm_arch->apic_access_page") That would be a possible approach. > > > > > > > > > > > > > > > > > > > > > > > > This is the price of all GUP users not only vhost itself. > > > > > > Yes. GUP is just not a great interface for vhost to use. > > > > > Zerocopy codes (enabled by defualt) use them for years. > > > > But only for TX and temporarily. We pin, read, unpin. > > > > > > Probably not. For several reasons that the page will be not be released soon > > > or held for a very long period of time or even forever. > > > > With zero copy? Well it's pinned until transmit. Takes a while > > but could be enough for autocopy to work esp since > > its the packet memory so not reused immediately. > > > > > > Your patch is different > > > > > > > > - it writes into memory and GUP has known issues with file > > > > backed memory > > > > > > The ordinary user for vhost is anonymous pages I think? > > > > It's not the most common scenario and not the fastest one > > (e.g. THP does not work) but file backed is useful sometimes. > > It would not be nice at all to corrupt guest memory in that case. > > > Ok. > > > > > > > > - it keeps pages pinned forever > > > > > > > > > > > > > > > > > > > What's more > > > > > > > important, the goal is not to be left too much behind for other backends > > > > > > > like DPDK or AF_XDP (all of which are using GUP). > > > > > > So these guys assume userspace knows what it's doing. > > > > > > We can't assume that. > > > > > What kind of assumption do you they have? > > > > > > > > > > > > > > > > > > userspace-controlled > > > > > > > > amount of memory locked up and not accounted for. > > > > > > > It's pretty easy to add this since the slow path was still kept. If we > > > > > > > exceeds the limitation, we can switch back to slow path. > > > > > > > > > > > > > > > Don't get me wrong it's a great patch in an ideal world. > > > > > > > > But then in an ideal world no barriers smap etc are necessary at all. > > > > > > > Again, this is only for metadata accessing not the data which has been used > > > > > > > for years for real use cases. > > > > > > > > > > > > > > For SMAP, it makes senses for the address that kernel can not forcast. But > > > > > > > it's not the case for the vhost metadata since we know the address will be > > > > > > > accessed very frequently. For speculation barrier, it helps nothing for the > > > > > > > data path of vhost which is a kthread. > > > > > > I don't see how a kthread makes any difference. We do have a validation > > > > > > step which makes some difference. > > > > > The problem is not kthread but the address of userspace address. The > > > > > addresses of vq metadata tends to be consistent for a while, and vhost knows > > > > > they will be frequently. SMAP doesn't help too much in this case. > > > > > > > > > > Thanks. > > > > It's true for a real life applications but a malicious one > > > > can call the setup ioctls any number of times. And SMAP is > > > > all about malcious applications. > > > > > > We don't do this in the path of ioctl, there's no context switch between > > > userspace and kernel in the worker thread. SMAP is used to prevent kernel > > > from accessing userspace pages unexpectedly which is not the case for > > > metadata access. > > > > > > Thanks > > OK let's forget smap for now. > > > Some numbers I measured: > > On an old Sandy bridge machine without SMAP support. Remove speculation > barrier boost the performance from 4.6Mpps to 5.1Mpps > > On a newer Broadwell machine with SMAP support. Remove speculation barrier > only gives 2%-5% improvement, disable SMAP completely through Kconfig boost > 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it > only bypass SMAP for metadata). > > So it looks like for recent machine, SMAP becomes pain point when the copy > is short (e.g 64B) for high PPS. > > Thanks Thanks a lot for looking into this! So first of all users can just boot with nosmap, right? What's wrong with that? Yes it's not fine-grained but OTOH it's easy to understand. And I guess this confirms that if we are going to worry about smap enabled, we need to look into packet copies too, not just meta-data. Vaguely could see a module option (off by default) where vhost basically does user_access_begin when it starts running, then uses unsafe accesses in vhost and tun and then user_access_end. > > > > > > > > > > Packet or AF_XDP benefit from > > > > > > > accessing metadata directly, we should do it as well. > > > > > > > > > > > > > > Thanks
On 2018/12/26 下午11:02, Michael S. Tsirkin wrote: > On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote: >> On 2018/12/25 下午8:50, Michael S. Tsirkin wrote: >>> On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: >>>> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: >>>>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: >>>>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: >>>>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: >>>>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: >>>>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >>>>>>>>>> It was noticed that the copy_user() friends that was used to access >>>>>>>>>> virtqueue metdata tends to be very expensive for dataplane >>>>>>>>>> implementation like vhost since it involves lots of software check, >>>>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The >>>>>>>>>> extra cost will be more obvious when transferring small packets. >>>>>>>>>> >>>>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages >>>>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup >>>>>>>>>> those mappings and memory accessors are modified to use pointers to >>>>>>>>>> access the metadata directly. >>>>>>>>>> >>>>>>>>>> Note, this was only done when device IOTLB is not enabled. We could >>>>>>>>>> use similar method to optimize it in the future. >>>>>>>>>> >>>>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user + >>>>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >>>>>>>>>> >>>>>>>>>> Before: ~5.0Mpps >>>>>>>>>> After: ~6.1Mpps >>>>>>>>>> >>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>>>>>> --- >>>>>>>>>> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >>>>>>>>>> drivers/vhost/vhost.h | 11 +++ >>>>>>>>>> 2 files changed, 189 insertions(+) >>>>>>>>>> >>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >>>>>>>>>> index bafe39d2e637..1bd24203afb6 100644 >>>>>>>>>> --- a/drivers/vhost/vhost.c >>>>>>>>>> +++ b/drivers/vhost/vhost.c >>>>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >>>>>>>>>> vq->indirect = NULL; >>>>>>>>>> vq->heads = NULL; >>>>>>>>>> vq->dev = dev; >>>>>>>>>> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >>>>>>>>>> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >>>>>>>>>> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >>>>>>>>>> mutex_init(&vq->mutex); >>>>>>>>>> vhost_vq_reset(dev, vq); >>>>>>>>>> if (vq->handle_kick) >>>>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >>>>>>>>>> spin_unlock(&dev->iotlb_lock); >>>>>>>>>> } >>>>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >>>>>>>>>> + size_t size, int write) >>>>>>>>>> +{ >>>>>>>>>> + struct page **pages; >>>>>>>>>> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >>>>>>>>>> + int npinned; >>>>>>>>>> + void *vaddr; >>>>>>>>>> + >>>>>>>>>> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >>>>>>>>>> + if (!pages) >>>>>>>>>> + return -ENOMEM; >>>>>>>>>> + >>>>>>>>>> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >>>>>>>>>> + if (npinned != npages) >>>>>>>>>> + goto err; >>>>>>>>>> + >>>>>>>>> As I said I have doubts about the whole approach, but this >>>>>>>>> implementation in particular isn't a good idea >>>>>>>>> as it keeps the page around forever. >>>>>> The pages wil be released during set features. >>>>>> >>>>>> >>>>>>>>> So no THP, no NUMA rebalancing, >>>>>> For THP, we will probably miss 2 or 4 pages, but does this really matter >>>>>> consider the gain we have? >>>>> We as in vhost? networking isn't the only thing guest does. >>>>> We don't even know if this guest does a lot of networking. >>>>> You don't >>>>> know what else is in this huge page. Can be something very important >>>>> that guest touches all the time. >>>> Well, the probability should be very small consider we usually give several >>>> gigabytes to guest. The rest of the pages that doesn't sit in the same >>>> hugepage with metadata can still be merged by THP. Anyway, I can test the >>>> differences. >>> Thanks! >>> >>>>>> For NUMA rebalancing, I'm even not quite sure if >>>>>> it can helps for the case of IPC (vhost). It looks to me the worst case it >>>>>> may cause page to be thrash between nodes if vhost and userspace are running >>>>>> in two nodes. >>>>> So again it's a gain for vhost but has a completely unpredictable effect on >>>>> other functionality of the guest. >>>>> >>>>> That's what bothers me with this approach. >>>> So: >>>> >>>> - The rest of the pages could still be balanced to other nodes, no? >>>> >>>> - try to balance metadata pages (belongs to co-operate processes) itself is >>>> still questionable >>> I am not sure why. It should be easy enough to force the VCPU and vhost >>> to move (e.g. start them pinned to 1 cpu, then pin them to another one). >>> Clearly sometimes this would be necessary for load balancing reasons. >> >> Yes, but it looks to me the part of motivation of auto NUMA is to avoid >> manual pinning. > ... of memory. Yes. > > >>> With autonuma after a while (could take seconds but it will happen) the >>> memory will migrate. >>> >> Yes. As you mentioned during the discuss, I wonder we could do it similarly >> through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86: >> Unpin and remove kvm_arch->apic_access_page") > That would be a possible approach. Yes, this looks possible, and the conversion seems not hard. Let me have a try with this. [...] >>>>>>> I don't see how a kthread makes any difference. We do have a validation >>>>>>> step which makes some difference. >>>>>> The problem is not kthread but the address of userspace address. The >>>>>> addresses of vq metadata tends to be consistent for a while, and vhost knows >>>>>> they will be frequently. SMAP doesn't help too much in this case. >>>>>> >>>>>> Thanks. >>>>> It's true for a real life applications but a malicious one >>>>> can call the setup ioctls any number of times. And SMAP is >>>>> all about malcious applications. >>>> We don't do this in the path of ioctl, there's no context switch between >>>> userspace and kernel in the worker thread. SMAP is used to prevent kernel >>>> from accessing userspace pages unexpectedly which is not the case for >>>> metadata access. >>>> >>>> Thanks >>> OK let's forget smap for now. >> >> Some numbers I measured: >> >> On an old Sandy bridge machine without SMAP support. Remove speculation >> barrier boost the performance from 4.6Mpps to 5.1Mpps >> >> On a newer Broadwell machine with SMAP support. Remove speculation barrier >> only gives 2%-5% improvement, disable SMAP completely through Kconfig boost >> 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it >> only bypass SMAP for metadata). >> >> So it looks like for recent machine, SMAP becomes pain point when the copy >> is short (e.g 64B) for high PPS. >> >> Thanks > Thanks a lot for looking into this! > > So first of all users can just boot with nosmap, right? > What's wrong with that? Nothing wrong, just realize we had this kernel parameter. > Yes it's not fine-grained but OTOH > it's easy to understand. > > And I guess this confirms that if we are going to worry > about smap enabled, we need to look into packet copies > too, not just meta-data. For packet copies, we can do batch copy which is pretty simple for the case of XDP. I've already had patches for this. > > Vaguely could see a module option (off by default) > where vhost basically does user_access_begin > when it starts running, then uses unsafe accesses > in vhost and tun and then user_access_end. Using user_access_begin() is more tricky than imaged. E.g it requires: - userspace address to be validated before through access_ok() [1] - It doesn't support calling a function that does explicit schedule since SMAP/PAN state is not maintained through schedule() [2] [1] https://lwn.net/Articles/736348/ [2] https://lkml.org/lkml/2018/11/23/430 So calling user_access_begin() all the time when vhost is running seems pretty dangerous. For a better batched datacopy, I tend to build not only XDP but also skb in vhost in the future. Thanks > > >>>>>>>> Packet or AF_XDP benefit from >>>>>>>> accessing metadata directly, we should do it as well. >>>>>>>> >>>>>>>> Thanks
On Thu, Dec 27, 2018 at 05:39:21PM +0800, Jason Wang wrote: > > On 2018/12/26 下午11:02, Michael S. Tsirkin wrote: > > On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote: > > > On 2018/12/25 下午8:50, Michael S. Tsirkin wrote: > > > > On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: > > > > > On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: > > > > > > On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: > > > > > > > On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: > > > > > > > > On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: > > > > > > > > > On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: > > > > > > > > > > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: > > > > > > > > > > > It was noticed that the copy_user() friends that was used to access > > > > > > > > > > > virtqueue metdata tends to be very expensive for dataplane > > > > > > > > > > > implementation like vhost since it involves lots of software check, > > > > > > > > > > > speculation barrier, hardware feature toggling (e.g SMAP). The > > > > > > > > > > > extra cost will be more obvious when transferring small packets. > > > > > > > > > > > > > > > > > > > > > > This patch tries to eliminate those overhead by pin vq metadata pages > > > > > > > > > > > and access them through vmap(). During SET_VRING_ADDR, we will setup > > > > > > > > > > > those mappings and memory accessors are modified to use pointers to > > > > > > > > > > > access the metadata directly. > > > > > > > > > > > > > > > > > > > > > > Note, this was only done when device IOTLB is not enabled. We could > > > > > > > > > > > use similar method to optimize it in the future. > > > > > > > > > > > > > > > > > > > > > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > > > > > > > > > > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > > > > > > > > > > > > > > > > > > > > > Before: ~5.0Mpps > > > > > > > > > > > After: ~6.1Mpps > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com> > > > > > > > > > > > --- > > > > > > > > > > > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > > > drivers/vhost/vhost.h | 11 +++ > > > > > > > > > > > 2 files changed, 189 insertions(+) > > > > > > > > > > > > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > > > > > > > index bafe39d2e637..1bd24203afb6 100644 > > > > > > > > > > > --- a/drivers/vhost/vhost.c > > > > > > > > > > > +++ b/drivers/vhost/vhost.c > > > > > > > > > > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > > > > > > > > > > > vq->indirect = NULL; > > > > > > > > > > > vq->heads = NULL; > > > > > > > > > > > vq->dev = dev; > > > > > > > > > > > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > > > > > > > > > > > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > > > > > > > > > > > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > > > > > > > > > > > mutex_init(&vq->mutex); > > > > > > > > > > > vhost_vq_reset(dev, vq); > > > > > > > > > > > if (vq->handle_kick) > > > > > > > > > > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > > > > > > > > > > > spin_unlock(&dev->iotlb_lock); > > > > > > > > > > > } > > > > > > > > > > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > > > > > > > > > > > + size_t size, int write) > > > > > > > > > > > +{ > > > > > > > > > > > + struct page **pages; > > > > > > > > > > > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > > > > > > > > > > > + int npinned; > > > > > > > > > > > + void *vaddr; > > > > > > > > > > > + > > > > > > > > > > > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > > > > > > > > > > > + if (!pages) > > > > > > > > > > > + return -ENOMEM; > > > > > > > > > > > + > > > > > > > > > > > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > > > > > > > > > > > + if (npinned != npages) > > > > > > > > > > > + goto err; > > > > > > > > > > > + > > > > > > > > > > As I said I have doubts about the whole approach, but this > > > > > > > > > > implementation in particular isn't a good idea > > > > > > > > > > as it keeps the page around forever. > > > > > > > The pages wil be released during set features. > > > > > > > > > > > > > > > > > > > > > > > > So no THP, no NUMA rebalancing, > > > > > > > For THP, we will probably miss 2 or 4 pages, but does this really matter > > > > > > > consider the gain we have? > > > > > > We as in vhost? networking isn't the only thing guest does. > > > > > > We don't even know if this guest does a lot of networking. > > > > > > You don't > > > > > > know what else is in this huge page. Can be something very important > > > > > > that guest touches all the time. > > > > > Well, the probability should be very small consider we usually give several > > > > > gigabytes to guest. The rest of the pages that doesn't sit in the same > > > > > hugepage with metadata can still be merged by THP. Anyway, I can test the > > > > > differences. > > > > Thanks! > > > > > > > > > > > For NUMA rebalancing, I'm even not quite sure if > > > > > > > it can helps for the case of IPC (vhost). It looks to me the worst case it > > > > > > > may cause page to be thrash between nodes if vhost and userspace are running > > > > > > > in two nodes. > > > > > > So again it's a gain for vhost but has a completely unpredictable effect on > > > > > > other functionality of the guest. > > > > > > > > > > > > That's what bothers me with this approach. > > > > > So: > > > > > > > > > > - The rest of the pages could still be balanced to other nodes, no? > > > > > > > > > > - try to balance metadata pages (belongs to co-operate processes) itself is > > > > > still questionable > > > > I am not sure why. It should be easy enough to force the VCPU and vhost > > > > to move (e.g. start them pinned to 1 cpu, then pin them to another one). > > > > Clearly sometimes this would be necessary for load balancing reasons. > > > > > > Yes, but it looks to me the part of motivation of auto NUMA is to avoid > > > manual pinning. > > ... of memory. Yes. > > > > > > > > With autonuma after a while (could take seconds but it will happen) the > > > > memory will migrate. > > > > > > > Yes. As you mentioned during the discuss, I wonder we could do it similarly > > > through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86: > > > Unpin and remove kvm_arch->apic_access_page") > > That would be a possible approach. > > > Yes, this looks possible, and the conversion seems not hard. Let me have a > try with this. > > > [...] > > > > > > > > > > I don't see how a kthread makes any difference. We do have a validation > > > > > > > > step which makes some difference. > > > > > > > The problem is not kthread but the address of userspace address. The > > > > > > > addresses of vq metadata tends to be consistent for a while, and vhost knows > > > > > > > they will be frequently. SMAP doesn't help too much in this case. > > > > > > > > > > > > > > Thanks. > > > > > > It's true for a real life applications but a malicious one > > > > > > can call the setup ioctls any number of times. And SMAP is > > > > > > all about malcious applications. > > > > > We don't do this in the path of ioctl, there's no context switch between > > > > > userspace and kernel in the worker thread. SMAP is used to prevent kernel > > > > > from accessing userspace pages unexpectedly which is not the case for > > > > > metadata access. > > > > > > > > > > Thanks > > > > OK let's forget smap for now. > > > > > > Some numbers I measured: > > > > > > On an old Sandy bridge machine without SMAP support. Remove speculation > > > barrier boost the performance from 4.6Mpps to 5.1Mpps > > > > > > On a newer Broadwell machine with SMAP support. Remove speculation barrier > > > only gives 2%-5% improvement, disable SMAP completely through Kconfig boost > > > 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it > > > only bypass SMAP for metadata). > > > > > > So it looks like for recent machine, SMAP becomes pain point when the copy > > > is short (e.g 64B) for high PPS. > > > > > > Thanks > > Thanks a lot for looking into this! > > > > So first of all users can just boot with nosmap, right? > > What's wrong with that? > > > Nothing wrong, just realize we had this kernel parameter. > > > > Yes it's not fine-grained but OTOH > > it's easy to understand. > > > > And I guess this confirms that if we are going to worry > > about smap enabled, we need to look into packet copies > > too, not just meta-data. > > > For packet copies, we can do batch copy which is pretty simple for the case > of XDP. I've already had patches for this. > > > > > > Vaguely could see a module option (off by default) > > where vhost basically does user_access_begin > > when it starts running, then uses unsafe accesses > > in vhost and tun and then user_access_end. > > > Using user_access_begin() is more tricky than imaged. E.g it requires: > > - userspace address to be validated before through access_ok() [1] This part is fine I think - addresses come from the memory map and when userspace supplies the memory map we validate everything with access_ok. Well do we validate with the iotlb too? Don't see it right now so maybe not but it's easy to add. > - It doesn't support calling a function that does explicit schedule since > SMAP/PAN state is not maintained through schedule() [2] > > [1] https://lwn.net/Articles/736348/ > > [2] https://lkml.org/lkml/2018/11/23/430 > > So calling user_access_begin() all the time when vhost is running seems > pretty dangerous. Yes it requires some rework e.g. to try getting memory with GFP_ATOMIC. We could then do a slow path with GFP_KERNEL if that fails. > For a better batched datacopy, I tend to build not only XDP but also skb in > vhost in the future. > > Thanks Sure, why not. > > > > > > > > > > > > > > Packet or AF_XDP benefit from > > > > > > > > > accessing metadata directly, we should do it as well. > > > > > > > > > > > > > > > > > > Thanks
On 2018/12/31 上午2:30, Michael S. Tsirkin wrote: > On Thu, Dec 27, 2018 at 05:39:21PM +0800, Jason Wang wrote: >> On 2018/12/26 下午11:02, Michael S. Tsirkin wrote: >>> On Wed, Dec 26, 2018 at 11:57:32AM +0800, Jason Wang wrote: >>>> On 2018/12/25 下午8:50, Michael S. Tsirkin wrote: >>>>> On Tue, Dec 25, 2018 at 06:05:25PM +0800, Jason Wang wrote: >>>>>> On 2018/12/25 上午2:10, Michael S. Tsirkin wrote: >>>>>>> On Mon, Dec 24, 2018 at 03:53:16PM +0800, Jason Wang wrote: >>>>>>>> On 2018/12/14 下午8:36, Michael S. Tsirkin wrote: >>>>>>>>> On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote: >>>>>>>>>> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote: >>>>>>>>>>> On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote: >>>>>>>>>>>> It was noticed that the copy_user() friends that was used to access >>>>>>>>>>>> virtqueue metdata tends to be very expensive for dataplane >>>>>>>>>>>> implementation like vhost since it involves lots of software check, >>>>>>>>>>>> speculation barrier, hardware feature toggling (e.g SMAP). The >>>>>>>>>>>> extra cost will be more obvious when transferring small packets. >>>>>>>>>>>> >>>>>>>>>>>> This patch tries to eliminate those overhead by pin vq metadata pages >>>>>>>>>>>> and access them through vmap(). During SET_VRING_ADDR, we will setup >>>>>>>>>>>> those mappings and memory accessors are modified to use pointers to >>>>>>>>>>>> access the metadata directly. >>>>>>>>>>>> >>>>>>>>>>>> Note, this was only done when device IOTLB is not enabled. We could >>>>>>>>>>>> use similar method to optimize it in the future. >>>>>>>>>>>> >>>>>>>>>>>> Tests shows about ~24% improvement on TX PPS when using virtio-user + >>>>>>>>>>>> vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): >>>>>>>>>>>> >>>>>>>>>>>> Before: ~5.0Mpps >>>>>>>>>>>> After: ~6.1Mpps >>>>>>>>>>>> >>>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>>>>>>>> --- >>>>>>>>>>>> drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ >>>>>>>>>>>> drivers/vhost/vhost.h | 11 +++ >>>>>>>>>>>> 2 files changed, 189 insertions(+) >>>>>>>>>>>> >>>>>>>>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c >>>>>>>>>>>> index bafe39d2e637..1bd24203afb6 100644 >>>>>>>>>>>> --- a/drivers/vhost/vhost.c >>>>>>>>>>>> +++ b/drivers/vhost/vhost.c >>>>>>>>>>>> @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, >>>>>>>>>>>> vq->indirect = NULL; >>>>>>>>>>>> vq->heads = NULL; >>>>>>>>>>>> vq->dev = dev; >>>>>>>>>>>> + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); >>>>>>>>>>>> + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); >>>>>>>>>>>> + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); >>>>>>>>>>>> mutex_init(&vq->mutex); >>>>>>>>>>>> vhost_vq_reset(dev, vq); >>>>>>>>>>>> if (vq->handle_kick) >>>>>>>>>>>> @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) >>>>>>>>>>>> spin_unlock(&dev->iotlb_lock); >>>>>>>>>>>> } >>>>>>>>>>>> +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, >>>>>>>>>>>> + size_t size, int write) >>>>>>>>>>>> +{ >>>>>>>>>>>> + struct page **pages; >>>>>>>>>>>> + int npages = DIV_ROUND_UP(size, PAGE_SIZE); >>>>>>>>>>>> + int npinned; >>>>>>>>>>>> + void *vaddr; >>>>>>>>>>>> + >>>>>>>>>>>> + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); >>>>>>>>>>>> + if (!pages) >>>>>>>>>>>> + return -ENOMEM; >>>>>>>>>>>> + >>>>>>>>>>>> + npinned = get_user_pages_fast(uaddr, npages, write, pages); >>>>>>>>>>>> + if (npinned != npages) >>>>>>>>>>>> + goto err; >>>>>>>>>>>> + >>>>>>>>>>> As I said I have doubts about the whole approach, but this >>>>>>>>>>> implementation in particular isn't a good idea >>>>>>>>>>> as it keeps the page around forever. >>>>>>>> The pages wil be released during set features. >>>>>>>> >>>>>>>> >>>>>>>>>>> So no THP, no NUMA rebalancing, >>>>>>>> For THP, we will probably miss 2 or 4 pages, but does this really matter >>>>>>>> consider the gain we have? >>>>>>> We as in vhost? networking isn't the only thing guest does. >>>>>>> We don't even know if this guest does a lot of networking. >>>>>>> You don't >>>>>>> know what else is in this huge page. Can be something very important >>>>>>> that guest touches all the time. >>>>>> Well, the probability should be very small consider we usually give several >>>>>> gigabytes to guest. The rest of the pages that doesn't sit in the same >>>>>> hugepage with metadata can still be merged by THP. Anyway, I can test the >>>>>> differences. >>>>> Thanks! >>>>> >>>>>>>> For NUMA rebalancing, I'm even not quite sure if >>>>>>>> it can helps for the case of IPC (vhost). It looks to me the worst case it >>>>>>>> may cause page to be thrash between nodes if vhost and userspace are running >>>>>>>> in two nodes. >>>>>>> So again it's a gain for vhost but has a completely unpredictable effect on >>>>>>> other functionality of the guest. >>>>>>> >>>>>>> That's what bothers me with this approach. >>>>>> So: >>>>>> >>>>>> - The rest of the pages could still be balanced to other nodes, no? >>>>>> >>>>>> - try to balance metadata pages (belongs to co-operate processes) itself is >>>>>> still questionable >>>>> I am not sure why. It should be easy enough to force the VCPU and vhost >>>>> to move (e.g. start them pinned to 1 cpu, then pin them to another one). >>>>> Clearly sometimes this would be necessary for load balancing reasons. >>>> Yes, but it looks to me the part of motivation of auto NUMA is to avoid >>>> manual pinning. >>> ... of memory. Yes. >>> >>> >>>>> With autonuma after a while (could take seconds but it will happen) the >>>>> memory will migrate. >>>>> >>>> Yes. As you mentioned during the discuss, I wonder we could do it similarly >>>> through mmu notifier like APIC access page in commit c24ae0dcd3e ("kvm: x86: >>>> Unpin and remove kvm_arch->apic_access_page") >>> That would be a possible approach. >> >> Yes, this looks possible, and the conversion seems not hard. Let me have a >> try with this. >> >> >> [...] >> >> >>>>>>>>> I don't see how a kthread makes any difference. We do have a validation >>>>>>>>> step which makes some difference. >>>>>>>> The problem is not kthread but the address of userspace address. The >>>>>>>> addresses of vq metadata tends to be consistent for a while, and vhost knows >>>>>>>> they will be frequently. SMAP doesn't help too much in this case. >>>>>>>> >>>>>>>> Thanks. >>>>>>> It's true for a real life applications but a malicious one >>>>>>> can call the setup ioctls any number of times. And SMAP is >>>>>>> all about malcious applications. >>>>>> We don't do this in the path of ioctl, there's no context switch between >>>>>> userspace and kernel in the worker thread. SMAP is used to prevent kernel >>>>>> from accessing userspace pages unexpectedly which is not the case for >>>>>> metadata access. >>>>>> >>>>>> Thanks >>>>> OK let's forget smap for now. >>>> Some numbers I measured: >>>> >>>> On an old Sandy bridge machine without SMAP support. Remove speculation >>>> barrier boost the performance from 4.6Mpps to 5.1Mpps >>>> >>>> On a newer Broadwell machine with SMAP support. Remove speculation barrier >>>> only gives 2%-5% improvement, disable SMAP completely through Kconfig boost >>>> 57% performance from 4.8Mpps to 7.5Mpps. (Vmap gives 6Mpps - 6.1Mpps, it >>>> only bypass SMAP for metadata). >>>> >>>> So it looks like for recent machine, SMAP becomes pain point when the copy >>>> is short (e.g 64B) for high PPS. >>>> >>>> Thanks >>> Thanks a lot for looking into this! >>> >>> So first of all users can just boot with nosmap, right? >>> What's wrong with that? >> >> Nothing wrong, just realize we had this kernel parameter. >> >> >>> Yes it's not fine-grained but OTOH >>> it's easy to understand. >>> >>> And I guess this confirms that if we are going to worry >>> about smap enabled, we need to look into packet copies >>> too, not just meta-data. >> >> For packet copies, we can do batch copy which is pretty simple for the case >> of XDP. I've already had patches for this. >> >> >>> Vaguely could see a module option (off by default) >>> where vhost basically does user_access_begin >>> when it starts running, then uses unsafe accesses >>> in vhost and tun and then user_access_end. >> >> Using user_access_begin() is more tricky than imaged. E.g it requires: >> >> - userspace address to be validated before through access_ok() [1] > This part is fine I think - addresses come from the memory > map and when userspace supplies the memory map > we validate everything with access_ok. > Well do we validate with the iotlb too? Don't see it right now > so maybe not but it's easy to add. Yes, it's not hard. > >> - It doesn't support calling a function that does explicit schedule since >> SMAP/PAN state is not maintained through schedule() [2] >> >> [1] https://lwn.net/Articles/736348/ >> >> [2] https://lkml.org/lkml/2018/11/23/430 >> >> So calling user_access_begin() all the time when vhost is running seems >> pretty dangerous. > Yes it requires some rework e.g. to try getting memory with > GFP_ATOMIC. We could then do a slow path with GFP_KERNEL > if that fails. I'm not sure this is the only part that needs care. Consider all the under layer network or block codes assumes a process context, it's not easy to figure out all I'm afraid. And even if we could, it's hard to prevent it from being added in the future. Thanks > >> For a better batched datacopy, I tend to build not only XDP but also skb in >> vhost in the future. >> >> Thanks > Sure, why not. > >>> >>>>>>>>>> Packet or AF_XDP benefit from >>>>>>>>>> accessing metadata directly, we should do it as well. >>>>>>>>>> >>>>>>>>>> Thanks
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bafe39d2e637..1bd24203afb6 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, vq->indirect = NULL; vq->heads = NULL; vq->dev = dev; + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); mutex_init(&vq->mutex); vhost_vq_reset(dev, vq); if (vq->handle_kick) @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) spin_unlock(&dev->iotlb_lock); } +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, + size_t size, int write) +{ + struct page **pages; + int npages = DIV_ROUND_UP(size, PAGE_SIZE); + int npinned; + void *vaddr; + + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + npinned = get_user_pages_fast(uaddr, npages, write, pages); + if (npinned != npages) + goto err; + + vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + if (!vaddr) + goto err; + + map->pages = pages; + map->addr = vaddr + (uaddr & (PAGE_SIZE - 1)); + map->npages = npages; + + return 0; + +err: + if (npinned > 0) + release_pages(pages, npinned); + kfree(pages); + return -EFAULT; +} + +static void vhost_uninit_vmap(struct vhost_vmap *map) +{ + if (!map->addr) + return; + + vunmap(map->addr); + release_pages(map->pages, map->npages); + kfree(map->pages); + + map->addr = NULL; + map->pages = NULL; + map->npages = 0; +} + +static void vhost_clean_vmaps(struct vhost_virtqueue *vq) +{ + vhost_uninit_vmap(&vq->avail_ring); + vhost_uninit_vmap(&vq->desc_ring); + vhost_uninit_vmap(&vq->used_ring); +} + +static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail, + unsigned long desc, unsigned long used) +{ + size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + size_t avail_size, desc_size, used_size; + int ret; + + vhost_clean_vmaps(vq); + + avail_size = sizeof(*vq->avail) + + sizeof(*vq->avail->ring) * vq->num + event; + ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false); + if (ret) { + vq_err(vq, "Fail to setup vmap for avail ring!\n"); + goto err_avail; + } + + desc_size = sizeof(*vq->desc) * vq->num; + ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false); + if (ret) { + vq_err(vq, "Fail to setup vmap for desc ring!\n"); + goto err_desc; + } + + used_size = sizeof(*vq->used) + + sizeof(*vq->used->ring) * vq->num + event; + ret = vhost_init_vmap(&vq->used_ring, used, used_size, true); + if (ret) { + vq_err(vq, "Fail to setup vmap for used ring!\n"); + goto err_used; + } + + return 0; + +err_used: + vhost_uninit_vmap(&vq->used_ring); +err_desc: + vhost_uninit_vmap(&vq->avail_ring); +err_avail: + return -EFAULT; +} + void vhost_dev_cleanup(struct vhost_dev *dev) { int i; @@ -626,6 +725,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) if (dev->vqs[i]->call_ctx) eventfd_ctx_put(dev->vqs[i]->call_ctx); vhost_vq_reset(dev, dev->vqs[i]); + vhost_clean_vmaps(dev->vqs[i]); } vhost_dev_free_iovecs(dev); if (dev->log_ctx) @@ -873,6 +973,14 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) { + if (!vq->iotlb) { + struct vring_used *used = vq->used_ring.addr; + + *((__virtio16 *)&used->ring[vq->num]) = + cpu_to_vhost16(vq, vq->avail_idx); + return 0; + } + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)); } @@ -881,6 +989,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, struct vring_used_elem *head, int idx, int count) { + if (!vq->iotlb) { + struct vring_used *used = vq->used_ring.addr; + + memcpy(used->ring + idx, head, count * sizeof(*head)); + return 0; + } + return vhost_copy_to_user(vq, vq->used->ring + idx, head, count * sizeof(*head)); } @@ -888,6 +1003,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) { + if (!vq->iotlb) { + struct vring_used *used = vq->used_ring.addr; + + used->flags = cpu_to_vhost16(vq, vq->used_flags); + return 0; + } + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags); } @@ -895,6 +1017,13 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) { + if (!vq->iotlb) { + struct vring_used *used = vq->used_ring.addr; + + used->idx = cpu_to_vhost16(vq, vq->last_used_idx); + return 0; + } + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx); } @@ -926,12 +1055,26 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, __virtio16 *idx) { + if (!vq->iotlb) { + struct vring_avail *avail = vq->avail_ring.addr; + + *idx = avail->idx; + return 0; + } + return vhost_get_avail(vq, *idx, &vq->avail->idx); } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, __virtio16 *head, int idx) { + if (!vq->iotlb) { + struct vring_avail *avail = vq->avail_ring.addr; + + *head = avail->ring[idx & (vq->num - 1)]; + return 0; + } + return vhost_get_avail(vq, *head, &vq->avail->ring[idx & (vq->num - 1)]); } @@ -939,24 +1082,52 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, __virtio16 *flags) { + if (!vq->iotlb) { + struct vring_avail *avail = vq->avail_ring.addr; + + *flags = avail->flags; + return 0; + } + return vhost_get_avail(vq, *flags, &vq->avail->flags); } static inline int vhost_get_used_event(struct vhost_virtqueue *vq, __virtio16 *event) { + if (!vq->iotlb) { + struct vring_avail *avail = vq->avail_ring.addr; + + *event = (__virtio16)avail->ring[vq->num]; + return 0; + } + return vhost_get_avail(vq, *event, vhost_used_event(vq)); } static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, __virtio16 *idx) { + if (!vq->iotlb) { + struct vring_used *used = vq->used_ring.addr; + + *idx = used->idx; + return 0; + } + return vhost_get_used(vq, *idx, &vq->used->idx); } static inline int vhost_get_desc(struct vhost_virtqueue *vq, struct vring_desc *desc, int idx) { + if (!vq->iotlb) { + struct vring_desc *d = vq->desc_ring.addr; + + *desc = *(d + idx); + return 0; + } + return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); } @@ -1551,6 +1722,13 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg } } + if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr, + a.desc_user_addr, + a.used_user_addr)) { + r = -EINVAL; + break; + } + vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); vq->desc = (void __user *)(unsigned long)a.desc_user_addr; vq->avail = (void __user *)(unsigned long)a.avail_user_addr; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 466ef7542291..89dc0ad3d055 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -80,6 +80,12 @@ enum vhost_uaddr_type { VHOST_NUM_ADDRS = 3, }; +struct vhost_vmap { + struct page **pages; + void *addr; + int npages; +}; + /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; @@ -90,6 +96,11 @@ struct vhost_virtqueue { struct vring_desc __user *desc; struct vring_avail __user *avail; struct vring_used __user *used; + + struct vhost_vmap avail_ring; + struct vhost_vmap desc_ring; + struct vhost_vmap used_ring; + const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; struct eventfd_ctx *call_ctx;
It was noticed that the copy_user() friends that was used to access virtqueue metdata tends to be very expensive for dataplane implementation like vhost since it involves lots of software check, speculation barrier, hardware feature toggling (e.g SMAP). The extra cost will be more obvious when transferring small packets. This patch tries to eliminate those overhead by pin vq metadata pages and access them through vmap(). During SET_VRING_ADDR, we will setup those mappings and memory accessors are modified to use pointers to access the metadata directly. Note, this was only done when device IOTLB is not enabled. We could use similar method to optimize it in the future. Tests shows about ~24% improvement on TX PPS when using virtio-user + vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): Before: ~5.0Mpps After: ~6.1Mpps Signed-off-by: Jason Wang <jasowang@redhat.com> --- drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 11 +++ 2 files changed, 189 insertions(+)