Message ID | 20201222145221.711-10-xieyongji@bytedance.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
Series | Introduce VDUSE - vDPA Device in Userspace | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On 2020/12/22 下午10:52, Xie Yongji wrote: > To support vhost-vdpa bus driver, we need a way to share the > vhost-vdpa backend process's memory with the userspace VDUSE process. > > This patch tries to make use of the vhost iotlb message to achieve > that. We will get the shm file from the iotlb message and pass it > to the userspace VDUSE process. > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > --- > Documentation/driver-api/vduse.rst | 15 +++- > drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > include/uapi/linux/vduse.h | 11 +++ > 3 files changed, 171 insertions(+), 2 deletions(-) > > diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > index 623f7b040ccf..48e4b1ba353f 100644 > --- a/Documentation/driver-api/vduse.rst > +++ b/Documentation/driver-api/vduse.rst > @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > > - VDUSE_GET_CONFIG: Read from device specific configuration space > > +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > + > +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > + > Please see include/linux/vdpa.h for details. > > -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > +The data path of userspace vDPA device is implemented in different ways > +depending on the vdpa bus to which it is attached. > + > +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > driver which supports mapping the kernel dma buffer to a userspace iova > region dynamically. The userspace iova region can be created by passing > the userspace vDPA device fd to mmap(2). > > +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > +which will be shared to the VDUSE userspace processs via the file > +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > +in this message. > + > Besides, the eventfd mechanism is used to trigger interrupt callbacks and > receive virtqueue kicks in userspace. The following ioctls on the userspace > vDPA device fd are provided to support that: > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > index b974333ed4e9..d24aaacb6008 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -34,6 +34,7 @@ > > struct vduse_dev_msg { > struct vduse_dev_request req; > + struct file *iotlb_file; > struct vduse_dev_response resp; > struct list_head list; > wait_queue_head_t waitq; > @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > return ret; > } > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > + u64 offset, u64 iova, u64 size, u8 perm) > +{ > + struct vduse_dev_msg *msg; > + int ret; > + > + if (!size) > + return -EINVAL; > + > + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > + msg->req.size = sizeof(struct vduse_iotlb); > + msg->req.iotlb.offset = offset; > + msg->req.iotlb.iova = iova; > + msg->req.iotlb.size = size; > + msg->req.iotlb.perm = perm; > + msg->req.iotlb.fd = -1; > + msg->iotlb_file = get_file(file); > + > + ret = vduse_dev_msg_sync(dev, msg); My feeling is that we should provide consistent API for the userspace device to use. E.g we'd better carry the IOTLB message for both virtio/vhost drivers. It looks to me for virtio drivers we can still use UPDAT_IOTLB message by using VDUSE file as msg->iotlb_file here. > + vduse_dev_msg_put(msg); > + fput(file); > + > + return ret; > +} > + > +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev, > + u64 iova, u64 size) > +{ > + struct vduse_dev_msg *msg; > + int ret; > + > + if (!size) > + return -EINVAL; > + > + msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB); > + msg->req.size = sizeof(struct vduse_iotlb); > + msg->req.iotlb.iova = iova; > + msg->req.iotlb.size = size; > + > + ret = vduse_dev_msg_sync(dev, msg); > + vduse_dev_msg_put(msg); > + > + return ret; > +} > + > +static unsigned int perm_to_file_flags(u8 perm) > +{ > + unsigned int flags = 0; > + > + switch (perm) { > + case VHOST_ACCESS_WO: > + flags |= O_WRONLY; > + break; > + case VHOST_ACCESS_RO: > + flags |= O_RDONLY; > + break; > + case VHOST_ACCESS_RW: > + flags |= O_RDWR; > + break; > + default: > + WARN(1, "invalidate vhost IOTLB permission\n"); > + break; > + } > + > + return flags; > +} > + > static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) > { > struct file *file = iocb->ki_filp; > struct vduse_dev *dev = file->private_data; > struct vduse_dev_msg *msg; > - int size = sizeof(struct vduse_dev_request); > + unsigned int flags; > + int fd, size = sizeof(struct vduse_dev_request); > ssize_t ret = 0; > > if (iov_iter_count(to) < size) > @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) > if (ret) > return ret; > } > + > + if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) { > + flags = perm_to_file_flags(msg->req.iotlb.perm); > + fd = get_unused_fd_flags(flags); > + if (fd < 0) { > + vduse_dev_enqueue_msg(dev, msg, &dev->send_list); > + return fd; > + } > + fd_install(fd, get_file(msg->iotlb_file)); > + msg->req.iotlb.fd = fd; > + } > + > ret = copy_to_iter(&msg->req, size, to); > if (ret != size) { > vduse_dev_enqueue_msg(dev, msg, &dev->send_list); > @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, > vduse_dev_set_config(dev, offset, buf, len); > } > > +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev, > + struct vhost_iotlb_msg *msg) > +{ > + vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size); > +} > + > +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev, > + struct vhost_iotlb_msg *msg) > +{ > + u64 uaddr = msg->uaddr; > + u64 iova = msg->iova; > + u64 size = msg->size; > + u64 offset; > + struct vm_area_struct *vma; > + int ret; > + > + while (uaddr < msg->uaddr + msg->size) { > + vma = find_vma(current->mm, uaddr); > + ret = -EINVAL; > + if (!vma) > + goto err; > + > + size = min(msg->size, vma->vm_end - uaddr); > + offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; > + if (vma->vm_file && (vma->vm_flags & VM_SHARED)) { > + ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset, > + iova, size, msg->perm); > + if (ret) > + goto err; My understanding is that vma is something that should not be known by a device. So I suggest to move the above processing to vhost-vdpa.c. Thanks > + } > + iova += size; > + uaddr += size; > + } > + return 0; > +err: > + vduse_dev_invalidate_iotlb(dev, msg->iova, iova - msg->iova); > + return ret; > +} > + > +static int vduse_vdpa_process_iotlb_msg(struct vdpa_device *vdpa, > + struct vhost_iotlb_msg *msg) > +{ > + struct vduse_dev *dev = vdpa_to_vduse(vdpa); > + int ret = 0; > + > + switch (msg->type) { > + case VHOST_IOTLB_UPDATE: > + ret = vduse_vdpa_update_iotlb(dev, msg); > + break; > + case VHOST_IOTLB_INVALIDATE: > + vduse_vdpa_invalidate_iotlb(dev, msg); > + break; > + case VHOST_IOTLB_BATCH_BEGIN: > + case VHOST_IOTLB_BATCH_END: > + break; > + default: > + ret = -EINVAL; > + break; > + } > + > + return ret; > +} > + > static void vduse_vdpa_free(struct vdpa_device *vdpa) > { > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > @@ -597,6 +741,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { > .set_status = vduse_vdpa_set_status, > .get_config = vduse_vdpa_get_config, > .set_config = vduse_vdpa_set_config, > + .process_iotlb_msg = vduse_vdpa_process_iotlb_msg, > .free = vduse_vdpa_free, > }; > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > index 873305dfd93f..c5080851f140 100644 > --- a/include/uapi/linux/vduse.h > +++ b/include/uapi/linux/vduse.h > @@ -21,6 +21,8 @@ enum vduse_req_type { > VDUSE_GET_STATUS, > VDUSE_SET_CONFIG, > VDUSE_GET_CONFIG, > + VDUSE_UPDATE_IOTLB, > + VDUSE_INVALIDATE_IOTLB, > }; > > struct vduse_vq_num { > @@ -51,6 +53,14 @@ struct vduse_dev_config_data { > __u8 data[VDUSE_CONFIG_DATA_LEN]; > }; > > +struct vduse_iotlb { > + __u32 fd; > + __u64 offset; > + __u64 iova; > + __u64 size; > + __u8 perm; > +}; > + > struct vduse_dev_request { > __u32 type; /* request type */ > __u32 unique; /* request id */ > @@ -62,6 +72,7 @@ struct vduse_dev_request { > struct vduse_vq_ready vq_ready; /* virtqueue ready status */ > struct vduse_vq_state vq_state; /* virtqueue state */ > struct vduse_dev_config_data config; /* virtio device config space */ > + struct vduse_iotlb iotlb; /* iotlb message */ > __u64 features; /* virtio features */ > __u8 status; /* device status */ > };
On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/22 下午10:52, Xie Yongji wrote: > > To support vhost-vdpa bus driver, we need a way to share the > > vhost-vdpa backend process's memory with the userspace VDUSE process. > > > > This patch tries to make use of the vhost iotlb message to achieve > > that. We will get the shm file from the iotlb message and pass it > > to the userspace VDUSE process. > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > > --- > > Documentation/driver-api/vduse.rst | 15 +++- > > drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > > include/uapi/linux/vduse.h | 11 +++ > > 3 files changed, 171 insertions(+), 2 deletions(-) > > > > diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > > index 623f7b040ccf..48e4b1ba353f 100644 > > --- a/Documentation/driver-api/vduse.rst > > +++ b/Documentation/driver-api/vduse.rst > > @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > > > > - VDUSE_GET_CONFIG: Read from device specific configuration space > > > > +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > > + > > +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > > + > > Please see include/linux/vdpa.h for details. > > > > -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > > +The data path of userspace vDPA device is implemented in different ways > > +depending on the vdpa bus to which it is attached. > > + > > +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > > driver which supports mapping the kernel dma buffer to a userspace iova > > region dynamically. The userspace iova region can be created by passing > > the userspace vDPA device fd to mmap(2). > > > > +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > > +which will be shared to the VDUSE userspace processs via the file > > +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > > +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > > +in this message. > > + > > Besides, the eventfd mechanism is used to trigger interrupt callbacks and > > receive virtqueue kicks in userspace. The following ioctls on the userspace > > vDPA device fd are provided to support that: > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > index b974333ed4e9..d24aaacb6008 100644 > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > @@ -34,6 +34,7 @@ > > > > struct vduse_dev_msg { > > struct vduse_dev_request req; > > + struct file *iotlb_file; > > struct vduse_dev_response resp; > > struct list_head list; > > wait_queue_head_t waitq; > > @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > > return ret; > > } > > > > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > > + u64 offset, u64 iova, u64 size, u8 perm) > > +{ > > + struct vduse_dev_msg *msg; > > + int ret; > > + > > + if (!size) > > + return -EINVAL; > > + > > + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > > + msg->req.size = sizeof(struct vduse_iotlb); > > + msg->req.iotlb.offset = offset; > > + msg->req.iotlb.iova = iova; > > + msg->req.iotlb.size = size; > > + msg->req.iotlb.perm = perm; > > + msg->req.iotlb.fd = -1; > > + msg->iotlb_file = get_file(file); > > + > > + ret = vduse_dev_msg_sync(dev, msg); > > > My feeling is that we should provide consistent API for the userspace > device to use. > > E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > > It looks to me for virtio drivers we can still use UPDAT_IOTLB message > by using VDUSE file as msg->iotlb_file here. > It's OK for me. One problem is when to transfer the UPDATE_IOTLB message in virtio cases. > > > + vduse_dev_msg_put(msg); > > + fput(file); > > + > > + return ret; > > +} > > + > > +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev, > > + u64 iova, u64 size) > > +{ > > + struct vduse_dev_msg *msg; > > + int ret; > > + > > + if (!size) > > + return -EINVAL; > > + > > + msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB); > > + msg->req.size = sizeof(struct vduse_iotlb); > > + msg->req.iotlb.iova = iova; > > + msg->req.iotlb.size = size; > > + > > + ret = vduse_dev_msg_sync(dev, msg); > > + vduse_dev_msg_put(msg); > > + > > + return ret; > > +} > > + > > +static unsigned int perm_to_file_flags(u8 perm) > > +{ > > + unsigned int flags = 0; > > + > > + switch (perm) { > > + case VHOST_ACCESS_WO: > > + flags |= O_WRONLY; > > + break; > > + case VHOST_ACCESS_RO: > > + flags |= O_RDONLY; > > + break; > > + case VHOST_ACCESS_RW: > > + flags |= O_RDWR; > > + break; > > + default: > > + WARN(1, "invalidate vhost IOTLB permission\n"); > > + break; > > + } > > + > > + return flags; > > +} > > + > > static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) > > { > > struct file *file = iocb->ki_filp; > > struct vduse_dev *dev = file->private_data; > > struct vduse_dev_msg *msg; > > - int size = sizeof(struct vduse_dev_request); > > + unsigned int flags; > > + int fd, size = sizeof(struct vduse_dev_request); > > ssize_t ret = 0; > > > > if (iov_iter_count(to) < size) > > @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) > > if (ret) > > return ret; > > } > > + > > + if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) { > > + flags = perm_to_file_flags(msg->req.iotlb.perm); > > + fd = get_unused_fd_flags(flags); > > + if (fd < 0) { > > + vduse_dev_enqueue_msg(dev, msg, &dev->send_list); > > + return fd; > > + } > > + fd_install(fd, get_file(msg->iotlb_file)); > > + msg->req.iotlb.fd = fd; > > + } > > + > > ret = copy_to_iter(&msg->req, size, to); > > if (ret != size) { > > vduse_dev_enqueue_msg(dev, msg, &dev->send_list); > > @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, > > vduse_dev_set_config(dev, offset, buf, len); > > } > > > > +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev, > > + struct vhost_iotlb_msg *msg) > > +{ > > + vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size); > > +} > > + > > +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev, > > + struct vhost_iotlb_msg *msg) > > +{ > > + u64 uaddr = msg->uaddr; > > + u64 iova = msg->iova; > > + u64 size = msg->size; > > + u64 offset; > > + struct vm_area_struct *vma; > > + int ret; > > + > > + while (uaddr < msg->uaddr + msg->size) { > > + vma = find_vma(current->mm, uaddr); > > + ret = -EINVAL; > > + if (!vma) > > + goto err; > > + > > + size = min(msg->size, vma->vm_end - uaddr); > > + offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; > > + if (vma->vm_file && (vma->vm_flags & VM_SHARED)) { > > + ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset, > > + iova, size, msg->perm); > > + if (ret) > > + goto err; > > > My understanding is that vma is something that should not be known by a > device. So I suggest to move the above processing to vhost-vdpa.c. > Will do it. Thanks, Yongji
On 2020/12/23 下午8:14, Yongji Xie wrote: > On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/22 下午10:52, Xie Yongji wrote: >>> To support vhost-vdpa bus driver, we need a way to share the >>> vhost-vdpa backend process's memory with the userspace VDUSE process. >>> >>> This patch tries to make use of the vhost iotlb message to achieve >>> that. We will get the shm file from the iotlb message and pass it >>> to the userspace VDUSE process. >>> >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> >>> --- >>> Documentation/driver-api/vduse.rst | 15 +++- >>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- >>> include/uapi/linux/vduse.h | 11 +++ >>> 3 files changed, 171 insertions(+), 2 deletions(-) >>> >>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst >>> index 623f7b040ccf..48e4b1ba353f 100644 >>> --- a/Documentation/driver-api/vduse.rst >>> +++ b/Documentation/driver-api/vduse.rst >>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: >>> >>> - VDUSE_GET_CONFIG: Read from device specific configuration space >>> >>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB >>> + >>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB >>> + >>> Please see include/linux/vdpa.h for details. >>> >>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU >>> +The data path of userspace vDPA device is implemented in different ways >>> +depending on the vdpa bus to which it is attached. >>> + >>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU >>> driver which supports mapping the kernel dma buffer to a userspace iova >>> region dynamically. The userspace iova region can be created by passing >>> the userspace vDPA device fd to mmap(2). >>> >>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region >>> +which will be shared to the VDUSE userspace processs via the file >>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address >>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included >>> +in this message. >>> + >>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and >>> receive virtqueue kicks in userspace. The following ioctls on the userspace >>> vDPA device fd are provided to support that: >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c >>> index b974333ed4e9..d24aaacb6008 100644 >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c >>> @@ -34,6 +34,7 @@ >>> >>> struct vduse_dev_msg { >>> struct vduse_dev_request req; >>> + struct file *iotlb_file; >>> struct vduse_dev_response resp; >>> struct list_head list; >>> wait_queue_head_t waitq; >>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, >>> return ret; >>> } >>> >>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, >>> + u64 offset, u64 iova, u64 size, u8 perm) >>> +{ >>> + struct vduse_dev_msg *msg; >>> + int ret; >>> + >>> + if (!size) >>> + return -EINVAL; >>> + >>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); >>> + msg->req.size = sizeof(struct vduse_iotlb); >>> + msg->req.iotlb.offset = offset; >>> + msg->req.iotlb.iova = iova; >>> + msg->req.iotlb.size = size; >>> + msg->req.iotlb.perm = perm; >>> + msg->req.iotlb.fd = -1; >>> + msg->iotlb_file = get_file(file); >>> + >>> + ret = vduse_dev_msg_sync(dev, msg); >> >> My feeling is that we should provide consistent API for the userspace >> device to use. >> >> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. >> >> It looks to me for virtio drivers we can still use UPDAT_IOTLB message >> by using VDUSE file as msg->iotlb_file here. >> > It's OK for me. One problem is when to transfer the UPDATE_IOTLB > message in virtio cases. Instead of generating IOTLB messages for userspace. How about record the mappings (which is a common case for device have on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl for userspace to query? Thanks > >>> + vduse_dev_msg_put(msg); >>> + fput(file); >>> + >>> + return ret; >>> +} >>> + >>> +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev, >>> + u64 iova, u64 size) >>> +{ >>> + struct vduse_dev_msg *msg; >>> + int ret; >>> + >>> + if (!size) >>> + return -EINVAL; >>> + >>> + msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB); >>> + msg->req.size = sizeof(struct vduse_iotlb); >>> + msg->req.iotlb.iova = iova; >>> + msg->req.iotlb.size = size; >>> + >>> + ret = vduse_dev_msg_sync(dev, msg); >>> + vduse_dev_msg_put(msg); >>> + >>> + return ret; >>> +} >>> + >>> +static unsigned int perm_to_file_flags(u8 perm) >>> +{ >>> + unsigned int flags = 0; >>> + >>> + switch (perm) { >>> + case VHOST_ACCESS_WO: >>> + flags |= O_WRONLY; >>> + break; >>> + case VHOST_ACCESS_RO: >>> + flags |= O_RDONLY; >>> + break; >>> + case VHOST_ACCESS_RW: >>> + flags |= O_RDWR; >>> + break; >>> + default: >>> + WARN(1, "invalidate vhost IOTLB permission\n"); >>> + break; >>> + } >>> + >>> + return flags; >>> +} >>> + >>> static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) >>> { >>> struct file *file = iocb->ki_filp; >>> struct vduse_dev *dev = file->private_data; >>> struct vduse_dev_msg *msg; >>> - int size = sizeof(struct vduse_dev_request); >>> + unsigned int flags; >>> + int fd, size = sizeof(struct vduse_dev_request); >>> ssize_t ret = 0; >>> >>> if (iov_iter_count(to) < size) >>> @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) >>> if (ret) >>> return ret; >>> } >>> + >>> + if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) { >>> + flags = perm_to_file_flags(msg->req.iotlb.perm); >>> + fd = get_unused_fd_flags(flags); >>> + if (fd < 0) { >>> + vduse_dev_enqueue_msg(dev, msg, &dev->send_list); >>> + return fd; >>> + } >>> + fd_install(fd, get_file(msg->iotlb_file)); >>> + msg->req.iotlb.fd = fd; >>> + } >>> + >>> ret = copy_to_iter(&msg->req, size, to); >>> if (ret != size) { >>> vduse_dev_enqueue_msg(dev, msg, &dev->send_list); >>> @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, >>> vduse_dev_set_config(dev, offset, buf, len); >>> } >>> >>> +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev, >>> + struct vhost_iotlb_msg *msg) >>> +{ >>> + vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size); >>> +} >>> + >>> +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev, >>> + struct vhost_iotlb_msg *msg) >>> +{ >>> + u64 uaddr = msg->uaddr; >>> + u64 iova = msg->iova; >>> + u64 size = msg->size; >>> + u64 offset; >>> + struct vm_area_struct *vma; >>> + int ret; >>> + >>> + while (uaddr < msg->uaddr + msg->size) { >>> + vma = find_vma(current->mm, uaddr); >>> + ret = -EINVAL; >>> + if (!vma) >>> + goto err; >>> + >>> + size = min(msg->size, vma->vm_end - uaddr); >>> + offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; >>> + if (vma->vm_file && (vma->vm_flags & VM_SHARED)) { >>> + ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset, >>> + iova, size, msg->perm); >>> + if (ret) >>> + goto err; >> >> My understanding is that vma is something that should not be known by a >> device. So I suggest to move the above processing to vhost-vdpa.c. >> > Will do it. > > Thanks, > Yongji >
On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/23 下午8:14, Yongji Xie wrote: > > On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/22 下午10:52, Xie Yongji wrote: > >>> To support vhost-vdpa bus driver, we need a way to share the > >>> vhost-vdpa backend process's memory with the userspace VDUSE process. > >>> > >>> This patch tries to make use of the vhost iotlb message to achieve > >>> that. We will get the shm file from the iotlb message and pass it > >>> to the userspace VDUSE process. > >>> > >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > >>> --- > >>> Documentation/driver-api/vduse.rst | 15 +++- > >>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > >>> include/uapi/linux/vduse.h | 11 +++ > >>> 3 files changed, 171 insertions(+), 2 deletions(-) > >>> > >>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > >>> index 623f7b040ccf..48e4b1ba353f 100644 > >>> --- a/Documentation/driver-api/vduse.rst > >>> +++ b/Documentation/driver-api/vduse.rst > >>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > >>> > >>> - VDUSE_GET_CONFIG: Read from device specific configuration space > >>> > >>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > >>> + > >>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > >>> + > >>> Please see include/linux/vdpa.h for details. > >>> > >>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > >>> +The data path of userspace vDPA device is implemented in different ways > >>> +depending on the vdpa bus to which it is attached. > >>> + > >>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > >>> driver which supports mapping the kernel dma buffer to a userspace iova > >>> region dynamically. The userspace iova region can be created by passing > >>> the userspace vDPA device fd to mmap(2). > >>> > >>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > >>> +which will be shared to the VDUSE userspace processs via the file > >>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > >>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > >>> +in this message. > >>> + > >>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and > >>> receive virtqueue kicks in userspace. The following ioctls on the userspace > >>> vDPA device fd are provided to support that: > >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > >>> index b974333ed4e9..d24aaacb6008 100644 > >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c > >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > >>> @@ -34,6 +34,7 @@ > >>> > >>> struct vduse_dev_msg { > >>> struct vduse_dev_request req; > >>> + struct file *iotlb_file; > >>> struct vduse_dev_response resp; > >>> struct list_head list; > >>> wait_queue_head_t waitq; > >>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > >>> return ret; > >>> } > >>> > >>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > >>> + u64 offset, u64 iova, u64 size, u8 perm) > >>> +{ > >>> + struct vduse_dev_msg *msg; > >>> + int ret; > >>> + > >>> + if (!size) > >>> + return -EINVAL; > >>> + > >>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > >>> + msg->req.size = sizeof(struct vduse_iotlb); > >>> + msg->req.iotlb.offset = offset; > >>> + msg->req.iotlb.iova = iova; > >>> + msg->req.iotlb.size = size; > >>> + msg->req.iotlb.perm = perm; > >>> + msg->req.iotlb.fd = -1; > >>> + msg->iotlb_file = get_file(file); > >>> + > >>> + ret = vduse_dev_msg_sync(dev, msg); > >> > >> My feeling is that we should provide consistent API for the userspace > >> device to use. > >> > >> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > >> > >> It looks to me for virtio drivers we can still use UPDAT_IOTLB message > >> by using VDUSE file as msg->iotlb_file here. > >> > > It's OK for me. One problem is when to transfer the UPDATE_IOTLB > > message in virtio cases. > > > Instead of generating IOTLB messages for userspace. > > How about record the mappings (which is a common case for device have > on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl > for userspace to query? > If so, the IOTLB UPDATE is actually triggered by ioctl, but IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or how about trigger it when userspace call mmap() on the device fd? Thanks, Yongji
On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote: > > On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On 2020/12/23 下午8:14, Yongji Xie wrote: > > > On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > > >> > > >> On 2020/12/22 下午10:52, Xie Yongji wrote: > > >>> To support vhost-vdpa bus driver, we need a way to share the > > >>> vhost-vdpa backend process's memory with the userspace VDUSE process. > > >>> > > >>> This patch tries to make use of the vhost iotlb message to achieve > > >>> that. We will get the shm file from the iotlb message and pass it > > >>> to the userspace VDUSE process. > > >>> > > >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > > >>> --- > > >>> Documentation/driver-api/vduse.rst | 15 +++- > > >>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > > >>> include/uapi/linux/vduse.h | 11 +++ > > >>> 3 files changed, 171 insertions(+), 2 deletions(-) > > >>> > > >>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > > >>> index 623f7b040ccf..48e4b1ba353f 100644 > > >>> --- a/Documentation/driver-api/vduse.rst > > >>> +++ b/Documentation/driver-api/vduse.rst > > >>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > > >>> > > >>> - VDUSE_GET_CONFIG: Read from device specific configuration space > > >>> > > >>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > > >>> + > > >>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > > >>> + > > >>> Please see include/linux/vdpa.h for details. > > >>> > > >>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > > >>> +The data path of userspace vDPA device is implemented in different ways > > >>> +depending on the vdpa bus to which it is attached. > > >>> + > > >>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > > >>> driver which supports mapping the kernel dma buffer to a userspace iova > > >>> region dynamically. The userspace iova region can be created by passing > > >>> the userspace vDPA device fd to mmap(2). > > >>> > > >>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > > >>> +which will be shared to the VDUSE userspace processs via the file > > >>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > > >>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > > >>> +in this message. > > >>> + > > >>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and > > >>> receive virtqueue kicks in userspace. The following ioctls on the userspace > > >>> vDPA device fd are provided to support that: > > >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > >>> index b974333ed4e9..d24aaacb6008 100644 > > >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > >>> @@ -34,6 +34,7 @@ > > >>> > > >>> struct vduse_dev_msg { > > >>> struct vduse_dev_request req; > > >>> + struct file *iotlb_file; > > >>> struct vduse_dev_response resp; > > >>> struct list_head list; > > >>> wait_queue_head_t waitq; > > >>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > > >>> return ret; > > >>> } > > >>> > > >>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > > >>> + u64 offset, u64 iova, u64 size, u8 perm) > > >>> +{ > > >>> + struct vduse_dev_msg *msg; > > >>> + int ret; > > >>> + > > >>> + if (!size) > > >>> + return -EINVAL; > > >>> + > > >>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > > >>> + msg->req.size = sizeof(struct vduse_iotlb); > > >>> + msg->req.iotlb.offset = offset; > > >>> + msg->req.iotlb.iova = iova; > > >>> + msg->req.iotlb.size = size; > > >>> + msg->req.iotlb.perm = perm; > > >>> + msg->req.iotlb.fd = -1; > > >>> + msg->iotlb_file = get_file(file); > > >>> + > > >>> + ret = vduse_dev_msg_sync(dev, msg); > > >> > > >> My feeling is that we should provide consistent API for the userspace > > >> device to use. > > >> > > >> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > > >> > > >> It looks to me for virtio drivers we can still use UPDAT_IOTLB message > > >> by using VDUSE file as msg->iotlb_file here. > > >> > > > It's OK for me. One problem is when to transfer the UPDATE_IOTLB > > > message in virtio cases. > > > > > > Instead of generating IOTLB messages for userspace. > > > > How about record the mappings (which is a common case for device have > > on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl > > for userspace to query? > > > > If so, the IOTLB UPDATE is actually triggered by ioctl, but > IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or > how about trigger it when userspace call mmap() on the device fd? > Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message handler. Is it possible for the vdpa device to know which vdpa bus it is attached to? So that we can generate this message during probing. Otherwise, we don't know whether the iova domain of MMU-based IOMMU is needed. Thanks, Yongji
On 2020/12/24 下午3:37, Yongji Xie wrote: > On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/23 下午8:14, Yongji Xie wrote: >>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: >>>> On 2020/12/22 下午10:52, Xie Yongji wrote: >>>>> To support vhost-vdpa bus driver, we need a way to share the >>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. >>>>> >>>>> This patch tries to make use of the vhost iotlb message to achieve >>>>> that. We will get the shm file from the iotlb message and pass it >>>>> to the userspace VDUSE process. >>>>> >>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> >>>>> --- >>>>> Documentation/driver-api/vduse.rst | 15 +++- >>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- >>>>> include/uapi/linux/vduse.h | 11 +++ >>>>> 3 files changed, 171 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst >>>>> index 623f7b040ccf..48e4b1ba353f 100644 >>>>> --- a/Documentation/driver-api/vduse.rst >>>>> +++ b/Documentation/driver-api/vduse.rst >>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: >>>>> >>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space >>>>> >>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB >>>>> + >>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB >>>>> + >>>>> Please see include/linux/vdpa.h for details. >>>>> >>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU >>>>> +The data path of userspace vDPA device is implemented in different ways >>>>> +depending on the vdpa bus to which it is attached. >>>>> + >>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU >>>>> driver which supports mapping the kernel dma buffer to a userspace iova >>>>> region dynamically. The userspace iova region can be created by passing >>>>> the userspace vDPA device fd to mmap(2). >>>>> >>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region >>>>> +which will be shared to the VDUSE userspace processs via the file >>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address >>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included >>>>> +in this message. >>>>> + >>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and >>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace >>>>> vDPA device fd are provided to support that: >>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>> index b974333ed4e9..d24aaacb6008 100644 >>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c >>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>> @@ -34,6 +34,7 @@ >>>>> >>>>> struct vduse_dev_msg { >>>>> struct vduse_dev_request req; >>>>> + struct file *iotlb_file; >>>>> struct vduse_dev_response resp; >>>>> struct list_head list; >>>>> wait_queue_head_t waitq; >>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, >>>>> return ret; >>>>> } >>>>> >>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, >>>>> + u64 offset, u64 iova, u64 size, u8 perm) >>>>> +{ >>>>> + struct vduse_dev_msg *msg; >>>>> + int ret; >>>>> + >>>>> + if (!size) >>>>> + return -EINVAL; >>>>> + >>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); >>>>> + msg->req.size = sizeof(struct vduse_iotlb); >>>>> + msg->req.iotlb.offset = offset; >>>>> + msg->req.iotlb.iova = iova; >>>>> + msg->req.iotlb.size = size; >>>>> + msg->req.iotlb.perm = perm; >>>>> + msg->req.iotlb.fd = -1; >>>>> + msg->iotlb_file = get_file(file); >>>>> + >>>>> + ret = vduse_dev_msg_sync(dev, msg); >>>> My feeling is that we should provide consistent API for the userspace >>>> device to use. >>>> >>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. >>>> >>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message >>>> by using VDUSE file as msg->iotlb_file here. >>>> >>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB >>> message in virtio cases. >> >> Instead of generating IOTLB messages for userspace. >> >> How about record the mappings (which is a common case for device have >> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl >> for userspace to query? >> > If so, the IOTLB UPDATE is actually triggered by ioctl, but > IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Good point. Some questions here: 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall is returned. But this patch doesn't have this guarantee. Can this lead some issues? 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to issue a munmap(). What if it doesn't do that? > Or > how about trigger it when userspace call mmap() on the device fd? One possible issue is that the IOTLB_UPDATE/INVALIDATE might come after mmap(): 1) When vIOMMU is enabled 2) Guest memory topology has been changed (memory hotplug). Thanks > > Thanks, > Yongji >
On 2020/12/25 上午10:37, Yongji Xie wrote: > On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote: >> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: >>> >>> On 2020/12/23 下午8:14, Yongji Xie wrote: >>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: >>>>> On 2020/12/22 下午10:52, Xie Yongji wrote: >>>>>> To support vhost-vdpa bus driver, we need a way to share the >>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. >>>>>> >>>>>> This patch tries to make use of the vhost iotlb message to achieve >>>>>> that. We will get the shm file from the iotlb message and pass it >>>>>> to the userspace VDUSE process. >>>>>> >>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> >>>>>> --- >>>>>> Documentation/driver-api/vduse.rst | 15 +++- >>>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- >>>>>> include/uapi/linux/vduse.h | 11 +++ >>>>>> 3 files changed, 171 insertions(+), 2 deletions(-) >>>>>> >>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst >>>>>> index 623f7b040ccf..48e4b1ba353f 100644 >>>>>> --- a/Documentation/driver-api/vduse.rst >>>>>> +++ b/Documentation/driver-api/vduse.rst >>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: >>>>>> >>>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space >>>>>> >>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB >>>>>> + >>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB >>>>>> + >>>>>> Please see include/linux/vdpa.h for details. >>>>>> >>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU >>>>>> +The data path of userspace vDPA device is implemented in different ways >>>>>> +depending on the vdpa bus to which it is attached. >>>>>> + >>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU >>>>>> driver which supports mapping the kernel dma buffer to a userspace iova >>>>>> region dynamically. The userspace iova region can be created by passing >>>>>> the userspace vDPA device fd to mmap(2). >>>>>> >>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region >>>>>> +which will be shared to the VDUSE userspace processs via the file >>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address >>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included >>>>>> +in this message. >>>>>> + >>>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and >>>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace >>>>>> vDPA device fd are provided to support that: >>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>> index b974333ed4e9..d24aaacb6008 100644 >>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>> @@ -34,6 +34,7 @@ >>>>>> >>>>>> struct vduse_dev_msg { >>>>>> struct vduse_dev_request req; >>>>>> + struct file *iotlb_file; >>>>>> struct vduse_dev_response resp; >>>>>> struct list_head list; >>>>>> wait_queue_head_t waitq; >>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, >>>>>> return ret; >>>>>> } >>>>>> >>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, >>>>>> + u64 offset, u64 iova, u64 size, u8 perm) >>>>>> +{ >>>>>> + struct vduse_dev_msg *msg; >>>>>> + int ret; >>>>>> + >>>>>> + if (!size) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); >>>>>> + msg->req.size = sizeof(struct vduse_iotlb); >>>>>> + msg->req.iotlb.offset = offset; >>>>>> + msg->req.iotlb.iova = iova; >>>>>> + msg->req.iotlb.size = size; >>>>>> + msg->req.iotlb.perm = perm; >>>>>> + msg->req.iotlb.fd = -1; >>>>>> + msg->iotlb_file = get_file(file); >>>>>> + >>>>>> + ret = vduse_dev_msg_sync(dev, msg); >>>>> My feeling is that we should provide consistent API for the userspace >>>>> device to use. >>>>> >>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. >>>>> >>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message >>>>> by using VDUSE file as msg->iotlb_file here. >>>>> >>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB >>>> message in virtio cases. >>> >>> Instead of generating IOTLB messages for userspace. >>> >>> How about record the mappings (which is a common case for device have >>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl >>> for userspace to query? >>> >> If so, the IOTLB UPDATE is actually triggered by ioctl, but >> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or >> how about trigger it when userspace call mmap() on the device fd? >> > Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message > handler. Is it possible for the vdpa device to know which vdpa bus it > is attached to? We'd better not. It's kind of layer violation. Thanks > So that we can generate this message during probing. > Otherwise, we don't know whether the iova domain of MMU-based IOMMU is > needed. > > Thanks, > Yongji >
On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/24 下午3:37, Yongji Xie wrote: > > On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/23 下午8:14, Yongji Xie wrote: > >>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2020/12/22 下午10:52, Xie Yongji wrote: > >>>>> To support vhost-vdpa bus driver, we need a way to share the > >>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. > >>>>> > >>>>> This patch tries to make use of the vhost iotlb message to achieve > >>>>> that. We will get the shm file from the iotlb message and pass it > >>>>> to the userspace VDUSE process. > >>>>> > >>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > >>>>> --- > >>>>> Documentation/driver-api/vduse.rst | 15 +++- > >>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > >>>>> include/uapi/linux/vduse.h | 11 +++ > >>>>> 3 files changed, 171 insertions(+), 2 deletions(-) > >>>>> > >>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > >>>>> index 623f7b040ccf..48e4b1ba353f 100644 > >>>>> --- a/Documentation/driver-api/vduse.rst > >>>>> +++ b/Documentation/driver-api/vduse.rst > >>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > >>>>> > >>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space > >>>>> > >>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > >>>>> + > >>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > >>>>> + > >>>>> Please see include/linux/vdpa.h for details. > >>>>> > >>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>> +The data path of userspace vDPA device is implemented in different ways > >>>>> +depending on the vdpa bus to which it is attached. > >>>>> + > >>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>> driver which supports mapping the kernel dma buffer to a userspace iova > >>>>> region dynamically. The userspace iova region can be created by passing > >>>>> the userspace vDPA device fd to mmap(2). > >>>>> > >>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > >>>>> +which will be shared to the VDUSE userspace processs via the file > >>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > >>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > >>>>> +in this message. > >>>>> + > >>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and > >>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace > >>>>> vDPA device fd are provided to support that: > >>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>> index b974333ed4e9..d24aaacb6008 100644 > >>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>> @@ -34,6 +34,7 @@ > >>>>> > >>>>> struct vduse_dev_msg { > >>>>> struct vduse_dev_request req; > >>>>> + struct file *iotlb_file; > >>>>> struct vduse_dev_response resp; > >>>>> struct list_head list; > >>>>> wait_queue_head_t waitq; > >>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > >>>>> return ret; > >>>>> } > >>>>> > >>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > >>>>> + u64 offset, u64 iova, u64 size, u8 perm) > >>>>> +{ > >>>>> + struct vduse_dev_msg *msg; > >>>>> + int ret; > >>>>> + > >>>>> + if (!size) > >>>>> + return -EINVAL; > >>>>> + > >>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > >>>>> + msg->req.size = sizeof(struct vduse_iotlb); > >>>>> + msg->req.iotlb.offset = offset; > >>>>> + msg->req.iotlb.iova = iova; > >>>>> + msg->req.iotlb.size = size; > >>>>> + msg->req.iotlb.perm = perm; > >>>>> + msg->req.iotlb.fd = -1; > >>>>> + msg->iotlb_file = get_file(file); > >>>>> + > >>>>> + ret = vduse_dev_msg_sync(dev, msg); > >>>> My feeling is that we should provide consistent API for the userspace > >>>> device to use. > >>>> > >>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > >>>> > >>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message > >>>> by using VDUSE file as msg->iotlb_file here. > >>>> > >>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB > >>> message in virtio cases. > >> > >> Instead of generating IOTLB messages for userspace. > >> > >> How about record the mappings (which is a common case for device have > >> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl > >> for userspace to query? > >> > > If so, the IOTLB UPDATE is actually triggered by ioctl, but > > IOTLB_INVALIDATE is triggered by the message. Is it a little odd? > > > Good point. > > Some questions here: > > 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall > is returned. But this patch doesn't have this guarantee. Can this lead > some issues? I'm not sure. But should it be guaranteed in VDUSE userspace? Just like what vhost-user backend process does. > 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to > issue a munmap(). What if it doesn't do that? > Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace could still access the corresponding guest memory region. Thanks, Yongji
On Fri, Dec 25, 2020 at 3:02 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/25 上午10:37, Yongji Xie wrote: > > On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote: > >> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: > >>> > >>> On 2020/12/23 下午8:14, Yongji Xie wrote: > >>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>> On 2020/12/22 下午10:52, Xie Yongji wrote: > >>>>>> To support vhost-vdpa bus driver, we need a way to share the > >>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. > >>>>>> > >>>>>> This patch tries to make use of the vhost iotlb message to achieve > >>>>>> that. We will get the shm file from the iotlb message and pass it > >>>>>> to the userspace VDUSE process. > >>>>>> > >>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > >>>>>> --- > >>>>>> Documentation/driver-api/vduse.rst | 15 +++- > >>>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > >>>>>> include/uapi/linux/vduse.h | 11 +++ > >>>>>> 3 files changed, 171 insertions(+), 2 deletions(-) > >>>>>> > >>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > >>>>>> index 623f7b040ccf..48e4b1ba353f 100644 > >>>>>> --- a/Documentation/driver-api/vduse.rst > >>>>>> +++ b/Documentation/driver-api/vduse.rst > >>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > >>>>>> > >>>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space > >>>>>> > >>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > >>>>>> + > >>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > >>>>>> + > >>>>>> Please see include/linux/vdpa.h for details. > >>>>>> > >>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>>> +The data path of userspace vDPA device is implemented in different ways > >>>>>> +depending on the vdpa bus to which it is attached. > >>>>>> + > >>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>>> driver which supports mapping the kernel dma buffer to a userspace iova > >>>>>> region dynamically. The userspace iova region can be created by passing > >>>>>> the userspace vDPA device fd to mmap(2). > >>>>>> > >>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > >>>>>> +which will be shared to the VDUSE userspace processs via the file > >>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > >>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > >>>>>> +in this message. > >>>>>> + > >>>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and > >>>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace > >>>>>> vDPA device fd are provided to support that: > >>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>> index b974333ed4e9..d24aaacb6008 100644 > >>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>> @@ -34,6 +34,7 @@ > >>>>>> > >>>>>> struct vduse_dev_msg { > >>>>>> struct vduse_dev_request req; > >>>>>> + struct file *iotlb_file; > >>>>>> struct vduse_dev_response resp; > >>>>>> struct list_head list; > >>>>>> wait_queue_head_t waitq; > >>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > >>>>>> return ret; > >>>>>> } > >>>>>> > >>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > >>>>>> + u64 offset, u64 iova, u64 size, u8 perm) > >>>>>> +{ > >>>>>> + struct vduse_dev_msg *msg; > >>>>>> + int ret; > >>>>>> + > >>>>>> + if (!size) > >>>>>> + return -EINVAL; > >>>>>> + > >>>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > >>>>>> + msg->req.size = sizeof(struct vduse_iotlb); > >>>>>> + msg->req.iotlb.offset = offset; > >>>>>> + msg->req.iotlb.iova = iova; > >>>>>> + msg->req.iotlb.size = size; > >>>>>> + msg->req.iotlb.perm = perm; > >>>>>> + msg->req.iotlb.fd = -1; > >>>>>> + msg->iotlb_file = get_file(file); > >>>>>> + > >>>>>> + ret = vduse_dev_msg_sync(dev, msg); > >>>>> My feeling is that we should provide consistent API for the userspace > >>>>> device to use. > >>>>> > >>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > >>>>> > >>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message > >>>>> by using VDUSE file as msg->iotlb_file here. > >>>>> > >>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB > >>>> message in virtio cases. > >>> > >>> Instead of generating IOTLB messages for userspace. > >>> > >>> How about record the mappings (which is a common case for device have > >>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl > >>> for userspace to query? > >>> > >> If so, the IOTLB UPDATE is actually triggered by ioctl, but > >> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or > >> how about trigger it when userspace call mmap() on the device fd? > >> > > Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message > > handler. Is it possible for the vdpa device to know which vdpa bus it > > is attached to? > > > We'd better not. It's kind of layer violation. > OK. Now I think both ioctl and message are needed. The ioctl is useful when VDUSE userspace daemon reboot. And the IOTLB_UPDATE message could be generated during the first DMA mapping in the virtio-vdpa case. Thanks, Yongji
On 2020/12/25 下午6:31, Yongji Xie wrote: > On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/24 下午3:37, Yongji Xie wrote: >>> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: >>>> On 2020/12/23 下午8:14, Yongji Xie wrote: >>>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>> On 2020/12/22 下午10:52, Xie Yongji wrote: >>>>>>> To support vhost-vdpa bus driver, we need a way to share the >>>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. >>>>>>> >>>>>>> This patch tries to make use of the vhost iotlb message to achieve >>>>>>> that. We will get the shm file from the iotlb message and pass it >>>>>>> to the userspace VDUSE process. >>>>>>> >>>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> >>>>>>> --- >>>>>>> Documentation/driver-api/vduse.rst | 15 +++- >>>>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- >>>>>>> include/uapi/linux/vduse.h | 11 +++ >>>>>>> 3 files changed, 171 insertions(+), 2 deletions(-) >>>>>>> >>>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst >>>>>>> index 623f7b040ccf..48e4b1ba353f 100644 >>>>>>> --- a/Documentation/driver-api/vduse.rst >>>>>>> +++ b/Documentation/driver-api/vduse.rst >>>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: >>>>>>> >>>>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space >>>>>>> >>>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB >>>>>>> + >>>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB >>>>>>> + >>>>>>> Please see include/linux/vdpa.h for details. >>>>>>> >>>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU >>>>>>> +The data path of userspace vDPA device is implemented in different ways >>>>>>> +depending on the vdpa bus to which it is attached. >>>>>>> + >>>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU >>>>>>> driver which supports mapping the kernel dma buffer to a userspace iova >>>>>>> region dynamically. The userspace iova region can be created by passing >>>>>>> the userspace vDPA device fd to mmap(2). >>>>>>> >>>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region >>>>>>> +which will be shared to the VDUSE userspace processs via the file >>>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address >>>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included >>>>>>> +in this message. >>>>>>> + >>>>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and >>>>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace >>>>>>> vDPA device fd are provided to support that: >>>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>>> index b974333ed4e9..d24aaacb6008 100644 >>>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c >>>>>>> @@ -34,6 +34,7 @@ >>>>>>> >>>>>>> struct vduse_dev_msg { >>>>>>> struct vduse_dev_request req; >>>>>>> + struct file *iotlb_file; >>>>>>> struct vduse_dev_response resp; >>>>>>> struct list_head list; >>>>>>> wait_queue_head_t waitq; >>>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, >>>>>>> + u64 offset, u64 iova, u64 size, u8 perm) >>>>>>> +{ >>>>>>> + struct vduse_dev_msg *msg; >>>>>>> + int ret; >>>>>>> + >>>>>>> + if (!size) >>>>>>> + return -EINVAL; >>>>>>> + >>>>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); >>>>>>> + msg->req.size = sizeof(struct vduse_iotlb); >>>>>>> + msg->req.iotlb.offset = offset; >>>>>>> + msg->req.iotlb.iova = iova; >>>>>>> + msg->req.iotlb.size = size; >>>>>>> + msg->req.iotlb.perm = perm; >>>>>>> + msg->req.iotlb.fd = -1; >>>>>>> + msg->iotlb_file = get_file(file); >>>>>>> + >>>>>>> + ret = vduse_dev_msg_sync(dev, msg); >>>>>> My feeling is that we should provide consistent API for the userspace >>>>>> device to use. >>>>>> >>>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. >>>>>> >>>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message >>>>>> by using VDUSE file as msg->iotlb_file here. >>>>>> >>>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB >>>>> message in virtio cases. >>>> Instead of generating IOTLB messages for userspace. >>>> >>>> How about record the mappings (which is a common case for device have >>>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl >>>> for userspace to query? >>>> >>> If so, the IOTLB UPDATE is actually triggered by ioctl, but >>> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? >> >> Good point. >> >> Some questions here: >> >> 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall >> is returned. But this patch doesn't have this guarantee. Can this lead >> some issues? > I'm not sure. But should it be guaranteed in VDUSE userspace? Just > like what vhost-user backend process does. I think so. This is because the userspace device needs a way to synchronize invalidation with its datapath. Otherwise, guest may thing the buffer is freed to be used by other parts but the it actually can be used by the VDUSE device. The may cause security issues. > >> 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to >> issue a munmap(). What if it doesn't do that? >> > Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace > could still access the corresponding guest memory region. I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE is expected to be synchronous. This need to be solved by tweaking the current VDUSE API or we can re-visit to go with descriptors relaying first. Thanks > > Thanks, > Yongji >
On Mon, Dec 28, 2020 at 3:43 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/25 下午6:31, Yongji Xie wrote: > > On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/24 下午3:37, Yongji Xie wrote: > >>> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2020/12/23 下午8:14, Yongji Xie wrote: > >>>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> On 2020/12/22 下午10:52, Xie Yongji wrote: > >>>>>>> To support vhost-vdpa bus driver, we need a way to share the > >>>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process. > >>>>>>> > >>>>>>> This patch tries to make use of the vhost iotlb message to achieve > >>>>>>> that. We will get the shm file from the iotlb message and pass it > >>>>>>> to the userspace VDUSE process. > >>>>>>> > >>>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com> > >>>>>>> --- > >>>>>>> Documentation/driver-api/vduse.rst | 15 +++- > >>>>>>> drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- > >>>>>>> include/uapi/linux/vduse.h | 11 +++ > >>>>>>> 3 files changed, 171 insertions(+), 2 deletions(-) > >>>>>>> > >>>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst > >>>>>>> index 623f7b040ccf..48e4b1ba353f 100644 > >>>>>>> --- a/Documentation/driver-api/vduse.rst > >>>>>>> +++ b/Documentation/driver-api/vduse.rst > >>>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: > >>>>>>> > >>>>>>> - VDUSE_GET_CONFIG: Read from device specific configuration space > >>>>>>> > >>>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB > >>>>>>> + > >>>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB > >>>>>>> + > >>>>>>> Please see include/linux/vdpa.h for details. > >>>>>>> > >>>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>>>> +The data path of userspace vDPA device is implemented in different ways > >>>>>>> +depending on the vdpa bus to which it is attached. > >>>>>>> + > >>>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU > >>>>>>> driver which supports mapping the kernel dma buffer to a userspace iova > >>>>>>> region dynamically. The userspace iova region can be created by passing > >>>>>>> the userspace vDPA device fd to mmap(2). > >>>>>>> > >>>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region > >>>>>>> +which will be shared to the VDUSE userspace processs via the file > >>>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address > >>>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included > >>>>>>> +in this message. > >>>>>>> + > >>>>>>> Besides, the eventfd mechanism is used to trigger interrupt callbacks and > >>>>>>> receive virtqueue kicks in userspace. The following ioctls on the userspace > >>>>>>> vDPA device fd are provided to support that: > >>>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>>> index b974333ed4e9..d24aaacb6008 100644 > >>>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > >>>>>>> @@ -34,6 +34,7 @@ > >>>>>>> > >>>>>>> struct vduse_dev_msg { > >>>>>>> struct vduse_dev_request req; > >>>>>>> + struct file *iotlb_file; > >>>>>>> struct vduse_dev_response resp; > >>>>>>> struct list_head list; > >>>>>>> wait_queue_head_t waitq; > >>>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, > >>>>>>> return ret; > >>>>>>> } > >>>>>>> > >>>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, > >>>>>>> + u64 offset, u64 iova, u64 size, u8 perm) > >>>>>>> +{ > >>>>>>> + struct vduse_dev_msg *msg; > >>>>>>> + int ret; > >>>>>>> + > >>>>>>> + if (!size) > >>>>>>> + return -EINVAL; > >>>>>>> + > >>>>>>> + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); > >>>>>>> + msg->req.size = sizeof(struct vduse_iotlb); > >>>>>>> + msg->req.iotlb.offset = offset; > >>>>>>> + msg->req.iotlb.iova = iova; > >>>>>>> + msg->req.iotlb.size = size; > >>>>>>> + msg->req.iotlb.perm = perm; > >>>>>>> + msg->req.iotlb.fd = -1; > >>>>>>> + msg->iotlb_file = get_file(file); > >>>>>>> + > >>>>>>> + ret = vduse_dev_msg_sync(dev, msg); > >>>>>> My feeling is that we should provide consistent API for the userspace > >>>>>> device to use. > >>>>>> > >>>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers. > >>>>>> > >>>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message > >>>>>> by using VDUSE file as msg->iotlb_file here. > >>>>>> > >>>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB > >>>>> message in virtio cases. > >>>> Instead of generating IOTLB messages for userspace. > >>>> > >>>> How about record the mappings (which is a common case for device have > >>>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl > >>>> for userspace to query? > >>>> > >>> If so, the IOTLB UPDATE is actually triggered by ioctl, but > >>> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? > >> > >> Good point. > >> > >> Some questions here: > >> > >> 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall > >> is returned. But this patch doesn't have this guarantee. Can this lead > >> some issues? > > I'm not sure. But should it be guaranteed in VDUSE userspace? Just > > like what vhost-user backend process does. > > > I think so. This is because the userspace device needs a way to > synchronize invalidation with its datapath. Otherwise, guest may thing > the buffer is freed to be used by other parts but the it actually can be > used by the VDUSE device. The may cause security issues. > > > > > >> 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to > >> issue a munmap(). What if it doesn't do that? > >> > > Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace > > could still access the corresponding guest memory region. > > > I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > is expected to be synchronous. This need to be solved by tweaking the > current VDUSE API or we can re-visit to go with descriptors relaying first. > Actually all vdpa related operations are synchronous in current implementation. The ops.set_map/dma_map/dma_unmap should not return until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied by userspace. Could it solve this problem? Thanks, Yongji
On 2020/12/28 下午4:14, Yongji Xie wrote: >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >> is expected to be synchronous. This need to be solved by tweaking the >> current VDUSE API or we can re-visit to go with descriptors relaying first. >> > Actually all vdpa related operations are synchronous in current > implementation. The ops.set_map/dma_map/dma_unmap should not return > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > by userspace. Could it solve this problem? I was thinking whether or not we need to generate IOTLB_INVALIDATE message to VDUSE during dma_unmap (vduse_dev_unmap_page). If we don't, we're probably fine. Thanks > > Thanks, > Yongji >
On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/28 下午4:14, Yongji Xie wrote: > >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >> is expected to be synchronous. This need to be solved by tweaking the > >> current VDUSE API or we can re-visit to go with descriptors relaying first. > >> > > Actually all vdpa related operations are synchronous in current > > implementation. The ops.set_map/dma_map/dma_unmap should not return > > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > > by userspace. Could it solve this problem? > > > I was thinking whether or not we need to generate IOTLB_INVALIDATE > message to VDUSE during dma_unmap (vduse_dev_unmap_page). > > If we don't, we're probably fine. > It seems not feasible. This message will be also used in the virtio-vdpa case to notify userspace to unmap some pages during consistent dma unmapping. Maybe we can document it to make sure the users can handle the message correctly. Thanks, Yongji
----- Original Message ----- > On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On 2020/12/28 下午4:14, Yongji Xie wrote: > > >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > > >> is expected to be synchronous. This need to be solved by tweaking the > > >> current VDUSE API or we can re-visit to go with descriptors relaying > > >> first. > > >> > > > Actually all vdpa related operations are synchronous in current > > > implementation. The ops.set_map/dma_map/dma_unmap should not return > > > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > > > by userspace. Could it solve this problem? > > > > > > I was thinking whether or not we need to generate IOTLB_INVALIDATE > > message to VDUSE during dma_unmap (vduse_dev_unmap_page). > > > > If we don't, we're probably fine. > > > > It seems not feasible. This message will be also used in the > virtio-vdpa case to notify userspace to unmap some pages during > consistent dma unmapping. Maybe we can document it to make sure the > users can handle the message correctly. Just to make sure I understand your point. Do you mean you plan to notify the unmap of 1) streaming DMA or 2) coherent DMA? For 1) you probably need a workqueue to do that since dma unmap can be done in irq or bh context. And if usrspace does't do the unmap, it can still access the bounce buffer (if you don't zap pte)? Thanks > > Thanks, > Yongji > >
On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > > > > ----- Original Message ----- > > On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > On 2020/12/28 下午4:14, Yongji Xie wrote: > > > >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > > > >> is expected to be synchronous. This need to be solved by tweaking the > > > >> current VDUSE API or we can re-visit to go with descriptors relaying > > > >> first. > > > >> > > > > Actually all vdpa related operations are synchronous in current > > > > implementation. The ops.set_map/dma_map/dma_unmap should not return > > > > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > > > > by userspace. Could it solve this problem? > > > > > > > > > I was thinking whether or not we need to generate IOTLB_INVALIDATE > > > message to VDUSE during dma_unmap (vduse_dev_unmap_page). > > > > > > If we don't, we're probably fine. > > > > > > > It seems not feasible. This message will be also used in the > > virtio-vdpa case to notify userspace to unmap some pages during > > consistent dma unmapping. Maybe we can document it to make sure the > > users can handle the message correctly. > > Just to make sure I understand your point. > > Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > coherent DMA? > > For 1) you probably need a workqueue to do that since dma unmap can > be done in irq or bh context. And if usrspace does't do the unmap, it > can still access the bounce buffer (if you don't zap pte)? > I plan to do it in the coherent DMA case. It's true that userspace can access the dma buffer if userspace doesn't do the unmap. But the dma pages would not be freed and reused unless user space called munmap() for them. Thanks, Yongji
On 2020/12/29 下午6:26, Yongji Xie wrote: > On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: >> >> >> ----- Original Message ----- >>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: >>>> >>>> On 2020/12/28 下午4:14, Yongji Xie wrote: >>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >>>>>> is expected to be synchronous. This need to be solved by tweaking the >>>>>> current VDUSE API or we can re-visit to go with descriptors relaying >>>>>> first. >>>>>> >>>>> Actually all vdpa related operations are synchronous in current >>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return >>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied >>>>> by userspace. Could it solve this problem? >>>> >>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE >>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). >>>> >>>> If we don't, we're probably fine. >>>> >>> It seems not feasible. This message will be also used in the >>> virtio-vdpa case to notify userspace to unmap some pages during >>> consistent dma unmapping. Maybe we can document it to make sure the >>> users can handle the message correctly. >> Just to make sure I understand your point. >> >> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) >> coherent DMA? >> >> For 1) you probably need a workqueue to do that since dma unmap can >> be done in irq or bh context. And if usrspace does't do the unmap, it >> can still access the bounce buffer (if you don't zap pte)? >> > I plan to do it in the coherent DMA case. Any reason for treating coherent DMA differently? > It's true that userspace can > access the dma buffer if userspace doesn't do the unmap. But the dma > pages would not be freed and reused unless user space called munmap() > for them. I wonder whether or not we could recycle IOVA in this case to avoid the IOTLB_UMAP message. Thanks > > Thanks, > Yongji >
On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/29 下午6:26, Yongji Xie wrote: > > On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> > >> ----- Original Message ----- > >>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > >>>> > >>>> On 2020/12/28 下午4:14, Yongji Xie wrote: > >>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >>>>>> is expected to be synchronous. This need to be solved by tweaking the > >>>>>> current VDUSE API or we can re-visit to go with descriptors relaying > >>>>>> first. > >>>>>> > >>>>> Actually all vdpa related operations are synchronous in current > >>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return > >>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > >>>>> by userspace. Could it solve this problem? > >>>> > >>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE > >>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). > >>>> > >>>> If we don't, we're probably fine. > >>>> > >>> It seems not feasible. This message will be also used in the > >>> virtio-vdpa case to notify userspace to unmap some pages during > >>> consistent dma unmapping. Maybe we can document it to make sure the > >>> users can handle the message correctly. > >> Just to make sure I understand your point. > >> > >> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > >> coherent DMA? > >> > >> For 1) you probably need a workqueue to do that since dma unmap can > >> be done in irq or bh context. And if usrspace does't do the unmap, it > >> can still access the bounce buffer (if you don't zap pte)? > >> > > I plan to do it in the coherent DMA case. > > > Any reason for treating coherent DMA differently? > Now the memory of the bounce buffer is allocated page by page in the page fault handler. So it can't be used in coherent DMA mapping case which needs some memory with contiguous virtual addresses. I can use vmalloc() to do allocation for the bounce buffer instead. But it might cause some memory waste. Any suggestion? > > > It's true that userspace can > > access the dma buffer if userspace doesn't do the unmap. But the dma > > pages would not be freed and reused unless user space called munmap() > > for them. > > > I wonder whether or not we could recycle IOVA in this case to avoid the > IOTLB_UMAP message. > We can achieve that if we use vmalloc() to do allocation for the bounce buffer which can be used in coherent DMA mapping case. But looks like we still have no way to avoid the IOTLB_UMAP message in vhost-vdpa case. Thanks, Yongji
On 2020/12/30 下午3:09, Yongji Xie wrote: > On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/29 下午6:26, Yongji Xie wrote: >>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>> >>>> ----- Original Message ----- >>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: >>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >>>>>>>> is expected to be synchronous. This need to be solved by tweaking the >>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying >>>>>>>> first. >>>>>>>> >>>>>>> Actually all vdpa related operations are synchronous in current >>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return >>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied >>>>>>> by userspace. Could it solve this problem? >>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE >>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). >>>>>> >>>>>> If we don't, we're probably fine. >>>>>> >>>>> It seems not feasible. This message will be also used in the >>>>> virtio-vdpa case to notify userspace to unmap some pages during >>>>> consistent dma unmapping. Maybe we can document it to make sure the >>>>> users can handle the message correctly. >>>> Just to make sure I understand your point. >>>> >>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) >>>> coherent DMA? >>>> >>>> For 1) you probably need a workqueue to do that since dma unmap can >>>> be done in irq or bh context. And if usrspace does't do the unmap, it >>>> can still access the bounce buffer (if you don't zap pte)? >>>> >>> I plan to do it in the coherent DMA case. >> >> Any reason for treating coherent DMA differently? >> > Now the memory of the bounce buffer is allocated page by page in the > page fault handler. So it can't be used in coherent DMA mapping case > which needs some memory with contiguous virtual addresses. I can use > vmalloc() to do allocation for the bounce buffer instead. But it might > cause some memory waste. Any suggestion? I may miss something. But I don't see a relationship between the IOTLB_UNMAP and vmalloc(). > >>> It's true that userspace can >>> access the dma buffer if userspace doesn't do the unmap. But the dma >>> pages would not be freed and reused unless user space called munmap() >>> for them. >> >> I wonder whether or not we could recycle IOVA in this case to avoid the >> IOTLB_UMAP message. >> > We can achieve that if we use vmalloc() to do allocation for the > bounce buffer which can be used in coherent DMA mapping case. But > looks like we still have no way to avoid the IOTLB_UMAP message in > vhost-vdpa case. I think that's fine. For virtio-vdpa, from VDUSE userspace perspective, it works like a driver that is using SWIOTLB in this case. Thanks > > Thanks, > Yongji >
On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/30 下午3:09, Yongji Xie wrote: > > On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/29 下午6:26, Yongji Xie wrote: > >>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>> > >>>> ----- Original Message ----- > >>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: > >>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >>>>>>>> is expected to be synchronous. This need to be solved by tweaking the > >>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying > >>>>>>>> first. > >>>>>>>> > >>>>>>> Actually all vdpa related operations are synchronous in current > >>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return > >>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > >>>>>>> by userspace. Could it solve this problem? > >>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE > >>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). > >>>>>> > >>>>>> If we don't, we're probably fine. > >>>>>> > >>>>> It seems not feasible. This message will be also used in the > >>>>> virtio-vdpa case to notify userspace to unmap some pages during > >>>>> consistent dma unmapping. Maybe we can document it to make sure the > >>>>> users can handle the message correctly. > >>>> Just to make sure I understand your point. > >>>> > >>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > >>>> coherent DMA? > >>>> > >>>> For 1) you probably need a workqueue to do that since dma unmap can > >>>> be done in irq or bh context. And if usrspace does't do the unmap, it > >>>> can still access the bounce buffer (if you don't zap pte)? > >>>> > >>> I plan to do it in the coherent DMA case. > >> > >> Any reason for treating coherent DMA differently? > >> > > Now the memory of the bounce buffer is allocated page by page in the > > page fault handler. So it can't be used in coherent DMA mapping case > > which needs some memory with contiguous virtual addresses. I can use > > vmalloc() to do allocation for the bounce buffer instead. But it might > > cause some memory waste. Any suggestion? > > > I may miss something. But I don't see a relationship between the > IOTLB_UNMAP and vmalloc(). > In the vmalloc() case, the coherent DMA page will be taken from the memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore during coherent DMA unmapping because those vmalloc'ed memory which has been mapped into userspace address space during initialization can be reused. And userspace should not unmap the region until we destroy the device. > > > > >>> It's true that userspace can > >>> access the dma buffer if userspace doesn't do the unmap. But the dma > >>> pages would not be freed and reused unless user space called munmap() > >>> for them. > >> > >> I wonder whether or not we could recycle IOVA in this case to avoid the > >> IOTLB_UMAP message. > >> > > We can achieve that if we use vmalloc() to do allocation for the > > bounce buffer which can be used in coherent DMA mapping case. But > > looks like we still have no way to avoid the IOTLB_UMAP message in > > vhost-vdpa case. > > > I think that's fine. For virtio-vdpa, from VDUSE userspace perspective, > it works like a driver that is using SWIOTLB in this case. > OK, will do it in v3. Thanks, Yongji
On 2020/12/30 下午6:12, Yongji Xie wrote: > On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/30 下午3:09, Yongji Xie wrote: >>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>> On 2020/12/29 下午6:26, Yongji Xie wrote: >>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>> ----- Original Message ----- >>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: >>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the >>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying >>>>>>>>>> first. >>>>>>>>>> >>>>>>>>> Actually all vdpa related operations are synchronous in current >>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return >>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied >>>>>>>>> by userspace. Could it solve this problem? >>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE >>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). >>>>>>>> >>>>>>>> If we don't, we're probably fine. >>>>>>>> >>>>>>> It seems not feasible. This message will be also used in the >>>>>>> virtio-vdpa case to notify userspace to unmap some pages during >>>>>>> consistent dma unmapping. Maybe we can document it to make sure the >>>>>>> users can handle the message correctly. >>>>>> Just to make sure I understand your point. >>>>>> >>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) >>>>>> coherent DMA? >>>>>> >>>>>> For 1) you probably need a workqueue to do that since dma unmap can >>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it >>>>>> can still access the bounce buffer (if you don't zap pte)? >>>>>> >>>>> I plan to do it in the coherent DMA case. >>>> Any reason for treating coherent DMA differently? >>>> >>> Now the memory of the bounce buffer is allocated page by page in the >>> page fault handler. So it can't be used in coherent DMA mapping case >>> which needs some memory with contiguous virtual addresses. I can use >>> vmalloc() to do allocation for the bounce buffer instead. But it might >>> cause some memory waste. Any suggestion? >> >> I may miss something. But I don't see a relationship between the >> IOTLB_UNMAP and vmalloc(). >> > In the vmalloc() case, the coherent DMA page will be taken from the > memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore > during coherent DMA unmapping because those vmalloc'ed memory which > has been mapped into userspace address space during initialization can > be reused. And userspace should not unmap the region until we destroy > the device. Just to make sure I understand. My understanding is that IOTLB_UNMAP is only needed when there's a change the mapping from IOVA to page. So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to free list to be used by the next IOVA allocating. IOTLB_UNMAP could be avoided. So we are not limited by how the pages are actually allocated? Thanks > >>>>> It's true that userspace can >>>>> access the dma buffer if userspace doesn't do the unmap. But the dma >>>>> pages would not be freed and reused unless user space called munmap() >>>>> for them. >>>> I wonder whether or not we could recycle IOVA in this case to avoid the >>>> IOTLB_UMAP message. >>>> >>> We can achieve that if we use vmalloc() to do allocation for the >>> bounce buffer which can be used in coherent DMA mapping case. But >>> looks like we still have no way to avoid the IOTLB_UMAP message in >>> vhost-vdpa case. >> >> I think that's fine. For virtio-vdpa, from VDUSE userspace perspective, >> it works like a driver that is using SWIOTLB in this case. >> > OK, will do it in v3. > > Thanks, > Yongji >
On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/30 下午6:12, Yongji Xie wrote: > > On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/30 下午3:09, Yongji Xie wrote: > >>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2020/12/29 下午6:26, Yongji Xie wrote: > >>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> ----- Original Message ----- > >>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: > >>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the > >>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying > >>>>>>>>>> first. > >>>>>>>>>> > >>>>>>>>> Actually all vdpa related operations are synchronous in current > >>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return > >>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > >>>>>>>>> by userspace. Could it solve this problem? > >>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE > >>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). > >>>>>>>> > >>>>>>>> If we don't, we're probably fine. > >>>>>>>> > >>>>>>> It seems not feasible. This message will be also used in the > >>>>>>> virtio-vdpa case to notify userspace to unmap some pages during > >>>>>>> consistent dma unmapping. Maybe we can document it to make sure the > >>>>>>> users can handle the message correctly. > >>>>>> Just to make sure I understand your point. > >>>>>> > >>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > >>>>>> coherent DMA? > >>>>>> > >>>>>> For 1) you probably need a workqueue to do that since dma unmap can > >>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it > >>>>>> can still access the bounce buffer (if you don't zap pte)? > >>>>>> > >>>>> I plan to do it in the coherent DMA case. > >>>> Any reason for treating coherent DMA differently? > >>>> > >>> Now the memory of the bounce buffer is allocated page by page in the > >>> page fault handler. So it can't be used in coherent DMA mapping case > >>> which needs some memory with contiguous virtual addresses. I can use > >>> vmalloc() to do allocation for the bounce buffer instead. But it might > >>> cause some memory waste. Any suggestion? > >> > >> I may miss something. But I don't see a relationship between the > >> IOTLB_UNMAP and vmalloc(). > >> > > In the vmalloc() case, the coherent DMA page will be taken from the > > memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore > > during coherent DMA unmapping because those vmalloc'ed memory which > > has been mapped into userspace address space during initialization can > > be reused. And userspace should not unmap the region until we destroy > > the device. > > > Just to make sure I understand. My understanding is that IOTLB_UNMAP is > only needed when there's a change the mapping from IOVA to page. > Yes, that's true. > So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to > free list to be used by the next IOVA allocating. IOTLB_UNMAP could be > avoided. > > So we are not limited by how the pages are actually allocated? > In coherent DMA cases, we need to return some memory with contiguous kernel virtual addresses. That is the reason why we need vmalloc() here. If we allocate the memory page by page, the corresponding kernel virtual addresses in a contiguous IOVA range might not be contiguous. And in streaming DMA cases, there is no limit. So another choice is using vmalloc'ed memory only for coherent DMA cases. Not sure if this is clear for you. Thanks, Yongji
On 2020/12/31 下午1:15, Yongji Xie wrote: > On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/30 下午6:12, Yongji Xie wrote: >>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: >>>> On 2020/12/30 下午3:09, Yongji Xie wrote: >>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote: >>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>> ----- Original Message ----- >>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: >>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the >>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying >>>>>>>>>>>> first. >>>>>>>>>>>> >>>>>>>>>>> Actually all vdpa related operations are synchronous in current >>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return >>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied >>>>>>>>>>> by userspace. Could it solve this problem? >>>>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE >>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). >>>>>>>>>> >>>>>>>>>> If we don't, we're probably fine. >>>>>>>>>> >>>>>>>>> It seems not feasible. This message will be also used in the >>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during >>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the >>>>>>>>> users can handle the message correctly. >>>>>>>> Just to make sure I understand your point. >>>>>>>> >>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) >>>>>>>> coherent DMA? >>>>>>>> >>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can >>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it >>>>>>>> can still access the bounce buffer (if you don't zap pte)? >>>>>>>> >>>>>>> I plan to do it in the coherent DMA case. >>>>>> Any reason for treating coherent DMA differently? >>>>>> >>>>> Now the memory of the bounce buffer is allocated page by page in the >>>>> page fault handler. So it can't be used in coherent DMA mapping case >>>>> which needs some memory with contiguous virtual addresses. I can use >>>>> vmalloc() to do allocation for the bounce buffer instead. But it might >>>>> cause some memory waste. Any suggestion? >>>> I may miss something. But I don't see a relationship between the >>>> IOTLB_UNMAP and vmalloc(). >>>> >>> In the vmalloc() case, the coherent DMA page will be taken from the >>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore >>> during coherent DMA unmapping because those vmalloc'ed memory which >>> has been mapped into userspace address space during initialization can >>> be reused. And userspace should not unmap the region until we destroy >>> the device. >> >> Just to make sure I understand. My understanding is that IOTLB_UNMAP is >> only needed when there's a change the mapping from IOVA to page. >> > Yes, that's true. > >> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to >> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be >> avoided. >> >> So we are not limited by how the pages are actually allocated? >> > In coherent DMA cases, we need to return some memory with contiguous > kernel virtual addresses. That is the reason why we need vmalloc() > here. If we allocate the memory page by page, the corresponding kernel > virtual addresses in a contiguous IOVA range might not be contiguous. Yes, but we can do that as what has been done in the series (alloc_pages_exact()). Or do you mean it would be a little bit hard to recycle IOVA/pages here? Thanks > And in streaming DMA cases, there is no limit. So another choice is > using vmalloc'ed memory only for coherent DMA cases. > > Not sure if this is clear for you. > > Thanks, > Yongji >
On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/31 下午1:15, Yongji Xie wrote: > > On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/30 下午6:12, Yongji Xie wrote: > >>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2020/12/30 下午3:09, Yongji Xie wrote: > >>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote: > >>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>> ----- Original Message ----- > >>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: > >>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the > >>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying > >>>>>>>>>>>> first. > >>>>>>>>>>>> > >>>>>>>>>>> Actually all vdpa related operations are synchronous in current > >>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return > >>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > >>>>>>>>>>> by userspace. Could it solve this problem? > >>>>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE > >>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). > >>>>>>>>>> > >>>>>>>>>> If we don't, we're probably fine. > >>>>>>>>>> > >>>>>>>>> It seems not feasible. This message will be also used in the > >>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during > >>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the > >>>>>>>>> users can handle the message correctly. > >>>>>>>> Just to make sure I understand your point. > >>>>>>>> > >>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > >>>>>>>> coherent DMA? > >>>>>>>> > >>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can > >>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it > >>>>>>>> can still access the bounce buffer (if you don't zap pte)? > >>>>>>>> > >>>>>>> I plan to do it in the coherent DMA case. > >>>>>> Any reason for treating coherent DMA differently? > >>>>>> > >>>>> Now the memory of the bounce buffer is allocated page by page in the > >>>>> page fault handler. So it can't be used in coherent DMA mapping case > >>>>> which needs some memory with contiguous virtual addresses. I can use > >>>>> vmalloc() to do allocation for the bounce buffer instead. But it might > >>>>> cause some memory waste. Any suggestion? > >>>> I may miss something. But I don't see a relationship between the > >>>> IOTLB_UNMAP and vmalloc(). > >>>> > >>> In the vmalloc() case, the coherent DMA page will be taken from the > >>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore > >>> during coherent DMA unmapping because those vmalloc'ed memory which > >>> has been mapped into userspace address space during initialization can > >>> be reused. And userspace should not unmap the region until we destroy > >>> the device. > >> > >> Just to make sure I understand. My understanding is that IOTLB_UNMAP is > >> only needed when there's a change the mapping from IOVA to page. > >> > > Yes, that's true. > > > >> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to > >> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be > >> avoided. > >> > >> So we are not limited by how the pages are actually allocated? > >> > > In coherent DMA cases, we need to return some memory with contiguous > > kernel virtual addresses. That is the reason why we need vmalloc() > > here. If we allocate the memory page by page, the corresponding kernel > > virtual addresses in a contiguous IOVA range might not be contiguous. > > > Yes, but we can do that as what has been done in the series > (alloc_pages_exact()). Or do you mean it would be a little bit hard to > recycle IOVA/pages here? > Yes, it might be hard to reuse the memory. For example, we firstly allocate 1 IOVA/page during dma_map, then the IOVA is freed during dma_unmap. Actually we can't reuse this single page if we need a two-pages area in the next IOVA allocating. So the best way is using IOTLB_UNMAP to free this single page during dma_unmap too. Thanks, Yongji
On 2020/12/31 下午2:52, Yongji Xie wrote: > On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote: >> >> On 2020/12/31 下午1:15, Yongji Xie wrote: >>> On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote: >>>> On 2020/12/30 下午6:12, Yongji Xie wrote: >>>>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>> On 2020/12/30 下午3:09, Yongji Xie wrote: >>>>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote: >>>>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>>>> ----- Original Message ----- >>>>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: >>>>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: >>>>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE >>>>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the >>>>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying >>>>>>>>>>>>>> first. >>>>>>>>>>>>>> >>>>>>>>>>>>> Actually all vdpa related operations are synchronous in current >>>>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return >>>>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied >>>>>>>>>>>>> by userspace. Could it solve this problem? >>>>>>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE >>>>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). >>>>>>>>>>>> >>>>>>>>>>>> If we don't, we're probably fine. >>>>>>>>>>>> >>>>>>>>>>> It seems not feasible. This message will be also used in the >>>>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during >>>>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the >>>>>>>>>>> users can handle the message correctly. >>>>>>>>>> Just to make sure I understand your point. >>>>>>>>>> >>>>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) >>>>>>>>>> coherent DMA? >>>>>>>>>> >>>>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can >>>>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it >>>>>>>>>> can still access the bounce buffer (if you don't zap pte)? >>>>>>>>>> >>>>>>>>> I plan to do it in the coherent DMA case. >>>>>>>> Any reason for treating coherent DMA differently? >>>>>>>> >>>>>>> Now the memory of the bounce buffer is allocated page by page in the >>>>>>> page fault handler. So it can't be used in coherent DMA mapping case >>>>>>> which needs some memory with contiguous virtual addresses. I can use >>>>>>> vmalloc() to do allocation for the bounce buffer instead. But it might >>>>>>> cause some memory waste. Any suggestion? >>>>>> I may miss something. But I don't see a relationship between the >>>>>> IOTLB_UNMAP and vmalloc(). >>>>>> >>>>> In the vmalloc() case, the coherent DMA page will be taken from the >>>>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore >>>>> during coherent DMA unmapping because those vmalloc'ed memory which >>>>> has been mapped into userspace address space during initialization can >>>>> be reused. And userspace should not unmap the region until we destroy >>>>> the device. >>>> Just to make sure I understand. My understanding is that IOTLB_UNMAP is >>>> only needed when there's a change the mapping from IOVA to page. >>>> >>> Yes, that's true. >>> >>>> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to >>>> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be >>>> avoided. >>>> >>>> So we are not limited by how the pages are actually allocated? >>>> >>> In coherent DMA cases, we need to return some memory with contiguous >>> kernel virtual addresses. That is the reason why we need vmalloc() >>> here. If we allocate the memory page by page, the corresponding kernel >>> virtual addresses in a contiguous IOVA range might not be contiguous. >> >> Yes, but we can do that as what has been done in the series >> (alloc_pages_exact()). Or do you mean it would be a little bit hard to >> recycle IOVA/pages here? >> > Yes, it might be hard to reuse the memory. For example, we firstly > allocate 1 IOVA/page during dma_map, then the IOVA is freed during > dma_unmap. Actually we can't reuse this single page if we need a > two-pages area in the next IOVA allocating. So the best way is using > IOTLB_UNMAP to free this single page during dma_unmap too. > > Thanks, > Yongji I get you now. Then I agree that let's go with IOTLB_UNMAP. Thanks >
On Thu, Dec 31, 2020 at 3:12 PM Jason Wang <jasowang@redhat.com> wrote: > > > On 2020/12/31 下午2:52, Yongji Xie wrote: > > On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote: > >> > >> On 2020/12/31 下午1:15, Yongji Xie wrote: > >>> On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote: > >>>> On 2020/12/30 下午6:12, Yongji Xie wrote: > >>>>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> On 2020/12/30 下午3:09, Yongji Xie wrote: > >>>>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote: > >>>>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>>>> ----- Original Message ----- > >>>>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote: > >>>>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote: > >>>>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE > >>>>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the > >>>>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying > >>>>>>>>>>>>>> first. > >>>>>>>>>>>>>> > >>>>>>>>>>>>> Actually all vdpa related operations are synchronous in current > >>>>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return > >>>>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied > >>>>>>>>>>>>> by userspace. Could it solve this problem? > >>>>>>>>>>>> I was thinking whether or not we need to generate IOTLB_INVALIDATE > >>>>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page). > >>>>>>>>>>>> > >>>>>>>>>>>> If we don't, we're probably fine. > >>>>>>>>>>>> > >>>>>>>>>>> It seems not feasible. This message will be also used in the > >>>>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during > >>>>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the > >>>>>>>>>>> users can handle the message correctly. > >>>>>>>>>> Just to make sure I understand your point. > >>>>>>>>>> > >>>>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2) > >>>>>>>>>> coherent DMA? > >>>>>>>>>> > >>>>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can > >>>>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it > >>>>>>>>>> can still access the bounce buffer (if you don't zap pte)? > >>>>>>>>>> > >>>>>>>>> I plan to do it in the coherent DMA case. > >>>>>>>> Any reason for treating coherent DMA differently? > >>>>>>>> > >>>>>>> Now the memory of the bounce buffer is allocated page by page in the > >>>>>>> page fault handler. So it can't be used in coherent DMA mapping case > >>>>>>> which needs some memory with contiguous virtual addresses. I can use > >>>>>>> vmalloc() to do allocation for the bounce buffer instead. But it might > >>>>>>> cause some memory waste. Any suggestion? > >>>>>> I may miss something. But I don't see a relationship between the > >>>>>> IOTLB_UNMAP and vmalloc(). > >>>>>> > >>>>> In the vmalloc() case, the coherent DMA page will be taken from the > >>>>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore > >>>>> during coherent DMA unmapping because those vmalloc'ed memory which > >>>>> has been mapped into userspace address space during initialization can > >>>>> be reused. And userspace should not unmap the region until we destroy > >>>>> the device. > >>>> Just to make sure I understand. My understanding is that IOTLB_UNMAP is > >>>> only needed when there's a change the mapping from IOVA to page. > >>>> > >>> Yes, that's true. > >>> > >>>> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to > >>>> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be > >>>> avoided. > >>>> > >>>> So we are not limited by how the pages are actually allocated? > >>>> > >>> In coherent DMA cases, we need to return some memory with contiguous > >>> kernel virtual addresses. That is the reason why we need vmalloc() > >>> here. If we allocate the memory page by page, the corresponding kernel > >>> virtual addresses in a contiguous IOVA range might not be contiguous. > >> > >> Yes, but we can do that as what has been done in the series > >> (alloc_pages_exact()). Or do you mean it would be a little bit hard to > >> recycle IOVA/pages here? > >> > > Yes, it might be hard to reuse the memory. For example, we firstly > > allocate 1 IOVA/page during dma_map, then the IOVA is freed during > > dma_unmap. Actually we can't reuse this single page if we need a > > two-pages area in the next IOVA allocating. So the best way is using > > IOTLB_UNMAP to free this single page during dma_unmap too. > > > > Thanks, > > Yongji > > > I get you now. Then I agree that let's go with IOTLB_UNMAP. > Fine, will do it. Thanks, Yongji
diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst index 623f7b040ccf..48e4b1ba353f 100644 --- a/Documentation/driver-api/vduse.rst +++ b/Documentation/driver-api/vduse.rst @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now: - VDUSE_GET_CONFIG: Read from device specific configuration space +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB + +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB + Please see include/linux/vdpa.h for details. -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU +The data path of userspace vDPA device is implemented in different ways +depending on the vdpa bus to which it is attached. + +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU driver which supports mapping the kernel dma buffer to a userspace iova region dynamically. The userspace iova region can be created by passing the userspace vDPA device fd to mmap(2). +In vhost-vdpa case, the dma buffer is reside in a userspace memory region +which will be shared to the VDUSE userspace processs via the file +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address +mapping (IOVA of dma buffer <-> VA of the memory region) is also included +in this message. + Besides, the eventfd mechanism is used to trigger interrupt callbacks and receive virtqueue kicks in userspace. The following ioctls on the userspace vDPA device fd are provided to support that: diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index b974333ed4e9..d24aaacb6008 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -34,6 +34,7 @@ struct vduse_dev_msg { struct vduse_dev_request req; + struct file *iotlb_file; struct vduse_dev_response resp; struct list_head list; wait_queue_head_t waitq; @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev, return ret; } +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file, + u64 offset, u64 iova, u64 size, u8 perm) +{ + struct vduse_dev_msg *msg; + int ret; + + if (!size) + return -EINVAL; + + msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB); + msg->req.size = sizeof(struct vduse_iotlb); + msg->req.iotlb.offset = offset; + msg->req.iotlb.iova = iova; + msg->req.iotlb.size = size; + msg->req.iotlb.perm = perm; + msg->req.iotlb.fd = -1; + msg->iotlb_file = get_file(file); + + ret = vduse_dev_msg_sync(dev, msg); + vduse_dev_msg_put(msg); + fput(file); + + return ret; +} + +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev, + u64 iova, u64 size) +{ + struct vduse_dev_msg *msg; + int ret; + + if (!size) + return -EINVAL; + + msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB); + msg->req.size = sizeof(struct vduse_iotlb); + msg->req.iotlb.iova = iova; + msg->req.iotlb.size = size; + + ret = vduse_dev_msg_sync(dev, msg); + vduse_dev_msg_put(msg); + + return ret; +} + +static unsigned int perm_to_file_flags(u8 perm) +{ + unsigned int flags = 0; + + switch (perm) { + case VHOST_ACCESS_WO: + flags |= O_WRONLY; + break; + case VHOST_ACCESS_RO: + flags |= O_RDONLY; + break; + case VHOST_ACCESS_RW: + flags |= O_RDWR; + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags; +} + static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vduse_dev *dev = file->private_data; struct vduse_dev_msg *msg; - int size = sizeof(struct vduse_dev_request); + unsigned int flags; + int fd, size = sizeof(struct vduse_dev_request); ssize_t ret = 0; if (iov_iter_count(to) < size) @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret) return ret; } + + if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) { + flags = perm_to_file_flags(msg->req.iotlb.perm); + fd = get_unused_fd_flags(flags); + if (fd < 0) { + vduse_dev_enqueue_msg(dev, msg, &dev->send_list); + return fd; + } + fd_install(fd, get_file(msg->iotlb_file)); + msg->req.iotlb.fd = fd; + } + ret = copy_to_iter(&msg->req, size, to); if (ret != size) { vduse_dev_enqueue_msg(dev, msg, &dev->send_list); @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, vduse_dev_set_config(dev, offset, buf, len); } +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev, + struct vhost_iotlb_msg *msg) +{ + vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size); +} + +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev, + struct vhost_iotlb_msg *msg) +{ + u64 uaddr = msg->uaddr; + u64 iova = msg->iova; + u64 size = msg->size; + u64 offset; + struct vm_area_struct *vma; + int ret; + + while (uaddr < msg->uaddr + msg->size) { + vma = find_vma(current->mm, uaddr); + ret = -EINVAL; + if (!vma) + goto err; + + size = min(msg->size, vma->vm_end - uaddr); + offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; + if (vma->vm_file && (vma->vm_flags & VM_SHARED)) { + ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset, + iova, size, msg->perm); + if (ret) + goto err; + } + iova += size; + uaddr += size; + } + return 0; +err: + vduse_dev_invalidate_iotlb(dev, msg->iova, iova - msg->iova); + return ret; +} + +static int vduse_vdpa_process_iotlb_msg(struct vdpa_device *vdpa, + struct vhost_iotlb_msg *msg) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + int ret = 0; + + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + ret = vduse_vdpa_update_iotlb(dev, msg); + break; + case VHOST_IOTLB_INVALIDATE: + vduse_vdpa_invalidate_iotlb(dev, msg); + break; + case VHOST_IOTLB_BATCH_BEGIN: + case VHOST_IOTLB_BATCH_END: + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + static void vduse_vdpa_free(struct vdpa_device *vdpa) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); @@ -597,6 +741,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .set_status = vduse_vdpa_set_status, .get_config = vduse_vdpa_get_config, .set_config = vduse_vdpa_set_config, + .process_iotlb_msg = vduse_vdpa_process_iotlb_msg, .free = vduse_vdpa_free, }; diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 873305dfd93f..c5080851f140 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -21,6 +21,8 @@ enum vduse_req_type { VDUSE_GET_STATUS, VDUSE_SET_CONFIG, VDUSE_GET_CONFIG, + VDUSE_UPDATE_IOTLB, + VDUSE_INVALIDATE_IOTLB, }; struct vduse_vq_num { @@ -51,6 +53,14 @@ struct vduse_dev_config_data { __u8 data[VDUSE_CONFIG_DATA_LEN]; }; +struct vduse_iotlb { + __u32 fd; + __u64 offset; + __u64 iova; + __u64 size; + __u8 perm; +}; + struct vduse_dev_request { __u32 type; /* request type */ __u32 unique; /* request id */ @@ -62,6 +72,7 @@ struct vduse_dev_request { struct vduse_vq_ready vq_ready; /* virtqueue ready status */ struct vduse_vq_state vq_state; /* virtqueue state */ struct vduse_dev_config_data config; /* virtio device config space */ + struct vduse_iotlb iotlb; /* iotlb message */ __u64 features; /* virtio features */ __u8 status; /* device status */ };
To support vhost-vdpa bus driver, we need a way to share the vhost-vdpa backend process's memory with the userspace VDUSE process. This patch tries to make use of the vhost iotlb message to achieve that. We will get the shm file from the iotlb message and pass it to the userspace VDUSE process. Signed-off-by: Xie Yongji <xieyongji@bytedance.com> --- Documentation/driver-api/vduse.rst | 15 +++- drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++- include/uapi/linux/vduse.h | 11 +++ 3 files changed, 171 insertions(+), 2 deletions(-)