diff mbox series

[RFC,v2,09/13] vduse: Add support for processing vhost iotlb message

Message ID 20201222145221.711-10-xieyongji@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Introduce VDUSE - vDPA Device in Userspace | expand

Commit Message

Yongji Xie Dec. 22, 2020, 2:52 p.m. UTC
To support vhost-vdpa bus driver, we need a way to share the
vhost-vdpa backend process's memory with the userspace VDUSE process.

This patch tries to make use of the vhost iotlb message to achieve
that. We will get the shm file from the iotlb message and pass it
to the userspace VDUSE process.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 Documentation/driver-api/vduse.rst |  15 +++-
 drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vduse.h         |  11 +++
 3 files changed, 171 insertions(+), 2 deletions(-)

Comments

Jason Wang Dec. 23, 2020, 9:05 a.m. UTC | #1
On 2020/12/22 下午10:52, Xie Yongji wrote:
> To support vhost-vdpa bus driver, we need a way to share the
> vhost-vdpa backend process's memory with the userspace VDUSE process.
>
> This patch tries to make use of the vhost iotlb message to achieve
> that. We will get the shm file from the iotlb message and pass it
> to the userspace VDUSE process.
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>   Documentation/driver-api/vduse.rst |  15 +++-
>   drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
>   include/uapi/linux/vduse.h         |  11 +++
>   3 files changed, 171 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> index 623f7b040ccf..48e4b1ba353f 100644
> --- a/Documentation/driver-api/vduse.rst
> +++ b/Documentation/driver-api/vduse.rst
> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
>   
>   - VDUSE_GET_CONFIG: Read from device specific configuration space
>   
> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> +
> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> +
>   Please see include/linux/vdpa.h for details.
>   
> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> +The data path of userspace vDPA device is implemented in different ways
> +depending on the vdpa bus to which it is attached.
> +
> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
>   driver which supports mapping the kernel dma buffer to a userspace iova
>   region dynamically. The userspace iova region can be created by passing
>   the userspace vDPA device fd to mmap(2).
>   
> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> +which will be shared to the VDUSE userspace processs via the file
> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> +in this message.
> +
>   Besides, the eventfd mechanism is used to trigger interrupt callbacks and
>   receive virtqueue kicks in userspace. The following ioctls on the userspace
>   vDPA device fd are provided to support that:
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index b974333ed4e9..d24aaacb6008 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -34,6 +34,7 @@
>   
>   struct vduse_dev_msg {
>   	struct vduse_dev_request req;
> +	struct file *iotlb_file;
>   	struct vduse_dev_response resp;
>   	struct list_head list;
>   	wait_queue_head_t waitq;
> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
>   	return ret;
>   }
>   
> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> +				u64 offset, u64 iova, u64 size, u8 perm)
> +{
> +	struct vduse_dev_msg *msg;
> +	int ret;
> +
> +	if (!size)
> +		return -EINVAL;
> +
> +	msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> +	msg->req.size = sizeof(struct vduse_iotlb);
> +	msg->req.iotlb.offset = offset;
> +	msg->req.iotlb.iova = iova;
> +	msg->req.iotlb.size = size;
> +	msg->req.iotlb.perm = perm;
> +	msg->req.iotlb.fd = -1;
> +	msg->iotlb_file = get_file(file);
> +
> +	ret = vduse_dev_msg_sync(dev, msg);


My feeling is that we should provide consistent API for the userspace 
device to use.

E.g we'd better carry the IOTLB message for both virtio/vhost drivers.

It looks to me for virtio drivers we can still use UPDAT_IOTLB message 
by using VDUSE file as msg->iotlb_file here.


> +	vduse_dev_msg_put(msg);
> +	fput(file);
> +
> +	return ret;
> +}
> +
> +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev,
> +					u64 iova, u64 size)
> +{
> +	struct vduse_dev_msg *msg;
> +	int ret;
> +
> +	if (!size)
> +		return -EINVAL;
> +
> +	msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB);
> +	msg->req.size = sizeof(struct vduse_iotlb);
> +	msg->req.iotlb.iova = iova;
> +	msg->req.iotlb.size = size;
> +
> +	ret = vduse_dev_msg_sync(dev, msg);
> +	vduse_dev_msg_put(msg);
> +
> +	return ret;
> +}
> +
> +static unsigned int perm_to_file_flags(u8 perm)
> +{
> +	unsigned int flags = 0;
> +
> +	switch (perm) {
> +	case VHOST_ACCESS_WO:
> +		flags |= O_WRONLY;
> +		break;
> +	case VHOST_ACCESS_RO:
> +		flags |= O_RDONLY;
> +		break;
> +	case VHOST_ACCESS_RW:
> +		flags |= O_RDWR;
> +		break;
> +	default:
> +		WARN(1, "invalidate vhost IOTLB permission\n");
> +		break;
> +	}
> +
> +	return flags;
> +}
> +
>   static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   {
>   	struct file *file = iocb->ki_filp;
>   	struct vduse_dev *dev = file->private_data;
>   	struct vduse_dev_msg *msg;
> -	int size = sizeof(struct vduse_dev_request);
> +	unsigned int flags;
> +	int fd, size = sizeof(struct vduse_dev_request);
>   	ssize_t ret = 0;
>   
>   	if (iov_iter_count(to) < size)
> @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   		if (ret)
>   			return ret;
>   	}
> +
> +	if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) {
> +		flags = perm_to_file_flags(msg->req.iotlb.perm);
> +		fd = get_unused_fd_flags(flags);
> +		if (fd < 0) {
> +			vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
> +			return fd;
> +		}
> +		fd_install(fd, get_file(msg->iotlb_file));
> +		msg->req.iotlb.fd = fd;
> +	}
> +
>   	ret = copy_to_iter(&msg->req, size, to);
>   	if (ret != size) {
>   		vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
> @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
>   	vduse_dev_set_config(dev, offset, buf, len);
>   }
>   
> +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev,
> +					struct vhost_iotlb_msg *msg)
> +{
> +	vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size);
> +}
> +
> +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev,
> +					struct vhost_iotlb_msg *msg)
> +{
> +	u64 uaddr = msg->uaddr;
> +	u64 iova = msg->iova;
> +	u64 size = msg->size;
> +	u64 offset;
> +	struct vm_area_struct *vma;
> +	int ret;
> +
> +	while (uaddr < msg->uaddr + msg->size) {
> +		vma = find_vma(current->mm, uaddr);
> +		ret = -EINVAL;
> +		if (!vma)
> +			goto err;
> +
> +		size = min(msg->size, vma->vm_end - uaddr);
> +		offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
> +		if (vma->vm_file && (vma->vm_flags & VM_SHARED)) {
> +			ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset,
> +							iova, size, msg->perm);
> +			if (ret)
> +				goto err;


My understanding is that vma is something that should not be known by a 
device. So I suggest to move the above processing to vhost-vdpa.c.

Thanks


> +		}
> +		iova += size;
> +		uaddr += size;
> +	}
> +	return 0;
> +err:
> +	vduse_dev_invalidate_iotlb(dev, msg->iova, iova - msg->iova);
> +	return ret;
> +}
> +
> +static int vduse_vdpa_process_iotlb_msg(struct vdpa_device *vdpa,
> +					struct vhost_iotlb_msg *msg)
> +{
> +	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> +	int ret = 0;
> +
> +	switch (msg->type) {
> +	case VHOST_IOTLB_UPDATE:
> +		ret = vduse_vdpa_update_iotlb(dev, msg);
> +		break;
> +	case VHOST_IOTLB_INVALIDATE:
> +		vduse_vdpa_invalidate_iotlb(dev, msg);
> +		break;
> +	case VHOST_IOTLB_BATCH_BEGIN:
> +	case VHOST_IOTLB_BATCH_END:
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>   static void vduse_vdpa_free(struct vdpa_device *vdpa)
>   {
>   	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> @@ -597,6 +741,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
>   	.set_status		= vduse_vdpa_set_status,
>   	.get_config		= vduse_vdpa_get_config,
>   	.set_config		= vduse_vdpa_set_config,
> +	.process_iotlb_msg	= vduse_vdpa_process_iotlb_msg,
>   	.free			= vduse_vdpa_free,
>   };
>   
> diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
> index 873305dfd93f..c5080851f140 100644
> --- a/include/uapi/linux/vduse.h
> +++ b/include/uapi/linux/vduse.h
> @@ -21,6 +21,8 @@ enum vduse_req_type {
>   	VDUSE_GET_STATUS,
>   	VDUSE_SET_CONFIG,
>   	VDUSE_GET_CONFIG,
> +	VDUSE_UPDATE_IOTLB,
> +	VDUSE_INVALIDATE_IOTLB,
>   };
>   
>   struct vduse_vq_num {
> @@ -51,6 +53,14 @@ struct vduse_dev_config_data {
>   	__u8 data[VDUSE_CONFIG_DATA_LEN];
>   };
>   
> +struct vduse_iotlb {
> +	__u32 fd;
> +	__u64 offset;
> +	__u64 iova;
> +	__u64 size;
> +	__u8 perm;
> +};
> +
>   struct vduse_dev_request {
>   	__u32 type; /* request type */
>   	__u32 unique; /* request id */
> @@ -62,6 +72,7 @@ struct vduse_dev_request {
>   		struct vduse_vq_ready vq_ready; /* virtqueue ready status */
>   		struct vduse_vq_state vq_state; /* virtqueue state */
>   		struct vduse_dev_config_data config; /* virtio device config space */
> +		struct vduse_iotlb iotlb; /* iotlb message */
>   		__u64 features; /* virtio features */
>   		__u8 status; /* device status */
>   	};
Yongji Xie Dec. 23, 2020, 12:14 p.m. UTC | #2
On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/22 下午10:52, Xie Yongji wrote:
> > To support vhost-vdpa bus driver, we need a way to share the
> > vhost-vdpa backend process's memory with the userspace VDUSE process.
> >
> > This patch tries to make use of the vhost iotlb message to achieve
> > that. We will get the shm file from the iotlb message and pass it
> > to the userspace VDUSE process.
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > ---
> >   Documentation/driver-api/vduse.rst |  15 +++-
> >   drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> >   include/uapi/linux/vduse.h         |  11 +++
> >   3 files changed, 171 insertions(+), 2 deletions(-)
> >
> > diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> > index 623f7b040ccf..48e4b1ba353f 100644
> > --- a/Documentation/driver-api/vduse.rst
> > +++ b/Documentation/driver-api/vduse.rst
> > @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> >
> >   - VDUSE_GET_CONFIG: Read from device specific configuration space
> >
> > +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> > +
> > +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> > +
> >   Please see include/linux/vdpa.h for details.
> >
> > -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> > +The data path of userspace vDPA device is implemented in different ways
> > +depending on the vdpa bus to which it is attached.
> > +
> > +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> >   driver which supports mapping the kernel dma buffer to a userspace iova
> >   region dynamically. The userspace iova region can be created by passing
> >   the userspace vDPA device fd to mmap(2).
> >
> > +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> > +which will be shared to the VDUSE userspace processs via the file
> > +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> > +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> > +in this message.
> > +
> >   Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> >   receive virtqueue kicks in userspace. The following ioctls on the userspace
> >   vDPA device fd are provided to support that:
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index b974333ed4e9..d24aaacb6008 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -34,6 +34,7 @@
> >
> >   struct vduse_dev_msg {
> >       struct vduse_dev_request req;
> > +     struct file *iotlb_file;
> >       struct vduse_dev_response resp;
> >       struct list_head list;
> >       wait_queue_head_t waitq;
> > @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> >       return ret;
> >   }
> >
> > +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> > +                             u64 offset, u64 iova, u64 size, u8 perm)
> > +{
> > +     struct vduse_dev_msg *msg;
> > +     int ret;
> > +
> > +     if (!size)
> > +             return -EINVAL;
> > +
> > +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> > +     msg->req.size = sizeof(struct vduse_iotlb);
> > +     msg->req.iotlb.offset = offset;
> > +     msg->req.iotlb.iova = iova;
> > +     msg->req.iotlb.size = size;
> > +     msg->req.iotlb.perm = perm;
> > +     msg->req.iotlb.fd = -1;
> > +     msg->iotlb_file = get_file(file);
> > +
> > +     ret = vduse_dev_msg_sync(dev, msg);
>
>
> My feeling is that we should provide consistent API for the userspace
> device to use.
>
> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
>
> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> by using VDUSE file as msg->iotlb_file here.
>

It's OK for me. One problem is when to transfer the UPDATE_IOTLB
message in virtio cases.

>
> > +     vduse_dev_msg_put(msg);
> > +     fput(file);
> > +
> > +     return ret;
> > +}
> > +
> > +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev,
> > +                                     u64 iova, u64 size)
> > +{
> > +     struct vduse_dev_msg *msg;
> > +     int ret;
> > +
> > +     if (!size)
> > +             return -EINVAL;
> > +
> > +     msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB);
> > +     msg->req.size = sizeof(struct vduse_iotlb);
> > +     msg->req.iotlb.iova = iova;
> > +     msg->req.iotlb.size = size;
> > +
> > +     ret = vduse_dev_msg_sync(dev, msg);
> > +     vduse_dev_msg_put(msg);
> > +
> > +     return ret;
> > +}
> > +
> > +static unsigned int perm_to_file_flags(u8 perm)
> > +{
> > +     unsigned int flags = 0;
> > +
> > +     switch (perm) {
> > +     case VHOST_ACCESS_WO:
> > +             flags |= O_WRONLY;
> > +             break;
> > +     case VHOST_ACCESS_RO:
> > +             flags |= O_RDONLY;
> > +             break;
> > +     case VHOST_ACCESS_RW:
> > +             flags |= O_RDWR;
> > +             break;
> > +     default:
> > +             WARN(1, "invalidate vhost IOTLB permission\n");
> > +             break;
> > +     }
> > +
> > +     return flags;
> > +}
> > +
> >   static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >   {
> >       struct file *file = iocb->ki_filp;
> >       struct vduse_dev *dev = file->private_data;
> >       struct vduse_dev_msg *msg;
> > -     int size = sizeof(struct vduse_dev_request);
> > +     unsigned int flags;
> > +     int fd, size = sizeof(struct vduse_dev_request);
> >       ssize_t ret = 0;
> >
> >       if (iov_iter_count(to) < size)
> > @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >               if (ret)
> >                       return ret;
> >       }
> > +
> > +     if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) {
> > +             flags = perm_to_file_flags(msg->req.iotlb.perm);
> > +             fd = get_unused_fd_flags(flags);
> > +             if (fd < 0) {
> > +                     vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
> > +                     return fd;
> > +             }
> > +             fd_install(fd, get_file(msg->iotlb_file));
> > +             msg->req.iotlb.fd = fd;
> > +     }
> > +
> >       ret = copy_to_iter(&msg->req, size, to);
> >       if (ret != size) {
> >               vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
> > @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
> >       vduse_dev_set_config(dev, offset, buf, len);
> >   }
> >
> > +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev,
> > +                                     struct vhost_iotlb_msg *msg)
> > +{
> > +     vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size);
> > +}
> > +
> > +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev,
> > +                                     struct vhost_iotlb_msg *msg)
> > +{
> > +     u64 uaddr = msg->uaddr;
> > +     u64 iova = msg->iova;
> > +     u64 size = msg->size;
> > +     u64 offset;
> > +     struct vm_area_struct *vma;
> > +     int ret;
> > +
> > +     while (uaddr < msg->uaddr + msg->size) {
> > +             vma = find_vma(current->mm, uaddr);
> > +             ret = -EINVAL;
> > +             if (!vma)
> > +                     goto err;
> > +
> > +             size = min(msg->size, vma->vm_end - uaddr);
> > +             offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
> > +             if (vma->vm_file && (vma->vm_flags & VM_SHARED)) {
> > +                     ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset,
> > +                                                     iova, size, msg->perm);
> > +                     if (ret)
> > +                             goto err;
>
>
> My understanding is that vma is something that should not be known by a
> device. So I suggest to move the above processing to vhost-vdpa.c.
>

Will do it.

Thanks,
Yongji
Jason Wang Dec. 24, 2020, 2:41 a.m. UTC | #3
On 2020/12/23 下午8:14, Yongji Xie wrote:
> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/22 下午10:52, Xie Yongji wrote:
>>> To support vhost-vdpa bus driver, we need a way to share the
>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
>>>
>>> This patch tries to make use of the vhost iotlb message to achieve
>>> that. We will get the shm file from the iotlb message and pass it
>>> to the userspace VDUSE process.
>>>
>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>> ---
>>>    Documentation/driver-api/vduse.rst |  15 +++-
>>>    drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
>>>    include/uapi/linux/vduse.h         |  11 +++
>>>    3 files changed, 171 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
>>> index 623f7b040ccf..48e4b1ba353f 100644
>>> --- a/Documentation/driver-api/vduse.rst
>>> +++ b/Documentation/driver-api/vduse.rst
>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
>>>
>>>    - VDUSE_GET_CONFIG: Read from device specific configuration space
>>>
>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
>>> +
>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
>>> +
>>>    Please see include/linux/vdpa.h for details.
>>>
>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
>>> +The data path of userspace vDPA device is implemented in different ways
>>> +depending on the vdpa bus to which it is attached.
>>> +
>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
>>>    driver which supports mapping the kernel dma buffer to a userspace iova
>>>    region dynamically. The userspace iova region can be created by passing
>>>    the userspace vDPA device fd to mmap(2).
>>>
>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
>>> +which will be shared to the VDUSE userspace processs via the file
>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
>>> +in this message.
>>> +
>>>    Besides, the eventfd mechanism is used to trigger interrupt callbacks and
>>>    receive virtqueue kicks in userspace. The following ioctls on the userspace
>>>    vDPA device fd are provided to support that:
>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
>>> index b974333ed4e9..d24aaacb6008 100644
>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
>>> @@ -34,6 +34,7 @@
>>>
>>>    struct vduse_dev_msg {
>>>        struct vduse_dev_request req;
>>> +     struct file *iotlb_file;
>>>        struct vduse_dev_response resp;
>>>        struct list_head list;
>>>        wait_queue_head_t waitq;
>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
>>>        return ret;
>>>    }
>>>
>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
>>> +{
>>> +     struct vduse_dev_msg *msg;
>>> +     int ret;
>>> +
>>> +     if (!size)
>>> +             return -EINVAL;
>>> +
>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
>>> +     msg->req.size = sizeof(struct vduse_iotlb);
>>> +     msg->req.iotlb.offset = offset;
>>> +     msg->req.iotlb.iova = iova;
>>> +     msg->req.iotlb.size = size;
>>> +     msg->req.iotlb.perm = perm;
>>> +     msg->req.iotlb.fd = -1;
>>> +     msg->iotlb_file = get_file(file);
>>> +
>>> +     ret = vduse_dev_msg_sync(dev, msg);
>>
>> My feeling is that we should provide consistent API for the userspace
>> device to use.
>>
>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
>>
>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
>> by using VDUSE file as msg->iotlb_file here.
>>
> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> message in virtio cases.


Instead of generating IOTLB messages for userspace.

How about record the mappings (which is a common case for device have 
on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl 
for userspace to query?

Thanks


>
>>> +     vduse_dev_msg_put(msg);
>>> +     fput(file);
>>> +
>>> +     return ret;
>>> +}
>>> +
>>> +static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev,
>>> +                                     u64 iova, u64 size)
>>> +{
>>> +     struct vduse_dev_msg *msg;
>>> +     int ret;
>>> +
>>> +     if (!size)
>>> +             return -EINVAL;
>>> +
>>> +     msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB);
>>> +     msg->req.size = sizeof(struct vduse_iotlb);
>>> +     msg->req.iotlb.iova = iova;
>>> +     msg->req.iotlb.size = size;
>>> +
>>> +     ret = vduse_dev_msg_sync(dev, msg);
>>> +     vduse_dev_msg_put(msg);
>>> +
>>> +     return ret;
>>> +}
>>> +
>>> +static unsigned int perm_to_file_flags(u8 perm)
>>> +{
>>> +     unsigned int flags = 0;
>>> +
>>> +     switch (perm) {
>>> +     case VHOST_ACCESS_WO:
>>> +             flags |= O_WRONLY;
>>> +             break;
>>> +     case VHOST_ACCESS_RO:
>>> +             flags |= O_RDONLY;
>>> +             break;
>>> +     case VHOST_ACCESS_RW:
>>> +             flags |= O_RDWR;
>>> +             break;
>>> +     default:
>>> +             WARN(1, "invalidate vhost IOTLB permission\n");
>>> +             break;
>>> +     }
>>> +
>>> +     return flags;
>>> +}
>>> +
>>>    static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>    {
>>>        struct file *file = iocb->ki_filp;
>>>        struct vduse_dev *dev = file->private_data;
>>>        struct vduse_dev_msg *msg;
>>> -     int size = sizeof(struct vduse_dev_request);
>>> +     unsigned int flags;
>>> +     int fd, size = sizeof(struct vduse_dev_request);
>>>        ssize_t ret = 0;
>>>
>>>        if (iov_iter_count(to) < size)
>>> @@ -349,6 +418,18 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>                if (ret)
>>>                        return ret;
>>>        }
>>> +
>>> +     if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) {
>>> +             flags = perm_to_file_flags(msg->req.iotlb.perm);
>>> +             fd = get_unused_fd_flags(flags);
>>> +             if (fd < 0) {
>>> +                     vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
>>> +                     return fd;
>>> +             }
>>> +             fd_install(fd, get_file(msg->iotlb_file));
>>> +             msg->req.iotlb.fd = fd;
>>> +     }
>>> +
>>>        ret = copy_to_iter(&msg->req, size, to);
>>>        if (ret != size) {
>>>                vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
>>> @@ -565,6 +646,69 @@ static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
>>>        vduse_dev_set_config(dev, offset, buf, len);
>>>    }
>>>
>>> +static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev,
>>> +                                     struct vhost_iotlb_msg *msg)
>>> +{
>>> +     vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size);
>>> +}
>>> +
>>> +static int vduse_vdpa_update_iotlb(struct vduse_dev *dev,
>>> +                                     struct vhost_iotlb_msg *msg)
>>> +{
>>> +     u64 uaddr = msg->uaddr;
>>> +     u64 iova = msg->iova;
>>> +     u64 size = msg->size;
>>> +     u64 offset;
>>> +     struct vm_area_struct *vma;
>>> +     int ret;
>>> +
>>> +     while (uaddr < msg->uaddr + msg->size) {
>>> +             vma = find_vma(current->mm, uaddr);
>>> +             ret = -EINVAL;
>>> +             if (!vma)
>>> +                     goto err;
>>> +
>>> +             size = min(msg->size, vma->vm_end - uaddr);
>>> +             offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
>>> +             if (vma->vm_file && (vma->vm_flags & VM_SHARED)) {
>>> +                     ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset,
>>> +                                                     iova, size, msg->perm);
>>> +                     if (ret)
>>> +                             goto err;
>>
>> My understanding is that vma is something that should not be known by a
>> device. So I suggest to move the above processing to vhost-vdpa.c.
>>
> Will do it.
>
> Thanks,
> Yongji
>
Yongji Xie Dec. 24, 2020, 7:37 a.m. UTC | #4
On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/23 下午8:14, Yongji Xie wrote:
> > On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/22 下午10:52, Xie Yongji wrote:
> >>> To support vhost-vdpa bus driver, we need a way to share the
> >>> vhost-vdpa backend process's memory with the userspace VDUSE process.
> >>>
> >>> This patch tries to make use of the vhost iotlb message to achieve
> >>> that. We will get the shm file from the iotlb message and pass it
> >>> to the userspace VDUSE process.
> >>>
> >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>> ---
> >>>    Documentation/driver-api/vduse.rst |  15 +++-
> >>>    drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> >>>    include/uapi/linux/vduse.h         |  11 +++
> >>>    3 files changed, 171 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> >>> index 623f7b040ccf..48e4b1ba353f 100644
> >>> --- a/Documentation/driver-api/vduse.rst
> >>> +++ b/Documentation/driver-api/vduse.rst
> >>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> >>>
> >>>    - VDUSE_GET_CONFIG: Read from device specific configuration space
> >>>
> >>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> >>> +
> >>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> >>> +
> >>>    Please see include/linux/vdpa.h for details.
> >>>
> >>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> >>> +The data path of userspace vDPA device is implemented in different ways
> >>> +depending on the vdpa bus to which it is attached.
> >>> +
> >>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>    driver which supports mapping the kernel dma buffer to a userspace iova
> >>>    region dynamically. The userspace iova region can be created by passing
> >>>    the userspace vDPA device fd to mmap(2).
> >>>
> >>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> >>> +which will be shared to the VDUSE userspace processs via the file
> >>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> >>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> >>> +in this message.
> >>> +
> >>>    Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> >>>    receive virtqueue kicks in userspace. The following ioctls on the userspace
> >>>    vDPA device fd are provided to support that:
> >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> index b974333ed4e9..d24aaacb6008 100644
> >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> @@ -34,6 +34,7 @@
> >>>
> >>>    struct vduse_dev_msg {
> >>>        struct vduse_dev_request req;
> >>> +     struct file *iotlb_file;
> >>>        struct vduse_dev_response resp;
> >>>        struct list_head list;
> >>>        wait_queue_head_t waitq;
> >>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> >>>        return ret;
> >>>    }
> >>>
> >>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> >>> +                             u64 offset, u64 iova, u64 size, u8 perm)
> >>> +{
> >>> +     struct vduse_dev_msg *msg;
> >>> +     int ret;
> >>> +
> >>> +     if (!size)
> >>> +             return -EINVAL;
> >>> +
> >>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> >>> +     msg->req.size = sizeof(struct vduse_iotlb);
> >>> +     msg->req.iotlb.offset = offset;
> >>> +     msg->req.iotlb.iova = iova;
> >>> +     msg->req.iotlb.size = size;
> >>> +     msg->req.iotlb.perm = perm;
> >>> +     msg->req.iotlb.fd = -1;
> >>> +     msg->iotlb_file = get_file(file);
> >>> +
> >>> +     ret = vduse_dev_msg_sync(dev, msg);
> >>
> >> My feeling is that we should provide consistent API for the userspace
> >> device to use.
> >>
> >> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
> >>
> >> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> >> by using VDUSE file as msg->iotlb_file here.
> >>
> > It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> > message in virtio cases.
>
>
> Instead of generating IOTLB messages for userspace.
>
> How about record the mappings (which is a common case for device have
> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
> for userspace to query?
>

If so, the IOTLB UPDATE is actually triggered by ioctl, but
IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or
how about trigger it when userspace call mmap() on the device fd?

Thanks,
Yongji
Yongji Xie Dec. 25, 2020, 2:37 a.m. UTC | #5
On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > On 2020/12/23 下午8:14, Yongji Xie wrote:
> > > On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> On 2020/12/22 下午10:52, Xie Yongji wrote:
> > >>> To support vhost-vdpa bus driver, we need a way to share the
> > >>> vhost-vdpa backend process's memory with the userspace VDUSE process.
> > >>>
> > >>> This patch tries to make use of the vhost iotlb message to achieve
> > >>> that. We will get the shm file from the iotlb message and pass it
> > >>> to the userspace VDUSE process.
> > >>>
> > >>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > >>> ---
> > >>>    Documentation/driver-api/vduse.rst |  15 +++-
> > >>>    drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> > >>>    include/uapi/linux/vduse.h         |  11 +++
> > >>>    3 files changed, 171 insertions(+), 2 deletions(-)
> > >>>
> > >>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> > >>> index 623f7b040ccf..48e4b1ba353f 100644
> > >>> --- a/Documentation/driver-api/vduse.rst
> > >>> +++ b/Documentation/driver-api/vduse.rst
> > >>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> > >>>
> > >>>    - VDUSE_GET_CONFIG: Read from device specific configuration space
> > >>>
> > >>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> > >>> +
> > >>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> > >>> +
> > >>>    Please see include/linux/vdpa.h for details.
> > >>>
> > >>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> > >>> +The data path of userspace vDPA device is implemented in different ways
> > >>> +depending on the vdpa bus to which it is attached.
> > >>> +
> > >>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> > >>>    driver which supports mapping the kernel dma buffer to a userspace iova
> > >>>    region dynamically. The userspace iova region can be created by passing
> > >>>    the userspace vDPA device fd to mmap(2).
> > >>>
> > >>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> > >>> +which will be shared to the VDUSE userspace processs via the file
> > >>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> > >>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> > >>> +in this message.
> > >>> +
> > >>>    Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> > >>>    receive virtqueue kicks in userspace. The following ioctls on the userspace
> > >>>    vDPA device fd are provided to support that:
> > >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > >>> index b974333ed4e9..d24aaacb6008 100644
> > >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > >>> @@ -34,6 +34,7 @@
> > >>>
> > >>>    struct vduse_dev_msg {
> > >>>        struct vduse_dev_request req;
> > >>> +     struct file *iotlb_file;
> > >>>        struct vduse_dev_response resp;
> > >>>        struct list_head list;
> > >>>        wait_queue_head_t waitq;
> > >>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> > >>>        return ret;
> > >>>    }
> > >>>
> > >>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> > >>> +                             u64 offset, u64 iova, u64 size, u8 perm)
> > >>> +{
> > >>> +     struct vduse_dev_msg *msg;
> > >>> +     int ret;
> > >>> +
> > >>> +     if (!size)
> > >>> +             return -EINVAL;
> > >>> +
> > >>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> > >>> +     msg->req.size = sizeof(struct vduse_iotlb);
> > >>> +     msg->req.iotlb.offset = offset;
> > >>> +     msg->req.iotlb.iova = iova;
> > >>> +     msg->req.iotlb.size = size;
> > >>> +     msg->req.iotlb.perm = perm;
> > >>> +     msg->req.iotlb.fd = -1;
> > >>> +     msg->iotlb_file = get_file(file);
> > >>> +
> > >>> +     ret = vduse_dev_msg_sync(dev, msg);
> > >>
> > >> My feeling is that we should provide consistent API for the userspace
> > >> device to use.
> > >>
> > >> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
> > >>
> > >> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> > >> by using VDUSE file as msg->iotlb_file here.
> > >>
> > > It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> > > message in virtio cases.
> >
> >
> > Instead of generating IOTLB messages for userspace.
> >
> > How about record the mappings (which is a common case for device have
> > on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
> > for userspace to query?
> >
>
> If so, the IOTLB UPDATE is actually triggered by ioctl, but
> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or
> how about trigger it when userspace call mmap() on the device fd?
>

Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message
handler. Is it possible for the vdpa device to know which vdpa bus it
is attached to? So that we can generate this message during probing.
Otherwise, we don't know whether the iova domain of MMU-based IOMMU is
needed.

Thanks,
Yongji
Jason Wang Dec. 25, 2020, 6:57 a.m. UTC | #6
On 2020/12/24 下午3:37, Yongji Xie wrote:
> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/23 下午8:14, Yongji Xie wrote:
>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
>>>>> To support vhost-vdpa bus driver, we need a way to share the
>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
>>>>>
>>>>> This patch tries to make use of the vhost iotlb message to achieve
>>>>> that. We will get the shm file from the iotlb message and pass it
>>>>> to the userspace VDUSE process.
>>>>>
>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>>>> ---
>>>>>     Documentation/driver-api/vduse.rst |  15 +++-
>>>>>     drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
>>>>>     include/uapi/linux/vduse.h         |  11 +++
>>>>>     3 files changed, 171 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
>>>>> index 623f7b040ccf..48e4b1ba353f 100644
>>>>> --- a/Documentation/driver-api/vduse.rst
>>>>> +++ b/Documentation/driver-api/vduse.rst
>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
>>>>>
>>>>>     - VDUSE_GET_CONFIG: Read from device specific configuration space
>>>>>
>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
>>>>> +
>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
>>>>> +
>>>>>     Please see include/linux/vdpa.h for details.
>>>>>
>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>> +The data path of userspace vDPA device is implemented in different ways
>>>>> +depending on the vdpa bus to which it is attached.
>>>>> +
>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>>     driver which supports mapping the kernel dma buffer to a userspace iova
>>>>>     region dynamically. The userspace iova region can be created by passing
>>>>>     the userspace vDPA device fd to mmap(2).
>>>>>
>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
>>>>> +which will be shared to the VDUSE userspace processs via the file
>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
>>>>> +in this message.
>>>>> +
>>>>>     Besides, the eventfd mechanism is used to trigger interrupt callbacks and
>>>>>     receive virtqueue kicks in userspace. The following ioctls on the userspace
>>>>>     vDPA device fd are provided to support that:
>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>> index b974333ed4e9..d24aaacb6008 100644
>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>> @@ -34,6 +34,7 @@
>>>>>
>>>>>     struct vduse_dev_msg {
>>>>>         struct vduse_dev_request req;
>>>>> +     struct file *iotlb_file;
>>>>>         struct vduse_dev_response resp;
>>>>>         struct list_head list;
>>>>>         wait_queue_head_t waitq;
>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
>>>>>         return ret;
>>>>>     }
>>>>>
>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
>>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
>>>>> +{
>>>>> +     struct vduse_dev_msg *msg;
>>>>> +     int ret;
>>>>> +
>>>>> +     if (!size)
>>>>> +             return -EINVAL;
>>>>> +
>>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
>>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
>>>>> +     msg->req.iotlb.offset = offset;
>>>>> +     msg->req.iotlb.iova = iova;
>>>>> +     msg->req.iotlb.size = size;
>>>>> +     msg->req.iotlb.perm = perm;
>>>>> +     msg->req.iotlb.fd = -1;
>>>>> +     msg->iotlb_file = get_file(file);
>>>>> +
>>>>> +     ret = vduse_dev_msg_sync(dev, msg);
>>>> My feeling is that we should provide consistent API for the userspace
>>>> device to use.
>>>>
>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
>>>>
>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
>>>> by using VDUSE file as msg->iotlb_file here.
>>>>
>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
>>> message in virtio cases.
>>
>> Instead of generating IOTLB messages for userspace.
>>
>> How about record the mappings (which is a common case for device have
>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
>> for userspace to query?
>>
> If so, the IOTLB UPDATE is actually triggered by ioctl, but
> IOTLB_INVALIDATE is triggered by the message. Is it a little odd?


Good point.

Some questions here:

1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall 
is returned. But this patch doesn't have this guarantee. Can this lead 
some issues?
2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to 
issue a munmap(). What if it doesn't do that?


>   Or
> how about trigger it when userspace call mmap() on the device fd?


One possible issue is that the IOTLB_UPDATE/INVALIDATE might come after 
mmap():

1) When vIOMMU is enabled
2) Guest memory topology has been changed (memory hotplug).

Thanks


>
> Thanks,
> Yongji
>
Jason Wang Dec. 25, 2020, 7:02 a.m. UTC | #7
On 2020/12/25 上午10:37, Yongji Xie wrote:
> On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> On 2020/12/23 下午8:14, Yongji Xie wrote:
>>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
>>>>>> To support vhost-vdpa bus driver, we need a way to share the
>>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
>>>>>>
>>>>>> This patch tries to make use of the vhost iotlb message to achieve
>>>>>> that. We will get the shm file from the iotlb message and pass it
>>>>>> to the userspace VDUSE process.
>>>>>>
>>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>>>>> ---
>>>>>>     Documentation/driver-api/vduse.rst |  15 +++-
>>>>>>     drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
>>>>>>     include/uapi/linux/vduse.h         |  11 +++
>>>>>>     3 files changed, 171 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
>>>>>> index 623f7b040ccf..48e4b1ba353f 100644
>>>>>> --- a/Documentation/driver-api/vduse.rst
>>>>>> +++ b/Documentation/driver-api/vduse.rst
>>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
>>>>>>
>>>>>>     - VDUSE_GET_CONFIG: Read from device specific configuration space
>>>>>>
>>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
>>>>>> +
>>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
>>>>>> +
>>>>>>     Please see include/linux/vdpa.h for details.
>>>>>>
>>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>>> +The data path of userspace vDPA device is implemented in different ways
>>>>>> +depending on the vdpa bus to which it is attached.
>>>>>> +
>>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>>>     driver which supports mapping the kernel dma buffer to a userspace iova
>>>>>>     region dynamically. The userspace iova region can be created by passing
>>>>>>     the userspace vDPA device fd to mmap(2).
>>>>>>
>>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
>>>>>> +which will be shared to the VDUSE userspace processs via the file
>>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
>>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
>>>>>> +in this message.
>>>>>> +
>>>>>>     Besides, the eventfd mechanism is used to trigger interrupt callbacks and
>>>>>>     receive virtqueue kicks in userspace. The following ioctls on the userspace
>>>>>>     vDPA device fd are provided to support that:
>>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>> index b974333ed4e9..d24aaacb6008 100644
>>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>> @@ -34,6 +34,7 @@
>>>>>>
>>>>>>     struct vduse_dev_msg {
>>>>>>         struct vduse_dev_request req;
>>>>>> +     struct file *iotlb_file;
>>>>>>         struct vduse_dev_response resp;
>>>>>>         struct list_head list;
>>>>>>         wait_queue_head_t waitq;
>>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
>>>>>>         return ret;
>>>>>>     }
>>>>>>
>>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
>>>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
>>>>>> +{
>>>>>> +     struct vduse_dev_msg *msg;
>>>>>> +     int ret;
>>>>>> +
>>>>>> +     if (!size)
>>>>>> +             return -EINVAL;
>>>>>> +
>>>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
>>>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
>>>>>> +     msg->req.iotlb.offset = offset;
>>>>>> +     msg->req.iotlb.iova = iova;
>>>>>> +     msg->req.iotlb.size = size;
>>>>>> +     msg->req.iotlb.perm = perm;
>>>>>> +     msg->req.iotlb.fd = -1;
>>>>>> +     msg->iotlb_file = get_file(file);
>>>>>> +
>>>>>> +     ret = vduse_dev_msg_sync(dev, msg);
>>>>> My feeling is that we should provide consistent API for the userspace
>>>>> device to use.
>>>>>
>>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
>>>>>
>>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
>>>>> by using VDUSE file as msg->iotlb_file here.
>>>>>
>>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
>>>> message in virtio cases.
>>>
>>> Instead of generating IOTLB messages for userspace.
>>>
>>> How about record the mappings (which is a common case for device have
>>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
>>> for userspace to query?
>>>
>> If so, the IOTLB UPDATE is actually triggered by ioctl, but
>> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or
>> how about trigger it when userspace call mmap() on the device fd?
>>
> Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message
> handler. Is it possible for the vdpa device to know which vdpa bus it
> is attached to?


We'd better not. It's kind of layer violation.

Thanks


>   So that we can generate this message during probing.
> Otherwise, we don't know whether the iova domain of MMU-based IOMMU is
> needed.
>
> Thanks,
> Yongji
>
Yongji Xie Dec. 25, 2020, 10:31 a.m. UTC | #8
On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/24 下午3:37, Yongji Xie wrote:
> > On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/23 下午8:14, Yongji Xie wrote:
> >>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
> >>>>> To support vhost-vdpa bus driver, we need a way to share the
> >>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
> >>>>>
> >>>>> This patch tries to make use of the vhost iotlb message to achieve
> >>>>> that. We will get the shm file from the iotlb message and pass it
> >>>>> to the userspace VDUSE process.
> >>>>>
> >>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>>>> ---
> >>>>>     Documentation/driver-api/vduse.rst |  15 +++-
> >>>>>     drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> >>>>>     include/uapi/linux/vduse.h         |  11 +++
> >>>>>     3 files changed, 171 insertions(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> >>>>> index 623f7b040ccf..48e4b1ba353f 100644
> >>>>> --- a/Documentation/driver-api/vduse.rst
> >>>>> +++ b/Documentation/driver-api/vduse.rst
> >>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> >>>>>
> >>>>>     - VDUSE_GET_CONFIG: Read from device specific configuration space
> >>>>>
> >>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> >>>>> +
> >>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> >>>>> +
> >>>>>     Please see include/linux/vdpa.h for details.
> >>>>>
> >>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>> +The data path of userspace vDPA device is implemented in different ways
> >>>>> +depending on the vdpa bus to which it is attached.
> >>>>> +
> >>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>>     driver which supports mapping the kernel dma buffer to a userspace iova
> >>>>>     region dynamically. The userspace iova region can be created by passing
> >>>>>     the userspace vDPA device fd to mmap(2).
> >>>>>
> >>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> >>>>> +which will be shared to the VDUSE userspace processs via the file
> >>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> >>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> >>>>> +in this message.
> >>>>> +
> >>>>>     Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> >>>>>     receive virtqueue kicks in userspace. The following ioctls on the userspace
> >>>>>     vDPA device fd are provided to support that:
> >>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>> index b974333ed4e9..d24aaacb6008 100644
> >>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>> @@ -34,6 +34,7 @@
> >>>>>
> >>>>>     struct vduse_dev_msg {
> >>>>>         struct vduse_dev_request req;
> >>>>> +     struct file *iotlb_file;
> >>>>>         struct vduse_dev_response resp;
> >>>>>         struct list_head list;
> >>>>>         wait_queue_head_t waitq;
> >>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> >>>>>         return ret;
> >>>>>     }
> >>>>>
> >>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> >>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
> >>>>> +{
> >>>>> +     struct vduse_dev_msg *msg;
> >>>>> +     int ret;
> >>>>> +
> >>>>> +     if (!size)
> >>>>> +             return -EINVAL;
> >>>>> +
> >>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> >>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
> >>>>> +     msg->req.iotlb.offset = offset;
> >>>>> +     msg->req.iotlb.iova = iova;
> >>>>> +     msg->req.iotlb.size = size;
> >>>>> +     msg->req.iotlb.perm = perm;
> >>>>> +     msg->req.iotlb.fd = -1;
> >>>>> +     msg->iotlb_file = get_file(file);
> >>>>> +
> >>>>> +     ret = vduse_dev_msg_sync(dev, msg);
> >>>> My feeling is that we should provide consistent API for the userspace
> >>>> device to use.
> >>>>
> >>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
> >>>>
> >>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> >>>> by using VDUSE file as msg->iotlb_file here.
> >>>>
> >>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> >>> message in virtio cases.
> >>
> >> Instead of generating IOTLB messages for userspace.
> >>
> >> How about record the mappings (which is a common case for device have
> >> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
> >> for userspace to query?
> >>
> > If so, the IOTLB UPDATE is actually triggered by ioctl, but
> > IOTLB_INVALIDATE is triggered by the message. Is it a little odd?
>
>
> Good point.
>
> Some questions here:
>
> 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall
> is returned. But this patch doesn't have this guarantee. Can this lead
> some issues?

I'm not sure. But should it be guaranteed in VDUSE userspace? Just
like what vhost-user backend process does.

> 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to
> issue a munmap(). What if it doesn't do that?
>

Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace
could still access the corresponding guest memory region.

Thanks,
Yongji
Yongji Xie Dec. 25, 2020, 11:36 a.m. UTC | #9
On Fri, Dec 25, 2020 at 3:02 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/25 上午10:37, Yongji Xie wrote:
> > On Thu, Dec 24, 2020 at 3:37 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> >> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>
> >>> On 2020/12/23 下午8:14, Yongji Xie wrote:
> >>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
> >>>>>> To support vhost-vdpa bus driver, we need a way to share the
> >>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
> >>>>>>
> >>>>>> This patch tries to make use of the vhost iotlb message to achieve
> >>>>>> that. We will get the shm file from the iotlb message and pass it
> >>>>>> to the userspace VDUSE process.
> >>>>>>
> >>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>>>>> ---
> >>>>>>     Documentation/driver-api/vduse.rst |  15 +++-
> >>>>>>     drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> >>>>>>     include/uapi/linux/vduse.h         |  11 +++
> >>>>>>     3 files changed, 171 insertions(+), 2 deletions(-)
> >>>>>>
> >>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> >>>>>> index 623f7b040ccf..48e4b1ba353f 100644
> >>>>>> --- a/Documentation/driver-api/vduse.rst
> >>>>>> +++ b/Documentation/driver-api/vduse.rst
> >>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> >>>>>>
> >>>>>>     - VDUSE_GET_CONFIG: Read from device specific configuration space
> >>>>>>
> >>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> >>>>>> +
> >>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> >>>>>> +
> >>>>>>     Please see include/linux/vdpa.h for details.
> >>>>>>
> >>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>>> +The data path of userspace vDPA device is implemented in different ways
> >>>>>> +depending on the vdpa bus to which it is attached.
> >>>>>> +
> >>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>>>     driver which supports mapping the kernel dma buffer to a userspace iova
> >>>>>>     region dynamically. The userspace iova region can be created by passing
> >>>>>>     the userspace vDPA device fd to mmap(2).
> >>>>>>
> >>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> >>>>>> +which will be shared to the VDUSE userspace processs via the file
> >>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> >>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> >>>>>> +in this message.
> >>>>>> +
> >>>>>>     Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> >>>>>>     receive virtqueue kicks in userspace. The following ioctls on the userspace
> >>>>>>     vDPA device fd are provided to support that:
> >>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>> index b974333ed4e9..d24aaacb6008 100644
> >>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>> @@ -34,6 +34,7 @@
> >>>>>>
> >>>>>>     struct vduse_dev_msg {
> >>>>>>         struct vduse_dev_request req;
> >>>>>> +     struct file *iotlb_file;
> >>>>>>         struct vduse_dev_response resp;
> >>>>>>         struct list_head list;
> >>>>>>         wait_queue_head_t waitq;
> >>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> >>>>>>         return ret;
> >>>>>>     }
> >>>>>>
> >>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> >>>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
> >>>>>> +{
> >>>>>> +     struct vduse_dev_msg *msg;
> >>>>>> +     int ret;
> >>>>>> +
> >>>>>> +     if (!size)
> >>>>>> +             return -EINVAL;
> >>>>>> +
> >>>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> >>>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
> >>>>>> +     msg->req.iotlb.offset = offset;
> >>>>>> +     msg->req.iotlb.iova = iova;
> >>>>>> +     msg->req.iotlb.size = size;
> >>>>>> +     msg->req.iotlb.perm = perm;
> >>>>>> +     msg->req.iotlb.fd = -1;
> >>>>>> +     msg->iotlb_file = get_file(file);
> >>>>>> +
> >>>>>> +     ret = vduse_dev_msg_sync(dev, msg);
> >>>>> My feeling is that we should provide consistent API for the userspace
> >>>>> device to use.
> >>>>>
> >>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
> >>>>>
> >>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> >>>>> by using VDUSE file as msg->iotlb_file here.
> >>>>>
> >>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> >>>> message in virtio cases.
> >>>
> >>> Instead of generating IOTLB messages for userspace.
> >>>
> >>> How about record the mappings (which is a common case for device have
> >>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
> >>> for userspace to query?
> >>>
> >> If so, the IOTLB UPDATE is actually triggered by ioctl, but
> >> IOTLB_INVALIDATE is triggered by the message. Is it a little odd? Or
> >> how about trigger it when userspace call mmap() on the device fd?
> >>
> > Oh sorry, looks like mmap() needs to be called in IOTLB UPDATE message
> > handler. Is it possible for the vdpa device to know which vdpa bus it
> > is attached to?
>
>
> We'd better not. It's kind of layer violation.
>

OK. Now I think both ioctl and message are needed. The ioctl is useful
when VDUSE userspace daemon reboot. And the IOTLB_UPDATE message could
be generated during the first DMA mapping in the virtio-vdpa case.

Thanks,
Yongji
Jason Wang Dec. 28, 2020, 7:43 a.m. UTC | #10
On 2020/12/25 下午6:31, Yongji Xie wrote:
> On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/24 下午3:37, Yongji Xie wrote:
>>> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2020/12/23 下午8:14, Yongji Xie wrote:
>>>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
>>>>>>> To support vhost-vdpa bus driver, we need a way to share the
>>>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
>>>>>>>
>>>>>>> This patch tries to make use of the vhost iotlb message to achieve
>>>>>>> that. We will get the shm file from the iotlb message and pass it
>>>>>>> to the userspace VDUSE process.
>>>>>>>
>>>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>>>>>>> ---
>>>>>>>      Documentation/driver-api/vduse.rst |  15 +++-
>>>>>>>      drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
>>>>>>>      include/uapi/linux/vduse.h         |  11 +++
>>>>>>>      3 files changed, 171 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
>>>>>>> index 623f7b040ccf..48e4b1ba353f 100644
>>>>>>> --- a/Documentation/driver-api/vduse.rst
>>>>>>> +++ b/Documentation/driver-api/vduse.rst
>>>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
>>>>>>>
>>>>>>>      - VDUSE_GET_CONFIG: Read from device specific configuration space
>>>>>>>
>>>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
>>>>>>> +
>>>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
>>>>>>> +
>>>>>>>      Please see include/linux/vdpa.h for details.
>>>>>>>
>>>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>>>> +The data path of userspace vDPA device is implemented in different ways
>>>>>>> +depending on the vdpa bus to which it is attached.
>>>>>>> +
>>>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
>>>>>>>      driver which supports mapping the kernel dma buffer to a userspace iova
>>>>>>>      region dynamically. The userspace iova region can be created by passing
>>>>>>>      the userspace vDPA device fd to mmap(2).
>>>>>>>
>>>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
>>>>>>> +which will be shared to the VDUSE userspace processs via the file
>>>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
>>>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
>>>>>>> +in this message.
>>>>>>> +
>>>>>>>      Besides, the eventfd mechanism is used to trigger interrupt callbacks and
>>>>>>>      receive virtqueue kicks in userspace. The following ioctls on the userspace
>>>>>>>      vDPA device fd are provided to support that:
>>>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>>> index b974333ed4e9..d24aaacb6008 100644
>>>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
>>>>>>> @@ -34,6 +34,7 @@
>>>>>>>
>>>>>>>      struct vduse_dev_msg {
>>>>>>>          struct vduse_dev_request req;
>>>>>>> +     struct file *iotlb_file;
>>>>>>>          struct vduse_dev_response resp;
>>>>>>>          struct list_head list;
>>>>>>>          wait_queue_head_t waitq;
>>>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
>>>>>>>          return ret;
>>>>>>>      }
>>>>>>>
>>>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
>>>>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
>>>>>>> +{
>>>>>>> +     struct vduse_dev_msg *msg;
>>>>>>> +     int ret;
>>>>>>> +
>>>>>>> +     if (!size)
>>>>>>> +             return -EINVAL;
>>>>>>> +
>>>>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
>>>>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
>>>>>>> +     msg->req.iotlb.offset = offset;
>>>>>>> +     msg->req.iotlb.iova = iova;
>>>>>>> +     msg->req.iotlb.size = size;
>>>>>>> +     msg->req.iotlb.perm = perm;
>>>>>>> +     msg->req.iotlb.fd = -1;
>>>>>>> +     msg->iotlb_file = get_file(file);
>>>>>>> +
>>>>>>> +     ret = vduse_dev_msg_sync(dev, msg);
>>>>>> My feeling is that we should provide consistent API for the userspace
>>>>>> device to use.
>>>>>>
>>>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
>>>>>>
>>>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
>>>>>> by using VDUSE file as msg->iotlb_file here.
>>>>>>
>>>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
>>>>> message in virtio cases.
>>>> Instead of generating IOTLB messages for userspace.
>>>>
>>>> How about record the mappings (which is a common case for device have
>>>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
>>>> for userspace to query?
>>>>
>>> If so, the IOTLB UPDATE is actually triggered by ioctl, but
>>> IOTLB_INVALIDATE is triggered by the message. Is it a little odd?
>>
>> Good point.
>>
>> Some questions here:
>>
>> 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall
>> is returned. But this patch doesn't have this guarantee. Can this lead
>> some issues?
> I'm not sure. But should it be guaranteed in VDUSE userspace? Just
> like what vhost-user backend process does.


I think so. This is because the userspace device needs a way to 
synchronize invalidation with its datapath. Otherwise, guest may thing 
the buffer is freed to be used by other parts but the it actually can be 
used by the VDUSE device. The may cause security issues.


>
>> 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to
>> issue a munmap(). What if it doesn't do that?
>>
> Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace
> could still access the corresponding guest memory region.


I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE 
is expected to be synchronous. This need to be solved by tweaking the 
current VDUSE API or we can re-visit to go with descriptors relaying first.

Thanks


>
> Thanks,
> Yongji
>
Yongji Xie Dec. 28, 2020, 8:14 a.m. UTC | #11
On Mon, Dec 28, 2020 at 3:43 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/25 下午6:31, Yongji Xie wrote:
> > On Fri, Dec 25, 2020 at 2:58 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/24 下午3:37, Yongji Xie wrote:
> >>> On Thu, Dec 24, 2020 at 10:41 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2020/12/23 下午8:14, Yongji Xie wrote:
> >>>>> On Wed, Dec 23, 2020 at 5:05 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On 2020/12/22 下午10:52, Xie Yongji wrote:
> >>>>>>> To support vhost-vdpa bus driver, we need a way to share the
> >>>>>>> vhost-vdpa backend process's memory with the userspace VDUSE process.
> >>>>>>>
> >>>>>>> This patch tries to make use of the vhost iotlb message to achieve
> >>>>>>> that. We will get the shm file from the iotlb message and pass it
> >>>>>>> to the userspace VDUSE process.
> >>>>>>>
> >>>>>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >>>>>>> ---
> >>>>>>>      Documentation/driver-api/vduse.rst |  15 +++-
> >>>>>>>      drivers/vdpa/vdpa_user/vduse_dev.c | 147 ++++++++++++++++++++++++++++++++++++-
> >>>>>>>      include/uapi/linux/vduse.h         |  11 +++
> >>>>>>>      3 files changed, 171 insertions(+), 2 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
> >>>>>>> index 623f7b040ccf..48e4b1ba353f 100644
> >>>>>>> --- a/Documentation/driver-api/vduse.rst
> >>>>>>> +++ b/Documentation/driver-api/vduse.rst
> >>>>>>> @@ -46,13 +46,26 @@ The following types of messages are provided by the VDUSE framework now:
> >>>>>>>
> >>>>>>>      - VDUSE_GET_CONFIG: Read from device specific configuration space
> >>>>>>>
> >>>>>>> +- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
> >>>>>>> +
> >>>>>>> +- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
> >>>>>>> +
> >>>>>>>      Please see include/linux/vdpa.h for details.
> >>>>>>>
> >>>>>>> -In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>>>> +The data path of userspace vDPA device is implemented in different ways
> >>>>>>> +depending on the vdpa bus to which it is attached.
> >>>>>>> +
> >>>>>>> +In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
> >>>>>>>      driver which supports mapping the kernel dma buffer to a userspace iova
> >>>>>>>      region dynamically. The userspace iova region can be created by passing
> >>>>>>>      the userspace vDPA device fd to mmap(2).
> >>>>>>>
> >>>>>>> +In vhost-vdpa case, the dma buffer is reside in a userspace memory region
> >>>>>>> +which will be shared to the VDUSE userspace processs via the file
> >>>>>>> +descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
> >>>>>>> +mapping (IOVA of dma buffer <-> VA of the memory region) is also included
> >>>>>>> +in this message.
> >>>>>>> +
> >>>>>>>      Besides, the eventfd mechanism is used to trigger interrupt callbacks and
> >>>>>>>      receive virtqueue kicks in userspace. The following ioctls on the userspace
> >>>>>>>      vDPA device fd are provided to support that:
> >>>>>>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>>> index b974333ed4e9..d24aaacb6008 100644
> >>>>>>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>>>>>> @@ -34,6 +34,7 @@
> >>>>>>>
> >>>>>>>      struct vduse_dev_msg {
> >>>>>>>          struct vduse_dev_request req;
> >>>>>>> +     struct file *iotlb_file;
> >>>>>>>          struct vduse_dev_response resp;
> >>>>>>>          struct list_head list;
> >>>>>>>          wait_queue_head_t waitq;
> >>>>>>> @@ -325,12 +326,80 @@ static int vduse_dev_set_vq_state(struct vduse_dev *dev,
> >>>>>>>          return ret;
> >>>>>>>      }
> >>>>>>>
> >>>>>>> +static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
> >>>>>>> +                             u64 offset, u64 iova, u64 size, u8 perm)
> >>>>>>> +{
> >>>>>>> +     struct vduse_dev_msg *msg;
> >>>>>>> +     int ret;
> >>>>>>> +
> >>>>>>> +     if (!size)
> >>>>>>> +             return -EINVAL;
> >>>>>>> +
> >>>>>>> +     msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
> >>>>>>> +     msg->req.size = sizeof(struct vduse_iotlb);
> >>>>>>> +     msg->req.iotlb.offset = offset;
> >>>>>>> +     msg->req.iotlb.iova = iova;
> >>>>>>> +     msg->req.iotlb.size = size;
> >>>>>>> +     msg->req.iotlb.perm = perm;
> >>>>>>> +     msg->req.iotlb.fd = -1;
> >>>>>>> +     msg->iotlb_file = get_file(file);
> >>>>>>> +
> >>>>>>> +     ret = vduse_dev_msg_sync(dev, msg);
> >>>>>> My feeling is that we should provide consistent API for the userspace
> >>>>>> device to use.
> >>>>>>
> >>>>>> E.g we'd better carry the IOTLB message for both virtio/vhost drivers.
> >>>>>>
> >>>>>> It looks to me for virtio drivers we can still use UPDAT_IOTLB message
> >>>>>> by using VDUSE file as msg->iotlb_file here.
> >>>>>>
> >>>>> It's OK for me. One problem is when to transfer the UPDATE_IOTLB
> >>>>> message in virtio cases.
> >>>> Instead of generating IOTLB messages for userspace.
> >>>>
> >>>> How about record the mappings (which is a common case for device have
> >>>> on-chip IOMMU e.g mlx5e and vdpa simlator), then we can introduce ioctl
> >>>> for userspace to query?
> >>>>
> >>> If so, the IOTLB UPDATE is actually triggered by ioctl, but
> >>> IOTLB_INVALIDATE is triggered by the message. Is it a little odd?
> >>
> >> Good point.
> >>
> >> Some questions here:
> >>
> >> 1) Userspace think the IOTLB was flushed after IOTLB_INVALIDATE syscall
> >> is returned. But this patch doesn't have this guarantee. Can this lead
> >> some issues?
> > I'm not sure. But should it be guaranteed in VDUSE userspace? Just
> > like what vhost-user backend process does.
>
>
> I think so. This is because the userspace device needs a way to
> synchronize invalidation with its datapath. Otherwise, guest may thing
> the buffer is freed to be used by other parts but the it actually can be
> used by the VDUSE device. The may cause security issues.
>
>
> >
> >> 2) I think after VDUSE userspace receives IOTLB_INVALIDATE, it needs to
> >> issue a munmap(). What if it doesn't do that?
> >>
> > Yes, the munmap() is needed. If it doesn't do that, VDUSE userspace
> > could still access the corresponding guest memory region.
>
>
> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> is expected to be synchronous. This need to be solved by tweaking the
> current VDUSE API or we can re-visit to go with descriptors relaying first.
>

Actually all vdpa related operations are synchronous in current
implementation. The ops.set_map/dma_map/dma_unmap should not return
until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
by userspace. Could it solve this problem?

Thanks,
Yongji
Jason Wang Dec. 28, 2020, 8:43 a.m. UTC | #12
On 2020/12/28 下午4:14, Yongji Xie wrote:
>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>> is expected to be synchronous. This need to be solved by tweaking the
>> current VDUSE API or we can re-visit to go with descriptors relaying first.
>>
> Actually all vdpa related operations are synchronous in current
> implementation. The ops.set_map/dma_map/dma_unmap should not return
> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> by userspace. Could it solve this problem?


  I was thinking whether or not we need to generate IOTLB_INVALIDATE 
message to VDUSE during dma_unmap (vduse_dev_unmap_page).

If we don't, we're probably fine.

Thanks


>
> Thanks,
> Yongji
>
Yongji Xie Dec. 28, 2020, 9:12 a.m. UTC | #13
On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >> is expected to be synchronous. This need to be solved by tweaking the
> >> current VDUSE API or we can re-visit to go with descriptors relaying first.
> >>
> > Actually all vdpa related operations are synchronous in current
> > implementation. The ops.set_map/dma_map/dma_unmap should not return
> > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> > by userspace. Could it solve this problem?
>
>
>   I was thinking whether or not we need to generate IOTLB_INVALIDATE
> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>
> If we don't, we're probably fine.
>

It seems not feasible. This message will be also used in the
virtio-vdpa case to notify userspace to unmap some pages during
consistent dma unmapping. Maybe we can document it to make sure the
users can handle the message correctly.

Thanks,
Yongji
Jason Wang Dec. 29, 2020, 9:11 a.m. UTC | #14
----- Original Message -----
> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > On 2020/12/28 下午4:14, Yongji Xie wrote:
> > >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> > >> is expected to be synchronous. This need to be solved by tweaking the
> > >> current VDUSE API or we can re-visit to go with descriptors relaying
> > >> first.
> > >>
> > > Actually all vdpa related operations are synchronous in current
> > > implementation. The ops.set_map/dma_map/dma_unmap should not return
> > > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> > > by userspace. Could it solve this problem?
> >
> >
> >   I was thinking whether or not we need to generate IOTLB_INVALIDATE
> > message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >
> > If we don't, we're probably fine.
> >
> 
> It seems not feasible. This message will be also used in the
> virtio-vdpa case to notify userspace to unmap some pages during
> consistent dma unmapping. Maybe we can document it to make sure the
> users can handle the message correctly.

Just to make sure I understand your point.

Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
coherent DMA?

For 1) you probably need a workqueue to do that since dma unmap can
be done in irq or bh context. And if usrspace does't do the unmap, it
can still access the bounce buffer (if you don't zap pte)?

Thanks


> 
> Thanks,
> Yongji
> 
>
Yongji Xie Dec. 29, 2020, 10:26 a.m. UTC | #15
On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> ----- Original Message -----
> > On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > >
> > > On 2020/12/28 下午4:14, Yongji Xie wrote:
> > > >> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> > > >> is expected to be synchronous. This need to be solved by tweaking the
> > > >> current VDUSE API or we can re-visit to go with descriptors relaying
> > > >> first.
> > > >>
> > > > Actually all vdpa related operations are synchronous in current
> > > > implementation. The ops.set_map/dma_map/dma_unmap should not return
> > > > until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> > > > by userspace. Could it solve this problem?
> > >
> > >
> > >   I was thinking whether or not we need to generate IOTLB_INVALIDATE
> > > message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> > >
> > > If we don't, we're probably fine.
> > >
> >
> > It seems not feasible. This message will be also used in the
> > virtio-vdpa case to notify userspace to unmap some pages during
> > consistent dma unmapping. Maybe we can document it to make sure the
> > users can handle the message correctly.
>
> Just to make sure I understand your point.
>
> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> coherent DMA?
>
> For 1) you probably need a workqueue to do that since dma unmap can
> be done in irq or bh context. And if usrspace does't do the unmap, it
> can still access the bounce buffer (if you don't zap pte)?
>

I plan to do it in the coherent DMA case. It's true that userspace can
access the dma buffer if userspace doesn't do the unmap. But the dma
pages would not be freed and reused unless user space called munmap()
for them.

Thanks,
Yongji
Jason Wang Dec. 30, 2020, 6:10 a.m. UTC | #16
On 2020/12/29 下午6:26, Yongji Xie wrote:
> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> ----- Original Message -----
>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>
>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>>>>>> is expected to be synchronous. This need to be solved by tweaking the
>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
>>>>>> first.
>>>>>>
>>>>> Actually all vdpa related operations are synchronous in current
>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
>>>>> by userspace. Could it solve this problem?
>>>>
>>>>    I was thinking whether or not we need to generate IOTLB_INVALIDATE
>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>>>>
>>>> If we don't, we're probably fine.
>>>>
>>> It seems not feasible. This message will be also used in the
>>> virtio-vdpa case to notify userspace to unmap some pages during
>>> consistent dma unmapping. Maybe we can document it to make sure the
>>> users can handle the message correctly.
>> Just to make sure I understand your point.
>>
>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
>> coherent DMA?
>>
>> For 1) you probably need a workqueue to do that since dma unmap can
>> be done in irq or bh context. And if usrspace does't do the unmap, it
>> can still access the bounce buffer (if you don't zap pte)?
>>
> I plan to do it in the coherent DMA case.


Any reason for treating coherent DMA differently?


> It's true that userspace can
> access the dma buffer if userspace doesn't do the unmap. But the dma
> pages would not be freed and reused unless user space called munmap()
> for them.


I wonder whether or not we could recycle IOVA in this case to avoid the 
IOTLB_UMAP message.

Thanks


>
> Thanks,
> Yongji
>
Yongji Xie Dec. 30, 2020, 7:09 a.m. UTC | #17
On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/29 下午6:26, Yongji Xie wrote:
> > On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >>
> >> ----- Original Message -----
> >>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>
> >>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >>>>>> is expected to be synchronous. This need to be solved by tweaking the
> >>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
> >>>>>> first.
> >>>>>>
> >>>>> Actually all vdpa related operations are synchronous in current
> >>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
> >>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> >>>>> by userspace. Could it solve this problem?
> >>>>
> >>>>    I was thinking whether or not we need to generate IOTLB_INVALIDATE
> >>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >>>>
> >>>> If we don't, we're probably fine.
> >>>>
> >>> It seems not feasible. This message will be also used in the
> >>> virtio-vdpa case to notify userspace to unmap some pages during
> >>> consistent dma unmapping. Maybe we can document it to make sure the
> >>> users can handle the message correctly.
> >> Just to make sure I understand your point.
> >>
> >> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> >> coherent DMA?
> >>
> >> For 1) you probably need a workqueue to do that since dma unmap can
> >> be done in irq or bh context. And if usrspace does't do the unmap, it
> >> can still access the bounce buffer (if you don't zap pte)?
> >>
> > I plan to do it in the coherent DMA case.
>
>
> Any reason for treating coherent DMA differently?
>

Now the memory of the bounce buffer is allocated page by page in the
page fault handler. So it can't be used in coherent DMA mapping case
which needs some memory with contiguous virtual addresses. I can use
vmalloc() to do allocation for the bounce buffer instead. But it might
cause some memory waste. Any suggestion?

>
> > It's true that userspace can
> > access the dma buffer if userspace doesn't do the unmap. But the dma
> > pages would not be freed and reused unless user space called munmap()
> > for them.
>
>
> I wonder whether or not we could recycle IOVA in this case to avoid the
> IOTLB_UMAP message.
>

We can achieve that if we use vmalloc() to do allocation for the
bounce buffer which can be used in coherent DMA mapping case. But
looks like we still have no way to avoid the IOTLB_UMAP message in
vhost-vdpa case.

Thanks,
Yongji
Jason Wang Dec. 30, 2020, 8:41 a.m. UTC | #18
On 2020/12/30 下午3:09, Yongji Xie wrote:
> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/29 下午6:26, Yongji Xie wrote:
>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>
>>>> ----- Original Message -----
>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
>>>>>>>> first.
>>>>>>>>
>>>>>>> Actually all vdpa related operations are synchronous in current
>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
>>>>>>> by userspace. Could it solve this problem?
>>>>>>     I was thinking whether or not we need to generate IOTLB_INVALIDATE
>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>>>>>>
>>>>>> If we don't, we're probably fine.
>>>>>>
>>>>> It seems not feasible. This message will be also used in the
>>>>> virtio-vdpa case to notify userspace to unmap some pages during
>>>>> consistent dma unmapping. Maybe we can document it to make sure the
>>>>> users can handle the message correctly.
>>>> Just to make sure I understand your point.
>>>>
>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
>>>> coherent DMA?
>>>>
>>>> For 1) you probably need a workqueue to do that since dma unmap can
>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
>>>> can still access the bounce buffer (if you don't zap pte)?
>>>>
>>> I plan to do it in the coherent DMA case.
>>
>> Any reason for treating coherent DMA differently?
>>
> Now the memory of the bounce buffer is allocated page by page in the
> page fault handler. So it can't be used in coherent DMA mapping case
> which needs some memory with contiguous virtual addresses. I can use
> vmalloc() to do allocation for the bounce buffer instead. But it might
> cause some memory waste. Any suggestion?


I may miss something. But I don't see a relationship between the 
IOTLB_UNMAP and vmalloc().


>
>>> It's true that userspace can
>>> access the dma buffer if userspace doesn't do the unmap. But the dma
>>> pages would not be freed and reused unless user space called munmap()
>>> for them.
>>
>> I wonder whether or not we could recycle IOVA in this case to avoid the
>> IOTLB_UMAP message.
>>
> We can achieve that if we use vmalloc() to do allocation for the
> bounce buffer which can be used in coherent DMA mapping case. But
> looks like we still have no way to avoid the IOTLB_UMAP message in
> vhost-vdpa case.


I think that's fine. For virtio-vdpa, from VDUSE userspace perspective, 
it works like a driver that is using SWIOTLB in this case.

Thanks


>
> Thanks,
> Yongji
>
Yongji Xie Dec. 30, 2020, 10:12 a.m. UTC | #19
On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/30 下午3:09, Yongji Xie wrote:
> > On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/29 下午6:26, Yongji Xie wrote:
> >>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>
> >>>> ----- Original Message -----
> >>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
> >>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
> >>>>>>>> first.
> >>>>>>>>
> >>>>>>> Actually all vdpa related operations are synchronous in current
> >>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
> >>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> >>>>>>> by userspace. Could it solve this problem?
> >>>>>>     I was thinking whether or not we need to generate IOTLB_INVALIDATE
> >>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >>>>>>
> >>>>>> If we don't, we're probably fine.
> >>>>>>
> >>>>> It seems not feasible. This message will be also used in the
> >>>>> virtio-vdpa case to notify userspace to unmap some pages during
> >>>>> consistent dma unmapping. Maybe we can document it to make sure the
> >>>>> users can handle the message correctly.
> >>>> Just to make sure I understand your point.
> >>>>
> >>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> >>>> coherent DMA?
> >>>>
> >>>> For 1) you probably need a workqueue to do that since dma unmap can
> >>>> be done in irq or bh context. And if usrspace does't do the unmap, it
> >>>> can still access the bounce buffer (if you don't zap pte)?
> >>>>
> >>> I plan to do it in the coherent DMA case.
> >>
> >> Any reason for treating coherent DMA differently?
> >>
> > Now the memory of the bounce buffer is allocated page by page in the
> > page fault handler. So it can't be used in coherent DMA mapping case
> > which needs some memory with contiguous virtual addresses. I can use
> > vmalloc() to do allocation for the bounce buffer instead. But it might
> > cause some memory waste. Any suggestion?
>
>
> I may miss something. But I don't see a relationship between the
> IOTLB_UNMAP and vmalloc().
>

In the vmalloc() case, the coherent DMA page will be taken from the
memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
during coherent DMA unmapping because those vmalloc'ed memory which
has been mapped into userspace address space during initialization can
be reused. And userspace should not unmap the region until we destroy
the device.

>
> >
> >>> It's true that userspace can
> >>> access the dma buffer if userspace doesn't do the unmap. But the dma
> >>> pages would not be freed and reused unless user space called munmap()
> >>> for them.
> >>
> >> I wonder whether or not we could recycle IOVA in this case to avoid the
> >> IOTLB_UMAP message.
> >>
> > We can achieve that if we use vmalloc() to do allocation for the
> > bounce buffer which can be used in coherent DMA mapping case. But
> > looks like we still have no way to avoid the IOTLB_UMAP message in
> > vhost-vdpa case.
>
>
> I think that's fine. For virtio-vdpa, from VDUSE userspace perspective,
> it works like a driver that is using SWIOTLB in this case.
>

OK, will do it in v3.

Thanks,
Yongji
Jason Wang Dec. 31, 2020, 2:49 a.m. UTC | #20
On 2020/12/30 下午6:12, Yongji Xie wrote:
> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/30 下午3:09, Yongji Xie wrote:
>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> ----- Original Message -----
>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
>>>>>>>>>> first.
>>>>>>>>>>
>>>>>>>>> Actually all vdpa related operations are synchronous in current
>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
>>>>>>>>> by userspace. Could it solve this problem?
>>>>>>>>      I was thinking whether or not we need to generate IOTLB_INVALIDATE
>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>>>>>>>>
>>>>>>>> If we don't, we're probably fine.
>>>>>>>>
>>>>>>> It seems not feasible. This message will be also used in the
>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
>>>>>>> users can handle the message correctly.
>>>>>> Just to make sure I understand your point.
>>>>>>
>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
>>>>>> coherent DMA?
>>>>>>
>>>>>> For 1) you probably need a workqueue to do that since dma unmap can
>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
>>>>>> can still access the bounce buffer (if you don't zap pte)?
>>>>>>
>>>>> I plan to do it in the coherent DMA case.
>>>> Any reason for treating coherent DMA differently?
>>>>
>>> Now the memory of the bounce buffer is allocated page by page in the
>>> page fault handler. So it can't be used in coherent DMA mapping case
>>> which needs some memory with contiguous virtual addresses. I can use
>>> vmalloc() to do allocation for the bounce buffer instead. But it might
>>> cause some memory waste. Any suggestion?
>>
>> I may miss something. But I don't see a relationship between the
>> IOTLB_UNMAP and vmalloc().
>>
> In the vmalloc() case, the coherent DMA page will be taken from the
> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
> during coherent DMA unmapping because those vmalloc'ed memory which
> has been mapped into userspace address space during initialization can
> be reused. And userspace should not unmap the region until we destroy
> the device.


Just to make sure I understand. My understanding is that IOTLB_UNMAP is 
only needed when there's a change the mapping from IOVA to page.

So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to 
free list to be used by the next IOVA allocating. IOTLB_UNMAP could be 
avoided.

So we are not limited by how the pages are actually allocated?

Thanks


>
>>>>> It's true that userspace can
>>>>> access the dma buffer if userspace doesn't do the unmap. But the dma
>>>>> pages would not be freed and reused unless user space called munmap()
>>>>> for them.
>>>> I wonder whether or not we could recycle IOVA in this case to avoid the
>>>> IOTLB_UMAP message.
>>>>
>>> We can achieve that if we use vmalloc() to do allocation for the
>>> bounce buffer which can be used in coherent DMA mapping case. But
>>> looks like we still have no way to avoid the IOTLB_UMAP message in
>>> vhost-vdpa case.
>>
>> I think that's fine. For virtio-vdpa, from VDUSE userspace perspective,
>> it works like a driver that is using SWIOTLB in this case.
>>
> OK, will do it in v3.
>
> Thanks,
> Yongji
>
Yongji Xie Dec. 31, 2020, 5:15 a.m. UTC | #21
On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/30 下午6:12, Yongji Xie wrote:
> > On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/30 下午3:09, Yongji Xie wrote:
> >>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
> >>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> ----- Original Message -----
> >>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
> >>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
> >>>>>>>>>> first.
> >>>>>>>>>>
> >>>>>>>>> Actually all vdpa related operations are synchronous in current
> >>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
> >>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> >>>>>>>>> by userspace. Could it solve this problem?
> >>>>>>>>      I was thinking whether or not we need to generate IOTLB_INVALIDATE
> >>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >>>>>>>>
> >>>>>>>> If we don't, we're probably fine.
> >>>>>>>>
> >>>>>>> It seems not feasible. This message will be also used in the
> >>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
> >>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
> >>>>>>> users can handle the message correctly.
> >>>>>> Just to make sure I understand your point.
> >>>>>>
> >>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> >>>>>> coherent DMA?
> >>>>>>
> >>>>>> For 1) you probably need a workqueue to do that since dma unmap can
> >>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
> >>>>>> can still access the bounce buffer (if you don't zap pte)?
> >>>>>>
> >>>>> I plan to do it in the coherent DMA case.
> >>>> Any reason for treating coherent DMA differently?
> >>>>
> >>> Now the memory of the bounce buffer is allocated page by page in the
> >>> page fault handler. So it can't be used in coherent DMA mapping case
> >>> which needs some memory with contiguous virtual addresses. I can use
> >>> vmalloc() to do allocation for the bounce buffer instead. But it might
> >>> cause some memory waste. Any suggestion?
> >>
> >> I may miss something. But I don't see a relationship between the
> >> IOTLB_UNMAP and vmalloc().
> >>
> > In the vmalloc() case, the coherent DMA page will be taken from the
> > memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
> > during coherent DMA unmapping because those vmalloc'ed memory which
> > has been mapped into userspace address space during initialization can
> > be reused. And userspace should not unmap the region until we destroy
> > the device.
>
>
> Just to make sure I understand. My understanding is that IOTLB_UNMAP is
> only needed when there's a change the mapping from IOVA to page.
>

Yes, that's true.

> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to
> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be
> avoided.
>
> So we are not limited by how the pages are actually allocated?
>

In coherent DMA cases, we need to return some memory with contiguous
kernel virtual addresses. That is the reason why we need vmalloc()
here. If we allocate the memory page by page, the corresponding kernel
virtual addresses in a contiguous IOVA range might not be contiguous.
And in streaming DMA cases, there is no limit. So another choice is
using vmalloc'ed memory only for coherent DMA cases.

Not sure if this is clear for you.

Thanks,
Yongji
Jason Wang Dec. 31, 2020, 5:49 a.m. UTC | #22
On 2020/12/31 下午1:15, Yongji Xie wrote:
> On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/30 下午6:12, Yongji Xie wrote:
>>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2020/12/30 下午3:09, Yongji Xie wrote:
>>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
>>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> ----- Original Message -----
>>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
>>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
>>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
>>>>>>>>>>>> first.
>>>>>>>>>>>>
>>>>>>>>>>> Actually all vdpa related operations are synchronous in current
>>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
>>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
>>>>>>>>>>> by userspace. Could it solve this problem?
>>>>>>>>>>       I was thinking whether or not we need to generate IOTLB_INVALIDATE
>>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>>>>>>>>>>
>>>>>>>>>> If we don't, we're probably fine.
>>>>>>>>>>
>>>>>>>>> It seems not feasible. This message will be also used in the
>>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
>>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
>>>>>>>>> users can handle the message correctly.
>>>>>>>> Just to make sure I understand your point.
>>>>>>>>
>>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
>>>>>>>> coherent DMA?
>>>>>>>>
>>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can
>>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
>>>>>>>> can still access the bounce buffer (if you don't zap pte)?
>>>>>>>>
>>>>>>> I plan to do it in the coherent DMA case.
>>>>>> Any reason for treating coherent DMA differently?
>>>>>>
>>>>> Now the memory of the bounce buffer is allocated page by page in the
>>>>> page fault handler. So it can't be used in coherent DMA mapping case
>>>>> which needs some memory with contiguous virtual addresses. I can use
>>>>> vmalloc() to do allocation for the bounce buffer instead. But it might
>>>>> cause some memory waste. Any suggestion?
>>>> I may miss something. But I don't see a relationship between the
>>>> IOTLB_UNMAP and vmalloc().
>>>>
>>> In the vmalloc() case, the coherent DMA page will be taken from the
>>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
>>> during coherent DMA unmapping because those vmalloc'ed memory which
>>> has been mapped into userspace address space during initialization can
>>> be reused. And userspace should not unmap the region until we destroy
>>> the device.
>>
>> Just to make sure I understand. My understanding is that IOTLB_UNMAP is
>> only needed when there's a change the mapping from IOVA to page.
>>
> Yes, that's true.
>
>> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to
>> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be
>> avoided.
>>
>> So we are not limited by how the pages are actually allocated?
>>
> In coherent DMA cases, we need to return some memory with contiguous
> kernel virtual addresses. That is the reason why we need vmalloc()
> here. If we allocate the memory page by page, the corresponding kernel
> virtual addresses in a contiguous IOVA range might not be contiguous.


Yes, but we can do that as what has been done in the series 
(alloc_pages_exact()). Or do you mean it would be a little bit hard to 
recycle IOVA/pages here?

Thanks


> And in streaming DMA cases, there is no limit. So another choice is
> using vmalloc'ed memory only for coherent DMA cases.
>
> Not sure if this is clear for you.
>
> Thanks,
> Yongji
>
Yongji Xie Dec. 31, 2020, 6:52 a.m. UTC | #23
On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/31 下午1:15, Yongji Xie wrote:
> > On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/30 下午6:12, Yongji Xie wrote:
> >>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2020/12/30 下午3:09, Yongji Xie wrote:
> >>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
> >>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>> ----- Original Message -----
> >>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
> >>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
> >>>>>>>>>>>> first.
> >>>>>>>>>>>>
> >>>>>>>>>>> Actually all vdpa related operations are synchronous in current
> >>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
> >>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> >>>>>>>>>>> by userspace. Could it solve this problem?
> >>>>>>>>>>       I was thinking whether or not we need to generate IOTLB_INVALIDATE
> >>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >>>>>>>>>>
> >>>>>>>>>> If we don't, we're probably fine.
> >>>>>>>>>>
> >>>>>>>>> It seems not feasible. This message will be also used in the
> >>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
> >>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
> >>>>>>>>> users can handle the message correctly.
> >>>>>>>> Just to make sure I understand your point.
> >>>>>>>>
> >>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> >>>>>>>> coherent DMA?
> >>>>>>>>
> >>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can
> >>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
> >>>>>>>> can still access the bounce buffer (if you don't zap pte)?
> >>>>>>>>
> >>>>>>> I plan to do it in the coherent DMA case.
> >>>>>> Any reason for treating coherent DMA differently?
> >>>>>>
> >>>>> Now the memory of the bounce buffer is allocated page by page in the
> >>>>> page fault handler. So it can't be used in coherent DMA mapping case
> >>>>> which needs some memory with contiguous virtual addresses. I can use
> >>>>> vmalloc() to do allocation for the bounce buffer instead. But it might
> >>>>> cause some memory waste. Any suggestion?
> >>>> I may miss something. But I don't see a relationship between the
> >>>> IOTLB_UNMAP and vmalloc().
> >>>>
> >>> In the vmalloc() case, the coherent DMA page will be taken from the
> >>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
> >>> during coherent DMA unmapping because those vmalloc'ed memory which
> >>> has been mapped into userspace address space during initialization can
> >>> be reused. And userspace should not unmap the region until we destroy
> >>> the device.
> >>
> >> Just to make sure I understand. My understanding is that IOTLB_UNMAP is
> >> only needed when there's a change the mapping from IOVA to page.
> >>
> > Yes, that's true.
> >
> >> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to
> >> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be
> >> avoided.
> >>
> >> So we are not limited by how the pages are actually allocated?
> >>
> > In coherent DMA cases, we need to return some memory with contiguous
> > kernel virtual addresses. That is the reason why we need vmalloc()
> > here. If we allocate the memory page by page, the corresponding kernel
> > virtual addresses in a contiguous IOVA range might not be contiguous.
>
>
> Yes, but we can do that as what has been done in the series
> (alloc_pages_exact()). Or do you mean it would be a little bit hard to
> recycle IOVA/pages here?
>

Yes, it might be hard to reuse the memory. For example, we firstly
allocate 1 IOVA/page during dma_map, then the IOVA is freed during
dma_unmap. Actually we can't reuse this single page if we need a
two-pages area in the next IOVA allocating. So the best way is using
IOTLB_UNMAP to free this single page during dma_unmap too.

Thanks,
Yongji
Jason Wang Dec. 31, 2020, 7:11 a.m. UTC | #24
On 2020/12/31 下午2:52, Yongji Xie wrote:
> On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2020/12/31 下午1:15, Yongji Xie wrote:
>>> On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2020/12/30 下午6:12, Yongji Xie wrote:
>>>>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2020/12/30 下午3:09, Yongji Xie wrote:
>>>>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
>>>>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>> ----- Original Message -----
>>>>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
>>>>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
>>>>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
>>>>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
>>>>>>>>>>>>>> first.
>>>>>>>>>>>>>>
>>>>>>>>>>>>> Actually all vdpa related operations are synchronous in current
>>>>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
>>>>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
>>>>>>>>>>>>> by userspace. Could it solve this problem?
>>>>>>>>>>>>        I was thinking whether or not we need to generate IOTLB_INVALIDATE
>>>>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
>>>>>>>>>>>>
>>>>>>>>>>>> If we don't, we're probably fine.
>>>>>>>>>>>>
>>>>>>>>>>> It seems not feasible. This message will be also used in the
>>>>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
>>>>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
>>>>>>>>>>> users can handle the message correctly.
>>>>>>>>>> Just to make sure I understand your point.
>>>>>>>>>>
>>>>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
>>>>>>>>>> coherent DMA?
>>>>>>>>>>
>>>>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can
>>>>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
>>>>>>>>>> can still access the bounce buffer (if you don't zap pte)?
>>>>>>>>>>
>>>>>>>>> I plan to do it in the coherent DMA case.
>>>>>>>> Any reason for treating coherent DMA differently?
>>>>>>>>
>>>>>>> Now the memory of the bounce buffer is allocated page by page in the
>>>>>>> page fault handler. So it can't be used in coherent DMA mapping case
>>>>>>> which needs some memory with contiguous virtual addresses. I can use
>>>>>>> vmalloc() to do allocation for the bounce buffer instead. But it might
>>>>>>> cause some memory waste. Any suggestion?
>>>>>> I may miss something. But I don't see a relationship between the
>>>>>> IOTLB_UNMAP and vmalloc().
>>>>>>
>>>>> In the vmalloc() case, the coherent DMA page will be taken from the
>>>>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
>>>>> during coherent DMA unmapping because those vmalloc'ed memory which
>>>>> has been mapped into userspace address space during initialization can
>>>>> be reused. And userspace should not unmap the region until we destroy
>>>>> the device.
>>>> Just to make sure I understand. My understanding is that IOTLB_UNMAP is
>>>> only needed when there's a change the mapping from IOVA to page.
>>>>
>>> Yes, that's true.
>>>
>>>> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to
>>>> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be
>>>> avoided.
>>>>
>>>> So we are not limited by how the pages are actually allocated?
>>>>
>>> In coherent DMA cases, we need to return some memory with contiguous
>>> kernel virtual addresses. That is the reason why we need vmalloc()
>>> here. If we allocate the memory page by page, the corresponding kernel
>>> virtual addresses in a contiguous IOVA range might not be contiguous.
>>
>> Yes, but we can do that as what has been done in the series
>> (alloc_pages_exact()). Or do you mean it would be a little bit hard to
>> recycle IOVA/pages here?
>>
> Yes, it might be hard to reuse the memory. For example, we firstly
> allocate 1 IOVA/page during dma_map, then the IOVA is freed during
> dma_unmap. Actually we can't reuse this single page if we need a
> two-pages area in the next IOVA allocating. So the best way is using
> IOTLB_UNMAP to free this single page during dma_unmap too.
>
> Thanks,
> Yongji


I get you now. Then I agree that let's go with IOTLB_UNMAP.

Thanks


>
Yongji Xie Dec. 31, 2020, 8 a.m. UTC | #25
On Thu, Dec 31, 2020 at 3:12 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2020/12/31 下午2:52, Yongji Xie wrote:
> > On Thu, Dec 31, 2020 at 1:50 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2020/12/31 下午1:15, Yongji Xie wrote:
> >>> On Thu, Dec 31, 2020 at 10:49 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2020/12/30 下午6:12, Yongji Xie wrote:
> >>>>> On Wed, Dec 30, 2020 at 4:41 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On 2020/12/30 下午3:09, Yongji Xie wrote:
> >>>>>>> On Wed, Dec 30, 2020 at 2:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>> On 2020/12/29 下午6:26, Yongji Xie wrote:
> >>>>>>>>> On Tue, Dec 29, 2020 at 5:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>> ----- Original Message -----
> >>>>>>>>>>> On Mon, Dec 28, 2020 at 4:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>>>> On 2020/12/28 下午4:14, Yongji Xie wrote:
> >>>>>>>>>>>>>> I see. So all the above two questions are because VHOST_IOTLB_INVALIDATE
> >>>>>>>>>>>>>> is expected to be synchronous. This need to be solved by tweaking the
> >>>>>>>>>>>>>> current VDUSE API or we can re-visit to go with descriptors relaying
> >>>>>>>>>>>>>> first.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>> Actually all vdpa related operations are synchronous in current
> >>>>>>>>>>>>> implementation. The ops.set_map/dma_map/dma_unmap should not return
> >>>>>>>>>>>>> until the VDUSE_UPDATE_IOTLB/VDUSE_INVALIDATE_IOTLB message is replied
> >>>>>>>>>>>>> by userspace. Could it solve this problem?
> >>>>>>>>>>>>        I was thinking whether or not we need to generate IOTLB_INVALIDATE
> >>>>>>>>>>>> message to VDUSE during dma_unmap (vduse_dev_unmap_page).
> >>>>>>>>>>>>
> >>>>>>>>>>>> If we don't, we're probably fine.
> >>>>>>>>>>>>
> >>>>>>>>>>> It seems not feasible. This message will be also used in the
> >>>>>>>>>>> virtio-vdpa case to notify userspace to unmap some pages during
> >>>>>>>>>>> consistent dma unmapping. Maybe we can document it to make sure the
> >>>>>>>>>>> users can handle the message correctly.
> >>>>>>>>>> Just to make sure I understand your point.
> >>>>>>>>>>
> >>>>>>>>>> Do you mean you plan to notify the unmap of 1) streaming DMA or 2)
> >>>>>>>>>> coherent DMA?
> >>>>>>>>>>
> >>>>>>>>>> For 1) you probably need a workqueue to do that since dma unmap can
> >>>>>>>>>> be done in irq or bh context. And if usrspace does't do the unmap, it
> >>>>>>>>>> can still access the bounce buffer (if you don't zap pte)?
> >>>>>>>>>>
> >>>>>>>>> I plan to do it in the coherent DMA case.
> >>>>>>>> Any reason for treating coherent DMA differently?
> >>>>>>>>
> >>>>>>> Now the memory of the bounce buffer is allocated page by page in the
> >>>>>>> page fault handler. So it can't be used in coherent DMA mapping case
> >>>>>>> which needs some memory with contiguous virtual addresses. I can use
> >>>>>>> vmalloc() to do allocation for the bounce buffer instead. But it might
> >>>>>>> cause some memory waste. Any suggestion?
> >>>>>> I may miss something. But I don't see a relationship between the
> >>>>>> IOTLB_UNMAP and vmalloc().
> >>>>>>
> >>>>> In the vmalloc() case, the coherent DMA page will be taken from the
> >>>>> memory allocated by vmalloc(). So IOTLB_UNMAP is not needed anymore
> >>>>> during coherent DMA unmapping because those vmalloc'ed memory which
> >>>>> has been mapped into userspace address space during initialization can
> >>>>> be reused. And userspace should not unmap the region until we destroy
> >>>>> the device.
> >>>> Just to make sure I understand. My understanding is that IOTLB_UNMAP is
> >>>> only needed when there's a change the mapping from IOVA to page.
> >>>>
> >>> Yes, that's true.
> >>>
> >>>> So if we stick to the mapping, e.g during dma_unmap, we just put IOVA to
> >>>> free list to be used by the next IOVA allocating. IOTLB_UNMAP could be
> >>>> avoided.
> >>>>
> >>>> So we are not limited by how the pages are actually allocated?
> >>>>
> >>> In coherent DMA cases, we need to return some memory with contiguous
> >>> kernel virtual addresses. That is the reason why we need vmalloc()
> >>> here. If we allocate the memory page by page, the corresponding kernel
> >>> virtual addresses in a contiguous IOVA range might not be contiguous.
> >>
> >> Yes, but we can do that as what has been done in the series
> >> (alloc_pages_exact()). Or do you mean it would be a little bit hard to
> >> recycle IOVA/pages here?
> >>
> > Yes, it might be hard to reuse the memory. For example, we firstly
> > allocate 1 IOVA/page during dma_map, then the IOVA is freed during
> > dma_unmap. Actually we can't reuse this single page if we need a
> > two-pages area in the next IOVA allocating. So the best way is using
> > IOTLB_UNMAP to free this single page during dma_unmap too.
> >
> > Thanks,
> > Yongji
>
>
> I get you now. Then I agree that let's go with IOTLB_UNMAP.
>

Fine, will do it.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/Documentation/driver-api/vduse.rst b/Documentation/driver-api/vduse.rst
index 623f7b040ccf..48e4b1ba353f 100644
--- a/Documentation/driver-api/vduse.rst
+++ b/Documentation/driver-api/vduse.rst
@@ -46,13 +46,26 @@  The following types of messages are provided by the VDUSE framework now:
 
 - VDUSE_GET_CONFIG: Read from device specific configuration space
 
+- VDUSE_UPDATE_IOTLB: Update the memory mapping in device IOTLB
+
+- VDUSE_INVALIDATE_IOTLB: Invalidate the memory mapping in device IOTLB
+
 Please see include/linux/vdpa.h for details.
 
-In the data path, VDUSE framework implements a MMU-based on-chip IOMMU
+The data path of userspace vDPA device is implemented in different ways
+depending on the vdpa bus to which it is attached.
+
+In virtio-vdpa case, VDUSE framework implements a MMU-based on-chip IOMMU
 driver which supports mapping the kernel dma buffer to a userspace iova
 region dynamically. The userspace iova region can be created by passing
 the userspace vDPA device fd to mmap(2).
 
+In vhost-vdpa case, the dma buffer is reside in a userspace memory region
+which will be shared to the VDUSE userspace processs via the file
+descriptor in VDUSE_UPDATE_IOTLB message. And the corresponding address
+mapping (IOVA of dma buffer <-> VA of the memory region) is also included
+in this message.
+
 Besides, the eventfd mechanism is used to trigger interrupt callbacks and
 receive virtqueue kicks in userspace. The following ioctls on the userspace
 vDPA device fd are provided to support that:
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index b974333ed4e9..d24aaacb6008 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -34,6 +34,7 @@ 
 
 struct vduse_dev_msg {
 	struct vduse_dev_request req;
+	struct file *iotlb_file;
 	struct vduse_dev_response resp;
 	struct list_head list;
 	wait_queue_head_t waitq;
@@ -325,12 +326,80 @@  static int vduse_dev_set_vq_state(struct vduse_dev *dev,
 	return ret;
 }
 
+static int vduse_dev_update_iotlb(struct vduse_dev *dev, struct file *file,
+				u64 offset, u64 iova, u64 size, u8 perm)
+{
+	struct vduse_dev_msg *msg;
+	int ret;
+
+	if (!size)
+		return -EINVAL;
+
+	msg = vduse_dev_new_msg(dev, VDUSE_UPDATE_IOTLB);
+	msg->req.size = sizeof(struct vduse_iotlb);
+	msg->req.iotlb.offset = offset;
+	msg->req.iotlb.iova = iova;
+	msg->req.iotlb.size = size;
+	msg->req.iotlb.perm = perm;
+	msg->req.iotlb.fd = -1;
+	msg->iotlb_file = get_file(file);
+
+	ret = vduse_dev_msg_sync(dev, msg);
+	vduse_dev_msg_put(msg);
+	fput(file);
+
+	return ret;
+}
+
+static int vduse_dev_invalidate_iotlb(struct vduse_dev *dev,
+					u64 iova, u64 size)
+{
+	struct vduse_dev_msg *msg;
+	int ret;
+
+	if (!size)
+		return -EINVAL;
+
+	msg = vduse_dev_new_msg(dev, VDUSE_INVALIDATE_IOTLB);
+	msg->req.size = sizeof(struct vduse_iotlb);
+	msg->req.iotlb.iova = iova;
+	msg->req.iotlb.size = size;
+
+	ret = vduse_dev_msg_sync(dev, msg);
+	vduse_dev_msg_put(msg);
+
+	return ret;
+}
+
+static unsigned int perm_to_file_flags(u8 perm)
+{
+	unsigned int flags = 0;
+
+	switch (perm) {
+	case VHOST_ACCESS_WO:
+		flags |= O_WRONLY;
+		break;
+	case VHOST_ACCESS_RO:
+		flags |= O_RDONLY;
+		break;
+	case VHOST_ACCESS_RW:
+		flags |= O_RDWR;
+		break;
+	default:
+		WARN(1, "invalidate vhost IOTLB permission\n");
+		break;
+	}
+
+	return flags;
+}
+
 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct vduse_dev *dev = file->private_data;
 	struct vduse_dev_msg *msg;
-	int size = sizeof(struct vduse_dev_request);
+	unsigned int flags;
+	int fd, size = sizeof(struct vduse_dev_request);
 	ssize_t ret = 0;
 
 	if (iov_iter_count(to) < size)
@@ -349,6 +418,18 @@  static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		if (ret)
 			return ret;
 	}
+
+	if (msg->req.type == VDUSE_UPDATE_IOTLB && msg->req.iotlb.fd == -1) {
+		flags = perm_to_file_flags(msg->req.iotlb.perm);
+		fd = get_unused_fd_flags(flags);
+		if (fd < 0) {
+			vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
+			return fd;
+		}
+		fd_install(fd, get_file(msg->iotlb_file));
+		msg->req.iotlb.fd = fd;
+	}
+
 	ret = copy_to_iter(&msg->req, size, to);
 	if (ret != size) {
 		vduse_dev_enqueue_msg(dev, msg, &dev->send_list);
@@ -565,6 +646,69 @@  static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
 	vduse_dev_set_config(dev, offset, buf, len);
 }
 
+static void vduse_vdpa_invalidate_iotlb(struct vduse_dev *dev,
+					struct vhost_iotlb_msg *msg)
+{
+	vduse_dev_invalidate_iotlb(dev, msg->iova, msg->size);
+}
+
+static int vduse_vdpa_update_iotlb(struct vduse_dev *dev,
+					struct vhost_iotlb_msg *msg)
+{
+	u64 uaddr = msg->uaddr;
+	u64 iova = msg->iova;
+	u64 size = msg->size;
+	u64 offset;
+	struct vm_area_struct *vma;
+	int ret;
+
+	while (uaddr < msg->uaddr + msg->size) {
+		vma = find_vma(current->mm, uaddr);
+		ret = -EINVAL;
+		if (!vma)
+			goto err;
+
+		size = min(msg->size, vma->vm_end - uaddr);
+		offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
+		if (vma->vm_file && (vma->vm_flags & VM_SHARED)) {
+			ret = vduse_dev_update_iotlb(dev, vma->vm_file, offset,
+							iova, size, msg->perm);
+			if (ret)
+				goto err;
+		}
+		iova += size;
+		uaddr += size;
+	}
+	return 0;
+err:
+	vduse_dev_invalidate_iotlb(dev, msg->iova, iova - msg->iova);
+	return ret;
+}
+
+static int vduse_vdpa_process_iotlb_msg(struct vdpa_device *vdpa,
+					struct vhost_iotlb_msg *msg)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	int ret = 0;
+
+	switch (msg->type) {
+	case VHOST_IOTLB_UPDATE:
+		ret = vduse_vdpa_update_iotlb(dev, msg);
+		break;
+	case VHOST_IOTLB_INVALIDATE:
+		vduse_vdpa_invalidate_iotlb(dev, msg);
+		break;
+	case VHOST_IOTLB_BATCH_BEGIN:
+	case VHOST_IOTLB_BATCH_END:
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 static void vduse_vdpa_free(struct vdpa_device *vdpa)
 {
 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -597,6 +741,7 @@  static const struct vdpa_config_ops vduse_vdpa_config_ops = {
 	.set_status		= vduse_vdpa_set_status,
 	.get_config		= vduse_vdpa_get_config,
 	.set_config		= vduse_vdpa_set_config,
+	.process_iotlb_msg	= vduse_vdpa_process_iotlb_msg,
 	.free			= vduse_vdpa_free,
 };
 
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 873305dfd93f..c5080851f140 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -21,6 +21,8 @@  enum vduse_req_type {
 	VDUSE_GET_STATUS,
 	VDUSE_SET_CONFIG,
 	VDUSE_GET_CONFIG,
+	VDUSE_UPDATE_IOTLB,
+	VDUSE_INVALIDATE_IOTLB,
 };
 
 struct vduse_vq_num {
@@ -51,6 +53,14 @@  struct vduse_dev_config_data {
 	__u8 data[VDUSE_CONFIG_DATA_LEN];
 };
 
+struct vduse_iotlb {
+	__u32 fd;
+	__u64 offset;
+	__u64 iova;
+	__u64 size;
+	__u8 perm;
+};
+
 struct vduse_dev_request {
 	__u32 type; /* request type */
 	__u32 unique; /* request id */
@@ -62,6 +72,7 @@  struct vduse_dev_request {
 		struct vduse_vq_ready vq_ready; /* virtqueue ready status */
 		struct vduse_vq_state vq_state; /* virtqueue state */
 		struct vduse_dev_config_data config; /* virtio device config space */
+		struct vduse_iotlb iotlb; /* iotlb message */
 		__u64 features; /* virtio features */
 		__u8 status; /* device status */
 	};