diff mbox series

[v2,1/3] dma-buf: Add ioctl to query mmap coherency/cache info

Message ID 20220815211516.3169470-2-robdclark@gmail.com (mailing list archive)
State New, archived
Headers show
Series dma-buf: map-info support | expand

Commit Message

Rob Clark Aug. 15, 2022, 9:15 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

This is a fairly narrowly focused interface, providing a way for a VMM
in userspace to tell the guest kernel what pgprot settings to use when
mapping a buffer to guest userspace.

For buffers that get mapped into guest userspace, virglrenderer returns
a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
pages into the guest VM, it needs to report to drm/virtio in the guest
the cache settings to use for guest userspace.  In particular, on some
architectures, creating aliased mappings with different cache attributes
is frowned upon, so it is important that the guest mappings have the
same cache attributes as any potential host mappings.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
v2: Combine with coherency, as that is a related concept.. and it is
    relevant to the VMM whether coherent access without the SYNC ioctl
    is possible; set map_info at export time to make it more clear
    that it applies for the lifetime of the dma-buf (for any mmap
    created via the dma-buf)

 drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
 include/linux/dma-buf.h      | 11 ++++++
 include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 10 deletions(-)

Comments

Christian König Aug. 16, 2022, 8:27 a.m. UTC | #1
Am 15.08.22 um 23:15 schrieb Rob Clark:
> From: Rob Clark <robdclark@chromium.org>
>
> This is a fairly narrowly focused interface, providing a way for a VMM
> in userspace to tell the guest kernel what pgprot settings to use when
> mapping a buffer to guest userspace.
>
> For buffers that get mapped into guest userspace, virglrenderer returns
> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
> pages into the guest VM, it needs to report to drm/virtio in the guest
> the cache settings to use for guest userspace.  In particular, on some
> architectures, creating aliased mappings with different cache attributes
> is frowned upon, so it is important that the guest mappings have the
> same cache attributes as any potential host mappings.
>
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
> v2: Combine with coherency, as that is a related concept.. and it is
>      relevant to the VMM whether coherent access without the SYNC ioctl
>      is possible; set map_info at export time to make it more clear
>      that it applies for the lifetime of the dma-buf (for any mmap
>      created via the dma-buf)

Well, exactly that's a conceptual NAK from my side.

The caching information can change at any time. For CPU mappings even 
without further notice from the exporter.

If the hardware can't use the caching information from the host CPU page 
tables directly then that pretty much completely breaks the concept that 
the exporter is responsible for setting up those page tables.

Regards,
Christian.

>
>   drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>   include/linux/dma-buf.h      | 11 ++++++
>   include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>   3 files changed, 132 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index 32f55640890c..262c4706f721 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>   	.kill_sb = kill_anon_super,
>   };
>   
> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> +{
> +	int ret;
> +
> +	/* check if buffer supports mmap */
> +	if (!dmabuf->ops->mmap)
> +		return -EINVAL;
> +
> +	ret = dmabuf->ops->mmap(dmabuf, vma);
> +
> +	/*
> +	 * If the exporter claims to support coherent access, ensure the
> +	 * pgprot flags match the claim.
> +	 */
> +	if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> +		pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> +		if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> +			WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> +		} else {
> +			WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> +		}
> +	}
> +
> +	return ret;
> +}
> +
>   static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>   {
>   	struct dma_buf *dmabuf;
> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>   
>   	dmabuf = file->private_data;
>   
> -	/* check if buffer supports mmap */
> -	if (!dmabuf->ops->mmap)
> -		return -EINVAL;
> -
>   	/* check for overflowing the buffer's size */
>   	if (vma->vm_pgoff + vma_pages(vma) >
>   	    dmabuf->size >> PAGE_SHIFT)
>   		return -EINVAL;
>   
> -	return dmabuf->ops->mmap(dmabuf, vma);
> +	return __dma_buf_mmap(dmabuf, vma);
>   }
>   
>   static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>   	return 0;
>   }
>   
> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> +{
> +	struct dma_buf_info arg;
> +
> +	if (copy_from_user(&arg, uarg, sizeof(arg)))
> +		return -EFAULT;
> +
> +	switch (arg.param) {
> +	case DMA_BUF_INFO_MAP_INFO:
> +		arg.value = dmabuf->map_info;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (copy_to_user(uarg, &arg, sizeof(arg)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>   static long dma_buf_ioctl(struct file *file,
>   			  unsigned int cmd, unsigned long arg)
>   {
> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>   	case DMA_BUF_SET_NAME_B:
>   		return dma_buf_set_name(dmabuf, (const char __user *)arg);
>   
> +	case DMA_BUF_IOCTL_INFO:
> +		return dma_buf_info(dmabuf, (void __user *)arg);
> +
>   	default:
>   		return -ENOTTY;
>   	}
> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>   	dmabuf->priv = exp_info->priv;
>   	dmabuf->ops = exp_info->ops;
>   	dmabuf->size = exp_info->size;
> +	dmabuf->map_info = exp_info->map_info;
>   	dmabuf->exp_name = exp_info->exp_name;
>   	dmabuf->owner = exp_info->owner;
>   	spin_lock_init(&dmabuf->name_lock);
> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>   	if (WARN_ON(!dmabuf || !vma))
>   		return -EINVAL;
>   
> -	/* check if buffer supports mmap */
> -	if (!dmabuf->ops->mmap)
> -		return -EINVAL;
> -
>   	/* check for offset overflow */
>   	if (pgoff + vma_pages(vma) < pgoff)
>   		return -EOVERFLOW;
> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>   	vma_set_file(vma, dmabuf->file);
>   	vma->vm_pgoff = pgoff;
>   
> -	return dmabuf->ops->mmap(dmabuf, vma);
> +	return __dma_buf_mmap(dmabuf, vma);
>   }
>   EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>   
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index 71731796c8c3..37923c8d5c24 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -23,6 +23,8 @@
>   #include <linux/dma-fence.h>
>   #include <linux/wait.h>
>   
> +#include <uapi/linux/dma-buf.h>
> +
>   struct device;
>   struct dma_buf;
>   struct dma_buf_attachment;
> @@ -307,6 +309,13 @@ struct dma_buf {
>   	 */
>   	size_t size;
>   
> +	/**
> +	 * @map_info:
> +	 *
> +	 * CPU mapping/coherency information for the buffer.
> +	 */
> +	enum dma_buf_map_info map_info;
> +
>   	/**
>   	 * @file:
>   	 *
> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>    * @ops:	Attach allocator-defined dma buf ops to the new buffer
>    * @size:	Size of the buffer - invariant over the lifetime of the buffer
>    * @flags:	mode flags for the file
> + * @map_info:	CPU mapping/coherency information for the buffer
>    * @resv:	reservation-object, NULL to allocate default one
>    * @priv:	Attach private data of allocator to this buffer
>    *
> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>   	const struct dma_buf_ops *ops;
>   	size_t size;
>   	int flags;
> +	enum dma_buf_map_info map_info;
>   	struct dma_resv *resv;
>   	void *priv;
>   };
> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> index b1523cb8ab30..07b403ffdb43 100644
> --- a/include/uapi/linux/dma-buf.h
> +++ b/include/uapi/linux/dma-buf.h
> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>   
>   #define DMA_BUF_NAME_LEN	32
>   
> +/**
> + * enum dma_buf_map_info - CPU mapping info
> + *
> + * This enum describes coherency of a userspace mapping of the dmabuf.
> + *
> + * Importing devices should check dma_buf::map_info flag and reject an
> + * import if unsupported.  For example, if the exporting device uses
> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> + * CPU cache coherency, the dma-buf import should fail.
> + */
> +enum dma_buf_map_info {
> +	/**
> +	 * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> +	 *
> +	 * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> +	 */
> +	DMA_BUF_MAP_INCOHERENT,
> +
> +	/**
> +	 * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> +	 *
> +	 * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> +	 * However fences may be still required for synchronizing access.  Ie.
> +	 * coherency can only be relied upon by an explicit-fencing userspace.
> +	 * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> +	 *
> +	 * The cpu mapping is writecombine.
> +	 */
> +	DMA_BUF_COHERENT_WC,
> +
> +	/**
> +	 * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> +	 *
> +	 * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> +	 * However fences may be still required for synchronizing access.  Ie.
> +	 * coherency can only be relied upon by an explicit-fencing userspace.
> +	 * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> +	 *
> +	 * The cpu mapping is cached.
> +	 */
> +	DMA_BUF_COHERENT_CACHED,
> +};
> +
> +/**
> + * struct dma_buf_info - Query info about the buffer.
> + */
> +struct dma_buf_info {
> +
> +#define DMA_BUF_INFO_MAP_INFO    1
> +
> +	/**
> +	 * @param: Which param to query
> +	 *
> +	 * DMA_BUF_INFO_MAP_INFO:
> +	 *     Returns enum dma_buf_map_info, describing the coherency and
> +	 *     caching of a CPU mapping of the buffer.
> +	 */
> +	__u32 param;
> +	__u32 pad;
> +
> +	/**
> +	 * @value: Return value of the query.
> +	 */
> +	__u64 value;
> +};
> +
>   #define DMA_BUF_BASE		'b'
>   #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>   
> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>   #define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, __u32)
>   #define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, __u64)
>   
> +#define DMA_BUF_IOCTL_INFO	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> +
>   #endif
Rob Clark Aug. 16, 2022, 2:26 p.m. UTC | #2
On Tue, Aug 16, 2022 at 1:27 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 15.08.22 um 23:15 schrieb Rob Clark:
> > From: Rob Clark <robdclark@chromium.org>
> >
> > This is a fairly narrowly focused interface, providing a way for a VMM
> > in userspace to tell the guest kernel what pgprot settings to use when
> > mapping a buffer to guest userspace.
> >
> > For buffers that get mapped into guest userspace, virglrenderer returns
> > a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
> > pages into the guest VM, it needs to report to drm/virtio in the guest
> > the cache settings to use for guest userspace.  In particular, on some
> > architectures, creating aliased mappings with different cache attributes
> > is frowned upon, so it is important that the guest mappings have the
> > same cache attributes as any potential host mappings.
> >
> > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > ---
> > v2: Combine with coherency, as that is a related concept.. and it is
> >      relevant to the VMM whether coherent access without the SYNC ioctl
> >      is possible; set map_info at export time to make it more clear
> >      that it applies for the lifetime of the dma-buf (for any mmap
> >      created via the dma-buf)
>
> Well, exactly that's a conceptual NAK from my side.
>
> The caching information can change at any time. For CPU mappings even
> without further notice from the exporter.

You should look before you criticize, as I left you a way out.. the
idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
values if something else would suit you better.  But the goal is to
give the VMM enough information to dtrt, or return an error if mapping
to guest is not possible.  That seems better than assuming mapping to
guest will work and guessing about cache attrs

BR,
-R

> If the hardware can't use the caching information from the host CPU page
> tables directly then that pretty much completely breaks the concept that
> the exporter is responsible for setting up those page tables.
>
> Regards,
> Christian.
>
> >
> >   drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> >   include/linux/dma-buf.h      | 11 ++++++
> >   include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> >   3 files changed, 132 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> > index 32f55640890c..262c4706f721 100644
> > --- a/drivers/dma-buf/dma-buf.c
> > +++ b/drivers/dma-buf/dma-buf.c
> > @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> >       .kill_sb = kill_anon_super,
> >   };
> >
> > +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> > +{
> > +     int ret;
> > +
> > +     /* check if buffer supports mmap */
> > +     if (!dmabuf->ops->mmap)
> > +             return -EINVAL;
> > +
> > +     ret = dmabuf->ops->mmap(dmabuf, vma);
> > +
> > +     /*
> > +      * If the exporter claims to support coherent access, ensure the
> > +      * pgprot flags match the claim.
> > +      */
> > +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> > +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> > +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> > +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> > +             } else {
> > +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> > +             }
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> >   static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >   {
> >       struct dma_buf *dmabuf;
> > @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >
> >       dmabuf = file->private_data;
> >
> > -     /* check if buffer supports mmap */
> > -     if (!dmabuf->ops->mmap)
> > -             return -EINVAL;
> > -
> >       /* check for overflowing the buffer's size */
> >       if (vma->vm_pgoff + vma_pages(vma) >
> >           dmabuf->size >> PAGE_SHIFT)
> >               return -EINVAL;
> >
> > -     return dmabuf->ops->mmap(dmabuf, vma);
> > +     return __dma_buf_mmap(dmabuf, vma);
> >   }
> >
> >   static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> > @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> >       return 0;
> >   }
> >
> > +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> > +{
> > +     struct dma_buf_info arg;
> > +
> > +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> > +             return -EFAULT;
> > +
> > +     switch (arg.param) {
> > +     case DMA_BUF_INFO_MAP_INFO:
> > +             arg.value = dmabuf->map_info;
> > +             break;
> > +     default:
> > +             return -EINVAL;
> > +     }
> > +
> > +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> > +             return -EFAULT;
> > +
> > +     return 0;
> > +}
> > +
> >   static long dma_buf_ioctl(struct file *file,
> >                         unsigned int cmd, unsigned long arg)
> >   {
> > @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> >       case DMA_BUF_SET_NAME_B:
> >               return dma_buf_set_name(dmabuf, (const char __user *)arg);
> >
> > +     case DMA_BUF_IOCTL_INFO:
> > +             return dma_buf_info(dmabuf, (void __user *)arg);
> > +
> >       default:
> >               return -ENOTTY;
> >       }
> > @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> >       dmabuf->priv = exp_info->priv;
> >       dmabuf->ops = exp_info->ops;
> >       dmabuf->size = exp_info->size;
> > +     dmabuf->map_info = exp_info->map_info;
> >       dmabuf->exp_name = exp_info->exp_name;
> >       dmabuf->owner = exp_info->owner;
> >       spin_lock_init(&dmabuf->name_lock);
> > @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >       if (WARN_ON(!dmabuf || !vma))
> >               return -EINVAL;
> >
> > -     /* check if buffer supports mmap */
> > -     if (!dmabuf->ops->mmap)
> > -             return -EINVAL;
> > -
> >       /* check for offset overflow */
> >       if (pgoff + vma_pages(vma) < pgoff)
> >               return -EOVERFLOW;
> > @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >       vma_set_file(vma, dmabuf->file);
> >       vma->vm_pgoff = pgoff;
> >
> > -     return dmabuf->ops->mmap(dmabuf, vma);
> > +     return __dma_buf_mmap(dmabuf, vma);
> >   }
> >   EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> >
> > diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > index 71731796c8c3..37923c8d5c24 100644
> > --- a/include/linux/dma-buf.h
> > +++ b/include/linux/dma-buf.h
> > @@ -23,6 +23,8 @@
> >   #include <linux/dma-fence.h>
> >   #include <linux/wait.h>
> >
> > +#include <uapi/linux/dma-buf.h>
> > +
> >   struct device;
> >   struct dma_buf;
> >   struct dma_buf_attachment;
> > @@ -307,6 +309,13 @@ struct dma_buf {
> >        */
> >       size_t size;
> >
> > +     /**
> > +      * @map_info:
> > +      *
> > +      * CPU mapping/coherency information for the buffer.
> > +      */
> > +     enum dma_buf_map_info map_info;
> > +
> >       /**
> >        * @file:
> >        *
> > @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> >    * @ops:    Attach allocator-defined dma buf ops to the new buffer
> >    * @size:   Size of the buffer - invariant over the lifetime of the buffer
> >    * @flags:  mode flags for the file
> > + * @map_info:        CPU mapping/coherency information for the buffer
> >    * @resv:   reservation-object, NULL to allocate default one
> >    * @priv:   Attach private data of allocator to this buffer
> >    *
> > @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> >       const struct dma_buf_ops *ops;
> >       size_t size;
> >       int flags;
> > +     enum dma_buf_map_info map_info;
> >       struct dma_resv *resv;
> >       void *priv;
> >   };
> > diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> > index b1523cb8ab30..07b403ffdb43 100644
> > --- a/include/uapi/linux/dma-buf.h
> > +++ b/include/uapi/linux/dma-buf.h
> > @@ -85,6 +85,72 @@ struct dma_buf_sync {
> >
> >   #define DMA_BUF_NAME_LEN    32
> >
> > +/**
> > + * enum dma_buf_map_info - CPU mapping info
> > + *
> > + * This enum describes coherency of a userspace mapping of the dmabuf.
> > + *
> > + * Importing devices should check dma_buf::map_info flag and reject an
> > + * import if unsupported.  For example, if the exporting device uses
> > + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> > + * CPU cache coherency, the dma-buf import should fail.
> > + */
> > +enum dma_buf_map_info {
> > +     /**
> > +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> > +      *
> > +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> > +      */
> > +     DMA_BUF_MAP_INCOHERENT,
> > +
> > +     /**
> > +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> > +      *
> > +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > +      * However fences may be still required for synchronizing access.  Ie.
> > +      * coherency can only be relied upon by an explicit-fencing userspace.
> > +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > +      *
> > +      * The cpu mapping is writecombine.
> > +      */
> > +     DMA_BUF_COHERENT_WC,
> > +
> > +     /**
> > +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> > +      *
> > +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > +      * However fences may be still required for synchronizing access.  Ie.
> > +      * coherency can only be relied upon by an explicit-fencing userspace.
> > +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > +      *
> > +      * The cpu mapping is cached.
> > +      */
> > +     DMA_BUF_COHERENT_CACHED,
> > +};
> > +
> > +/**
> > + * struct dma_buf_info - Query info about the buffer.
> > + */
> > +struct dma_buf_info {
> > +
> > +#define DMA_BUF_INFO_MAP_INFO    1
> > +
> > +     /**
> > +      * @param: Which param to query
> > +      *
> > +      * DMA_BUF_INFO_MAP_INFO:
> > +      *     Returns enum dma_buf_map_info, describing the coherency and
> > +      *     caching of a CPU mapping of the buffer.
> > +      */
> > +     __u32 param;
> > +     __u32 pad;
> > +
> > +     /**
> > +      * @value: Return value of the query.
> > +      */
> > +     __u64 value;
> > +};
> > +
> >   #define DMA_BUF_BASE                'b'
> >   #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> >
> > @@ -95,4 +161,6 @@ struct dma_buf_sync {
> >   #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> >   #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> >
> > +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> > +
> >   #endif
>
Christian König Aug. 16, 2022, 4:50 p.m. UTC | #3
Am 16.08.22 um 16:26 schrieb Rob Clark:
> On Tue, Aug 16, 2022 at 1:27 AM Christian König
> <christian.koenig@amd.com> wrote:
>> Am 15.08.22 um 23:15 schrieb Rob Clark:
>>> From: Rob Clark <robdclark@chromium.org>
>>>
>>> This is a fairly narrowly focused interface, providing a way for a VMM
>>> in userspace to tell the guest kernel what pgprot settings to use when
>>> mapping a buffer to guest userspace.
>>>
>>> For buffers that get mapped into guest userspace, virglrenderer returns
>>> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
>>> pages into the guest VM, it needs to report to drm/virtio in the guest
>>> the cache settings to use for guest userspace.  In particular, on some
>>> architectures, creating aliased mappings with different cache attributes
>>> is frowned upon, so it is important that the guest mappings have the
>>> same cache attributes as any potential host mappings.
>>>
>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>> ---
>>> v2: Combine with coherency, as that is a related concept.. and it is
>>>       relevant to the VMM whether coherent access without the SYNC ioctl
>>>       is possible; set map_info at export time to make it more clear
>>>       that it applies for the lifetime of the dma-buf (for any mmap
>>>       created via the dma-buf)
>> Well, exactly that's a conceptual NAK from my side.
>>
>> The caching information can change at any time. For CPU mappings even
>> without further notice from the exporter.
> You should look before you criticize, as I left you a way out.. the
> idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
> cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
> values if something else would suit you better.  But the goal is to
> give the VMM enough information to dtrt, or return an error if mapping
> to guest is not possible.  That seems better than assuming mapping to
> guest will work and guessing about cache attrs

Well I'm not rejecting the implementation, I'm rejecting this from the 
conceptual point of view.

We intentional gave the exporter full control over the CPU mappings. 
This approach here breaks that now.

I haven't seen the full detailed reason why we should do that and to be 
honest KVM seems to mess with things it is not supposed to touch.

For example the page reference count of mappings marked with VM_IO is a 
complete no-go. This is a very strong evidence that this was based on 
rather dangerous halve knowledge about the background of the handling here.

So as long as I don't see a full explanation why KVM is grabbing 
reference to pages while faulting them and why we manually need to 
forward the caching while the hardware documentation indicates otherwise 
I will be rejecting this whole approach.

Regards,
Christian.

>
> BR,
> -R
>
>> If the hardware can't use the caching information from the host CPU page
>> tables directly then that pretty much completely breaks the concept that
>> the exporter is responsible for setting up those page tables.
>>
>> Regards,
>> Christian.
>>
>>>    drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>>>    include/linux/dma-buf.h      | 11 ++++++
>>>    include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>>>    3 files changed, 132 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>> index 32f55640890c..262c4706f721 100644
>>> --- a/drivers/dma-buf/dma-buf.c
>>> +++ b/drivers/dma-buf/dma-buf.c
>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>>>        .kill_sb = kill_anon_super,
>>>    };
>>>
>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>>> +{
>>> +     int ret;
>>> +
>>> +     /* check if buffer supports mmap */
>>> +     if (!dmabuf->ops->mmap)
>>> +             return -EINVAL;
>>> +
>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
>>> +
>>> +     /*
>>> +      * If the exporter claims to support coherent access, ensure the
>>> +      * pgprot flags match the claim.
>>> +      */
>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
>>> +             } else {
>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
>>> +             }
>>> +     }
>>> +
>>> +     return ret;
>>> +}
>>> +
>>>    static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>    {
>>>        struct dma_buf *dmabuf;
>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>
>>>        dmabuf = file->private_data;
>>>
>>> -     /* check if buffer supports mmap */
>>> -     if (!dmabuf->ops->mmap)
>>> -             return -EINVAL;
>>> -
>>>        /* check for overflowing the buffer's size */
>>>        if (vma->vm_pgoff + vma_pages(vma) >
>>>            dmabuf->size >> PAGE_SHIFT)
>>>                return -EINVAL;
>>>
>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>    }
>>>
>>>    static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>>>        return 0;
>>>    }
>>>
>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
>>> +{
>>> +     struct dma_buf_info arg;
>>> +
>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
>>> +             return -EFAULT;
>>> +
>>> +     switch (arg.param) {
>>> +     case DMA_BUF_INFO_MAP_INFO:
>>> +             arg.value = dmabuf->map_info;
>>> +             break;
>>> +     default:
>>> +             return -EINVAL;
>>> +     }
>>> +
>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
>>> +             return -EFAULT;
>>> +
>>> +     return 0;
>>> +}
>>> +
>>>    static long dma_buf_ioctl(struct file *file,
>>>                          unsigned int cmd, unsigned long arg)
>>>    {
>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>>>        case DMA_BUF_SET_NAME_B:
>>>                return dma_buf_set_name(dmabuf, (const char __user *)arg);
>>>
>>> +     case DMA_BUF_IOCTL_INFO:
>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
>>> +
>>>        default:
>>>                return -ENOTTY;
>>>        }
>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>        dmabuf->priv = exp_info->priv;
>>>        dmabuf->ops = exp_info->ops;
>>>        dmabuf->size = exp_info->size;
>>> +     dmabuf->map_info = exp_info->map_info;
>>>        dmabuf->exp_name = exp_info->exp_name;
>>>        dmabuf->owner = exp_info->owner;
>>>        spin_lock_init(&dmabuf->name_lock);
>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>        if (WARN_ON(!dmabuf || !vma))
>>>                return -EINVAL;
>>>
>>> -     /* check if buffer supports mmap */
>>> -     if (!dmabuf->ops->mmap)
>>> -             return -EINVAL;
>>> -
>>>        /* check for offset overflow */
>>>        if (pgoff + vma_pages(vma) < pgoff)
>>>                return -EOVERFLOW;
>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>        vma_set_file(vma, dmabuf->file);
>>>        vma->vm_pgoff = pgoff;
>>>
>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>    }
>>>    EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>>>
>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>> index 71731796c8c3..37923c8d5c24 100644
>>> --- a/include/linux/dma-buf.h
>>> +++ b/include/linux/dma-buf.h
>>> @@ -23,6 +23,8 @@
>>>    #include <linux/dma-fence.h>
>>>    #include <linux/wait.h>
>>>
>>> +#include <uapi/linux/dma-buf.h>
>>> +
>>>    struct device;
>>>    struct dma_buf;
>>>    struct dma_buf_attachment;
>>> @@ -307,6 +309,13 @@ struct dma_buf {
>>>         */
>>>        size_t size;
>>>
>>> +     /**
>>> +      * @map_info:
>>> +      *
>>> +      * CPU mapping/coherency information for the buffer.
>>> +      */
>>> +     enum dma_buf_map_info map_info;
>>> +
>>>        /**
>>>         * @file:
>>>         *
>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>>>     * @ops:    Attach allocator-defined dma buf ops to the new buffer
>>>     * @size:   Size of the buffer - invariant over the lifetime of the buffer
>>>     * @flags:  mode flags for the file
>>> + * @map_info:        CPU mapping/coherency information for the buffer
>>>     * @resv:   reservation-object, NULL to allocate default one
>>>     * @priv:   Attach private data of allocator to this buffer
>>>     *
>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>>>        const struct dma_buf_ops *ops;
>>>        size_t size;
>>>        int flags;
>>> +     enum dma_buf_map_info map_info;
>>>        struct dma_resv *resv;
>>>        void *priv;
>>>    };
>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
>>> index b1523cb8ab30..07b403ffdb43 100644
>>> --- a/include/uapi/linux/dma-buf.h
>>> +++ b/include/uapi/linux/dma-buf.h
>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>>>
>>>    #define DMA_BUF_NAME_LEN    32
>>>
>>> +/**
>>> + * enum dma_buf_map_info - CPU mapping info
>>> + *
>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
>>> + *
>>> + * Importing devices should check dma_buf::map_info flag and reject an
>>> + * import if unsupported.  For example, if the exporting device uses
>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
>>> + * CPU cache coherency, the dma-buf import should fail.
>>> + */
>>> +enum dma_buf_map_info {
>>> +     /**
>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
>>> +      *
>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
>>> +      */
>>> +     DMA_BUF_MAP_INCOHERENT,
>>> +
>>> +     /**
>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
>>> +      *
>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>> +      * However fences may be still required for synchronizing access.  Ie.
>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>> +      *
>>> +      * The cpu mapping is writecombine.
>>> +      */
>>> +     DMA_BUF_COHERENT_WC,
>>> +
>>> +     /**
>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
>>> +      *
>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>> +      * However fences may be still required for synchronizing access.  Ie.
>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>> +      *
>>> +      * The cpu mapping is cached.
>>> +      */
>>> +     DMA_BUF_COHERENT_CACHED,
>>> +};
>>> +
>>> +/**
>>> + * struct dma_buf_info - Query info about the buffer.
>>> + */
>>> +struct dma_buf_info {
>>> +
>>> +#define DMA_BUF_INFO_MAP_INFO    1
>>> +
>>> +     /**
>>> +      * @param: Which param to query
>>> +      *
>>> +      * DMA_BUF_INFO_MAP_INFO:
>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
>>> +      *     caching of a CPU mapping of the buffer.
>>> +      */
>>> +     __u32 param;
>>> +     __u32 pad;
>>> +
>>> +     /**
>>> +      * @value: Return value of the query.
>>> +      */
>>> +     __u64 value;
>>> +};
>>> +
>>>    #define DMA_BUF_BASE                'b'
>>>    #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>>>
>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>>>    #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
>>>    #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
>>>
>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
>>> +
>>>    #endif
Rob Clark Aug. 16, 2022, 5:29 p.m. UTC | #4
On Tue, Aug 16, 2022 at 9:51 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 16.08.22 um 16:26 schrieb Rob Clark:
> > On Tue, Aug 16, 2022 at 1:27 AM Christian König
> > <christian.koenig@amd.com> wrote:
> >> Am 15.08.22 um 23:15 schrieb Rob Clark:
> >>> From: Rob Clark <robdclark@chromium.org>
> >>>
> >>> This is a fairly narrowly focused interface, providing a way for a VMM
> >>> in userspace to tell the guest kernel what pgprot settings to use when
> >>> mapping a buffer to guest userspace.
> >>>
> >>> For buffers that get mapped into guest userspace, virglrenderer returns
> >>> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
> >>> pages into the guest VM, it needs to report to drm/virtio in the guest
> >>> the cache settings to use for guest userspace.  In particular, on some
> >>> architectures, creating aliased mappings with different cache attributes
> >>> is frowned upon, so it is important that the guest mappings have the
> >>> same cache attributes as any potential host mappings.
> >>>
> >>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >>> ---
> >>> v2: Combine with coherency, as that is a related concept.. and it is
> >>>       relevant to the VMM whether coherent access without the SYNC ioctl
> >>>       is possible; set map_info at export time to make it more clear
> >>>       that it applies for the lifetime of the dma-buf (for any mmap
> >>>       created via the dma-buf)
> >> Well, exactly that's a conceptual NAK from my side.
> >>
> >> The caching information can change at any time. For CPU mappings even
> >> without further notice from the exporter.
> > You should look before you criticize, as I left you a way out.. the
> > idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
> > cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
> > values if something else would suit you better.  But the goal is to
> > give the VMM enough information to dtrt, or return an error if mapping
> > to guest is not possible.  That seems better than assuming mapping to
> > guest will work and guessing about cache attrs
>
> Well I'm not rejecting the implementation, I'm rejecting this from the
> conceptual point of view.
>
> We intentional gave the exporter full control over the CPU mappings.
> This approach here breaks that now.
>
> I haven't seen the full detailed reason why we should do that and to be
> honest KVM seems to mess with things it is not supposed to touch.
>
> For example the page reference count of mappings marked with VM_IO is a
> complete no-go. This is a very strong evidence that this was based on
> rather dangerous halve knowledge about the background of the handling here.
>
> So as long as I don't see a full explanation why KVM is grabbing
> reference to pages while faulting them and why we manually need to
> forward the caching while the hardware documentation indicates otherwise
> I will be rejecting this whole approach.

Didn't we cover this on the previous iteration already.  From a host
kernel PoV these are just normal host userspace mappings.  The
userspace VMM mapping becomes the "physical address" in the guest and
the stage 2 translation tables map it to the guest userspace.

The resulting cache attrs from combination of S1 and S2 translation
can differ.  So ideally we setup the S2 pgtables in guest aligned with
host userspace mappings

BR,
-R

>
> Regards,
> Christian.
>
> >
> > BR,
> > -R
> >
> >> If the hardware can't use the caching information from the host CPU page
> >> tables directly then that pretty much completely breaks the concept that
> >> the exporter is responsible for setting up those page tables.
> >>
> >> Regards,
> >> Christian.
> >>
> >>>    drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> >>>    include/linux/dma-buf.h      | 11 ++++++
> >>>    include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> >>>    3 files changed, 132 insertions(+), 10 deletions(-)
> >>>
> >>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> >>> index 32f55640890c..262c4706f721 100644
> >>> --- a/drivers/dma-buf/dma-buf.c
> >>> +++ b/drivers/dma-buf/dma-buf.c
> >>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> >>>        .kill_sb = kill_anon_super,
> >>>    };
> >>>
> >>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> >>> +{
> >>> +     int ret;
> >>> +
> >>> +     /* check if buffer supports mmap */
> >>> +     if (!dmabuf->ops->mmap)
> >>> +             return -EINVAL;
> >>> +
> >>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> >>> +
> >>> +     /*
> >>> +      * If the exporter claims to support coherent access, ensure the
> >>> +      * pgprot flags match the claim.
> >>> +      */
> >>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> >>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> >>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> >>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> >>> +             } else {
> >>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> >>> +             }
> >>> +     }
> >>> +
> >>> +     return ret;
> >>> +}
> >>> +
> >>>    static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>    {
> >>>        struct dma_buf *dmabuf;
> >>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>
> >>>        dmabuf = file->private_data;
> >>>
> >>> -     /* check if buffer supports mmap */
> >>> -     if (!dmabuf->ops->mmap)
> >>> -             return -EINVAL;
> >>> -
> >>>        /* check for overflowing the buffer's size */
> >>>        if (vma->vm_pgoff + vma_pages(vma) >
> >>>            dmabuf->size >> PAGE_SHIFT)
> >>>                return -EINVAL;
> >>>
> >>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>    }
> >>>
> >>>    static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> >>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> >>>        return 0;
> >>>    }
> >>>
> >>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> >>> +{
> >>> +     struct dma_buf_info arg;
> >>> +
> >>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> >>> +             return -EFAULT;
> >>> +
> >>> +     switch (arg.param) {
> >>> +     case DMA_BUF_INFO_MAP_INFO:
> >>> +             arg.value = dmabuf->map_info;
> >>> +             break;
> >>> +     default:
> >>> +             return -EINVAL;
> >>> +     }
> >>> +
> >>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> >>> +             return -EFAULT;
> >>> +
> >>> +     return 0;
> >>> +}
> >>> +
> >>>    static long dma_buf_ioctl(struct file *file,
> >>>                          unsigned int cmd, unsigned long arg)
> >>>    {
> >>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> >>>        case DMA_BUF_SET_NAME_B:
> >>>                return dma_buf_set_name(dmabuf, (const char __user *)arg);
> >>>
> >>> +     case DMA_BUF_IOCTL_INFO:
> >>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> >>> +
> >>>        default:
> >>>                return -ENOTTY;
> >>>        }
> >>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> >>>        dmabuf->priv = exp_info->priv;
> >>>        dmabuf->ops = exp_info->ops;
> >>>        dmabuf->size = exp_info->size;
> >>> +     dmabuf->map_info = exp_info->map_info;
> >>>        dmabuf->exp_name = exp_info->exp_name;
> >>>        dmabuf->owner = exp_info->owner;
> >>>        spin_lock_init(&dmabuf->name_lock);
> >>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>        if (WARN_ON(!dmabuf || !vma))
> >>>                return -EINVAL;
> >>>
> >>> -     /* check if buffer supports mmap */
> >>> -     if (!dmabuf->ops->mmap)
> >>> -             return -EINVAL;
> >>> -
> >>>        /* check for offset overflow */
> >>>        if (pgoff + vma_pages(vma) < pgoff)
> >>>                return -EOVERFLOW;
> >>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>        vma_set_file(vma, dmabuf->file);
> >>>        vma->vm_pgoff = pgoff;
> >>>
> >>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>    }
> >>>    EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> >>>
> >>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >>> index 71731796c8c3..37923c8d5c24 100644
> >>> --- a/include/linux/dma-buf.h
> >>> +++ b/include/linux/dma-buf.h
> >>> @@ -23,6 +23,8 @@
> >>>    #include <linux/dma-fence.h>
> >>>    #include <linux/wait.h>
> >>>
> >>> +#include <uapi/linux/dma-buf.h>
> >>> +
> >>>    struct device;
> >>>    struct dma_buf;
> >>>    struct dma_buf_attachment;
> >>> @@ -307,6 +309,13 @@ struct dma_buf {
> >>>         */
> >>>        size_t size;
> >>>
> >>> +     /**
> >>> +      * @map_info:
> >>> +      *
> >>> +      * CPU mapping/coherency information for the buffer.
> >>> +      */
> >>> +     enum dma_buf_map_info map_info;
> >>> +
> >>>        /**
> >>>         * @file:
> >>>         *
> >>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> >>>     * @ops:    Attach allocator-defined dma buf ops to the new buffer
> >>>     * @size:   Size of the buffer - invariant over the lifetime of the buffer
> >>>     * @flags:  mode flags for the file
> >>> + * @map_info:        CPU mapping/coherency information for the buffer
> >>>     * @resv:   reservation-object, NULL to allocate default one
> >>>     * @priv:   Attach private data of allocator to this buffer
> >>>     *
> >>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> >>>        const struct dma_buf_ops *ops;
> >>>        size_t size;
> >>>        int flags;
> >>> +     enum dma_buf_map_info map_info;
> >>>        struct dma_resv *resv;
> >>>        void *priv;
> >>>    };
> >>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> >>> index b1523cb8ab30..07b403ffdb43 100644
> >>> --- a/include/uapi/linux/dma-buf.h
> >>> +++ b/include/uapi/linux/dma-buf.h
> >>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> >>>
> >>>    #define DMA_BUF_NAME_LEN    32
> >>>
> >>> +/**
> >>> + * enum dma_buf_map_info - CPU mapping info
> >>> + *
> >>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> >>> + *
> >>> + * Importing devices should check dma_buf::map_info flag and reject an
> >>> + * import if unsupported.  For example, if the exporting device uses
> >>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> >>> + * CPU cache coherency, the dma-buf import should fail.
> >>> + */
> >>> +enum dma_buf_map_info {
> >>> +     /**
> >>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> >>> +      *
> >>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> >>> +      */
> >>> +     DMA_BUF_MAP_INCOHERENT,
> >>> +
> >>> +     /**
> >>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> >>> +      *
> >>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>> +      * However fences may be still required for synchronizing access.  Ie.
> >>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>> +      *
> >>> +      * The cpu mapping is writecombine.
> >>> +      */
> >>> +     DMA_BUF_COHERENT_WC,
> >>> +
> >>> +     /**
> >>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> >>> +      *
> >>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>> +      * However fences may be still required for synchronizing access.  Ie.
> >>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>> +      *
> >>> +      * The cpu mapping is cached.
> >>> +      */
> >>> +     DMA_BUF_COHERENT_CACHED,
> >>> +};
> >>> +
> >>> +/**
> >>> + * struct dma_buf_info - Query info about the buffer.
> >>> + */
> >>> +struct dma_buf_info {
> >>> +
> >>> +#define DMA_BUF_INFO_MAP_INFO    1
> >>> +
> >>> +     /**
> >>> +      * @param: Which param to query
> >>> +      *
> >>> +      * DMA_BUF_INFO_MAP_INFO:
> >>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> >>> +      *     caching of a CPU mapping of the buffer.
> >>> +      */
> >>> +     __u32 param;
> >>> +     __u32 pad;
> >>> +
> >>> +     /**
> >>> +      * @value: Return value of the query.
> >>> +      */
> >>> +     __u64 value;
> >>> +};
> >>> +
> >>>    #define DMA_BUF_BASE                'b'
> >>>    #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> >>>
> >>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> >>>    #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> >>>    #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> >>>
> >>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> >>> +
> >>>    #endif
>
Christian König Aug. 17, 2022, 9:57 a.m. UTC | #5
Am 16.08.22 um 19:29 schrieb Rob Clark:
> On Tue, Aug 16, 2022 at 9:51 AM Christian König
> <christian.koenig@amd.com> wrote:
>> Am 16.08.22 um 16:26 schrieb Rob Clark:
>>> On Tue, Aug 16, 2022 at 1:27 AM Christian König
>>> <christian.koenig@amd.com> wrote:
>>>> Am 15.08.22 um 23:15 schrieb Rob Clark:
>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>
>>>>> This is a fairly narrowly focused interface, providing a way for a VMM
>>>>> in userspace to tell the guest kernel what pgprot settings to use when
>>>>> mapping a buffer to guest userspace.
>>>>>
>>>>> For buffers that get mapped into guest userspace, virglrenderer returns
>>>>> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
>>>>> pages into the guest VM, it needs to report to drm/virtio in the guest
>>>>> the cache settings to use for guest userspace.  In particular, on some
>>>>> architectures, creating aliased mappings with different cache attributes
>>>>> is frowned upon, so it is important that the guest mappings have the
>>>>> same cache attributes as any potential host mappings.
>>>>>
>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>> ---
>>>>> v2: Combine with coherency, as that is a related concept.. and it is
>>>>>        relevant to the VMM whether coherent access without the SYNC ioctl
>>>>>        is possible; set map_info at export time to make it more clear
>>>>>        that it applies for the lifetime of the dma-buf (for any mmap
>>>>>        created via the dma-buf)
>>>> Well, exactly that's a conceptual NAK from my side.
>>>>
>>>> The caching information can change at any time. For CPU mappings even
>>>> without further notice from the exporter.
>>> You should look before you criticize, as I left you a way out.. the
>>> idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
>>> cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
>>> values if something else would suit you better.  But the goal is to
>>> give the VMM enough information to dtrt, or return an error if mapping
>>> to guest is not possible.  That seems better than assuming mapping to
>>> guest will work and guessing about cache attrs
>> Well I'm not rejecting the implementation, I'm rejecting this from the
>> conceptual point of view.
>>
>> We intentional gave the exporter full control over the CPU mappings.
>> This approach here breaks that now.
>>
>> I haven't seen the full detailed reason why we should do that and to be
>> honest KVM seems to mess with things it is not supposed to touch.
>>
>> For example the page reference count of mappings marked with VM_IO is a
>> complete no-go. This is a very strong evidence that this was based on
>> rather dangerous halve knowledge about the background of the handling here.
>>
>> So as long as I don't see a full explanation why KVM is grabbing
>> reference to pages while faulting them and why we manually need to
>> forward the caching while the hardware documentation indicates otherwise
>> I will be rejecting this whole approach.
> Didn't we cover this on the previous iteration already.  From a host
> kernel PoV these are just normal host userspace mappings.  The
> userspace VMM mapping becomes the "physical address" in the guest and
> the stage 2 translation tables map it to the guest userspace.
>
> The resulting cache attrs from combination of S1 and S2 translation
> can differ.  So ideally we setup the S2 pgtables in guest aligned with
> host userspace mappings

Well exactly that is not very convincing.

What you want to do is to use one channel for the address and a 
different one for the cache attrs, that's not something I would 
recommend doing in general.

Instead the client pgtables should be setup in a way so that host can 
overwrite them.

Regards,
Christian.

>
> BR,
> -R
>
>> Regards,
>> Christian.
>>
>>> BR,
>>> -R
>>>
>>>> If the hardware can't use the caching information from the host CPU page
>>>> tables directly then that pretty much completely breaks the concept that
>>>> the exporter is responsible for setting up those page tables.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>     drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>>>>>     include/linux/dma-buf.h      | 11 ++++++
>>>>>     include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>>>>>     3 files changed, 132 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>>> index 32f55640890c..262c4706f721 100644
>>>>> --- a/drivers/dma-buf/dma-buf.c
>>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>>>>>         .kill_sb = kill_anon_super,
>>>>>     };
>>>>>
>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>>>>> +{
>>>>> +     int ret;
>>>>> +
>>>>> +     /* check if buffer supports mmap */
>>>>> +     if (!dmabuf->ops->mmap)
>>>>> +             return -EINVAL;
>>>>> +
>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
>>>>> +
>>>>> +     /*
>>>>> +      * If the exporter claims to support coherent access, ensure the
>>>>> +      * pgprot flags match the claim.
>>>>> +      */
>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
>>>>> +             } else {
>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
>>>>> +             }
>>>>> +     }
>>>>> +
>>>>> +     return ret;
>>>>> +}
>>>>> +
>>>>>     static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>     {
>>>>>         struct dma_buf *dmabuf;
>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>
>>>>>         dmabuf = file->private_data;
>>>>>
>>>>> -     /* check if buffer supports mmap */
>>>>> -     if (!dmabuf->ops->mmap)
>>>>> -             return -EINVAL;
>>>>> -
>>>>>         /* check for overflowing the buffer's size */
>>>>>         if (vma->vm_pgoff + vma_pages(vma) >
>>>>>             dmabuf->size >> PAGE_SHIFT)
>>>>>                 return -EINVAL;
>>>>>
>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>     }
>>>>>
>>>>>     static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>>>>>         return 0;
>>>>>     }
>>>>>
>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
>>>>> +{
>>>>> +     struct dma_buf_info arg;
>>>>> +
>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
>>>>> +             return -EFAULT;
>>>>> +
>>>>> +     switch (arg.param) {
>>>>> +     case DMA_BUF_INFO_MAP_INFO:
>>>>> +             arg.value = dmabuf->map_info;
>>>>> +             break;
>>>>> +     default:
>>>>> +             return -EINVAL;
>>>>> +     }
>>>>> +
>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
>>>>> +             return -EFAULT;
>>>>> +
>>>>> +     return 0;
>>>>> +}
>>>>> +
>>>>>     static long dma_buf_ioctl(struct file *file,
>>>>>                           unsigned int cmd, unsigned long arg)
>>>>>     {
>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>>>>>         case DMA_BUF_SET_NAME_B:
>>>>>                 return dma_buf_set_name(dmabuf, (const char __user *)arg);
>>>>>
>>>>> +     case DMA_BUF_IOCTL_INFO:
>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
>>>>> +
>>>>>         default:
>>>>>                 return -ENOTTY;
>>>>>         }
>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>>         dmabuf->priv = exp_info->priv;
>>>>>         dmabuf->ops = exp_info->ops;
>>>>>         dmabuf->size = exp_info->size;
>>>>> +     dmabuf->map_info = exp_info->map_info;
>>>>>         dmabuf->exp_name = exp_info->exp_name;
>>>>>         dmabuf->owner = exp_info->owner;
>>>>>         spin_lock_init(&dmabuf->name_lock);
>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>         if (WARN_ON(!dmabuf || !vma))
>>>>>                 return -EINVAL;
>>>>>
>>>>> -     /* check if buffer supports mmap */
>>>>> -     if (!dmabuf->ops->mmap)
>>>>> -             return -EINVAL;
>>>>> -
>>>>>         /* check for offset overflow */
>>>>>         if (pgoff + vma_pages(vma) < pgoff)
>>>>>                 return -EOVERFLOW;
>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>         vma_set_file(vma, dmabuf->file);
>>>>>         vma->vm_pgoff = pgoff;
>>>>>
>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>     }
>>>>>     EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>>>>>
>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>>>> index 71731796c8c3..37923c8d5c24 100644
>>>>> --- a/include/linux/dma-buf.h
>>>>> +++ b/include/linux/dma-buf.h
>>>>> @@ -23,6 +23,8 @@
>>>>>     #include <linux/dma-fence.h>
>>>>>     #include <linux/wait.h>
>>>>>
>>>>> +#include <uapi/linux/dma-buf.h>
>>>>> +
>>>>>     struct device;
>>>>>     struct dma_buf;
>>>>>     struct dma_buf_attachment;
>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
>>>>>          */
>>>>>         size_t size;
>>>>>
>>>>> +     /**
>>>>> +      * @map_info:
>>>>> +      *
>>>>> +      * CPU mapping/coherency information for the buffer.
>>>>> +      */
>>>>> +     enum dma_buf_map_info map_info;
>>>>> +
>>>>>         /**
>>>>>          * @file:
>>>>>          *
>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>>>>>      * @ops:    Attach allocator-defined dma buf ops to the new buffer
>>>>>      * @size:   Size of the buffer - invariant over the lifetime of the buffer
>>>>>      * @flags:  mode flags for the file
>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
>>>>>      * @resv:   reservation-object, NULL to allocate default one
>>>>>      * @priv:   Attach private data of allocator to this buffer
>>>>>      *
>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>>>>>         const struct dma_buf_ops *ops;
>>>>>         size_t size;
>>>>>         int flags;
>>>>> +     enum dma_buf_map_info map_info;
>>>>>         struct dma_resv *resv;
>>>>>         void *priv;
>>>>>     };
>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
>>>>> index b1523cb8ab30..07b403ffdb43 100644
>>>>> --- a/include/uapi/linux/dma-buf.h
>>>>> +++ b/include/uapi/linux/dma-buf.h
>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>>>>>
>>>>>     #define DMA_BUF_NAME_LEN    32
>>>>>
>>>>> +/**
>>>>> + * enum dma_buf_map_info - CPU mapping info
>>>>> + *
>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
>>>>> + *
>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
>>>>> + * import if unsupported.  For example, if the exporting device uses
>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
>>>>> + * CPU cache coherency, the dma-buf import should fail.
>>>>> + */
>>>>> +enum dma_buf_map_info {
>>>>> +     /**
>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
>>>>> +      *
>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
>>>>> +      */
>>>>> +     DMA_BUF_MAP_INCOHERENT,
>>>>> +
>>>>> +     /**
>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
>>>>> +      *
>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>> +      *
>>>>> +      * The cpu mapping is writecombine.
>>>>> +      */
>>>>> +     DMA_BUF_COHERENT_WC,
>>>>> +
>>>>> +     /**
>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
>>>>> +      *
>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>> +      *
>>>>> +      * The cpu mapping is cached.
>>>>> +      */
>>>>> +     DMA_BUF_COHERENT_CACHED,
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct dma_buf_info - Query info about the buffer.
>>>>> + */
>>>>> +struct dma_buf_info {
>>>>> +
>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
>>>>> +
>>>>> +     /**
>>>>> +      * @param: Which param to query
>>>>> +      *
>>>>> +      * DMA_BUF_INFO_MAP_INFO:
>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
>>>>> +      *     caching of a CPU mapping of the buffer.
>>>>> +      */
>>>>> +     __u32 param;
>>>>> +     __u32 pad;
>>>>> +
>>>>> +     /**
>>>>> +      * @value: Return value of the query.
>>>>> +      */
>>>>> +     __u64 value;
>>>>> +};
>>>>> +
>>>>>     #define DMA_BUF_BASE                'b'
>>>>>     #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>>>>>
>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>>>>>     #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
>>>>>     #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
>>>>>
>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
>>>>> +
>>>>>     #endif
Rob Clark Aug. 17, 2022, 1:44 p.m. UTC | #6
On Wed, Aug 17, 2022 at 2:57 AM Christian König
<christian.koenig@amd.com> wrote:
>
>
>
> Am 16.08.22 um 19:29 schrieb Rob Clark:
> > On Tue, Aug 16, 2022 at 9:51 AM Christian König
> > <christian.koenig@amd.com> wrote:
> >> Am 16.08.22 um 16:26 schrieb Rob Clark:
> >>> On Tue, Aug 16, 2022 at 1:27 AM Christian König
> >>> <christian.koenig@amd.com> wrote:
> >>>> Am 15.08.22 um 23:15 schrieb Rob Clark:
> >>>>> From: Rob Clark <robdclark@chromium.org>
> >>>>>
> >>>>> This is a fairly narrowly focused interface, providing a way for a VMM
> >>>>> in userspace to tell the guest kernel what pgprot settings to use when
> >>>>> mapping a buffer to guest userspace.
> >>>>>
> >>>>> For buffers that get mapped into guest userspace, virglrenderer returns
> >>>>> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
> >>>>> pages into the guest VM, it needs to report to drm/virtio in the guest
> >>>>> the cache settings to use for guest userspace.  In particular, on some
> >>>>> architectures, creating aliased mappings with different cache attributes
> >>>>> is frowned upon, so it is important that the guest mappings have the
> >>>>> same cache attributes as any potential host mappings.
> >>>>>
> >>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >>>>> ---
> >>>>> v2: Combine with coherency, as that is a related concept.. and it is
> >>>>>        relevant to the VMM whether coherent access without the SYNC ioctl
> >>>>>        is possible; set map_info at export time to make it more clear
> >>>>>        that it applies for the lifetime of the dma-buf (for any mmap
> >>>>>        created via the dma-buf)
> >>>> Well, exactly that's a conceptual NAK from my side.
> >>>>
> >>>> The caching information can change at any time. For CPU mappings even
> >>>> without further notice from the exporter.
> >>> You should look before you criticize, as I left you a way out.. the
> >>> idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
> >>> cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
> >>> values if something else would suit you better.  But the goal is to
> >>> give the VMM enough information to dtrt, or return an error if mapping
> >>> to guest is not possible.  That seems better than assuming mapping to
> >>> guest will work and guessing about cache attrs
> >> Well I'm not rejecting the implementation, I'm rejecting this from the
> >> conceptual point of view.
> >>
> >> We intentional gave the exporter full control over the CPU mappings.
> >> This approach here breaks that now.
> >>
> >> I haven't seen the full detailed reason why we should do that and to be
> >> honest KVM seems to mess with things it is not supposed to touch.
> >>
> >> For example the page reference count of mappings marked with VM_IO is a
> >> complete no-go. This is a very strong evidence that this was based on
> >> rather dangerous halve knowledge about the background of the handling here.
> >>
> >> So as long as I don't see a full explanation why KVM is grabbing
> >> reference to pages while faulting them and why we manually need to
> >> forward the caching while the hardware documentation indicates otherwise
> >> I will be rejecting this whole approach.
> > Didn't we cover this on the previous iteration already.  From a host
> > kernel PoV these are just normal host userspace mappings.  The
> > userspace VMM mapping becomes the "physical address" in the guest and
> > the stage 2 translation tables map it to the guest userspace.
> >
> > The resulting cache attrs from combination of S1 and S2 translation
> > can differ.  So ideally we setup the S2 pgtables in guest aligned with
> > host userspace mappings
>
> Well exactly that is not very convincing.
>
> What you want to do is to use one channel for the address and a
> different one for the cache attrs, that's not something I would
> recommend doing in general.

How would that work.. mmap() is the channel for the address, we'd need
to introduce a new syscall that returned additional information?

> Instead the client pgtables should be setup in a way so that host can
> overwrite them.

How?  That is completely not how VMs work.  Even if the host knew
where the pgtables were and somehow magically knew the various guest
userspace VAs, it would be racey.

BR,
-R

> Regards,
> Christian.
>
> >
> > BR,
> > -R
> >
> >> Regards,
> >> Christian.
> >>
> >>> BR,
> >>> -R
> >>>
> >>>> If the hardware can't use the caching information from the host CPU page
> >>>> tables directly then that pretty much completely breaks the concept that
> >>>> the exporter is responsible for setting up those page tables.
> >>>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>>     drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> >>>>>     include/linux/dma-buf.h      | 11 ++++++
> >>>>>     include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> >>>>>     3 files changed, 132 insertions(+), 10 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> >>>>> index 32f55640890c..262c4706f721 100644
> >>>>> --- a/drivers/dma-buf/dma-buf.c
> >>>>> +++ b/drivers/dma-buf/dma-buf.c
> >>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> >>>>>         .kill_sb = kill_anon_super,
> >>>>>     };
> >>>>>
> >>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> >>>>> +{
> >>>>> +     int ret;
> >>>>> +
> >>>>> +     /* check if buffer supports mmap */
> >>>>> +     if (!dmabuf->ops->mmap)
> >>>>> +             return -EINVAL;
> >>>>> +
> >>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> >>>>> +
> >>>>> +     /*
> >>>>> +      * If the exporter claims to support coherent access, ensure the
> >>>>> +      * pgprot flags match the claim.
> >>>>> +      */
> >>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> >>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> >>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> >>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> >>>>> +             } else {
> >>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> >>>>> +             }
> >>>>> +     }
> >>>>> +
> >>>>> +     return ret;
> >>>>> +}
> >>>>> +
> >>>>>     static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>     {
> >>>>>         struct dma_buf *dmabuf;
> >>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>
> >>>>>         dmabuf = file->private_data;
> >>>>>
> >>>>> -     /* check if buffer supports mmap */
> >>>>> -     if (!dmabuf->ops->mmap)
> >>>>> -             return -EINVAL;
> >>>>> -
> >>>>>         /* check for overflowing the buffer's size */
> >>>>>         if (vma->vm_pgoff + vma_pages(vma) >
> >>>>>             dmabuf->size >> PAGE_SHIFT)
> >>>>>                 return -EINVAL;
> >>>>>
> >>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>     }
> >>>>>
> >>>>>     static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> >>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> >>>>>         return 0;
> >>>>>     }
> >>>>>
> >>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> >>>>> +{
> >>>>> +     struct dma_buf_info arg;
> >>>>> +
> >>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> >>>>> +             return -EFAULT;
> >>>>> +
> >>>>> +     switch (arg.param) {
> >>>>> +     case DMA_BUF_INFO_MAP_INFO:
> >>>>> +             arg.value = dmabuf->map_info;
> >>>>> +             break;
> >>>>> +     default:
> >>>>> +             return -EINVAL;
> >>>>> +     }
> >>>>> +
> >>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> >>>>> +             return -EFAULT;
> >>>>> +
> >>>>> +     return 0;
> >>>>> +}
> >>>>> +
> >>>>>     static long dma_buf_ioctl(struct file *file,
> >>>>>                           unsigned int cmd, unsigned long arg)
> >>>>>     {
> >>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> >>>>>         case DMA_BUF_SET_NAME_B:
> >>>>>                 return dma_buf_set_name(dmabuf, (const char __user *)arg);
> >>>>>
> >>>>> +     case DMA_BUF_IOCTL_INFO:
> >>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> >>>>> +
> >>>>>         default:
> >>>>>                 return -ENOTTY;
> >>>>>         }
> >>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> >>>>>         dmabuf->priv = exp_info->priv;
> >>>>>         dmabuf->ops = exp_info->ops;
> >>>>>         dmabuf->size = exp_info->size;
> >>>>> +     dmabuf->map_info = exp_info->map_info;
> >>>>>         dmabuf->exp_name = exp_info->exp_name;
> >>>>>         dmabuf->owner = exp_info->owner;
> >>>>>         spin_lock_init(&dmabuf->name_lock);
> >>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>         if (WARN_ON(!dmabuf || !vma))
> >>>>>                 return -EINVAL;
> >>>>>
> >>>>> -     /* check if buffer supports mmap */
> >>>>> -     if (!dmabuf->ops->mmap)
> >>>>> -             return -EINVAL;
> >>>>> -
> >>>>>         /* check for offset overflow */
> >>>>>         if (pgoff + vma_pages(vma) < pgoff)
> >>>>>                 return -EOVERFLOW;
> >>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>         vma_set_file(vma, dmabuf->file);
> >>>>>         vma->vm_pgoff = pgoff;
> >>>>>
> >>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>     }
> >>>>>     EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> >>>>>
> >>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >>>>> index 71731796c8c3..37923c8d5c24 100644
> >>>>> --- a/include/linux/dma-buf.h
> >>>>> +++ b/include/linux/dma-buf.h
> >>>>> @@ -23,6 +23,8 @@
> >>>>>     #include <linux/dma-fence.h>
> >>>>>     #include <linux/wait.h>
> >>>>>
> >>>>> +#include <uapi/linux/dma-buf.h>
> >>>>> +
> >>>>>     struct device;
> >>>>>     struct dma_buf;
> >>>>>     struct dma_buf_attachment;
> >>>>> @@ -307,6 +309,13 @@ struct dma_buf {
> >>>>>          */
> >>>>>         size_t size;
> >>>>>
> >>>>> +     /**
> >>>>> +      * @map_info:
> >>>>> +      *
> >>>>> +      * CPU mapping/coherency information for the buffer.
> >>>>> +      */
> >>>>> +     enum dma_buf_map_info map_info;
> >>>>> +
> >>>>>         /**
> >>>>>          * @file:
> >>>>>          *
> >>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> >>>>>      * @ops:    Attach allocator-defined dma buf ops to the new buffer
> >>>>>      * @size:   Size of the buffer - invariant over the lifetime of the buffer
> >>>>>      * @flags:  mode flags for the file
> >>>>> + * @map_info:        CPU mapping/coherency information for the buffer
> >>>>>      * @resv:   reservation-object, NULL to allocate default one
> >>>>>      * @priv:   Attach private data of allocator to this buffer
> >>>>>      *
> >>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> >>>>>         const struct dma_buf_ops *ops;
> >>>>>         size_t size;
> >>>>>         int flags;
> >>>>> +     enum dma_buf_map_info map_info;
> >>>>>         struct dma_resv *resv;
> >>>>>         void *priv;
> >>>>>     };
> >>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> >>>>> index b1523cb8ab30..07b403ffdb43 100644
> >>>>> --- a/include/uapi/linux/dma-buf.h
> >>>>> +++ b/include/uapi/linux/dma-buf.h
> >>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> >>>>>
> >>>>>     #define DMA_BUF_NAME_LEN    32
> >>>>>
> >>>>> +/**
> >>>>> + * enum dma_buf_map_info - CPU mapping info
> >>>>> + *
> >>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> >>>>> + *
> >>>>> + * Importing devices should check dma_buf::map_info flag and reject an
> >>>>> + * import if unsupported.  For example, if the exporting device uses
> >>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> >>>>> + * CPU cache coherency, the dma-buf import should fail.
> >>>>> + */
> >>>>> +enum dma_buf_map_info {
> >>>>> +     /**
> >>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> >>>>> +      *
> >>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> >>>>> +      */
> >>>>> +     DMA_BUF_MAP_INCOHERENT,
> >>>>> +
> >>>>> +     /**
> >>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> >>>>> +      *
> >>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>> +      *
> >>>>> +      * The cpu mapping is writecombine.
> >>>>> +      */
> >>>>> +     DMA_BUF_COHERENT_WC,
> >>>>> +
> >>>>> +     /**
> >>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> >>>>> +      *
> >>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>> +      *
> >>>>> +      * The cpu mapping is cached.
> >>>>> +      */
> >>>>> +     DMA_BUF_COHERENT_CACHED,
> >>>>> +};
> >>>>> +
> >>>>> +/**
> >>>>> + * struct dma_buf_info - Query info about the buffer.
> >>>>> + */
> >>>>> +struct dma_buf_info {
> >>>>> +
> >>>>> +#define DMA_BUF_INFO_MAP_INFO    1
> >>>>> +
> >>>>> +     /**
> >>>>> +      * @param: Which param to query
> >>>>> +      *
> >>>>> +      * DMA_BUF_INFO_MAP_INFO:
> >>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> >>>>> +      *     caching of a CPU mapping of the buffer.
> >>>>> +      */
> >>>>> +     __u32 param;
> >>>>> +     __u32 pad;
> >>>>> +
> >>>>> +     /**
> >>>>> +      * @value: Return value of the query.
> >>>>> +      */
> >>>>> +     __u64 value;
> >>>>> +};
> >>>>> +
> >>>>>     #define DMA_BUF_BASE                'b'
> >>>>>     #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> >>>>>
> >>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> >>>>>     #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> >>>>>     #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> >>>>>
> >>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> >>>>> +
> >>>>>     #endif
>
Christian König Aug. 18, 2022, 11:20 a.m. UTC | #7
Am 17.08.22 um 15:44 schrieb Rob Clark:
> On Wed, Aug 17, 2022 at 2:57 AM Christian König
> <christian.koenig@amd.com> wrote:
>> [SNIP]
>>
>> The resulting cache attrs from combination of S1 and S2 translation
>> can differ.  So ideally we setup the S2 pgtables in guest aligned with
>> host userspace mappings
>> Well exactly that is not very convincing.
>>
>> What you want to do is to use one channel for the address and a
>> different one for the cache attrs, that's not something I would
>> recommend doing in general.
> How would that work.. mmap() is the channel for the address, we'd need
> to introduce a new syscall that returned additional information?

The channel for the address is not mmap(), but rather the page faults. 
mmap() is just the function setting up that channel.

The page faults then insert both the address as well as the caching 
attributes (at least on x86).

That we then need to forward the caching attributes manually once more 
seems really misplaced.

>> Instead the client pgtables should be setup in a way so that host can
>> overwrite them.
> How?  That is completely not how VMs work.  Even if the host knew
> where the pgtables were and somehow magically knew the various guest
> userspace VAs, it would be racey.

Well you mentioned that the client page tables can be setup in a way 
that the host page tables determine what caching to use. As far as I can 
see this is what we should use here.

Regards,
Christian.

>
> BR,
> -R
>
>> Regards,
>> Christian.
>>
>>> BR,
>>> -R
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> BR,
>>>>> -R
>>>>>
>>>>>> If the hardware can't use the caching information from the host CPU page
>>>>>> tables directly then that pretty much completely breaks the concept that
>>>>>> the exporter is responsible for setting up those page tables.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>      drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>>>>>>>      include/linux/dma-buf.h      | 11 ++++++
>>>>>>>      include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>>>>>>>      3 files changed, 132 insertions(+), 10 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>>>>> index 32f55640890c..262c4706f721 100644
>>>>>>> --- a/drivers/dma-buf/dma-buf.c
>>>>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>>>>>>>          .kill_sb = kill_anon_super,
>>>>>>>      };
>>>>>>>
>>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>>>>>>> +{
>>>>>>> +     int ret;
>>>>>>> +
>>>>>>> +     /* check if buffer supports mmap */
>>>>>>> +     if (!dmabuf->ops->mmap)
>>>>>>> +             return -EINVAL;
>>>>>>> +
>>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
>>>>>>> +
>>>>>>> +     /*
>>>>>>> +      * If the exporter claims to support coherent access, ensure the
>>>>>>> +      * pgprot flags match the claim.
>>>>>>> +      */
>>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
>>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
>>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
>>>>>>> +             } else {
>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
>>>>>>> +             }
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>>      static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>>>      {
>>>>>>>          struct dma_buf *dmabuf;
>>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>>>
>>>>>>>          dmabuf = file->private_data;
>>>>>>>
>>>>>>> -     /* check if buffer supports mmap */
>>>>>>> -     if (!dmabuf->ops->mmap)
>>>>>>> -             return -EINVAL;
>>>>>>> -
>>>>>>>          /* check for overflowing the buffer's size */
>>>>>>>          if (vma->vm_pgoff + vma_pages(vma) >
>>>>>>>              dmabuf->size >> PAGE_SHIFT)
>>>>>>>                  return -EINVAL;
>>>>>>>
>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>>>      }
>>>>>>>
>>>>>>>      static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
>>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>>>>>>>          return 0;
>>>>>>>      }
>>>>>>>
>>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
>>>>>>> +{
>>>>>>> +     struct dma_buf_info arg;
>>>>>>> +
>>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
>>>>>>> +             return -EFAULT;
>>>>>>> +
>>>>>>> +     switch (arg.param) {
>>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
>>>>>>> +             arg.value = dmabuf->map_info;
>>>>>>> +             break;
>>>>>>> +     default:
>>>>>>> +             return -EINVAL;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
>>>>>>> +             return -EFAULT;
>>>>>>> +
>>>>>>> +     return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>      static long dma_buf_ioctl(struct file *file,
>>>>>>>                            unsigned int cmd, unsigned long arg)
>>>>>>>      {
>>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>>>>>>>          case DMA_BUF_SET_NAME_B:
>>>>>>>                  return dma_buf_set_name(dmabuf, (const char __user *)arg);
>>>>>>>
>>>>>>> +     case DMA_BUF_IOCTL_INFO:
>>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
>>>>>>> +
>>>>>>>          default:
>>>>>>>                  return -ENOTTY;
>>>>>>>          }
>>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>>>>          dmabuf->priv = exp_info->priv;
>>>>>>>          dmabuf->ops = exp_info->ops;
>>>>>>>          dmabuf->size = exp_info->size;
>>>>>>> +     dmabuf->map_info = exp_info->map_info;
>>>>>>>          dmabuf->exp_name = exp_info->exp_name;
>>>>>>>          dmabuf->owner = exp_info->owner;
>>>>>>>          spin_lock_init(&dmabuf->name_lock);
>>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>>>          if (WARN_ON(!dmabuf || !vma))
>>>>>>>                  return -EINVAL;
>>>>>>>
>>>>>>> -     /* check if buffer supports mmap */
>>>>>>> -     if (!dmabuf->ops->mmap)
>>>>>>> -             return -EINVAL;
>>>>>>> -
>>>>>>>          /* check for offset overflow */
>>>>>>>          if (pgoff + vma_pages(vma) < pgoff)
>>>>>>>                  return -EOVERFLOW;
>>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>>>          vma_set_file(vma, dmabuf->file);
>>>>>>>          vma->vm_pgoff = pgoff;
>>>>>>>
>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>>>      }
>>>>>>>      EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>>>>>>>
>>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>>>>>> index 71731796c8c3..37923c8d5c24 100644
>>>>>>> --- a/include/linux/dma-buf.h
>>>>>>> +++ b/include/linux/dma-buf.h
>>>>>>> @@ -23,6 +23,8 @@
>>>>>>>      #include <linux/dma-fence.h>
>>>>>>>      #include <linux/wait.h>
>>>>>>>
>>>>>>> +#include <uapi/linux/dma-buf.h>
>>>>>>> +
>>>>>>>      struct device;
>>>>>>>      struct dma_buf;
>>>>>>>      struct dma_buf_attachment;
>>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
>>>>>>>           */
>>>>>>>          size_t size;
>>>>>>>
>>>>>>> +     /**
>>>>>>> +      * @map_info:
>>>>>>> +      *
>>>>>>> +      * CPU mapping/coherency information for the buffer.
>>>>>>> +      */
>>>>>>> +     enum dma_buf_map_info map_info;
>>>>>>> +
>>>>>>>          /**
>>>>>>>           * @file:
>>>>>>>           *
>>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>>>>>>>       * @ops:    Attach allocator-defined dma buf ops to the new buffer
>>>>>>>       * @size:   Size of the buffer - invariant over the lifetime of the buffer
>>>>>>>       * @flags:  mode flags for the file
>>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
>>>>>>>       * @resv:   reservation-object, NULL to allocate default one
>>>>>>>       * @priv:   Attach private data of allocator to this buffer
>>>>>>>       *
>>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>>>>>>>          const struct dma_buf_ops *ops;
>>>>>>>          size_t size;
>>>>>>>          int flags;
>>>>>>> +     enum dma_buf_map_info map_info;
>>>>>>>          struct dma_resv *resv;
>>>>>>>          void *priv;
>>>>>>>      };
>>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
>>>>>>> index b1523cb8ab30..07b403ffdb43 100644
>>>>>>> --- a/include/uapi/linux/dma-buf.h
>>>>>>> +++ b/include/uapi/linux/dma-buf.h
>>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>>>>>>>
>>>>>>>      #define DMA_BUF_NAME_LEN    32
>>>>>>>
>>>>>>> +/**
>>>>>>> + * enum dma_buf_map_info - CPU mapping info
>>>>>>> + *
>>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
>>>>>>> + *
>>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
>>>>>>> + * import if unsupported.  For example, if the exporting device uses
>>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
>>>>>>> + * CPU cache coherency, the dma-buf import should fail.
>>>>>>> + */
>>>>>>> +enum dma_buf_map_info {
>>>>>>> +     /**
>>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
>>>>>>> +      *
>>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
>>>>>>> +      */
>>>>>>> +     DMA_BUF_MAP_INCOHERENT,
>>>>>>> +
>>>>>>> +     /**
>>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
>>>>>>> +      *
>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>>>> +      *
>>>>>>> +      * The cpu mapping is writecombine.
>>>>>>> +      */
>>>>>>> +     DMA_BUF_COHERENT_WC,
>>>>>>> +
>>>>>>> +     /**
>>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
>>>>>>> +      *
>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>>>> +      *
>>>>>>> +      * The cpu mapping is cached.
>>>>>>> +      */
>>>>>>> +     DMA_BUF_COHERENT_CACHED,
>>>>>>> +};
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * struct dma_buf_info - Query info about the buffer.
>>>>>>> + */
>>>>>>> +struct dma_buf_info {
>>>>>>> +
>>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
>>>>>>> +
>>>>>>> +     /**
>>>>>>> +      * @param: Which param to query
>>>>>>> +      *
>>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
>>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
>>>>>>> +      *     caching of a CPU mapping of the buffer.
>>>>>>> +      */
>>>>>>> +     __u32 param;
>>>>>>> +     __u32 pad;
>>>>>>> +
>>>>>>> +     /**
>>>>>>> +      * @value: Return value of the query.
>>>>>>> +      */
>>>>>>> +     __u64 value;
>>>>>>> +};
>>>>>>> +
>>>>>>>      #define DMA_BUF_BASE                'b'
>>>>>>>      #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>>>>>>>
>>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>>>>>>>      #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
>>>>>>>      #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
>>>>>>>
>>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
>>>>>>> +
>>>>>>>      #endif
Rob Clark Aug. 18, 2022, 2:25 p.m. UTC | #8
On Thu, Aug 18, 2022 at 4:21 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 17.08.22 um 15:44 schrieb Rob Clark:
> > On Wed, Aug 17, 2022 at 2:57 AM Christian König
> > <christian.koenig@amd.com> wrote:
> >> [SNIP]
> >>
> >> The resulting cache attrs from combination of S1 and S2 translation
> >> can differ.  So ideally we setup the S2 pgtables in guest aligned with
> >> host userspace mappings
> >> Well exactly that is not very convincing.
> >>
> >> What you want to do is to use one channel for the address and a
> >> different one for the cache attrs, that's not something I would
> >> recommend doing in general.
> > How would that work.. mmap() is the channel for the address, we'd need
> > to introduce a new syscall that returned additional information?
>
> The channel for the address is not mmap(), but rather the page faults.
> mmap() is just the function setting up that channel.
>
> The page faults then insert both the address as well as the caching
> attributes (at least on x86).

This is true on arm64 as well, but only in the S1 tables (which I
would have to assume is the case on x86 as well)

> That we then need to forward the caching attributes manually once more
> seems really misplaced.
>
> >> Instead the client pgtables should be setup in a way so that host can
> >> overwrite them.
> > How?  That is completely not how VMs work.  Even if the host knew
> > where the pgtables were and somehow magically knew the various guest
> > userspace VAs, it would be racey.
>
> Well you mentioned that the client page tables can be setup in a way
> that the host page tables determine what caching to use. As far as I can
> see this is what we should use here.

On arm64/aarch64, they *can*.. but the system (on some versions of
armv8) can also be configured to let S2 determine the attributes.  And
apparently there are benefits to this (avoids unnecessary cache
flushing in the host, AFAIU.)  This is the case where we need this new
api.

IMO it is fine for the exporter to return a value indicating that the
attributes change dynamically or that S1 attributes must somehow be
used by the hw.  This would at least let the VMM return an error in
cases where S1 attrs cannot be relied on.  But there are enough
exporters where the cache attrs are static for the life of the buffer.
So even if you need to return DMA_BUF_MAP_I_DONT_KNOW, maybe that is
fine (if x86 can always rely on S1 attrs), or at least will let the
VMM return an error rather than just blindly assuming things will
work.

But it makes no sense to reject the whole idea just because of some
exporters (which may not even need this).  There is always room to let
them return a map-info value that describes the situation or
limitations to the VMM.

BR,
-R

> Regards,
> Christian.
>
> >
> > BR,
> > -R
> >
> >> Regards,
> >> Christian.
> >>
> >>> BR,
> >>> -R
> >>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>> BR,
> >>>>> -R
> >>>>>
> >>>>>> If the hardware can't use the caching information from the host CPU page
> >>>>>> tables directly then that pretty much completely breaks the concept that
> >>>>>> the exporter is responsible for setting up those page tables.
> >>>>>>
> >>>>>> Regards,
> >>>>>> Christian.
> >>>>>>
> >>>>>>>      drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> >>>>>>>      include/linux/dma-buf.h      | 11 ++++++
> >>>>>>>      include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> >>>>>>>      3 files changed, 132 insertions(+), 10 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> >>>>>>> index 32f55640890c..262c4706f721 100644
> >>>>>>> --- a/drivers/dma-buf/dma-buf.c
> >>>>>>> +++ b/drivers/dma-buf/dma-buf.c
> >>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> >>>>>>>          .kill_sb = kill_anon_super,
> >>>>>>>      };
> >>>>>>>
> >>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> >>>>>>> +{
> >>>>>>> +     int ret;
> >>>>>>> +
> >>>>>>> +     /* check if buffer supports mmap */
> >>>>>>> +     if (!dmabuf->ops->mmap)
> >>>>>>> +             return -EINVAL;
> >>>>>>> +
> >>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>> +
> >>>>>>> +     /*
> >>>>>>> +      * If the exporter claims to support coherent access, ensure the
> >>>>>>> +      * pgprot flags match the claim.
> >>>>>>> +      */
> >>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> >>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> >>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> >>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> >>>>>>> +             } else {
> >>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> >>>>>>> +             }
> >>>>>>> +     }
> >>>>>>> +
> >>>>>>> +     return ret;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>      static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>>>      {
> >>>>>>>          struct dma_buf *dmabuf;
> >>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>>>
> >>>>>>>          dmabuf = file->private_data;
> >>>>>>>
> >>>>>>> -     /* check if buffer supports mmap */
> >>>>>>> -     if (!dmabuf->ops->mmap)
> >>>>>>> -             return -EINVAL;
> >>>>>>> -
> >>>>>>>          /* check for overflowing the buffer's size */
> >>>>>>>          if (vma->vm_pgoff + vma_pages(vma) >
> >>>>>>>              dmabuf->size >> PAGE_SHIFT)
> >>>>>>>                  return -EINVAL;
> >>>>>>>
> >>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>>>      }
> >>>>>>>
> >>>>>>>      static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> >>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> >>>>>>>          return 0;
> >>>>>>>      }
> >>>>>>>
> >>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> >>>>>>> +{
> >>>>>>> +     struct dma_buf_info arg;
> >>>>>>> +
> >>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> >>>>>>> +             return -EFAULT;
> >>>>>>> +
> >>>>>>> +     switch (arg.param) {
> >>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
> >>>>>>> +             arg.value = dmabuf->map_info;
> >>>>>>> +             break;
> >>>>>>> +     default:
> >>>>>>> +             return -EINVAL;
> >>>>>>> +     }
> >>>>>>> +
> >>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> >>>>>>> +             return -EFAULT;
> >>>>>>> +
> >>>>>>> +     return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>      static long dma_buf_ioctl(struct file *file,
> >>>>>>>                            unsigned int cmd, unsigned long arg)
> >>>>>>>      {
> >>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> >>>>>>>          case DMA_BUF_SET_NAME_B:
> >>>>>>>                  return dma_buf_set_name(dmabuf, (const char __user *)arg);
> >>>>>>>
> >>>>>>> +     case DMA_BUF_IOCTL_INFO:
> >>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> >>>>>>> +
> >>>>>>>          default:
> >>>>>>>                  return -ENOTTY;
> >>>>>>>          }
> >>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> >>>>>>>          dmabuf->priv = exp_info->priv;
> >>>>>>>          dmabuf->ops = exp_info->ops;
> >>>>>>>          dmabuf->size = exp_info->size;
> >>>>>>> +     dmabuf->map_info = exp_info->map_info;
> >>>>>>>          dmabuf->exp_name = exp_info->exp_name;
> >>>>>>>          dmabuf->owner = exp_info->owner;
> >>>>>>>          spin_lock_init(&dmabuf->name_lock);
> >>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>>>          if (WARN_ON(!dmabuf || !vma))
> >>>>>>>                  return -EINVAL;
> >>>>>>>
> >>>>>>> -     /* check if buffer supports mmap */
> >>>>>>> -     if (!dmabuf->ops->mmap)
> >>>>>>> -             return -EINVAL;
> >>>>>>> -
> >>>>>>>          /* check for offset overflow */
> >>>>>>>          if (pgoff + vma_pages(vma) < pgoff)
> >>>>>>>                  return -EOVERFLOW;
> >>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>>>          vma_set_file(vma, dmabuf->file);
> >>>>>>>          vma->vm_pgoff = pgoff;
> >>>>>>>
> >>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>>>      }
> >>>>>>>      EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> >>>>>>>
> >>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >>>>>>> index 71731796c8c3..37923c8d5c24 100644
> >>>>>>> --- a/include/linux/dma-buf.h
> >>>>>>> +++ b/include/linux/dma-buf.h
> >>>>>>> @@ -23,6 +23,8 @@
> >>>>>>>      #include <linux/dma-fence.h>
> >>>>>>>      #include <linux/wait.h>
> >>>>>>>
> >>>>>>> +#include <uapi/linux/dma-buf.h>
> >>>>>>> +
> >>>>>>>      struct device;
> >>>>>>>      struct dma_buf;
> >>>>>>>      struct dma_buf_attachment;
> >>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
> >>>>>>>           */
> >>>>>>>          size_t size;
> >>>>>>>
> >>>>>>> +     /**
> >>>>>>> +      * @map_info:
> >>>>>>> +      *
> >>>>>>> +      * CPU mapping/coherency information for the buffer.
> >>>>>>> +      */
> >>>>>>> +     enum dma_buf_map_info map_info;
> >>>>>>> +
> >>>>>>>          /**
> >>>>>>>           * @file:
> >>>>>>>           *
> >>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> >>>>>>>       * @ops:    Attach allocator-defined dma buf ops to the new buffer
> >>>>>>>       * @size:   Size of the buffer - invariant over the lifetime of the buffer
> >>>>>>>       * @flags:  mode flags for the file
> >>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
> >>>>>>>       * @resv:   reservation-object, NULL to allocate default one
> >>>>>>>       * @priv:   Attach private data of allocator to this buffer
> >>>>>>>       *
> >>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> >>>>>>>          const struct dma_buf_ops *ops;
> >>>>>>>          size_t size;
> >>>>>>>          int flags;
> >>>>>>> +     enum dma_buf_map_info map_info;
> >>>>>>>          struct dma_resv *resv;
> >>>>>>>          void *priv;
> >>>>>>>      };
> >>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> >>>>>>> index b1523cb8ab30..07b403ffdb43 100644
> >>>>>>> --- a/include/uapi/linux/dma-buf.h
> >>>>>>> +++ b/include/uapi/linux/dma-buf.h
> >>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> >>>>>>>
> >>>>>>>      #define DMA_BUF_NAME_LEN    32
> >>>>>>>
> >>>>>>> +/**
> >>>>>>> + * enum dma_buf_map_info - CPU mapping info
> >>>>>>> + *
> >>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> >>>>>>> + *
> >>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
> >>>>>>> + * import if unsupported.  For example, if the exporting device uses
> >>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> >>>>>>> + * CPU cache coherency, the dma-buf import should fail.
> >>>>>>> + */
> >>>>>>> +enum dma_buf_map_info {
> >>>>>>> +     /**
> >>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> >>>>>>> +      *
> >>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> >>>>>>> +      */
> >>>>>>> +     DMA_BUF_MAP_INCOHERENT,
> >>>>>>> +
> >>>>>>> +     /**
> >>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> >>>>>>> +      *
> >>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>>>> +      *
> >>>>>>> +      * The cpu mapping is writecombine.
> >>>>>>> +      */
> >>>>>>> +     DMA_BUF_COHERENT_WC,
> >>>>>>> +
> >>>>>>> +     /**
> >>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> >>>>>>> +      *
> >>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>>>> +      *
> >>>>>>> +      * The cpu mapping is cached.
> >>>>>>> +      */
> >>>>>>> +     DMA_BUF_COHERENT_CACHED,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +/**
> >>>>>>> + * struct dma_buf_info - Query info about the buffer.
> >>>>>>> + */
> >>>>>>> +struct dma_buf_info {
> >>>>>>> +
> >>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
> >>>>>>> +
> >>>>>>> +     /**
> >>>>>>> +      * @param: Which param to query
> >>>>>>> +      *
> >>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
> >>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> >>>>>>> +      *     caching of a CPU mapping of the buffer.
> >>>>>>> +      */
> >>>>>>> +     __u32 param;
> >>>>>>> +     __u32 pad;
> >>>>>>> +
> >>>>>>> +     /**
> >>>>>>> +      * @value: Return value of the query.
> >>>>>>> +      */
> >>>>>>> +     __u64 value;
> >>>>>>> +};
> >>>>>>> +
> >>>>>>>      #define DMA_BUF_BASE                'b'
> >>>>>>>      #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> >>>>>>>
> >>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> >>>>>>>      #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> >>>>>>>      #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> >>>>>>>
> >>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> >>>>>>> +
> >>>>>>>      #endif
>
Christian König Aug. 18, 2022, 2:54 p.m. UTC | #9
Am 18.08.22 um 16:25 schrieb Rob Clark:
> On Thu, Aug 18, 2022 at 4:21 AM Christian König
> <christian.koenig@amd.com> wrote:
>> Am 17.08.22 um 15:44 schrieb Rob Clark:
>>> On Wed, Aug 17, 2022 at 2:57 AM Christian König
>>> <christian.koenig@amd.com> wrote:
>>>> [SNIP]
>>>>
>>>> The resulting cache attrs from combination of S1 and S2 translation
>>>> can differ.  So ideally we setup the S2 pgtables in guest aligned with
>>>> host userspace mappings
>>>> Well exactly that is not very convincing.
>>>>
>>>> What you want to do is to use one channel for the address and a
>>>> different one for the cache attrs, that's not something I would
>>>> recommend doing in general.
>>> How would that work.. mmap() is the channel for the address, we'd need
>>> to introduce a new syscall that returned additional information?
>> The channel for the address is not mmap(), but rather the page faults.
>> mmap() is just the function setting up that channel.
>>
>> The page faults then insert both the address as well as the caching
>> attributes (at least on x86).
> This is true on arm64 as well, but only in the S1 tables (which I
> would have to assume is the case on x86 as well)
>
>> That we then need to forward the caching attributes manually once more
>> seems really misplaced.
>>
>>>> Instead the client pgtables should be setup in a way so that host can
>>>> overwrite them.
>>> How?  That is completely not how VMs work.  Even if the host knew
>>> where the pgtables were and somehow magically knew the various guest
>>> userspace VAs, it would be racey.
>> Well you mentioned that the client page tables can be setup in a way
>> that the host page tables determine what caching to use. As far as I can
>> see this is what we should use here.
> On arm64/aarch64, they *can*.. but the system (on some versions of
> armv8) can also be configured to let S2 determine the attributes.  And
> apparently there are benefits to this (avoids unnecessary cache
> flushing in the host, AFAIU.)  This is the case where we need this new
> api.
>
> IMO it is fine for the exporter to return a value indicating that the
> attributes change dynamically or that S1 attributes must somehow be
> used by the hw.  This would at least let the VMM return an error in
> cases where S1 attrs cannot be relied on.  But there are enough
> exporters where the cache attrs are static for the life of the buffer.
> So even if you need to return DMA_BUF_MAP_I_DONT_KNOW, maybe that is
> fine (if x86 can always rely on S1 attrs), or at least will let the
> VMM return an error rather than just blindly assuming things will
> work.
>
> But it makes no sense to reject the whole idea just because of some
> exporters (which may not even need this).  There is always room to let
> them return a map-info value that describes the situation or
> limitations to the VMM.

Well it does make sense as far as I can see.

This is a very specific workaround for a platform problem which only 
matters there, but increases complexity for everybody.

If we don't have any other choice on the problem to work around that I 
would say ok we add an ARM specific workaround.

But as long as that's not the case the whole idea is pretty clearly a 
NAK from my side.

Regards,
Christian.

>
> BR,
> -R
>
>> Regards,
>> Christian.
>>
>>> BR,
>>> -R
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> BR,
>>>>> -R
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> BR,
>>>>>>> -R
>>>>>>>
>>>>>>>> If the hardware can't use the caching information from the host CPU page
>>>>>>>> tables directly then that pretty much completely breaks the concept that
>>>>>>>> the exporter is responsible for setting up those page tables.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>       drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>>>>>>>>>       include/linux/dma-buf.h      | 11 ++++++
>>>>>>>>>       include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>>>>>>>>>       3 files changed, 132 insertions(+), 10 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>>>>>>> index 32f55640890c..262c4706f721 100644
>>>>>>>>> --- a/drivers/dma-buf/dma-buf.c
>>>>>>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>>>>>>>>>           .kill_sb = kill_anon_super,
>>>>>>>>>       };
>>>>>>>>>
>>>>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>>>>>>>>> +{
>>>>>>>>> +     int ret;
>>>>>>>>> +
>>>>>>>>> +     /* check if buffer supports mmap */
>>>>>>>>> +     if (!dmabuf->ops->mmap)
>>>>>>>>> +             return -EINVAL;
>>>>>>>>> +
>>>>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
>>>>>>>>> +
>>>>>>>>> +     /*
>>>>>>>>> +      * If the exporter claims to support coherent access, ensure the
>>>>>>>>> +      * pgprot flags match the claim.
>>>>>>>>> +      */
>>>>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
>>>>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
>>>>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
>>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
>>>>>>>>> +             } else {
>>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
>>>>>>>>> +             }
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>> +     return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>       static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>>>>>       {
>>>>>>>>>           struct dma_buf *dmabuf;
>>>>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>>>>>
>>>>>>>>>           dmabuf = file->private_data;
>>>>>>>>>
>>>>>>>>> -     /* check if buffer supports mmap */
>>>>>>>>> -     if (!dmabuf->ops->mmap)
>>>>>>>>> -             return -EINVAL;
>>>>>>>>> -
>>>>>>>>>           /* check for overflowing the buffer's size */
>>>>>>>>>           if (vma->vm_pgoff + vma_pages(vma) >
>>>>>>>>>               dmabuf->size >> PAGE_SHIFT)
>>>>>>>>>                   return -EINVAL;
>>>>>>>>>
>>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>>>>>       }
>>>>>>>>>
>>>>>>>>>       static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
>>>>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>>>>>>>>>           return 0;
>>>>>>>>>       }
>>>>>>>>>
>>>>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
>>>>>>>>> +{
>>>>>>>>> +     struct dma_buf_info arg;
>>>>>>>>> +
>>>>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
>>>>>>>>> +             return -EFAULT;
>>>>>>>>> +
>>>>>>>>> +     switch (arg.param) {
>>>>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
>>>>>>>>> +             arg.value = dmabuf->map_info;
>>>>>>>>> +             break;
>>>>>>>>> +     default:
>>>>>>>>> +             return -EINVAL;
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
>>>>>>>>> +             return -EFAULT;
>>>>>>>>> +
>>>>>>>>> +     return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>       static long dma_buf_ioctl(struct file *file,
>>>>>>>>>                             unsigned int cmd, unsigned long arg)
>>>>>>>>>       {
>>>>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>>>>>>>>>           case DMA_BUF_SET_NAME_B:
>>>>>>>>>                   return dma_buf_set_name(dmabuf, (const char __user *)arg);
>>>>>>>>>
>>>>>>>>> +     case DMA_BUF_IOCTL_INFO:
>>>>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
>>>>>>>>> +
>>>>>>>>>           default:
>>>>>>>>>                   return -ENOTTY;
>>>>>>>>>           }
>>>>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>>>>>>           dmabuf->priv = exp_info->priv;
>>>>>>>>>           dmabuf->ops = exp_info->ops;
>>>>>>>>>           dmabuf->size = exp_info->size;
>>>>>>>>> +     dmabuf->map_info = exp_info->map_info;
>>>>>>>>>           dmabuf->exp_name = exp_info->exp_name;
>>>>>>>>>           dmabuf->owner = exp_info->owner;
>>>>>>>>>           spin_lock_init(&dmabuf->name_lock);
>>>>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>>>>>           if (WARN_ON(!dmabuf || !vma))
>>>>>>>>>                   return -EINVAL;
>>>>>>>>>
>>>>>>>>> -     /* check if buffer supports mmap */
>>>>>>>>> -     if (!dmabuf->ops->mmap)
>>>>>>>>> -             return -EINVAL;
>>>>>>>>> -
>>>>>>>>>           /* check for offset overflow */
>>>>>>>>>           if (pgoff + vma_pages(vma) < pgoff)
>>>>>>>>>                   return -EOVERFLOW;
>>>>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>>>>>           vma_set_file(vma, dmabuf->file);
>>>>>>>>>           vma->vm_pgoff = pgoff;
>>>>>>>>>
>>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>>>>>       }
>>>>>>>>>       EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>>>>>>>> index 71731796c8c3..37923c8d5c24 100644
>>>>>>>>> --- a/include/linux/dma-buf.h
>>>>>>>>> +++ b/include/linux/dma-buf.h
>>>>>>>>> @@ -23,6 +23,8 @@
>>>>>>>>>       #include <linux/dma-fence.h>
>>>>>>>>>       #include <linux/wait.h>
>>>>>>>>>
>>>>>>>>> +#include <uapi/linux/dma-buf.h>
>>>>>>>>> +
>>>>>>>>>       struct device;
>>>>>>>>>       struct dma_buf;
>>>>>>>>>       struct dma_buf_attachment;
>>>>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
>>>>>>>>>            */
>>>>>>>>>           size_t size;
>>>>>>>>>
>>>>>>>>> +     /**
>>>>>>>>> +      * @map_info:
>>>>>>>>> +      *
>>>>>>>>> +      * CPU mapping/coherency information for the buffer.
>>>>>>>>> +      */
>>>>>>>>> +     enum dma_buf_map_info map_info;
>>>>>>>>> +
>>>>>>>>>           /**
>>>>>>>>>            * @file:
>>>>>>>>>            *
>>>>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>>>>>>>>>        * @ops:    Attach allocator-defined dma buf ops to the new buffer
>>>>>>>>>        * @size:   Size of the buffer - invariant over the lifetime of the buffer
>>>>>>>>>        * @flags:  mode flags for the file
>>>>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
>>>>>>>>>        * @resv:   reservation-object, NULL to allocate default one
>>>>>>>>>        * @priv:   Attach private data of allocator to this buffer
>>>>>>>>>        *
>>>>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>>>>>>>>>           const struct dma_buf_ops *ops;
>>>>>>>>>           size_t size;
>>>>>>>>>           int flags;
>>>>>>>>> +     enum dma_buf_map_info map_info;
>>>>>>>>>           struct dma_resv *resv;
>>>>>>>>>           void *priv;
>>>>>>>>>       };
>>>>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
>>>>>>>>> index b1523cb8ab30..07b403ffdb43 100644
>>>>>>>>> --- a/include/uapi/linux/dma-buf.h
>>>>>>>>> +++ b/include/uapi/linux/dma-buf.h
>>>>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>>>>>>>>>
>>>>>>>>>       #define DMA_BUF_NAME_LEN    32
>>>>>>>>>
>>>>>>>>> +/**
>>>>>>>>> + * enum dma_buf_map_info - CPU mapping info
>>>>>>>>> + *
>>>>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
>>>>>>>>> + *
>>>>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
>>>>>>>>> + * import if unsupported.  For example, if the exporting device uses
>>>>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
>>>>>>>>> + * CPU cache coherency, the dma-buf import should fail.
>>>>>>>>> + */
>>>>>>>>> +enum dma_buf_map_info {
>>>>>>>>> +     /**
>>>>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
>>>>>>>>> +      *
>>>>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
>>>>>>>>> +      */
>>>>>>>>> +     DMA_BUF_MAP_INCOHERENT,
>>>>>>>>> +
>>>>>>>>> +     /**
>>>>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
>>>>>>>>> +      *
>>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>>>>>> +      *
>>>>>>>>> +      * The cpu mapping is writecombine.
>>>>>>>>> +      */
>>>>>>>>> +     DMA_BUF_COHERENT_WC,
>>>>>>>>> +
>>>>>>>>> +     /**
>>>>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
>>>>>>>>> +      *
>>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>>>>>> +      *
>>>>>>>>> +      * The cpu mapping is cached.
>>>>>>>>> +      */
>>>>>>>>> +     DMA_BUF_COHERENT_CACHED,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * struct dma_buf_info - Query info about the buffer.
>>>>>>>>> + */
>>>>>>>>> +struct dma_buf_info {
>>>>>>>>> +
>>>>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
>>>>>>>>> +
>>>>>>>>> +     /**
>>>>>>>>> +      * @param: Which param to query
>>>>>>>>> +      *
>>>>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
>>>>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
>>>>>>>>> +      *     caching of a CPU mapping of the buffer.
>>>>>>>>> +      */
>>>>>>>>> +     __u32 param;
>>>>>>>>> +     __u32 pad;
>>>>>>>>> +
>>>>>>>>> +     /**
>>>>>>>>> +      * @value: Return value of the query.
>>>>>>>>> +      */
>>>>>>>>> +     __u64 value;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>       #define DMA_BUF_BASE                'b'
>>>>>>>>>       #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>>>>>>>>>
>>>>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>>>>>>>>>       #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
>>>>>>>>>       #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
>>>>>>>>>
>>>>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
>>>>>>>>> +
>>>>>>>>>       #endif
Rob Clark Aug. 18, 2022, 3:01 p.m. UTC | #10
On Thu, Aug 18, 2022 at 7:54 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 18.08.22 um 16:25 schrieb Rob Clark:
> > On Thu, Aug 18, 2022 at 4:21 AM Christian König
> > <christian.koenig@amd.com> wrote:
> >> Am 17.08.22 um 15:44 schrieb Rob Clark:
> >>> On Wed, Aug 17, 2022 at 2:57 AM Christian König
> >>> <christian.koenig@amd.com> wrote:
> >>>> [SNIP]
> >>>>
> >>>> The resulting cache attrs from combination of S1 and S2 translation
> >>>> can differ.  So ideally we setup the S2 pgtables in guest aligned with
> >>>> host userspace mappings
> >>>> Well exactly that is not very convincing.
> >>>>
> >>>> What you want to do is to use one channel for the address and a
> >>>> different one for the cache attrs, that's not something I would
> >>>> recommend doing in general.
> >>> How would that work.. mmap() is the channel for the address, we'd need
> >>> to introduce a new syscall that returned additional information?
> >> The channel for the address is not mmap(), but rather the page faults.
> >> mmap() is just the function setting up that channel.
> >>
> >> The page faults then insert both the address as well as the caching
> >> attributes (at least on x86).
> > This is true on arm64 as well, but only in the S1 tables (which I
> > would have to assume is the case on x86 as well)
> >
> >> That we then need to forward the caching attributes manually once more
> >> seems really misplaced.
> >>
> >>>> Instead the client pgtables should be setup in a way so that host can
> >>>> overwrite them.
> >>> How?  That is completely not how VMs work.  Even if the host knew
> >>> where the pgtables were and somehow magically knew the various guest
> >>> userspace VAs, it would be racey.
> >> Well you mentioned that the client page tables can be setup in a way
> >> that the host page tables determine what caching to use. As far as I can
> >> see this is what we should use here.
> > On arm64/aarch64, they *can*.. but the system (on some versions of
> > armv8) can also be configured to let S2 determine the attributes.  And
> > apparently there are benefits to this (avoids unnecessary cache
> > flushing in the host, AFAIU.)  This is the case where we need this new
> > api.
> >
> > IMO it is fine for the exporter to return a value indicating that the
> > attributes change dynamically or that S1 attributes must somehow be
> > used by the hw.  This would at least let the VMM return an error in
> > cases where S1 attrs cannot be relied on.  But there are enough
> > exporters where the cache attrs are static for the life of the buffer.
> > So even if you need to return DMA_BUF_MAP_I_DONT_KNOW, maybe that is
> > fine (if x86 can always rely on S1 attrs), or at least will let the
> > VMM return an error rather than just blindly assuming things will
> > work.
> >
> > But it makes no sense to reject the whole idea just because of some
> > exporters (which may not even need this).  There is always room to let
> > them return a map-info value that describes the situation or
> > limitations to the VMM.
>
> Well it does make sense as far as I can see.
>
> This is a very specific workaround for a platform problem which only
> matters there, but increases complexity for everybody.

I'm not sure how this adds complexity for everybody.. or at least the
intention was the default value for the new enum is the same as
current status-quo, so no need to plumb something thru every single
exporter.

BR,
-R

> If we don't have any other choice on the problem to work around that I
> would say ok we add an ARM specific workaround.
>
> But as long as that's not the case the whole idea is pretty clearly a
> NAK from my side.
>
> Regards,
> Christian.
>
> >
> > BR,
> > -R
> >
> >> Regards,
> >> Christian.
> >>
> >>> BR,
> >>> -R
> >>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>> BR,
> >>>>> -R
> >>>>>
> >>>>>> Regards,
> >>>>>> Christian.
> >>>>>>
> >>>>>>> BR,
> >>>>>>> -R
> >>>>>>>
> >>>>>>>> If the hardware can't use the caching information from the host CPU page
> >>>>>>>> tables directly then that pretty much completely breaks the concept that
> >>>>>>>> the exporter is responsible for setting up those page tables.
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>> Christian.
> >>>>>>>>
> >>>>>>>>>       drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> >>>>>>>>>       include/linux/dma-buf.h      | 11 ++++++
> >>>>>>>>>       include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> >>>>>>>>>       3 files changed, 132 insertions(+), 10 deletions(-)
> >>>>>>>>>
> >>>>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> >>>>>>>>> index 32f55640890c..262c4706f721 100644
> >>>>>>>>> --- a/drivers/dma-buf/dma-buf.c
> >>>>>>>>> +++ b/drivers/dma-buf/dma-buf.c
> >>>>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> >>>>>>>>>           .kill_sb = kill_anon_super,
> >>>>>>>>>       };
> >>>>>>>>>
> >>>>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> >>>>>>>>> +{
> >>>>>>>>> +     int ret;
> >>>>>>>>> +
> >>>>>>>>> +     /* check if buffer supports mmap */
> >>>>>>>>> +     if (!dmabuf->ops->mmap)
> >>>>>>>>> +             return -EINVAL;
> >>>>>>>>> +
> >>>>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>>>> +
> >>>>>>>>> +     /*
> >>>>>>>>> +      * If the exporter claims to support coherent access, ensure the
> >>>>>>>>> +      * pgprot flags match the claim.
> >>>>>>>>> +      */
> >>>>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> >>>>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> >>>>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> >>>>>>>>> +             } else {
> >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> >>>>>>>>> +             }
> >>>>>>>>> +     }
> >>>>>>>>> +
> >>>>>>>>> +     return ret;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>>       static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>>>>>       {
> >>>>>>>>>           struct dma_buf *dmabuf;
> >>>>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> >>>>>>>>>
> >>>>>>>>>           dmabuf = file->private_data;
> >>>>>>>>>
> >>>>>>>>> -     /* check if buffer supports mmap */
> >>>>>>>>> -     if (!dmabuf->ops->mmap)
> >>>>>>>>> -             return -EINVAL;
> >>>>>>>>> -
> >>>>>>>>>           /* check for overflowing the buffer's size */
> >>>>>>>>>           if (vma->vm_pgoff + vma_pages(vma) >
> >>>>>>>>>               dmabuf->size >> PAGE_SHIFT)
> >>>>>>>>>                   return -EINVAL;
> >>>>>>>>>
> >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>>>>>       }
> >>>>>>>>>
> >>>>>>>>>       static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> >>>>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> >>>>>>>>>           return 0;
> >>>>>>>>>       }
> >>>>>>>>>
> >>>>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> >>>>>>>>> +{
> >>>>>>>>> +     struct dma_buf_info arg;
> >>>>>>>>> +
> >>>>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> >>>>>>>>> +             return -EFAULT;
> >>>>>>>>> +
> >>>>>>>>> +     switch (arg.param) {
> >>>>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
> >>>>>>>>> +             arg.value = dmabuf->map_info;
> >>>>>>>>> +             break;
> >>>>>>>>> +     default:
> >>>>>>>>> +             return -EINVAL;
> >>>>>>>>> +     }
> >>>>>>>>> +
> >>>>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> >>>>>>>>> +             return -EFAULT;
> >>>>>>>>> +
> >>>>>>>>> +     return 0;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>>       static long dma_buf_ioctl(struct file *file,
> >>>>>>>>>                             unsigned int cmd, unsigned long arg)
> >>>>>>>>>       {
> >>>>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> >>>>>>>>>           case DMA_BUF_SET_NAME_B:
> >>>>>>>>>                   return dma_buf_set_name(dmabuf, (const char __user *)arg);
> >>>>>>>>>
> >>>>>>>>> +     case DMA_BUF_IOCTL_INFO:
> >>>>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> >>>>>>>>> +
> >>>>>>>>>           default:
> >>>>>>>>>                   return -ENOTTY;
> >>>>>>>>>           }
> >>>>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> >>>>>>>>>           dmabuf->priv = exp_info->priv;
> >>>>>>>>>           dmabuf->ops = exp_info->ops;
> >>>>>>>>>           dmabuf->size = exp_info->size;
> >>>>>>>>> +     dmabuf->map_info = exp_info->map_info;
> >>>>>>>>>           dmabuf->exp_name = exp_info->exp_name;
> >>>>>>>>>           dmabuf->owner = exp_info->owner;
> >>>>>>>>>           spin_lock_init(&dmabuf->name_lock);
> >>>>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>>>>>           if (WARN_ON(!dmabuf || !vma))
> >>>>>>>>>                   return -EINVAL;
> >>>>>>>>>
> >>>>>>>>> -     /* check if buffer supports mmap */
> >>>>>>>>> -     if (!dmabuf->ops->mmap)
> >>>>>>>>> -             return -EINVAL;
> >>>>>>>>> -
> >>>>>>>>>           /* check for offset overflow */
> >>>>>>>>>           if (pgoff + vma_pages(vma) < pgoff)
> >>>>>>>>>                   return -EOVERFLOW;
> >>>>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> >>>>>>>>>           vma_set_file(vma, dmabuf->file);
> >>>>>>>>>           vma->vm_pgoff = pgoff;
> >>>>>>>>>
> >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> >>>>>>>>>       }
> >>>>>>>>>       EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> >>>>>>>>>
> >>>>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >>>>>>>>> index 71731796c8c3..37923c8d5c24 100644
> >>>>>>>>> --- a/include/linux/dma-buf.h
> >>>>>>>>> +++ b/include/linux/dma-buf.h
> >>>>>>>>> @@ -23,6 +23,8 @@
> >>>>>>>>>       #include <linux/dma-fence.h>
> >>>>>>>>>       #include <linux/wait.h>
> >>>>>>>>>
> >>>>>>>>> +#include <uapi/linux/dma-buf.h>
> >>>>>>>>> +
> >>>>>>>>>       struct device;
> >>>>>>>>>       struct dma_buf;
> >>>>>>>>>       struct dma_buf_attachment;
> >>>>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
> >>>>>>>>>            */
> >>>>>>>>>           size_t size;
> >>>>>>>>>
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @map_info:
> >>>>>>>>> +      *
> >>>>>>>>> +      * CPU mapping/coherency information for the buffer.
> >>>>>>>>> +      */
> >>>>>>>>> +     enum dma_buf_map_info map_info;
> >>>>>>>>> +
> >>>>>>>>>           /**
> >>>>>>>>>            * @file:
> >>>>>>>>>            *
> >>>>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> >>>>>>>>>        * @ops:    Attach allocator-defined dma buf ops to the new buffer
> >>>>>>>>>        * @size:   Size of the buffer - invariant over the lifetime of the buffer
> >>>>>>>>>        * @flags:  mode flags for the file
> >>>>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
> >>>>>>>>>        * @resv:   reservation-object, NULL to allocate default one
> >>>>>>>>>        * @priv:   Attach private data of allocator to this buffer
> >>>>>>>>>        *
> >>>>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> >>>>>>>>>           const struct dma_buf_ops *ops;
> >>>>>>>>>           size_t size;
> >>>>>>>>>           int flags;
> >>>>>>>>> +     enum dma_buf_map_info map_info;
> >>>>>>>>>           struct dma_resv *resv;
> >>>>>>>>>           void *priv;
> >>>>>>>>>       };
> >>>>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> >>>>>>>>> index b1523cb8ab30..07b403ffdb43 100644
> >>>>>>>>> --- a/include/uapi/linux/dma-buf.h
> >>>>>>>>> +++ b/include/uapi/linux/dma-buf.h
> >>>>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> >>>>>>>>>
> >>>>>>>>>       #define DMA_BUF_NAME_LEN    32
> >>>>>>>>>
> >>>>>>>>> +/**
> >>>>>>>>> + * enum dma_buf_map_info - CPU mapping info
> >>>>>>>>> + *
> >>>>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> >>>>>>>>> + *
> >>>>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
> >>>>>>>>> + * import if unsupported.  For example, if the exporting device uses
> >>>>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> >>>>>>>>> + * CPU cache coherency, the dma-buf import should fail.
> >>>>>>>>> + */
> >>>>>>>>> +enum dma_buf_map_info {
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> >>>>>>>>> +      *
> >>>>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> >>>>>>>>> +      */
> >>>>>>>>> +     DMA_BUF_MAP_INCOHERENT,
> >>>>>>>>> +
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> >>>>>>>>> +      *
> >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>>>>>> +      *
> >>>>>>>>> +      * The cpu mapping is writecombine.
> >>>>>>>>> +      */
> >>>>>>>>> +     DMA_BUF_COHERENT_WC,
> >>>>>>>>> +
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> >>>>>>>>> +      *
> >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> >>>>>>>>> +      *
> >>>>>>>>> +      * The cpu mapping is cached.
> >>>>>>>>> +      */
> >>>>>>>>> +     DMA_BUF_COHERENT_CACHED,
> >>>>>>>>> +};
> >>>>>>>>> +
> >>>>>>>>> +/**
> >>>>>>>>> + * struct dma_buf_info - Query info about the buffer.
> >>>>>>>>> + */
> >>>>>>>>> +struct dma_buf_info {
> >>>>>>>>> +
> >>>>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
> >>>>>>>>> +
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @param: Which param to query
> >>>>>>>>> +      *
> >>>>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
> >>>>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> >>>>>>>>> +      *     caching of a CPU mapping of the buffer.
> >>>>>>>>> +      */
> >>>>>>>>> +     __u32 param;
> >>>>>>>>> +     __u32 pad;
> >>>>>>>>> +
> >>>>>>>>> +     /**
> >>>>>>>>> +      * @value: Return value of the query.
> >>>>>>>>> +      */
> >>>>>>>>> +     __u64 value;
> >>>>>>>>> +};
> >>>>>>>>> +
> >>>>>>>>>       #define DMA_BUF_BASE                'b'
> >>>>>>>>>       #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> >>>>>>>>>
> >>>>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> >>>>>>>>>       #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> >>>>>>>>>       #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> >>>>>>>>>
> >>>>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> >>>>>>>>> +
> >>>>>>>>>       #endif
>
Daniel Vetter Sept. 7, 2022, 4:43 p.m. UTC | #11
On Tue, Aug 16, 2022 at 06:50:54PM +0200, Christian König wrote:
> Am 16.08.22 um 16:26 schrieb Rob Clark:
> > On Tue, Aug 16, 2022 at 1:27 AM Christian König
> > <christian.koenig@amd.com> wrote:
> > > Am 15.08.22 um 23:15 schrieb Rob Clark:
> > > > From: Rob Clark <robdclark@chromium.org>
> > > > 
> > > > This is a fairly narrowly focused interface, providing a way for a VMM
> > > > in userspace to tell the guest kernel what pgprot settings to use when
> > > > mapping a buffer to guest userspace.
> > > > 
> > > > For buffers that get mapped into guest userspace, virglrenderer returns
> > > > a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
> > > > pages into the guest VM, it needs to report to drm/virtio in the guest
> > > > the cache settings to use for guest userspace.  In particular, on some
> > > > architectures, creating aliased mappings with different cache attributes
> > > > is frowned upon, so it is important that the guest mappings have the
> > > > same cache attributes as any potential host mappings.
> > > > 
> > > > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > > > ---
> > > > v2: Combine with coherency, as that is a related concept.. and it is
> > > >       relevant to the VMM whether coherent access without the SYNC ioctl
> > > >       is possible; set map_info at export time to make it more clear
> > > >       that it applies for the lifetime of the dma-buf (for any mmap
> > > >       created via the dma-buf)
> > > Well, exactly that's a conceptual NAK from my side.
> > > 
> > > The caching information can change at any time. For CPU mappings even
> > > without further notice from the exporter.
> > You should look before you criticize, as I left you a way out.. the
> > idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
> > cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
> > values if something else would suit you better.  But the goal is to
> > give the VMM enough information to dtrt, or return an error if mapping
> > to guest is not possible.  That seems better than assuming mapping to
> > guest will work and guessing about cache attrs
> 
> Well I'm not rejecting the implementation, I'm rejecting this from the
> conceptual point of view.
> 
> We intentional gave the exporter full control over the CPU mappings. This
> approach here breaks that now.
> 
> I haven't seen the full detailed reason why we should do that and to be
> honest KVM seems to mess with things it is not supposed to touch.
> 
> For example the page reference count of mappings marked with VM_IO is a
> complete no-go. This is a very strong evidence that this was based on rather
> dangerous halve knowledge about the background of the handling here.

Wut?

KVM grabs page references of VM_IO vma? I thought the issue was that we
still had some bo/dma-buf vma that didn't set either VM_IO or VM_PFNMAP,
and not that kvm was just outright breaking every core mm contract there
is.

Is this really what's going on in that other thread about "fixing" ttm?
-Daniel

> So as long as I don't see a full explanation why KVM is grabbing reference
> to pages while faulting them and why we manually need to forward the caching
> while the hardware documentation indicates otherwise I will be rejecting
> this whole approach.
> 
> Regards,
> Christian.
> 
> > 
> > BR,
> > -R
> > 
> > > If the hardware can't use the caching information from the host CPU page
> > > tables directly then that pretty much completely breaks the concept that
> > > the exporter is responsible for setting up those page tables.
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > >    drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> > > >    include/linux/dma-buf.h      | 11 ++++++
> > > >    include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> > > >    3 files changed, 132 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> > > > index 32f55640890c..262c4706f721 100644
> > > > --- a/drivers/dma-buf/dma-buf.c
> > > > +++ b/drivers/dma-buf/dma-buf.c
> > > > @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> > > >        .kill_sb = kill_anon_super,
> > > >    };
> > > > 
> > > > +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> > > > +{
> > > > +     int ret;
> > > > +
> > > > +     /* check if buffer supports mmap */
> > > > +     if (!dmabuf->ops->mmap)
> > > > +             return -EINVAL;
> > > > +
> > > > +     ret = dmabuf->ops->mmap(dmabuf, vma);
> > > > +
> > > > +     /*
> > > > +      * If the exporter claims to support coherent access, ensure the
> > > > +      * pgprot flags match the claim.
> > > > +      */
> > > > +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> > > > +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> > > > +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> > > > +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> > > > +             } else {
> > > > +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> > > > +             }
> > > > +     }
> > > > +
> > > > +     return ret;
> > > > +}
> > > > +
> > > >    static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > > >    {
> > > >        struct dma_buf *dmabuf;
> > > > @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > > > 
> > > >        dmabuf = file->private_data;
> > > > 
> > > > -     /* check if buffer supports mmap */
> > > > -     if (!dmabuf->ops->mmap)
> > > > -             return -EINVAL;
> > > > -
> > > >        /* check for overflowing the buffer's size */
> > > >        if (vma->vm_pgoff + vma_pages(vma) >
> > > >            dmabuf->size >> PAGE_SHIFT)
> > > >                return -EINVAL;
> > > > 
> > > > -     return dmabuf->ops->mmap(dmabuf, vma);
> > > > +     return __dma_buf_mmap(dmabuf, vma);
> > > >    }
> > > > 
> > > >    static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> > > > @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> > > >        return 0;
> > > >    }
> > > > 
> > > > +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> > > > +{
> > > > +     struct dma_buf_info arg;
> > > > +
> > > > +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> > > > +             return -EFAULT;
> > > > +
> > > > +     switch (arg.param) {
> > > > +     case DMA_BUF_INFO_MAP_INFO:
> > > > +             arg.value = dmabuf->map_info;
> > > > +             break;
> > > > +     default:
> > > > +             return -EINVAL;
> > > > +     }
> > > > +
> > > > +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> > > > +             return -EFAULT;
> > > > +
> > > > +     return 0;
> > > > +}
> > > > +
> > > >    static long dma_buf_ioctl(struct file *file,
> > > >                          unsigned int cmd, unsigned long arg)
> > > >    {
> > > > @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> > > >        case DMA_BUF_SET_NAME_B:
> > > >                return dma_buf_set_name(dmabuf, (const char __user *)arg);
> > > > 
> > > > +     case DMA_BUF_IOCTL_INFO:
> > > > +             return dma_buf_info(dmabuf, (void __user *)arg);
> > > > +
> > > >        default:
> > > >                return -ENOTTY;
> > > >        }
> > > > @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> > > >        dmabuf->priv = exp_info->priv;
> > > >        dmabuf->ops = exp_info->ops;
> > > >        dmabuf->size = exp_info->size;
> > > > +     dmabuf->map_info = exp_info->map_info;
> > > >        dmabuf->exp_name = exp_info->exp_name;
> > > >        dmabuf->owner = exp_info->owner;
> > > >        spin_lock_init(&dmabuf->name_lock);
> > > > @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > > >        if (WARN_ON(!dmabuf || !vma))
> > > >                return -EINVAL;
> > > > 
> > > > -     /* check if buffer supports mmap */
> > > > -     if (!dmabuf->ops->mmap)
> > > > -             return -EINVAL;
> > > > -
> > > >        /* check for offset overflow */
> > > >        if (pgoff + vma_pages(vma) < pgoff)
> > > >                return -EOVERFLOW;
> > > > @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > > >        vma_set_file(vma, dmabuf->file);
> > > >        vma->vm_pgoff = pgoff;
> > > > 
> > > > -     return dmabuf->ops->mmap(dmabuf, vma);
> > > > +     return __dma_buf_mmap(dmabuf, vma);
> > > >    }
> > > >    EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> > > > 
> > > > diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > > > index 71731796c8c3..37923c8d5c24 100644
> > > > --- a/include/linux/dma-buf.h
> > > > +++ b/include/linux/dma-buf.h
> > > > @@ -23,6 +23,8 @@
> > > >    #include <linux/dma-fence.h>
> > > >    #include <linux/wait.h>
> > > > 
> > > > +#include <uapi/linux/dma-buf.h>
> > > > +
> > > >    struct device;
> > > >    struct dma_buf;
> > > >    struct dma_buf_attachment;
> > > > @@ -307,6 +309,13 @@ struct dma_buf {
> > > >         */
> > > >        size_t size;
> > > > 
> > > > +     /**
> > > > +      * @map_info:
> > > > +      *
> > > > +      * CPU mapping/coherency information for the buffer.
> > > > +      */
> > > > +     enum dma_buf_map_info map_info;
> > > > +
> > > >        /**
> > > >         * @file:
> > > >         *
> > > > @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> > > >     * @ops:    Attach allocator-defined dma buf ops to the new buffer
> > > >     * @size:   Size of the buffer - invariant over the lifetime of the buffer
> > > >     * @flags:  mode flags for the file
> > > > + * @map_info:        CPU mapping/coherency information for the buffer
> > > >     * @resv:   reservation-object, NULL to allocate default one
> > > >     * @priv:   Attach private data of allocator to this buffer
> > > >     *
> > > > @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> > > >        const struct dma_buf_ops *ops;
> > > >        size_t size;
> > > >        int flags;
> > > > +     enum dma_buf_map_info map_info;
> > > >        struct dma_resv *resv;
> > > >        void *priv;
> > > >    };
> > > > diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> > > > index b1523cb8ab30..07b403ffdb43 100644
> > > > --- a/include/uapi/linux/dma-buf.h
> > > > +++ b/include/uapi/linux/dma-buf.h
> > > > @@ -85,6 +85,72 @@ struct dma_buf_sync {
> > > > 
> > > >    #define DMA_BUF_NAME_LEN    32
> > > > 
> > > > +/**
> > > > + * enum dma_buf_map_info - CPU mapping info
> > > > + *
> > > > + * This enum describes coherency of a userspace mapping of the dmabuf.
> > > > + *
> > > > + * Importing devices should check dma_buf::map_info flag and reject an
> > > > + * import if unsupported.  For example, if the exporting device uses
> > > > + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> > > > + * CPU cache coherency, the dma-buf import should fail.
> > > > + */
> > > > +enum dma_buf_map_info {
> > > > +     /**
> > > > +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> > > > +      *
> > > > +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> > > > +      */
> > > > +     DMA_BUF_MAP_INCOHERENT,
> > > > +
> > > > +     /**
> > > > +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> > > > +      *
> > > > +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > > > +      * However fences may be still required for synchronizing access.  Ie.
> > > > +      * coherency can only be relied upon by an explicit-fencing userspace.
> > > > +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > > > +      *
> > > > +      * The cpu mapping is writecombine.
> > > > +      */
> > > > +     DMA_BUF_COHERENT_WC,
> > > > +
> > > > +     /**
> > > > +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> > > > +      *
> > > > +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > > > +      * However fences may be still required for synchronizing access.  Ie.
> > > > +      * coherency can only be relied upon by an explicit-fencing userspace.
> > > > +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > > > +      *
> > > > +      * The cpu mapping is cached.
> > > > +      */
> > > > +     DMA_BUF_COHERENT_CACHED,
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct dma_buf_info - Query info about the buffer.
> > > > + */
> > > > +struct dma_buf_info {
> > > > +
> > > > +#define DMA_BUF_INFO_MAP_INFO    1
> > > > +
> > > > +     /**
> > > > +      * @param: Which param to query
> > > > +      *
> > > > +      * DMA_BUF_INFO_MAP_INFO:
> > > > +      *     Returns enum dma_buf_map_info, describing the coherency and
> > > > +      *     caching of a CPU mapping of the buffer.
> > > > +      */
> > > > +     __u32 param;
> > > > +     __u32 pad;
> > > > +
> > > > +     /**
> > > > +      * @value: Return value of the query.
> > > > +      */
> > > > +     __u64 value;
> > > > +};
> > > > +
> > > >    #define DMA_BUF_BASE                'b'
> > > >    #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> > > > 
> > > > @@ -95,4 +161,6 @@ struct dma_buf_sync {
> > > >    #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> > > >    #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> > > > 
> > > > +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> > > > +
> > > >    #endif
>
Daniel Vetter Sept. 7, 2022, 4:55 p.m. UTC | #12
On Thu, Aug 18, 2022 at 08:01:53AM -0700, Rob Clark wrote:
> On Thu, Aug 18, 2022 at 7:54 AM Christian König
> <christian.koenig@amd.com> wrote:
> >
> > Am 18.08.22 um 16:25 schrieb Rob Clark:
> > > On Thu, Aug 18, 2022 at 4:21 AM Christian König
> > > <christian.koenig@amd.com> wrote:
> > >> Am 17.08.22 um 15:44 schrieb Rob Clark:
> > >>> On Wed, Aug 17, 2022 at 2:57 AM Christian König
> > >>> <christian.koenig@amd.com> wrote:
> > >>>> [SNIP]
> > >>>>
> > >>>> The resulting cache attrs from combination of S1 and S2 translation
> > >>>> can differ.  So ideally we setup the S2 pgtables in guest aligned with
> > >>>> host userspace mappings
> > >>>> Well exactly that is not very convincing.
> > >>>>
> > >>>> What you want to do is to use one channel for the address and a
> > >>>> different one for the cache attrs, that's not something I would
> > >>>> recommend doing in general.
> > >>> How would that work.. mmap() is the channel for the address, we'd need
> > >>> to introduce a new syscall that returned additional information?
> > >> The channel for the address is not mmap(), but rather the page faults.
> > >> mmap() is just the function setting up that channel.
> > >>
> > >> The page faults then insert both the address as well as the caching
> > >> attributes (at least on x86).
> > > This is true on arm64 as well, but only in the S1 tables (which I
> > > would have to assume is the case on x86 as well)
> > >
> > >> That we then need to forward the caching attributes manually once more
> > >> seems really misplaced.
> > >>
> > >>>> Instead the client pgtables should be setup in a way so that host can
> > >>>> overwrite them.
> > >>> How?  That is completely not how VMs work.  Even if the host knew
> > >>> where the pgtables were and somehow magically knew the various guest
> > >>> userspace VAs, it would be racey.
> > >> Well you mentioned that the client page tables can be setup in a way
> > >> that the host page tables determine what caching to use. As far as I can
> > >> see this is what we should use here.
> > > On arm64/aarch64, they *can*.. but the system (on some versions of
> > > armv8) can also be configured to let S2 determine the attributes.  And
> > > apparently there are benefits to this (avoids unnecessary cache
> > > flushing in the host, AFAIU.)  This is the case where we need this new
> > > api.
> > >
> > > IMO it is fine for the exporter to return a value indicating that the
> > > attributes change dynamically or that S1 attributes must somehow be
> > > used by the hw.  This would at least let the VMM return an error in
> > > cases where S1 attrs cannot be relied on.  But there are enough
> > > exporters where the cache attrs are static for the life of the buffer.
> > > So even if you need to return DMA_BUF_MAP_I_DONT_KNOW, maybe that is
> > > fine (if x86 can always rely on S1 attrs), or at least will let the
> > > VMM return an error rather than just blindly assuming things will
> > > work.
> > >
> > > But it makes no sense to reject the whole idea just because of some
> > > exporters (which may not even need this).  There is always room to let
> > > them return a map-info value that describes the situation or
> > > limitations to the VMM.
> >
> > Well it does make sense as far as I can see.
> >
> > This is a very specific workaround for a platform problem which only
> > matters there, but increases complexity for everybody.
> 
> I'm not sure how this adds complexity for everybody.. or at least the
> intention was the default value for the new enum is the same as
> current status-quo, so no need to plumb something thru every single
> exporter.

I think what König freaks out about here, and I think it's the same
concern I have, is that this is for _all_ dma-buf exporter.

Yeah I know we're having this "anything might not be implemented" escape
hatch, but we're also slowly working to get that fixed and make dma-buf
implementations. And so adding a fully generic dma-buf ioctl which is very
limited use for arm64 VM in funky configuration (where the guest controls
caching mode in ptes fully). The usual way we do these very special things
are:

- Importer upcasts the dma-buf to the exporters buffer type by checking
  the ops structure pointer.
- There is no dma-buf interface anymore really, and you can do very
  specific stuff like xgmi mappings, or virtio uuid, or whatever.

I think this should also work like that. Which means the query ioctl
should be on some kvm/vm specific interface most likely, and not on
generic dma-buf.

Also if we add this interface somewhere in the kvm world then that would
also be a natural place to implement stuff like "the guest can't actually
overwrite caching modes, ignore this all" and anything else. Ideally this
thing exists only where it's actually needed, i.e. this specific vm+arm64
cases. Not anywhere else, because experience says that if dma-buf can be
abused in some way, people will.

Also I'd really like to understand the full flow here too, some vague
hand-waving that "apparently it has some cache flushing benefits" is a bit
too vague, if this is really only for performance. Like my experience has
been that when a less priviledge entity can control caching, then you
actually have to flush more often, not less. At least depending upon
hardware (some recent intel igpu chips have this issue where due to
userspace overwriting cache control too much we have to defensively flush
caches again for everything - hw design is going to get fixed again soon).
-Daniel

> 
> BR,
> -R
> 
> > If we don't have any other choice on the problem to work around that I
> > would say ok we add an ARM specific workaround.
> >
> > But as long as that's not the case the whole idea is pretty clearly a
> > NAK from my side.
> >
> > Regards,
> > Christian.
> >
> > >
> > > BR,
> > > -R
> > >
> > >> Regards,
> > >> Christian.
> > >>
> > >>> BR,
> > >>> -R
> > >>>
> > >>>> Regards,
> > >>>> Christian.
> > >>>>
> > >>>>> BR,
> > >>>>> -R
> > >>>>>
> > >>>>>> Regards,
> > >>>>>> Christian.
> > >>>>>>
> > >>>>>>> BR,
> > >>>>>>> -R
> > >>>>>>>
> > >>>>>>>> If the hardware can't use the caching information from the host CPU page
> > >>>>>>>> tables directly then that pretty much completely breaks the concept that
> > >>>>>>>> the exporter is responsible for setting up those page tables.
> > >>>>>>>>
> > >>>>>>>> Regards,
> > >>>>>>>> Christian.
> > >>>>>>>>
> > >>>>>>>>>       drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> > >>>>>>>>>       include/linux/dma-buf.h      | 11 ++++++
> > >>>>>>>>>       include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> > >>>>>>>>>       3 files changed, 132 insertions(+), 10 deletions(-)
> > >>>>>>>>>
> > >>>>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> > >>>>>>>>> index 32f55640890c..262c4706f721 100644
> > >>>>>>>>> --- a/drivers/dma-buf/dma-buf.c
> > >>>>>>>>> +++ b/drivers/dma-buf/dma-buf.c
> > >>>>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> > >>>>>>>>>           .kill_sb = kill_anon_super,
> > >>>>>>>>>       };
> > >>>>>>>>>
> > >>>>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> > >>>>>>>>> +{
> > >>>>>>>>> +     int ret;
> > >>>>>>>>> +
> > >>>>>>>>> +     /* check if buffer supports mmap */
> > >>>>>>>>> +     if (!dmabuf->ops->mmap)
> > >>>>>>>>> +             return -EINVAL;
> > >>>>>>>>> +
> > >>>>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> > >>>>>>>>> +
> > >>>>>>>>> +     /*
> > >>>>>>>>> +      * If the exporter claims to support coherent access, ensure the
> > >>>>>>>>> +      * pgprot flags match the claim.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> > >>>>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> > >>>>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> > >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> > >>>>>>>>> +             } else {
> > >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> > >>>>>>>>> +             }
> > >>>>>>>>> +     }
> > >>>>>>>>> +
> > >>>>>>>>> +     return ret;
> > >>>>>>>>> +}
> > >>>>>>>>> +
> > >>>>>>>>>       static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > >>>>>>>>>       {
> > >>>>>>>>>           struct dma_buf *dmabuf;
> > >>>>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > >>>>>>>>>
> > >>>>>>>>>           dmabuf = file->private_data;
> > >>>>>>>>>
> > >>>>>>>>> -     /* check if buffer supports mmap */
> > >>>>>>>>> -     if (!dmabuf->ops->mmap)
> > >>>>>>>>> -             return -EINVAL;
> > >>>>>>>>> -
> > >>>>>>>>>           /* check for overflowing the buffer's size */
> > >>>>>>>>>           if (vma->vm_pgoff + vma_pages(vma) >
> > >>>>>>>>>               dmabuf->size >> PAGE_SHIFT)
> > >>>>>>>>>                   return -EINVAL;
> > >>>>>>>>>
> > >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> > >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> > >>>>>>>>>       }
> > >>>>>>>>>
> > >>>>>>>>>       static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> > >>>>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> > >>>>>>>>>           return 0;
> > >>>>>>>>>       }
> > >>>>>>>>>
> > >>>>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> > >>>>>>>>> +{
> > >>>>>>>>> +     struct dma_buf_info arg;
> > >>>>>>>>> +
> > >>>>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> > >>>>>>>>> +             return -EFAULT;
> > >>>>>>>>> +
> > >>>>>>>>> +     switch (arg.param) {
> > >>>>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
> > >>>>>>>>> +             arg.value = dmabuf->map_info;
> > >>>>>>>>> +             break;
> > >>>>>>>>> +     default:
> > >>>>>>>>> +             return -EINVAL;
> > >>>>>>>>> +     }
> > >>>>>>>>> +
> > >>>>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> > >>>>>>>>> +             return -EFAULT;
> > >>>>>>>>> +
> > >>>>>>>>> +     return 0;
> > >>>>>>>>> +}
> > >>>>>>>>> +
> > >>>>>>>>>       static long dma_buf_ioctl(struct file *file,
> > >>>>>>>>>                             unsigned int cmd, unsigned long arg)
> > >>>>>>>>>       {
> > >>>>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> > >>>>>>>>>           case DMA_BUF_SET_NAME_B:
> > >>>>>>>>>                   return dma_buf_set_name(dmabuf, (const char __user *)arg);
> > >>>>>>>>>
> > >>>>>>>>> +     case DMA_BUF_IOCTL_INFO:
> > >>>>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> > >>>>>>>>> +
> > >>>>>>>>>           default:
> > >>>>>>>>>                   return -ENOTTY;
> > >>>>>>>>>           }
> > >>>>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> > >>>>>>>>>           dmabuf->priv = exp_info->priv;
> > >>>>>>>>>           dmabuf->ops = exp_info->ops;
> > >>>>>>>>>           dmabuf->size = exp_info->size;
> > >>>>>>>>> +     dmabuf->map_info = exp_info->map_info;
> > >>>>>>>>>           dmabuf->exp_name = exp_info->exp_name;
> > >>>>>>>>>           dmabuf->owner = exp_info->owner;
> > >>>>>>>>>           spin_lock_init(&dmabuf->name_lock);
> > >>>>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > >>>>>>>>>           if (WARN_ON(!dmabuf || !vma))
> > >>>>>>>>>                   return -EINVAL;
> > >>>>>>>>>
> > >>>>>>>>> -     /* check if buffer supports mmap */
> > >>>>>>>>> -     if (!dmabuf->ops->mmap)
> > >>>>>>>>> -             return -EINVAL;
> > >>>>>>>>> -
> > >>>>>>>>>           /* check for offset overflow */
> > >>>>>>>>>           if (pgoff + vma_pages(vma) < pgoff)
> > >>>>>>>>>                   return -EOVERFLOW;
> > >>>>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > >>>>>>>>>           vma_set_file(vma, dmabuf->file);
> > >>>>>>>>>           vma->vm_pgoff = pgoff;
> > >>>>>>>>>
> > >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> > >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> > >>>>>>>>>       }
> > >>>>>>>>>       EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> > >>>>>>>>>
> > >>>>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > >>>>>>>>> index 71731796c8c3..37923c8d5c24 100644
> > >>>>>>>>> --- a/include/linux/dma-buf.h
> > >>>>>>>>> +++ b/include/linux/dma-buf.h
> > >>>>>>>>> @@ -23,6 +23,8 @@
> > >>>>>>>>>       #include <linux/dma-fence.h>
> > >>>>>>>>>       #include <linux/wait.h>
> > >>>>>>>>>
> > >>>>>>>>> +#include <uapi/linux/dma-buf.h>
> > >>>>>>>>> +
> > >>>>>>>>>       struct device;
> > >>>>>>>>>       struct dma_buf;
> > >>>>>>>>>       struct dma_buf_attachment;
> > >>>>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
> > >>>>>>>>>            */
> > >>>>>>>>>           size_t size;
> > >>>>>>>>>
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @map_info:
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * CPU mapping/coherency information for the buffer.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     enum dma_buf_map_info map_info;
> > >>>>>>>>> +
> > >>>>>>>>>           /**
> > >>>>>>>>>            * @file:
> > >>>>>>>>>            *
> > >>>>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> > >>>>>>>>>        * @ops:    Attach allocator-defined dma buf ops to the new buffer
> > >>>>>>>>>        * @size:   Size of the buffer - invariant over the lifetime of the buffer
> > >>>>>>>>>        * @flags:  mode flags for the file
> > >>>>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
> > >>>>>>>>>        * @resv:   reservation-object, NULL to allocate default one
> > >>>>>>>>>        * @priv:   Attach private data of allocator to this buffer
> > >>>>>>>>>        *
> > >>>>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> > >>>>>>>>>           const struct dma_buf_ops *ops;
> > >>>>>>>>>           size_t size;
> > >>>>>>>>>           int flags;
> > >>>>>>>>> +     enum dma_buf_map_info map_info;
> > >>>>>>>>>           struct dma_resv *resv;
> > >>>>>>>>>           void *priv;
> > >>>>>>>>>       };
> > >>>>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> > >>>>>>>>> index b1523cb8ab30..07b403ffdb43 100644
> > >>>>>>>>> --- a/include/uapi/linux/dma-buf.h
> > >>>>>>>>> +++ b/include/uapi/linux/dma-buf.h
> > >>>>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> > >>>>>>>>>
> > >>>>>>>>>       #define DMA_BUF_NAME_LEN    32
> > >>>>>>>>>
> > >>>>>>>>> +/**
> > >>>>>>>>> + * enum dma_buf_map_info - CPU mapping info
> > >>>>>>>>> + *
> > >>>>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> > >>>>>>>>> + *
> > >>>>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
> > >>>>>>>>> + * import if unsupported.  For example, if the exporting device uses
> > >>>>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> > >>>>>>>>> + * CPU cache coherency, the dma-buf import should fail.
> > >>>>>>>>> + */
> > >>>>>>>>> +enum dma_buf_map_info {
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     DMA_BUF_MAP_INCOHERENT,
> > >>>>>>>>> +
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> > >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> > >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * The cpu mapping is writecombine.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     DMA_BUF_COHERENT_WC,
> > >>>>>>>>> +
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> > >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> > >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * The cpu mapping is cached.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     DMA_BUF_COHERENT_CACHED,
> > >>>>>>>>> +};
> > >>>>>>>>> +
> > >>>>>>>>> +/**
> > >>>>>>>>> + * struct dma_buf_info - Query info about the buffer.
> > >>>>>>>>> + */
> > >>>>>>>>> +struct dma_buf_info {
> > >>>>>>>>> +
> > >>>>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
> > >>>>>>>>> +
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @param: Which param to query
> > >>>>>>>>> +      *
> > >>>>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
> > >>>>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> > >>>>>>>>> +      *     caching of a CPU mapping of the buffer.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     __u32 param;
> > >>>>>>>>> +     __u32 pad;
> > >>>>>>>>> +
> > >>>>>>>>> +     /**
> > >>>>>>>>> +      * @value: Return value of the query.
> > >>>>>>>>> +      */
> > >>>>>>>>> +     __u64 value;
> > >>>>>>>>> +};
> > >>>>>>>>> +
> > >>>>>>>>>       #define DMA_BUF_BASE                'b'
> > >>>>>>>>>       #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> > >>>>>>>>>
> > >>>>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> > >>>>>>>>>       #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> > >>>>>>>>>       #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> > >>>>>>>>>
> > >>>>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> > >>>>>>>>> +
> > >>>>>>>>>       #endif
> >
Rob Clark Sept. 8, 2022, 5:24 a.m. UTC | #13
On Wed, Sep 7, 2022 at 9:55 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, Aug 18, 2022 at 08:01:53AM -0700, Rob Clark wrote:
> > On Thu, Aug 18, 2022 at 7:54 AM Christian König
> > <christian.koenig@amd.com> wrote:
> > >
> > > Am 18.08.22 um 16:25 schrieb Rob Clark:
> > > > On Thu, Aug 18, 2022 at 4:21 AM Christian König
> > > > <christian.koenig@amd.com> wrote:
> > > >> Am 17.08.22 um 15:44 schrieb Rob Clark:
> > > >>> On Wed, Aug 17, 2022 at 2:57 AM Christian König
> > > >>> <christian.koenig@amd.com> wrote:
> > > >>>> [SNIP]
> > > >>>>
> > > >>>> The resulting cache attrs from combination of S1 and S2 translation
> > > >>>> can differ.  So ideally we setup the S2 pgtables in guest aligned with
> > > >>>> host userspace mappings
> > > >>>> Well exactly that is not very convincing.
> > > >>>>
> > > >>>> What you want to do is to use one channel for the address and a
> > > >>>> different one for the cache attrs, that's not something I would
> > > >>>> recommend doing in general.
> > > >>> How would that work.. mmap() is the channel for the address, we'd need
> > > >>> to introduce a new syscall that returned additional information?
> > > >> The channel for the address is not mmap(), but rather the page faults.
> > > >> mmap() is just the function setting up that channel.
> > > >>
> > > >> The page faults then insert both the address as well as the caching
> > > >> attributes (at least on x86).
> > > > This is true on arm64 as well, but only in the S1 tables (which I
> > > > would have to assume is the case on x86 as well)
> > > >
> > > >> That we then need to forward the caching attributes manually once more
> > > >> seems really misplaced.
> > > >>
> > > >>>> Instead the client pgtables should be setup in a way so that host can
> > > >>>> overwrite them.
> > > >>> How?  That is completely not how VMs work.  Even if the host knew
> > > >>> where the pgtables were and somehow magically knew the various guest
> > > >>> userspace VAs, it would be racey.
> > > >> Well you mentioned that the client page tables can be setup in a way
> > > >> that the host page tables determine what caching to use. As far as I can
> > > >> see this is what we should use here.
> > > > On arm64/aarch64, they *can*.. but the system (on some versions of
> > > > armv8) can also be configured to let S2 determine the attributes.  And
> > > > apparently there are benefits to this (avoids unnecessary cache
> > > > flushing in the host, AFAIU.)  This is the case where we need this new
> > > > api.
> > > >
> > > > IMO it is fine for the exporter to return a value indicating that the
> > > > attributes change dynamically or that S1 attributes must somehow be
> > > > used by the hw.  This would at least let the VMM return an error in
> > > > cases where S1 attrs cannot be relied on.  But there are enough
> > > > exporters where the cache attrs are static for the life of the buffer.
> > > > So even if you need to return DMA_BUF_MAP_I_DONT_KNOW, maybe that is
> > > > fine (if x86 can always rely on S1 attrs), or at least will let the
> > > > VMM return an error rather than just blindly assuming things will
> > > > work.
> > > >
> > > > But it makes no sense to reject the whole idea just because of some
> > > > exporters (which may not even need this).  There is always room to let
> > > > them return a map-info value that describes the situation or
> > > > limitations to the VMM.
> > >
> > > Well it does make sense as far as I can see.
> > >
> > > This is a very specific workaround for a platform problem which only
> > > matters there, but increases complexity for everybody.
> >
> > I'm not sure how this adds complexity for everybody.. or at least the
> > intention was the default value for the new enum is the same as
> > current status-quo, so no need to plumb something thru every single
> > exporter.
>
> I think what König freaks out about here, and I think it's the same
> concern I have, is that this is for _all_ dma-buf exporter.
>
> Yeah I know we're having this "anything might not be implemented" escape
> hatch, but we're also slowly working to get that fixed and make dma-buf
> implementations. And so adding a fully generic dma-buf ioctl which is very
> limited use for arm64 VM in funky configuration (where the guest controls
> caching mode in ptes fully). The usual way we do these very special things
> are:

I'm not sure that it is necessarily arm64 specific (or rather I guess
there are additional archs where virtualization works)..

But this isn't *only* solving caching modes, it is also letting the
dmabuf exporter indicate to userspace when it shouldn't even try to
map to guest (ie. cases where you really do require the SYNC ioctl),
which seems useful.

> - Importer upcasts the dma-buf to the exporters buffer type by checking
>   the ops structure pointer.

The "importer" in this case is userspace ;-)

> - There is no dma-buf interface anymore really, and you can do very
>   specific stuff like xgmi mappings, or virtio uuid, or whatever.
>
> I think this should also work like that. Which means the query ioctl
> should be on some kvm/vm specific interface most likely, and not on
> generic dma-buf.

A kvm specific ioctl would seem.. odd.  Given that kvm doesn't
otherwise have anything to do with dmabuf fd's.  The kvm interaction
is simply "make these pages that are mmap'd into VMM visible in the
guest"

If I were to do something else, it would be a drm/msm specific ioctl.
But that only solves the issue for one driver.. maybe that is ok, I'm
not sure if any of the other non-dGPU drivers that can appear on
anything other than x86 support cached coherent buffers.

But still, having a way to indicate to userspace whether or not SYNC
ioctl is required seems useful.  Ie. rather than assume that because
x86 doesn't need this, no one does, let's provide userspace VMM with
the information it needs.  (The VMM already has arch specific
knowledge, so it should be aware about archs where it doesn't need to
care about getting guest cache attrs correct.)

> Also if we add this interface somewhere in the kvm world then that would
> also be a natural place to implement stuff like "the guest can't actually
> overwrite caching modes, ignore this all" and anything else. Ideally this
> thing exists only where it's actually needed, i.e. this specific vm+arm64
> cases. Not anywhere else, because experience says that if dma-buf can be
> abused in some way, people will.
>
> Also I'd really like to understand the full flow here too, some vague
> hand-waving that "apparently it has some cache flushing benefits" is a bit
> too vague, if this is really only for performance. Like my experience has
> been that when a less priviledge entity can control caching, then you
> actually have to flush more often, not less. At least depending upon
> hardware (some recent intel igpu chips have this issue where due to
> userspace overwriting cache control too much we have to defensively flush
> caches again for everything - hw design is going to get fixed again soon).

See for ex, https://patchwork.kernel.org/project/linux-arm-kernel/patch/20200915170442.131635-1-alexandru.elisei@arm.com/

BR,
-R

> -Daniel
>
> >
> > BR,
> > -R
> >
> > > If we don't have any other choice on the problem to work around that I
> > > would say ok we add an ARM specific workaround.
> > >
> > > But as long as that's not the case the whole idea is pretty clearly a
> > > NAK from my side.
> > >
> > > Regards,
> > > Christian.
> > >
> > > >
> > > > BR,
> > > > -R
> > > >
> > > >> Regards,
> > > >> Christian.
> > > >>
> > > >>> BR,
> > > >>> -R
> > > >>>
> > > >>>> Regards,
> > > >>>> Christian.
> > > >>>>
> > > >>>>> BR,
> > > >>>>> -R
> > > >>>>>
> > > >>>>>> Regards,
> > > >>>>>> Christian.
> > > >>>>>>
> > > >>>>>>> BR,
> > > >>>>>>> -R
> > > >>>>>>>
> > > >>>>>>>> If the hardware can't use the caching information from the host CPU page
> > > >>>>>>>> tables directly then that pretty much completely breaks the concept that
> > > >>>>>>>> the exporter is responsible for setting up those page tables.
> > > >>>>>>>>
> > > >>>>>>>> Regards,
> > > >>>>>>>> Christian.
> > > >>>>>>>>
> > > >>>>>>>>>       drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
> > > >>>>>>>>>       include/linux/dma-buf.h      | 11 ++++++
> > > >>>>>>>>>       include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
> > > >>>>>>>>>       3 files changed, 132 insertions(+), 10 deletions(-)
> > > >>>>>>>>>
> > > >>>>>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> > > >>>>>>>>> index 32f55640890c..262c4706f721 100644
> > > >>>>>>>>> --- a/drivers/dma-buf/dma-buf.c
> > > >>>>>>>>> +++ b/drivers/dma-buf/dma-buf.c
> > > >>>>>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
> > > >>>>>>>>>           .kill_sb = kill_anon_super,
> > > >>>>>>>>>       };
> > > >>>>>>>>>
> > > >>>>>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> > > >>>>>>>>> +{
> > > >>>>>>>>> +     int ret;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /* check if buffer supports mmap */
> > > >>>>>>>>> +     if (!dmabuf->ops->mmap)
> > > >>>>>>>>> +             return -EINVAL;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /*
> > > >>>>>>>>> +      * If the exporter claims to support coherent access, ensure the
> > > >>>>>>>>> +      * pgprot flags match the claim.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
> > > >>>>>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
> > > >>>>>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
> > > >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
> > > >>>>>>>>> +             } else {
> > > >>>>>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
> > > >>>>>>>>> +             }
> > > >>>>>>>>> +     }
> > > >>>>>>>>> +
> > > >>>>>>>>> +     return ret;
> > > >>>>>>>>> +}
> > > >>>>>>>>> +
> > > >>>>>>>>>       static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > > >>>>>>>>>       {
> > > >>>>>>>>>           struct dma_buf *dmabuf;
> > > >>>>>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
> > > >>>>>>>>>
> > > >>>>>>>>>           dmabuf = file->private_data;
> > > >>>>>>>>>
> > > >>>>>>>>> -     /* check if buffer supports mmap */
> > > >>>>>>>>> -     if (!dmabuf->ops->mmap)
> > > >>>>>>>>> -             return -EINVAL;
> > > >>>>>>>>> -
> > > >>>>>>>>>           /* check for overflowing the buffer's size */
> > > >>>>>>>>>           if (vma->vm_pgoff + vma_pages(vma) >
> > > >>>>>>>>>               dmabuf->size >> PAGE_SHIFT)
> > > >>>>>>>>>                   return -EINVAL;
> > > >>>>>>>>>
> > > >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> > > >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> > > >>>>>>>>>       }
> > > >>>>>>>>>
> > > >>>>>>>>>       static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
> > > >>>>>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> > > >>>>>>>>>           return 0;
> > > >>>>>>>>>       }
> > > >>>>>>>>>
> > > >>>>>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
> > > >>>>>>>>> +{
> > > >>>>>>>>> +     struct dma_buf_info arg;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
> > > >>>>>>>>> +             return -EFAULT;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     switch (arg.param) {
> > > >>>>>>>>> +     case DMA_BUF_INFO_MAP_INFO:
> > > >>>>>>>>> +             arg.value = dmabuf->map_info;
> > > >>>>>>>>> +             break;
> > > >>>>>>>>> +     default:
> > > >>>>>>>>> +             return -EINVAL;
> > > >>>>>>>>> +     }
> > > >>>>>>>>> +
> > > >>>>>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
> > > >>>>>>>>> +             return -EFAULT;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     return 0;
> > > >>>>>>>>> +}
> > > >>>>>>>>> +
> > > >>>>>>>>>       static long dma_buf_ioctl(struct file *file,
> > > >>>>>>>>>                             unsigned int cmd, unsigned long arg)
> > > >>>>>>>>>       {
> > > >>>>>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
> > > >>>>>>>>>           case DMA_BUF_SET_NAME_B:
> > > >>>>>>>>>                   return dma_buf_set_name(dmabuf, (const char __user *)arg);
> > > >>>>>>>>>
> > > >>>>>>>>> +     case DMA_BUF_IOCTL_INFO:
> > > >>>>>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
> > > >>>>>>>>> +
> > > >>>>>>>>>           default:
> > > >>>>>>>>>                   return -ENOTTY;
> > > >>>>>>>>>           }
> > > >>>>>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
> > > >>>>>>>>>           dmabuf->priv = exp_info->priv;
> > > >>>>>>>>>           dmabuf->ops = exp_info->ops;
> > > >>>>>>>>>           dmabuf->size = exp_info->size;
> > > >>>>>>>>> +     dmabuf->map_info = exp_info->map_info;
> > > >>>>>>>>>           dmabuf->exp_name = exp_info->exp_name;
> > > >>>>>>>>>           dmabuf->owner = exp_info->owner;
> > > >>>>>>>>>           spin_lock_init(&dmabuf->name_lock);
> > > >>>>>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > > >>>>>>>>>           if (WARN_ON(!dmabuf || !vma))
> > > >>>>>>>>>                   return -EINVAL;
> > > >>>>>>>>>
> > > >>>>>>>>> -     /* check if buffer supports mmap */
> > > >>>>>>>>> -     if (!dmabuf->ops->mmap)
> > > >>>>>>>>> -             return -EINVAL;
> > > >>>>>>>>> -
> > > >>>>>>>>>           /* check for offset overflow */
> > > >>>>>>>>>           if (pgoff + vma_pages(vma) < pgoff)
> > > >>>>>>>>>                   return -EOVERFLOW;
> > > >>>>>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
> > > >>>>>>>>>           vma_set_file(vma, dmabuf->file);
> > > >>>>>>>>>           vma->vm_pgoff = pgoff;
> > > >>>>>>>>>
> > > >>>>>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
> > > >>>>>>>>> +     return __dma_buf_mmap(dmabuf, vma);
> > > >>>>>>>>>       }
> > > >>>>>>>>>       EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
> > > >>>>>>>>>
> > > >>>>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > > >>>>>>>>> index 71731796c8c3..37923c8d5c24 100644
> > > >>>>>>>>> --- a/include/linux/dma-buf.h
> > > >>>>>>>>> +++ b/include/linux/dma-buf.h
> > > >>>>>>>>> @@ -23,6 +23,8 @@
> > > >>>>>>>>>       #include <linux/dma-fence.h>
> > > >>>>>>>>>       #include <linux/wait.h>
> > > >>>>>>>>>
> > > >>>>>>>>> +#include <uapi/linux/dma-buf.h>
> > > >>>>>>>>> +
> > > >>>>>>>>>       struct device;
> > > >>>>>>>>>       struct dma_buf;
> > > >>>>>>>>>       struct dma_buf_attachment;
> > > >>>>>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
> > > >>>>>>>>>            */
> > > >>>>>>>>>           size_t size;
> > > >>>>>>>>>
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @map_info:
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * CPU mapping/coherency information for the buffer.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     enum dma_buf_map_info map_info;
> > > >>>>>>>>> +
> > > >>>>>>>>>           /**
> > > >>>>>>>>>            * @file:
> > > >>>>>>>>>            *
> > > >>>>>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
> > > >>>>>>>>>        * @ops:    Attach allocator-defined dma buf ops to the new buffer
> > > >>>>>>>>>        * @size:   Size of the buffer - invariant over the lifetime of the buffer
> > > >>>>>>>>>        * @flags:  mode flags for the file
> > > >>>>>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
> > > >>>>>>>>>        * @resv:   reservation-object, NULL to allocate default one
> > > >>>>>>>>>        * @priv:   Attach private data of allocator to this buffer
> > > >>>>>>>>>        *
> > > >>>>>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
> > > >>>>>>>>>           const struct dma_buf_ops *ops;
> > > >>>>>>>>>           size_t size;
> > > >>>>>>>>>           int flags;
> > > >>>>>>>>> +     enum dma_buf_map_info map_info;
> > > >>>>>>>>>           struct dma_resv *resv;
> > > >>>>>>>>>           void *priv;
> > > >>>>>>>>>       };
> > > >>>>>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> > > >>>>>>>>> index b1523cb8ab30..07b403ffdb43 100644
> > > >>>>>>>>> --- a/include/uapi/linux/dma-buf.h
> > > >>>>>>>>> +++ b/include/uapi/linux/dma-buf.h
> > > >>>>>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
> > > >>>>>>>>>
> > > >>>>>>>>>       #define DMA_BUF_NAME_LEN    32
> > > >>>>>>>>>
> > > >>>>>>>>> +/**
> > > >>>>>>>>> + * enum dma_buf_map_info - CPU mapping info
> > > >>>>>>>>> + *
> > > >>>>>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
> > > >>>>>>>>> + *
> > > >>>>>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
> > > >>>>>>>>> + * import if unsupported.  For example, if the exporting device uses
> > > >>>>>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
> > > >>>>>>>>> + * CPU cache coherency, the dma-buf import should fail.
> > > >>>>>>>>> + */
> > > >>>>>>>>> +enum dma_buf_map_info {
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     DMA_BUF_MAP_INCOHERENT,
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > > >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> > > >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> > > >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * The cpu mapping is writecombine.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     DMA_BUF_COHERENT_WC,
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
> > > >>>>>>>>> +      * However fences may be still required for synchronizing access.  Ie.
> > > >>>>>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
> > > >>>>>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * The cpu mapping is cached.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     DMA_BUF_COHERENT_CACHED,
> > > >>>>>>>>> +};
> > > >>>>>>>>> +
> > > >>>>>>>>> +/**
> > > >>>>>>>>> + * struct dma_buf_info - Query info about the buffer.
> > > >>>>>>>>> + */
> > > >>>>>>>>> +struct dma_buf_info {
> > > >>>>>>>>> +
> > > >>>>>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @param: Which param to query
> > > >>>>>>>>> +      *
> > > >>>>>>>>> +      * DMA_BUF_INFO_MAP_INFO:
> > > >>>>>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
> > > >>>>>>>>> +      *     caching of a CPU mapping of the buffer.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     __u32 param;
> > > >>>>>>>>> +     __u32 pad;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     /**
> > > >>>>>>>>> +      * @value: Return value of the query.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +     __u64 value;
> > > >>>>>>>>> +};
> > > >>>>>>>>> +
> > > >>>>>>>>>       #define DMA_BUF_BASE                'b'
> > > >>>>>>>>>       #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
> > > >>>>>>>>>
> > > >>>>>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
> > > >>>>>>>>>       #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
> > > >>>>>>>>>       #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
> > > >>>>>>>>>
> > > >>>>>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
> > > >>>>>>>>> +
> > > >>>>>>>>>       #endif
> > >
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Christian König Sept. 8, 2022, 6:01 a.m. UTC | #14
Am 07.09.22 um 18:43 schrieb Daniel Vetter:
> On Tue, Aug 16, 2022 at 06:50:54PM +0200, Christian König wrote:
>> Am 16.08.22 um 16:26 schrieb Rob Clark:
>>> On Tue, Aug 16, 2022 at 1:27 AM Christian König
>>> <christian.koenig@amd.com> wrote:
>>>> Am 15.08.22 um 23:15 schrieb Rob Clark:
>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>
>>>>> This is a fairly narrowly focused interface, providing a way for a VMM
>>>>> in userspace to tell the guest kernel what pgprot settings to use when
>>>>> mapping a buffer to guest userspace.
>>>>>
>>>>> For buffers that get mapped into guest userspace, virglrenderer returns
>>>>> a dma-buf fd to the VMM (crosvm or qemu).  In addition to mapping the
>>>>> pages into the guest VM, it needs to report to drm/virtio in the guest
>>>>> the cache settings to use for guest userspace.  In particular, on some
>>>>> architectures, creating aliased mappings with different cache attributes
>>>>> is frowned upon, so it is important that the guest mappings have the
>>>>> same cache attributes as any potential host mappings.
>>>>>
>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>> ---
>>>>> v2: Combine with coherency, as that is a related concept.. and it is
>>>>>        relevant to the VMM whether coherent access without the SYNC ioctl
>>>>>        is possible; set map_info at export time to make it more clear
>>>>>        that it applies for the lifetime of the dma-buf (for any mmap
>>>>>        created via the dma-buf)
>>>> Well, exactly that's a conceptual NAK from my side.
>>>>
>>>> The caching information can change at any time. For CPU mappings even
>>>> without further notice from the exporter.
>>> You should look before you criticize, as I left you a way out.. the
>>> idea was that DMA_BUF_MAP_INCOHERENT should indicate that the buffer
>>> cannot be mapped to the guest.  We could ofc add more DMA_BUF_MAP_*
>>> values if something else would suit you better.  But the goal is to
>>> give the VMM enough information to dtrt, or return an error if mapping
>>> to guest is not possible.  That seems better than assuming mapping to
>>> guest will work and guessing about cache attrs
>> Well I'm not rejecting the implementation, I'm rejecting this from the
>> conceptual point of view.
>>
>> We intentional gave the exporter full control over the CPU mappings. This
>> approach here breaks that now.
>>
>> I haven't seen the full detailed reason why we should do that and to be
>> honest KVM seems to mess with things it is not supposed to touch.
>>
>> For example the page reference count of mappings marked with VM_IO is a
>> complete no-go. This is a very strong evidence that this was based on rather
>> dangerous halve knowledge about the background of the handling here.
> Wut?

Yep, that was also my initial reaction.

> KVM grabs page references of VM_IO vma? I thought the issue was that we
> still had some bo/dma-buf vma that didn't set either VM_IO or VM_PFNMAP,
> and not that kvm was just outright breaking every core mm contract there
> is.
>
> Is this really what's going on in that other thread about "fixing" ttm?

At least it seems so. I haven't looked at the actual KVM code in detail, 
but from what I've seen it looks like KVM implements similar 
functionality to GUP to figure out which struct page a virtual address 
points to, but without the correct checks GUP does.

I absolutely don't understand either the why and what. Especially for 
the user case of KVM grabbing a page reference doesn't seem to make any 
sense.

My suspicion is that this should have been an MMU notifier instead, but 
this is really just a gut feeling. Maybe there is a good reason KVM 
grabs this reference.

Christian.

> -Daniel
>
>> So as long as I don't see a full explanation why KVM is grabbing reference
>> to pages while faulting them and why we manually need to forward the caching
>> while the hardware documentation indicates otherwise I will be rejecting
>> this whole approach.
>>
>> Regards,
>> Christian.
>>
>>> BR,
>>> -R
>>>
>>>> If the hardware can't use the caching information from the host CPU page
>>>> tables directly then that pretty much completely breaks the concept that
>>>> the exporter is responsible for setting up those page tables.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>     drivers/dma-buf/dma-buf.c    | 63 +++++++++++++++++++++++++++------
>>>>>     include/linux/dma-buf.h      | 11 ++++++
>>>>>     include/uapi/linux/dma-buf.h | 68 ++++++++++++++++++++++++++++++++++++
>>>>>     3 files changed, 132 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>>> index 32f55640890c..262c4706f721 100644
>>>>> --- a/drivers/dma-buf/dma-buf.c
>>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>>> @@ -125,6 +125,32 @@ static struct file_system_type dma_buf_fs_type = {
>>>>>         .kill_sb = kill_anon_super,
>>>>>     };
>>>>>
>>>>> +static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>>>>> +{
>>>>> +     int ret;
>>>>> +
>>>>> +     /* check if buffer supports mmap */
>>>>> +     if (!dmabuf->ops->mmap)
>>>>> +             return -EINVAL;
>>>>> +
>>>>> +     ret = dmabuf->ops->mmap(dmabuf, vma);
>>>>> +
>>>>> +     /*
>>>>> +      * If the exporter claims to support coherent access, ensure the
>>>>> +      * pgprot flags match the claim.
>>>>> +      */
>>>>> +     if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
>>>>> +             pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
>>>>> +             if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
>>>>> +             } else {
>>>>> +                     WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
>>>>> +             }
>>>>> +     }
>>>>> +
>>>>> +     return ret;
>>>>> +}
>>>>> +
>>>>>     static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>     {
>>>>>         struct dma_buf *dmabuf;
>>>>> @@ -134,16 +160,12 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
>>>>>
>>>>>         dmabuf = file->private_data;
>>>>>
>>>>> -     /* check if buffer supports mmap */
>>>>> -     if (!dmabuf->ops->mmap)
>>>>> -             return -EINVAL;
>>>>> -
>>>>>         /* check for overflowing the buffer's size */
>>>>>         if (vma->vm_pgoff + vma_pages(vma) >
>>>>>             dmabuf->size >> PAGE_SHIFT)
>>>>>                 return -EINVAL;
>>>>>
>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>     }
>>>>>
>>>>>     static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
>>>>> @@ -326,6 +348,27 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
>>>>>         return 0;
>>>>>     }
>>>>>
>>>>> +static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
>>>>> +{
>>>>> +     struct dma_buf_info arg;
>>>>> +
>>>>> +     if (copy_from_user(&arg, uarg, sizeof(arg)))
>>>>> +             return -EFAULT;
>>>>> +
>>>>> +     switch (arg.param) {
>>>>> +     case DMA_BUF_INFO_MAP_INFO:
>>>>> +             arg.value = dmabuf->map_info;
>>>>> +             break;
>>>>> +     default:
>>>>> +             return -EINVAL;
>>>>> +     }
>>>>> +
>>>>> +     if (copy_to_user(uarg, &arg, sizeof(arg)))
>>>>> +             return -EFAULT;
>>>>> +
>>>>> +     return 0;
>>>>> +}
>>>>> +
>>>>>     static long dma_buf_ioctl(struct file *file,
>>>>>                           unsigned int cmd, unsigned long arg)
>>>>>     {
>>>>> @@ -369,6 +412,9 @@ static long dma_buf_ioctl(struct file *file,
>>>>>         case DMA_BUF_SET_NAME_B:
>>>>>                 return dma_buf_set_name(dmabuf, (const char __user *)arg);
>>>>>
>>>>> +     case DMA_BUF_IOCTL_INFO:
>>>>> +             return dma_buf_info(dmabuf, (void __user *)arg);
>>>>> +
>>>>>         default:
>>>>>                 return -ENOTTY;
>>>>>         }
>>>>> @@ -530,6 +576,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>>         dmabuf->priv = exp_info->priv;
>>>>>         dmabuf->ops = exp_info->ops;
>>>>>         dmabuf->size = exp_info->size;
>>>>> +     dmabuf->map_info = exp_info->map_info;
>>>>>         dmabuf->exp_name = exp_info->exp_name;
>>>>>         dmabuf->owner = exp_info->owner;
>>>>>         spin_lock_init(&dmabuf->name_lock);
>>>>> @@ -1245,10 +1292,6 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>         if (WARN_ON(!dmabuf || !vma))
>>>>>                 return -EINVAL;
>>>>>
>>>>> -     /* check if buffer supports mmap */
>>>>> -     if (!dmabuf->ops->mmap)
>>>>> -             return -EINVAL;
>>>>> -
>>>>>         /* check for offset overflow */
>>>>>         if (pgoff + vma_pages(vma) < pgoff)
>>>>>                 return -EOVERFLOW;
>>>>> @@ -1262,7 +1305,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
>>>>>         vma_set_file(vma, dmabuf->file);
>>>>>         vma->vm_pgoff = pgoff;
>>>>>
>>>>> -     return dmabuf->ops->mmap(dmabuf, vma);
>>>>> +     return __dma_buf_mmap(dmabuf, vma);
>>>>>     }
>>>>>     EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
>>>>>
>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>>>> index 71731796c8c3..37923c8d5c24 100644
>>>>> --- a/include/linux/dma-buf.h
>>>>> +++ b/include/linux/dma-buf.h
>>>>> @@ -23,6 +23,8 @@
>>>>>     #include <linux/dma-fence.h>
>>>>>     #include <linux/wait.h>
>>>>>
>>>>> +#include <uapi/linux/dma-buf.h>
>>>>> +
>>>>>     struct device;
>>>>>     struct dma_buf;
>>>>>     struct dma_buf_attachment;
>>>>> @@ -307,6 +309,13 @@ struct dma_buf {
>>>>>          */
>>>>>         size_t size;
>>>>>
>>>>> +     /**
>>>>> +      * @map_info:
>>>>> +      *
>>>>> +      * CPU mapping/coherency information for the buffer.
>>>>> +      */
>>>>> +     enum dma_buf_map_info map_info;
>>>>> +
>>>>>         /**
>>>>>          * @file:
>>>>>          *
>>>>> @@ -533,6 +542,7 @@ struct dma_buf_attachment {
>>>>>      * @ops:    Attach allocator-defined dma buf ops to the new buffer
>>>>>      * @size:   Size of the buffer - invariant over the lifetime of the buffer
>>>>>      * @flags:  mode flags for the file
>>>>> + * @map_info:        CPU mapping/coherency information for the buffer
>>>>>      * @resv:   reservation-object, NULL to allocate default one
>>>>>      * @priv:   Attach private data of allocator to this buffer
>>>>>      *
>>>>> @@ -545,6 +555,7 @@ struct dma_buf_export_info {
>>>>>         const struct dma_buf_ops *ops;
>>>>>         size_t size;
>>>>>         int flags;
>>>>> +     enum dma_buf_map_info map_info;
>>>>>         struct dma_resv *resv;
>>>>>         void *priv;
>>>>>     };
>>>>> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
>>>>> index b1523cb8ab30..07b403ffdb43 100644
>>>>> --- a/include/uapi/linux/dma-buf.h
>>>>> +++ b/include/uapi/linux/dma-buf.h
>>>>> @@ -85,6 +85,72 @@ struct dma_buf_sync {
>>>>>
>>>>>     #define DMA_BUF_NAME_LEN    32
>>>>>
>>>>> +/**
>>>>> + * enum dma_buf_map_info - CPU mapping info
>>>>> + *
>>>>> + * This enum describes coherency of a userspace mapping of the dmabuf.
>>>>> + *
>>>>> + * Importing devices should check dma_buf::map_info flag and reject an
>>>>> + * import if unsupported.  For example, if the exporting device uses
>>>>> + * @DMA_BUF_COHERENT_CACHED but the importing device does not support
>>>>> + * CPU cache coherency, the dma-buf import should fail.
>>>>> + */
>>>>> +enum dma_buf_map_info {
>>>>> +     /**
>>>>> +      * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
>>>>> +      *
>>>>> +      * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
>>>>> +      */
>>>>> +     DMA_BUF_MAP_INCOHERENT,
>>>>> +
>>>>> +     /**
>>>>> +      * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
>>>>> +      *
>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>> +      *
>>>>> +      * The cpu mapping is writecombine.
>>>>> +      */
>>>>> +     DMA_BUF_COHERENT_WC,
>>>>> +
>>>>> +     /**
>>>>> +      * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
>>>>> +      *
>>>>> +      * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
>>>>> +      * However fences may be still required for synchronizing access.  Ie.
>>>>> +      * coherency can only be relied upon by an explicit-fencing userspace.
>>>>> +      * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
>>>>> +      *
>>>>> +      * The cpu mapping is cached.
>>>>> +      */
>>>>> +     DMA_BUF_COHERENT_CACHED,
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct dma_buf_info - Query info about the buffer.
>>>>> + */
>>>>> +struct dma_buf_info {
>>>>> +
>>>>> +#define DMA_BUF_INFO_MAP_INFO    1
>>>>> +
>>>>> +     /**
>>>>> +      * @param: Which param to query
>>>>> +      *
>>>>> +      * DMA_BUF_INFO_MAP_INFO:
>>>>> +      *     Returns enum dma_buf_map_info, describing the coherency and
>>>>> +      *     caching of a CPU mapping of the buffer.
>>>>> +      */
>>>>> +     __u32 param;
>>>>> +     __u32 pad;
>>>>> +
>>>>> +     /**
>>>>> +      * @value: Return value of the query.
>>>>> +      */
>>>>> +     __u64 value;
>>>>> +};
>>>>> +
>>>>>     #define DMA_BUF_BASE                'b'
>>>>>     #define DMA_BUF_IOCTL_SYNC  _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
>>>>>
>>>>> @@ -95,4 +161,6 @@ struct dma_buf_sync {
>>>>>     #define DMA_BUF_SET_NAME_A  _IOW(DMA_BUF_BASE, 1, __u32)
>>>>>     #define DMA_BUF_SET_NAME_B  _IOW(DMA_BUF_BASE, 1, __u64)
>>>>>
>>>>> +#define DMA_BUF_IOCTL_INFO   _IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
>>>>> +
>>>>>     #endif
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 32f55640890c..262c4706f721 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -125,6 +125,32 @@  static struct file_system_type dma_buf_fs_type = {
 	.kill_sb = kill_anon_super,
 };
 
+static int __dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+	int ret;
+
+	/* check if buffer supports mmap */
+	if (!dmabuf->ops->mmap)
+		return -EINVAL;
+
+	ret = dmabuf->ops->mmap(dmabuf, vma);
+
+	/*
+	 * If the exporter claims to support coherent access, ensure the
+	 * pgprot flags match the claim.
+	 */
+	if ((dmabuf->map_info != DMA_BUF_MAP_INCOHERENT) && !ret) {
+		pgprot_t wc_prot = pgprot_writecombine(vma->vm_page_prot);
+		if (dmabuf->map_info == DMA_BUF_COHERENT_WC) {
+			WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) != pgprot_val(wc_prot));
+		} else {
+			WARN_ON_ONCE(pgprot_val(vma->vm_page_prot) == pgprot_val(wc_prot));
+		}
+	}
+
+	return ret;
+}
+
 static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
 {
 	struct dma_buf *dmabuf;
@@ -134,16 +160,12 @@  static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
 
 	dmabuf = file->private_data;
 
-	/* check if buffer supports mmap */
-	if (!dmabuf->ops->mmap)
-		return -EINVAL;
-
 	/* check for overflowing the buffer's size */
 	if (vma->vm_pgoff + vma_pages(vma) >
 	    dmabuf->size >> PAGE_SHIFT)
 		return -EINVAL;
 
-	return dmabuf->ops->mmap(dmabuf, vma);
+	return __dma_buf_mmap(dmabuf, vma);
 }
 
 static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
@@ -326,6 +348,27 @@  static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
 	return 0;
 }
 
+static long dma_buf_info(struct dma_buf *dmabuf, void __user *uarg)
+{
+	struct dma_buf_info arg;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	switch (arg.param) {
+	case DMA_BUF_INFO_MAP_INFO:
+		arg.value = dmabuf->map_info;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (copy_to_user(uarg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long dma_buf_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -369,6 +412,9 @@  static long dma_buf_ioctl(struct file *file,
 	case DMA_BUF_SET_NAME_B:
 		return dma_buf_set_name(dmabuf, (const char __user *)arg);
 
+	case DMA_BUF_IOCTL_INFO:
+		return dma_buf_info(dmabuf, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -530,6 +576,7 @@  struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 	dmabuf->priv = exp_info->priv;
 	dmabuf->ops = exp_info->ops;
 	dmabuf->size = exp_info->size;
+	dmabuf->map_info = exp_info->map_info;
 	dmabuf->exp_name = exp_info->exp_name;
 	dmabuf->owner = exp_info->owner;
 	spin_lock_init(&dmabuf->name_lock);
@@ -1245,10 +1292,6 @@  int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 	if (WARN_ON(!dmabuf || !vma))
 		return -EINVAL;
 
-	/* check if buffer supports mmap */
-	if (!dmabuf->ops->mmap)
-		return -EINVAL;
-
 	/* check for offset overflow */
 	if (pgoff + vma_pages(vma) < pgoff)
 		return -EOVERFLOW;
@@ -1262,7 +1305,7 @@  int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 	vma_set_file(vma, dmabuf->file);
 	vma->vm_pgoff = pgoff;
 
-	return dmabuf->ops->mmap(dmabuf, vma);
+	return __dma_buf_mmap(dmabuf, vma);
 }
 EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);
 
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 71731796c8c3..37923c8d5c24 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -23,6 +23,8 @@ 
 #include <linux/dma-fence.h>
 #include <linux/wait.h>
 
+#include <uapi/linux/dma-buf.h>
+
 struct device;
 struct dma_buf;
 struct dma_buf_attachment;
@@ -307,6 +309,13 @@  struct dma_buf {
 	 */
 	size_t size;
 
+	/**
+	 * @map_info:
+	 *
+	 * CPU mapping/coherency information for the buffer.
+	 */
+	enum dma_buf_map_info map_info;
+
 	/**
 	 * @file:
 	 *
@@ -533,6 +542,7 @@  struct dma_buf_attachment {
  * @ops:	Attach allocator-defined dma buf ops to the new buffer
  * @size:	Size of the buffer - invariant over the lifetime of the buffer
  * @flags:	mode flags for the file
+ * @map_info:	CPU mapping/coherency information for the buffer
  * @resv:	reservation-object, NULL to allocate default one
  * @priv:	Attach private data of allocator to this buffer
  *
@@ -545,6 +555,7 @@  struct dma_buf_export_info {
 	const struct dma_buf_ops *ops;
 	size_t size;
 	int flags;
+	enum dma_buf_map_info map_info;
 	struct dma_resv *resv;
 	void *priv;
 };
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index b1523cb8ab30..07b403ffdb43 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -85,6 +85,72 @@  struct dma_buf_sync {
 
 #define DMA_BUF_NAME_LEN	32
 
+/**
+ * enum dma_buf_map_info - CPU mapping info
+ *
+ * This enum describes coherency of a userspace mapping of the dmabuf.
+ *
+ * Importing devices should check dma_buf::map_info flag and reject an
+ * import if unsupported.  For example, if the exporting device uses
+ * @DMA_BUF_COHERENT_CACHED but the importing device does not support
+ * CPU cache coherency, the dma-buf import should fail.
+ */
+enum dma_buf_map_info {
+	/**
+	 * @DMA_BUF_MAP_INCOHERENT: CPU mapping is incoherent.
+	 *
+	 * Use of DMA_BUF_IOCTL_SYNC is required for CPU managed coherenency.
+	 */
+	DMA_BUF_MAP_INCOHERENT,
+
+	/**
+	 * @DMA_BUF_COHERENT_WC: CPU mapping is coherent but not cached.
+	 *
+	 * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
+	 * However fences may be still required for synchronizing access.  Ie.
+	 * coherency can only be relied upon by an explicit-fencing userspace.
+	 * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
+	 *
+	 * The cpu mapping is writecombine.
+	 */
+	DMA_BUF_COHERENT_WC,
+
+	/**
+	 * @DMA_BUF_COHERENT_CACHED: CPU mapping is coherent and CPU cached.
+	 *
+	 * A cpu mmap'ing is coherent, and DMA_BUF_IOCTL_SYNC is not required.
+	 * However fences may be still required for synchronizing access.  Ie.
+	 * coherency can only be relied upon by an explicit-fencing userspace.
+	 * An implicit-sync userspace must still use DMA_BUF_IOCTL_SYNC.
+	 *
+	 * The cpu mapping is cached.
+	 */
+	DMA_BUF_COHERENT_CACHED,
+};
+
+/**
+ * struct dma_buf_info - Query info about the buffer.
+ */
+struct dma_buf_info {
+
+#define DMA_BUF_INFO_MAP_INFO    1
+
+	/**
+	 * @param: Which param to query
+	 *
+	 * DMA_BUF_INFO_MAP_INFO:
+	 *     Returns enum dma_buf_map_info, describing the coherency and
+	 *     caching of a CPU mapping of the buffer.
+	 */
+	__u32 param;
+	__u32 pad;
+
+	/**
+	 * @value: Return value of the query.
+	 */
+	__u64 value;
+};
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
 
@@ -95,4 +161,6 @@  struct dma_buf_sync {
 #define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, __u32)
 #define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, __u64)
 
+#define DMA_BUF_IOCTL_INFO	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_info)
+
 #endif