diff mbox series

[RESEND,v2,4/5] drm/msm: add DRM_MSM_GEM_SYNC_CACHE for non-coherent cache maintenance

Message ID 20201114151717.5369-5-jonathan@marek.ca (mailing list archive)
State New, archived
Headers show
Series drm/msm: support for host-cached BOs | expand

Commit Message

Jonathan Marek Nov. 14, 2020, 3:17 p.m. UTC
This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
which otherwise doesn't provide any method for cleaning/invalidating the
cache to sync with the device.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
---
 drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/msm/msm_drv.h |  2 ++
 drivers/gpu/drm/msm/msm_gem.c | 23 +++++++++++++++++++++++
 include/uapi/drm/msm_drm.h    | 20 ++++++++++++++++++++
 4 files changed, 66 insertions(+)

Comments

Christoph Hellwig Nov. 14, 2020, 4:24 p.m. UTC | #1
On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end)
> +{
> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
> +	struct device *dev = msm_obj->base.dev->dev;
> +
> +	/* exit early if get_pages() hasn't been called yet */
> +	if (!msm_obj->pages)
> +		return;
> +
> +	/* TODO: sync only the specified range */
> +
> +	if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> +		dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> +				msm_obj->sgt->nents, DMA_TO_DEVICE);
> +	}
> +
> +	if (flags & MSM_GEM_SYNC_FOR_CPU) {
> +		dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> +				msm_obj->sgt->nents, DMA_FROM_DEVICE);
> +	}

Splitting this helper from the only caller is rather strange, epecially
with the two unused arguments.  And I think the way this is specified
to take a range, but ignoring it is actively dangerous.  User space will
rely on it syncing everything sooner or later and then you are stuck.
So just define a sync all primitive for now, and if you really need a
range sync and have actually implemented it add a new ioctl for that.
Rob Clark Nov. 14, 2020, 6:46 p.m. UTC | #2
On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> > +             size_t range_start, size_t range_end)
> > +{
> > +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
> > +     struct device *dev = msm_obj->base.dev->dev;
> > +
> > +     /* exit early if get_pages() hasn't been called yet */
> > +     if (!msm_obj->pages)
> > +             return;
> > +
> > +     /* TODO: sync only the specified range */
> > +
> > +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> > +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> > +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
> > +     }
> > +
> > +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
> > +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> > +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
> > +     }
>
> Splitting this helper from the only caller is rather strange, epecially
> with the two unused arguments.  And I think the way this is specified
> to take a range, but ignoring it is actively dangerous.  User space will
> rely on it syncing everything sooner or later and then you are stuck.
> So just define a sync all primitive for now, and if you really need a
> range sync and have actually implemented it add a new ioctl for that.

We do already have a split of ioctl "layer" which enforces valid ioctl
params, etc, and gem (or other) module code which is called by the
ioctl func.  So I think it is fine to keep this split here.  (Also, I
think at some point there will be a uring type of ioctl alternative
which would re-use the same gem func.)

But I do agree that the range should be respected or added later..
drm_ioctl() dispatch is well prepared for extending ioctls.

And I assume there should be some validation that the range is aligned
to cache-line?  Or can we flush a partial cache line?

BR,
-R
Jonathan Marek Nov. 14, 2020, 6:54 p.m. UTC | #3
On 11/14/20 1:46 PM, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
>>
>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>>> +             size_t range_start, size_t range_end)
>>> +{
>>> +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>> +     struct device *dev = msm_obj->base.dev->dev;
>>> +
>>> +     /* exit early if get_pages() hasn't been called yet */
>>> +     if (!msm_obj->pages)
>>> +             return;
>>> +
>>> +     /* TODO: sync only the specified range */
>>> +
>>> +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
>>> +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
>>> +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
>>> +     }
>>> +
>>> +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
>>> +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
>>> +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
>>> +     }
>>
>> Splitting this helper from the only caller is rather strange, epecially
>> with the two unused arguments.  And I think the way this is specified
>> to take a range, but ignoring it is actively dangerous.  User space will
>> rely on it syncing everything sooner or later and then you are stuck.
>> So just define a sync all primitive for now, and if you really need a
>> range sync and have actually implemented it add a new ioctl for that.
> 
> We do already have a split of ioctl "layer" which enforces valid ioctl
> params, etc, and gem (or other) module code which is called by the
> ioctl func.  So I think it is fine to keep this split here.  (Also, I
> think at some point there will be a uring type of ioctl alternative
> which would re-use the same gem func.)
> 
> But I do agree that the range should be respected or added later..
> drm_ioctl() dispatch is well prepared for extending ioctls.
> 
> And I assume there should be some validation that the range is aligned
> to cache-line?  Or can we flush a partial cache line?
> 

The range is intended to be "sync at least this range", so that 
userspace doesn't have to worry about details like that.

> BR,
> -R
>
Rob Clark Nov. 14, 2020, 7:39 p.m. UTC | #4
On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <jonathan@marek.ca> wrote:
>
> On 11/14/20 1:46 PM, Rob Clark wrote:
> > On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
> >>
> >> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> >>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> >>> +             size_t range_start, size_t range_end)
> >>> +{
> >>> +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
> >>> +     struct device *dev = msm_obj->base.dev->dev;
> >>> +
> >>> +     /* exit early if get_pages() hasn't been called yet */
> >>> +     if (!msm_obj->pages)
> >>> +             return;
> >>> +
> >>> +     /* TODO: sync only the specified range */
> >>> +
> >>> +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> >>> +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> >>> +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
> >>> +     }
> >>> +
> >>> +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
> >>> +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> >>> +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
> >>> +     }
> >>
> >> Splitting this helper from the only caller is rather strange, epecially
> >> with the two unused arguments.  And I think the way this is specified
> >> to take a range, but ignoring it is actively dangerous.  User space will
> >> rely on it syncing everything sooner or later and then you are stuck.
> >> So just define a sync all primitive for now, and if you really need a
> >> range sync and have actually implemented it add a new ioctl for that.
> >
> > We do already have a split of ioctl "layer" which enforces valid ioctl
> > params, etc, and gem (or other) module code which is called by the
> > ioctl func.  So I think it is fine to keep this split here.  (Also, I
> > think at some point there will be a uring type of ioctl alternative
> > which would re-use the same gem func.)
> >
> > But I do agree that the range should be respected or added later..
> > drm_ioctl() dispatch is well prepared for extending ioctls.
> >
> > And I assume there should be some validation that the range is aligned
> > to cache-line?  Or can we flush a partial cache line?
> >
>
> The range is intended to be "sync at least this range", so that
> userspace doesn't have to worry about details like that.
>

I don't think userspace can *not* worry about details like that.
Consider a case where the cpu and gpu are simultaneously accessing
different parts of a buffer (for ex, sub-allocation).  There needs to
be cache-line separation between the two.

BR,
-R
Jonathan Marek Nov. 14, 2020, 8:07 p.m. UTC | #5
On 11/14/20 2:39 PM, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <jonathan@marek.ca> wrote:
>>
>> On 11/14/20 1:46 PM, Rob Clark wrote:
>>> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
>>>>
>>>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
>>>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
>>>>> +             size_t range_start, size_t range_end)
>>>>> +{
>>>>> +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>>>> +     struct device *dev = msm_obj->base.dev->dev;
>>>>> +
>>>>> +     /* exit early if get_pages() hasn't been called yet */
>>>>> +     if (!msm_obj->pages)
>>>>> +             return;
>>>>> +
>>>>> +     /* TODO: sync only the specified range */
>>>>> +
>>>>> +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
>>>>> +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
>>>>> +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
>>>>> +     }
>>>>> +
>>>>> +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
>>>>> +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
>>>>> +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
>>>>> +     }
>>>>
>>>> Splitting this helper from the only caller is rather strange, epecially
>>>> with the two unused arguments.  And I think the way this is specified
>>>> to take a range, but ignoring it is actively dangerous.  User space will
>>>> rely on it syncing everything sooner or later and then you are stuck.
>>>> So just define a sync all primitive for now, and if you really need a
>>>> range sync and have actually implemented it add a new ioctl for that.
>>>
>>> We do already have a split of ioctl "layer" which enforces valid ioctl
>>> params, etc, and gem (or other) module code which is called by the
>>> ioctl func.  So I think it is fine to keep this split here.  (Also, I
>>> think at some point there will be a uring type of ioctl alternative
>>> which would re-use the same gem func.)
>>>
>>> But I do agree that the range should be respected or added later..
>>> drm_ioctl() dispatch is well prepared for extending ioctls.
>>>
>>> And I assume there should be some validation that the range is aligned
>>> to cache-line?  Or can we flush a partial cache line?
>>>
>>
>> The range is intended to be "sync at least this range", so that
>> userspace doesn't have to worry about details like that.
>>
> 
> I don't think userspace can *not* worry about details like that.
> Consider a case where the cpu and gpu are simultaneously accessing
> different parts of a buffer (for ex, sub-allocation).  There needs to
> be cache-line separation between the two.
> 

Right.. and it also seems like we can't get away with just 
flushing/invalidating the whole thing.

qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like 
dma_sync_single_for_cpu() does deal in some way with the partial cache 
line case, although I'm not sure that means we can have a 
nonCoherentAtomSize=1.

> BR,
> -R
>
Rob Clark Nov. 14, 2020, 8:48 p.m. UTC | #6
On Sat, Nov 14, 2020 at 12:10 PM Jonathan Marek <jonathan@marek.ca> wrote:
>
> On 11/14/20 2:39 PM, Rob Clark wrote:
> > On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <jonathan@marek.ca> wrote:
> >>
> >> On 11/14/20 1:46 PM, Rob Clark wrote:
> >>> On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
> >>>>
> >>>> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> >>>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> >>>>> +             size_t range_start, size_t range_end)
> >>>>> +{
> >>>>> +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
> >>>>> +     struct device *dev = msm_obj->base.dev->dev;
> >>>>> +
> >>>>> +     /* exit early if get_pages() hasn't been called yet */
> >>>>> +     if (!msm_obj->pages)
> >>>>> +             return;
> >>>>> +
> >>>>> +     /* TODO: sync only the specified range */
> >>>>> +
> >>>>> +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> >>>>> +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> >>>>> +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
> >>>>> +     }
> >>>>> +
> >>>>> +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
> >>>>> +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> >>>>> +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
> >>>>> +     }
> >>>>
> >>>> Splitting this helper from the only caller is rather strange, epecially
> >>>> with the two unused arguments.  And I think the way this is specified
> >>>> to take a range, but ignoring it is actively dangerous.  User space will
> >>>> rely on it syncing everything sooner or later and then you are stuck.
> >>>> So just define a sync all primitive for now, and if you really need a
> >>>> range sync and have actually implemented it add a new ioctl for that.
> >>>
> >>> We do already have a split of ioctl "layer" which enforces valid ioctl
> >>> params, etc, and gem (or other) module code which is called by the
> >>> ioctl func.  So I think it is fine to keep this split here.  (Also, I
> >>> think at some point there will be a uring type of ioctl alternative
> >>> which would re-use the same gem func.)
> >>>
> >>> But I do agree that the range should be respected or added later..
> >>> drm_ioctl() dispatch is well prepared for extending ioctls.
> >>>
> >>> And I assume there should be some validation that the range is aligned
> >>> to cache-line?  Or can we flush a partial cache line?
> >>>
> >>
> >> The range is intended to be "sync at least this range", so that
> >> userspace doesn't have to worry about details like that.
> >>
> >
> > I don't think userspace can *not* worry about details like that.
> > Consider a case where the cpu and gpu are simultaneously accessing
> > different parts of a buffer (for ex, sub-allocation).  There needs to
> > be cache-line separation between the two.
> >
>
> Right.. and it also seems like we can't get away with just
> flushing/invalidating the whole thing.
>
> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> dma_sync_single_for_cpu() does deal in some way with the partial cache
> line case, although I'm not sure that means we can have a
> nonCoherentAtomSize=1.
>

flush/inv the whole thing could be a useful first step, or at least I
can think of some uses for it.  But if it isn't useful for how vk sees
the world, then maybe we should just implement the range properly from
the get-go.  (And I *think* requiring the range to be aligned to
cacheline boundaries.. it is always easy from a kernel uabi PoV to
loosen restrictions later, than the other way around.)

BR,
-R
Jordan Crouse Nov. 16, 2020, 5:25 p.m. UTC | #7
On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> This makes it possible to use the non-coherent cached MSM_BO_CACHED mode,
> which otherwise doesn't provide any method for cleaning/invalidating the
> cache to sync with the device.
> 
> Signed-off-by: Jonathan Marek <jonathan@marek.ca>
> ---
>  drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++
>  drivers/gpu/drm/msm/msm_drv.h |  2 ++
>  drivers/gpu/drm/msm/msm_gem.c | 23 +++++++++++++++++++++++
>  include/uapi/drm/msm_drm.h    | 20 ++++++++++++++++++++
>  4 files changed, 66 insertions(+)
> 
> diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> index bae48afca82e..3f17acdf6594 100644
> --- a/drivers/gpu/drm/msm/msm_drv.c
> +++ b/drivers/gpu/drm/msm/msm_drv.c
> @@ -959,6 +959,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
>  	return msm_submitqueue_remove(file->driver_priv, id);
>  }
>  
> +static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
> +		struct drm_file *file)
> +{
> +	struct drm_msm_gem_sync_cache *args = data;
> +	struct drm_gem_object *obj;
> +
> +	if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
> +		return -EINVAL;
> +
> +	obj = drm_gem_object_lookup(file, args->handle);
> +	if (!obj)
> +		return -ENOENT;
> +
> +	msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
> +
> +	drm_gem_object_put(obj);
> +
> +	return 0;
> +}
> +
>  static const struct drm_ioctl_desc msm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_RENDER_ALLOW),
> @@ -971,6 +991,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW,   msm_ioctl_submitqueue_new,   DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE,    msm_ioctl_gem_sync_cache,    DRM_RENDER_ALLOW),
>  };
>  
>  static const struct vm_operations_struct vm_ops = {
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 22ebecb28349..f170f843010e 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -318,6 +318,8 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
>  void msm_gem_active_put(struct drm_gem_object *obj);
>  int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
>  int msm_gem_cpu_fini(struct drm_gem_object *obj);
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end);
>  void msm_gem_free_object(struct drm_gem_object *obj);
>  int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
>  		uint32_t size, uint32_t flags, uint32_t *handle, char *name);
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index 3d8254b5de16..039738696f9a 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -797,6 +797,29 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj)
>  	return 0;
>  }
>  
> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> +		size_t range_start, size_t range_end)
> +{
> +	struct msm_gem_object *msm_obj = to_msm_bo(obj);
> +	struct device *dev = msm_obj->base.dev->dev;
> +

On a6xx targets with I/O coherency you can skip this (assuming that the IOMMU
flag has been set for this buffer). I'm not sure if one of the other patches
already does the bypass but I mention it here since this is the point of
no-return.

Jordan

> +	/* exit early if get_pages() hasn't been called yet */
> +	if (!msm_obj->pages)
> +		return;
> +
> +	/* TODO: sync only the specified range */
> +
> +	if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> +		dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> +				msm_obj->sgt->nents, DMA_TO_DEVICE);
> +	}
> +
> +	if (flags & MSM_GEM_SYNC_FOR_CPU) {
> +		dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> +				msm_obj->sgt->nents, DMA_FROM_DEVICE);
> +	}
> +}
> +
>  #ifdef CONFIG_DEBUG_FS
>  static void describe_fence(struct dma_fence *fence, const char *type,
>  		struct seq_file *m)
> diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
> index 474497e8743a..c8288f328528 100644
> --- a/include/uapi/drm/msm_drm.h
> +++ b/include/uapi/drm/msm_drm.h
> @@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query {
>  	__u32 pad;
>  };
>  
> +/*
> + * Host cache maintenance (relevant for MSM_BO_CACHED)
> + * driver may both clean/invalidate (flush) for clean
> + */
> +
> +#define MSM_GEM_SYNC_FOR_DEVICE		0x1
> +#define MSM_GEM_SYNC_FOR_CPU		0x2
> +
> +#define MSM_GEM_SYNC_CACHE_FLAGS	(MSM_GEM_SYNC_FOR_DEVICE | \
> +					 MSM_GEM_SYNC_FOR_CPU)
> +
> +struct drm_msm_gem_sync_cache {
> +	__u32 handle;
> +	__u32 flags;
> +	__u64 offset;
> +	__u64 end;      /* offset + size */
> +};
> +
>  #define DRM_MSM_GET_PARAM              0x00
>  /* placeholder:
>  #define DRM_MSM_SET_PARAM              0x01
> @@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query {
>  #define DRM_MSM_SUBMITQUEUE_NEW        0x0A
>  #define DRM_MSM_SUBMITQUEUE_CLOSE      0x0B
>  #define DRM_MSM_SUBMITQUEUE_QUERY      0x0C
> +#define DRM_MSM_GEM_SYNC_CACHE         0x0D
>  
>  #define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
>  #define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
> @@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query {
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW    DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
>  #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
> +#define DRM_IOCTL_MSM_GEM_SYNC_CACHE     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)
>  
>  #if defined(__cplusplus)
>  }
> -- 
> 2.26.1
> 
> _______________________________________________
> Freedreno mailing list
> Freedreno@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno
Jordan Crouse Nov. 16, 2020, 5:27 p.m. UTC | #8
On Sat, Nov 14, 2020 at 11:39:45AM -0800, Rob Clark wrote:
> On Sat, Nov 14, 2020 at 10:58 AM Jonathan Marek <jonathan@marek.ca> wrote:
> >
> > On 11/14/20 1:46 PM, Rob Clark wrote:
> > > On Sat, Nov 14, 2020 at 8:24 AM Christoph Hellwig <hch@lst.de> wrote:
> > >>
> > >> On Sat, Nov 14, 2020 at 10:17:12AM -0500, Jonathan Marek wrote:
> > >>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
> > >>> +             size_t range_start, size_t range_end)
> > >>> +{
> > >>> +     struct msm_gem_object *msm_obj = to_msm_bo(obj);
> > >>> +     struct device *dev = msm_obj->base.dev->dev;
> > >>> +
> > >>> +     /* exit early if get_pages() hasn't been called yet */
> > >>> +     if (!msm_obj->pages)
> > >>> +             return;
> > >>> +
> > >>> +     /* TODO: sync only the specified range */
> > >>> +
> > >>> +     if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
> > >>> +             dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
> > >>> +                             msm_obj->sgt->nents, DMA_TO_DEVICE);
> > >>> +     }
> > >>> +
> > >>> +     if (flags & MSM_GEM_SYNC_FOR_CPU) {
> > >>> +             dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
> > >>> +                             msm_obj->sgt->nents, DMA_FROM_DEVICE);
> > >>> +     }
> > >>
> > >> Splitting this helper from the only caller is rather strange, epecially
> > >> with the two unused arguments.  And I think the way this is specified
> > >> to take a range, but ignoring it is actively dangerous.  User space will
> > >> rely on it syncing everything sooner or later and then you are stuck.
> > >> So just define a sync all primitive for now, and if you really need a
> > >> range sync and have actually implemented it add a new ioctl for that.
> > >
> > > We do already have a split of ioctl "layer" which enforces valid ioctl
> > > params, etc, and gem (or other) module code which is called by the
> > > ioctl func.  So I think it is fine to keep this split here.  (Also, I
> > > think at some point there will be a uring type of ioctl alternative
> > > which would re-use the same gem func.)
> > >
> > > But I do agree that the range should be respected or added later..
> > > drm_ioctl() dispatch is well prepared for extending ioctls.
> > >
> > > And I assume there should be some validation that the range is aligned
> > > to cache-line?  Or can we flush a partial cache line?
> > >
> >
> > The range is intended to be "sync at least this range", so that
> > userspace doesn't have to worry about details like that.
> >
> 
> I don't think userspace can *not* worry about details like that.
> Consider a case where the cpu and gpu are simultaneously accessing
> different parts of a buffer (for ex, sub-allocation).  There needs to
> be cache-line separation between the two.

There is at least one compute conformance test that I can think of that does
exactly this.

Jordan

> BR,
> -R
> _______________________________________________
> Freedreno mailing list
> Freedreno@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno
Christoph Hellwig Nov. 16, 2020, 5:33 p.m. UTC | #9
On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like 
> dma_sync_single_for_cpu() does deal in some way with the partial cache line 
> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.

No, it doesn't.  You need to ensure ownership is managed at
dma_get_cache_alignment() granularity.
Rob Clark Nov. 16, 2020, 5:50 p.m. UTC | #10
On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> > qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> > dma_sync_single_for_cpu() does deal in some way with the partial cache line
> > case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
>
> No, it doesn't.  You need to ensure ownership is managed at
> dma_get_cache_alignment() granularity.

my guess is nonCoherentAtomSize=1 only works in the case of cache
coherent buffers

BR,
-R
Jonathan Marek Nov. 16, 2020, 5:52 p.m. UTC | #11
On 11/16/20 12:50 PM, Rob Clark wrote:
> On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <hch@lst.de> wrote:
>>
>> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
>>> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
>>> dma_sync_single_for_cpu() does deal in some way with the partial cache line
>>> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
>>
>> No, it doesn't.  You need to ensure ownership is managed at
>> dma_get_cache_alignment() granularity.
> 
> my guess is nonCoherentAtomSize=1 only works in the case of cache
> coherent buffers
> 

nonCoherentAtomSize doesn't apply to coherent memory (as the name 
implies), I guess qcom's driver is just wrong about having 
nonCoherentAtomSize=1.

Jordan just mentioned there is at least one conformance test for this, I 
wonder if it just doesn't test it well enough, or just doesn't test the 
non-coherent memory type?
Rob Clark Nov. 29, 2020, 6:51 p.m. UTC | #12
On Mon, Nov 16, 2020 at 9:55 AM Jonathan Marek <jonathan@marek.ca> wrote:
>
> On 11/16/20 12:50 PM, Rob Clark wrote:
> > On Mon, Nov 16, 2020 at 9:33 AM Christoph Hellwig <hch@lst.de> wrote:
> >>
> >> On Sat, Nov 14, 2020 at 03:07:20PM -0500, Jonathan Marek wrote:
> >>> qcom's vulkan driver has nonCoherentAtomSize=1, and it looks like
> >>> dma_sync_single_for_cpu() does deal in some way with the partial cache line
> >>> case, although I'm not sure that means we can have a nonCoherentAtomSize=1.
> >>
> >> No, it doesn't.  You need to ensure ownership is managed at
> >> dma_get_cache_alignment() granularity.
> >
> > my guess is nonCoherentAtomSize=1 only works in the case of cache
> > coherent buffers
> >
>
> nonCoherentAtomSize doesn't apply to coherent memory (as the name
> implies), I guess qcom's driver is just wrong about having
> nonCoherentAtomSize=1.
>
> Jordan just mentioned there is at least one conformance test for this, I
> wonder if it just doesn't test it well enough, or just doesn't test the
> non-coherent memory type?

I was *assuming* (but could be wrong) that Jordan was referring to an
opencl cts test?

At any rate, it is sounding like you should add a
`MSM_PARAM_CACHE_ALIGNMENT` type of param that returns
dma_get_cache_alignment(), and then properly implement offset/end

BR,
-R
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index bae48afca82e..3f17acdf6594 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -959,6 +959,26 @@  static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
 	return msm_submitqueue_remove(file->driver_priv, id);
 }
 
+static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_sync_cache *args = data;
+	struct drm_gem_object *obj;
+
+	if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	msm_gem_sync_cache(obj, args->flags, args->offset, args->end);
+
+	drm_gem_object_put(obj);
+
+	return 0;
+}
+
 static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_RENDER_ALLOW),
@@ -971,6 +991,7 @@  static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW,   msm_ioctl_submitqueue_new,   DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE,    msm_ioctl_gem_sync_cache,    DRM_RENDER_ALLOW),
 };
 
 static const struct vm_operations_struct vm_ops = {
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 22ebecb28349..f170f843010e 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -318,6 +318,8 @@  void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
 void msm_gem_active_put(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
 int msm_gem_cpu_fini(struct drm_gem_object *obj);
+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+		size_t range_start, size_t range_end);
 void msm_gem_free_object(struct drm_gem_object *obj);
 int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
 		uint32_t size, uint32_t flags, uint32_t *handle, char *name);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 3d8254b5de16..039738696f9a 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -797,6 +797,29 @@  int msm_gem_cpu_fini(struct drm_gem_object *obj)
 	return 0;
 }
 
+void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags,
+		size_t range_start, size_t range_end)
+{
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+	struct device *dev = msm_obj->base.dev->dev;
+
+	/* exit early if get_pages() hasn't been called yet */
+	if (!msm_obj->pages)
+		return;
+
+	/* TODO: sync only the specified range */
+
+	if (flags & MSM_GEM_SYNC_FOR_DEVICE) {
+		dma_sync_sg_for_device(dev, msm_obj->sgt->sgl,
+				msm_obj->sgt->nents, DMA_TO_DEVICE);
+	}
+
+	if (flags & MSM_GEM_SYNC_FOR_CPU) {
+		dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl,
+				msm_obj->sgt->nents, DMA_FROM_DEVICE);
+	}
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void describe_fence(struct dma_fence *fence, const char *type,
 		struct seq_file *m)
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 474497e8743a..c8288f328528 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -319,6 +319,24 @@  struct drm_msm_submitqueue_query {
 	__u32 pad;
 };
 
+/*
+ * Host cache maintenance (relevant for MSM_BO_CACHED)
+ * driver may both clean/invalidate (flush) for clean
+ */
+
+#define MSM_GEM_SYNC_FOR_DEVICE		0x1
+#define MSM_GEM_SYNC_FOR_CPU		0x2
+
+#define MSM_GEM_SYNC_CACHE_FLAGS	(MSM_GEM_SYNC_FOR_DEVICE | \
+					 MSM_GEM_SYNC_FOR_CPU)
+
+struct drm_msm_gem_sync_cache {
+	__u32 handle;
+	__u32 flags;
+	__u64 offset;
+	__u64 end;      /* offset + size */
+};
+
 #define DRM_MSM_GET_PARAM              0x00
 /* placeholder:
 #define DRM_MSM_SET_PARAM              0x01
@@ -336,6 +354,7 @@  struct drm_msm_submitqueue_query {
 #define DRM_MSM_SUBMITQUEUE_NEW        0x0A
 #define DRM_MSM_SUBMITQUEUE_CLOSE      0x0B
 #define DRM_MSM_SUBMITQUEUE_QUERY      0x0C
+#define DRM_MSM_GEM_SYNC_CACHE         0x0D
 
 #define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
 #define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
@@ -348,6 +367,7 @@  struct drm_msm_submitqueue_query {
 #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW    DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
 #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
 #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
+#define DRM_IOCTL_MSM_GEM_SYNC_CACHE     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache)
 
 #if defined(__cplusplus)
 }