diff mbox

[3/4] drm/nouveau: hook up cache sync functions

Message ID 1400483458-9648-4-git-send-email-acourbot@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Alexandre Courbot May 19, 2014, 7:10 a.m. UTC
From: Lucas Stach <dev@lynxeye.de>

Signed-off-by: Lucas Stach <dev@lynxeye.de>
[acourbot@nvidia.com: make conditional and platform-friendly]
Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c  | 32 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_bo.h  | 20 ++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_gem.c |  8 +++++++-
 3 files changed, 59 insertions(+), 1 deletion(-)

Comments

Thierry Reding May 19, 2014, 8:46 a.m. UTC | #1
On Mon, May 19, 2014 at 04:10:57PM +0900, Alexandre Courbot wrote:
> From: Lucas Stach <dev@lynxeye.de>
> 
> Signed-off-by: Lucas Stach <dev@lynxeye.de>
> [acourbot@nvidia.com: make conditional and platform-friendly]
> Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>

Perhaps having a propery commit message here would be good.

> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
[...]
> +#ifdef NOUVEAU_NEED_CACHE_SYNC
> +void
> +nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
> +{
> +	struct nouveau_device *device;
> +	struct ttm_tt *ttm = nvbo->bo.ttm;
> +
> +	device = nouveau_dev(nouveau_bdev(ttm->bdev)->dev);
> +
> +	if (nvbo->bo.ttm && nvbo->bo.ttm->caching_state == tt_cached)
> +		ttm_dma_tt_cache_sync_for_cpu((struct ttm_dma_tt *)nvbo->bo.ttm,
> +					      nv_device_base(device));

Can we be certain at this point that the struct ttm_tt is in fact a
struct ttm_dma_tt?

> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
[...]
> +#if IS_ENABLED(CONFIG_ARCH_TEGRA)
> +#define NOUVEAU_NEED_CACHE_SYNC
> +#endif

I know I gave this as an example myself when we discussed this offline,
but I'm now thinking that this might actually be better off in Kconfig.

> +#ifdef NOUVEAU_NEED_CACHE_SYNC
> +void nouveau_bo_sync_for_cpu(struct nouveau_bo *);
> +void nouveau_bo_sync_for_device(struct nouveau_bo *);
> +#else
> +static inline void
> +nouveau_bo_sync_for_cpu(struct nouveau_bo *)
> +{
> +}
> +
> +static inline void
> +nouveau_bo_sync_for_device(struct nouveau_bo *)
> +{
> +}
> +#endif
> +
> +

There's a gratuituous blank line here.

> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index c90c0dc0afe8..b7e42fdc9634 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -897,7 +897,13 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
>  	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
>  	spin_unlock(&nvbo->bo.bdev->fence_lock);
>  	drm_gem_object_unreference_unlocked(gem);
> -	return ret;
> +
> +	if (ret)
> +		return ret;
> +
> +	nouveau_bo_sync_for_cpu(nvbo);
> +
> +	return 0;
>  }

This could be rewritten as:

	if (!ret)
		nouveau_bo_sync_for_cpu(nvbo);

	return ret;

Which would be slightly shorter.

On second thought, perhaps part of nouveau_gem_ioctl_cpu_prep() could be
refactored into a separate function to make this more symmetric. If we
put that in nouveau_bo.c and name it nouveau_bo_wait() for example, the
dummies can go away and both nouveau_bo_sync_for_{cpu,device}() can be
made static. I also think that's cleaner because it has both variants of
the nouveau_bo_sync_for_*() calls in the same file.

Thierry
Lucas Stach May 19, 2014, 9:31 a.m. UTC | #2
Am Montag, den 19.05.2014, 16:10 +0900 schrieb Alexandre Courbot:
> From: Lucas Stach <dev@lynxeye.de>
> 
> Signed-off-by: Lucas Stach <dev@lynxeye.de>
> [acourbot@nvidia.com: make conditional and platform-friendly]
> Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c  | 32 ++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_bo.h  | 20 ++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_gem.c |  8 +++++++-
>  3 files changed, 59 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index b6dc85c614be..0886f47e5244 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -407,6 +407,8 @@ nouveau_bo_validate(struct nouveau_bo *nvbo, bool interruptible,
>  {
>  	int ret;
>  
> +	nouveau_bo_sync_for_device(nvbo);
> +
>  	ret = ttm_bo_validate(&nvbo->bo, &nvbo->placement,
>  			      interruptible, no_wait_gpu);
>  	if (ret)
> @@ -487,6 +489,36 @@ nouveau_bo_invalidate_caches(struct ttm_bo_device *bdev, uint32_t flags)
>  	return 0;
>  }
>  
> +#ifdef NOUVEAU_NEED_CACHE_SYNC

I don't like this ifdef at all. I know calling this functions will add a
little overhead to x86 where it isn't strictly required, but I think
it's negligible.

When I looked at them the dma_sync_single_for_[device|cpu] functions
which are called here map out to just a drain of the PCI store buffer on
x86, which should be fast enough to be done unconditionally. They won't
so any time-consuming cache synchronization on PCI coherent arches.

Regards,
Lucas
Lucas Stach May 19, 2014, 9:44 a.m. UTC | #3
Am Montag, den 19.05.2014, 10:46 +0200 schrieb Thierry Reding:
> On Mon, May 19, 2014 at 04:10:57PM +0900, Alexandre Courbot wrote:
> > From: Lucas Stach <dev@lynxeye.de>
> > 
> > Signed-off-by: Lucas Stach <dev@lynxeye.de>
> > [acourbot@nvidia.com: make conditional and platform-friendly]
> > Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
> 
> Perhaps having a propery commit message here would be good.
> 
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> [...]
> > +#ifdef NOUVEAU_NEED_CACHE_SYNC
> > +void
> > +nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
> > +{
> > +	struct nouveau_device *device;
> > +	struct ttm_tt *ttm = nvbo->bo.ttm;
> > +
> > +	device = nouveau_dev(nouveau_bdev(ttm->bdev)->dev);
> > +
> > +	if (nvbo->bo.ttm && nvbo->bo.ttm->caching_state == tt_cached)
> > +		ttm_dma_tt_cache_sync_for_cpu((struct ttm_dma_tt *)nvbo->bo.ttm,
> > +					      nv_device_base(device));
> 
> Can we be certain at this point that the struct ttm_tt is in fact a
> struct ttm_dma_tt?

Yes, for all cases except AGP, where things are mapped WC anyways (so
caching_state is always != cached) nouveau_bo_driver uses
nouveau_ttm_tt_create() as its ttm_tt_create callback. This in turn
calls nouveau_sgdma_create_ttm() which uses ttm_dma_tt_init()
unconditionally.

Regards,
Lucas
Alexandre Courbot May 23, 2014, 6 a.m. UTC | #4
On Mon, May 19, 2014 at 5:46 PM, Thierry Reding
<thierry.reding@gmail.com> wrote:
> On Mon, May 19, 2014 at 04:10:57PM +0900, Alexandre Courbot wrote:
>> From: Lucas Stach <dev@lynxeye.de>
>>
>> Signed-off-by: Lucas Stach <dev@lynxeye.de>
>> [acourbot@nvidia.com: make conditional and platform-friendly]
>> Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
>
> Perhaps having a propery commit message here would be good.
>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> [...]
>> +#ifdef NOUVEAU_NEED_CACHE_SYNC
>> +void
>> +nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
>> +{
>> +     struct nouveau_device *device;
>> +     struct ttm_tt *ttm = nvbo->bo.ttm;
>> +
>> +     device = nouveau_dev(nouveau_bdev(ttm->bdev)->dev);
>> +
>> +     if (nvbo->bo.ttm && nvbo->bo.ttm->caching_state == tt_cached)
>> +             ttm_dma_tt_cache_sync_for_cpu((struct ttm_dma_tt *)nvbo->bo.ttm,
>> +                                           nv_device_base(device));
>
> Can we be certain at this point that the struct ttm_tt is in fact a
> struct ttm_dma_tt?
>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
> [...]
>> +#if IS_ENABLED(CONFIG_ARCH_TEGRA)
>> +#define NOUVEAU_NEED_CACHE_SYNC
>> +#endif
>
> I know I gave this as an example myself when we discussed this offline,
> but I'm now thinking that this might actually be better off in Kconfig.
>
>> +#ifdef NOUVEAU_NEED_CACHE_SYNC
>> +void nouveau_bo_sync_for_cpu(struct nouveau_bo *);
>> +void nouveau_bo_sync_for_device(struct nouveau_bo *);
>> +#else
>> +static inline void
>> +nouveau_bo_sync_for_cpu(struct nouveau_bo *)
>> +{
>> +}
>> +
>> +static inline void
>> +nouveau_bo_sync_for_device(struct nouveau_bo *)
>> +{
>> +}
>> +#endif
>> +
>> +
>
> There's a gratuituous blank line here.

Fixed.

>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> index c90c0dc0afe8..b7e42fdc9634 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> @@ -897,7 +897,13 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
>>       ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
>>       spin_unlock(&nvbo->bo.bdev->fence_lock);
>>       drm_gem_object_unreference_unlocked(gem);
>> -     return ret;
>> +
>> +     if (ret)
>> +             return ret;
>> +
>> +     nouveau_bo_sync_for_cpu(nvbo);
>> +
>> +     return 0;
>>  }
>
> This could be rewritten as:
>
>         if (!ret)
>                 nouveau_bo_sync_for_cpu(nvbo);
>
>         return ret;
>
> Which would be slightly shorter.

I prefer to have a clear, easy to read code flow here by keeping
error-handling within conditions (and not the other way round). This
kind of optimization is very well done by the compiler.

>
> On second thought, perhaps part of nouveau_gem_ioctl_cpu_prep() could be
> refactored into a separate function to make this more symmetric. If we
> put that in nouveau_bo.c and name it nouveau_bo_wait() for example, the
> dummies can go away and both nouveau_bo_sync_for_{cpu,device}() can be
> made static. I also think that's cleaner because it has both variants of
> the nouveau_bo_sync_for_*() calls in the same file.

Yep, agreed. I will give it a try in the next version of the series.

Thanks,
Alex.
Alexandre Courbot May 23, 2014, 6:01 a.m. UTC | #5
On Mon, May 19, 2014 at 6:31 PM, Lucas Stach <l.stach@pengutronix.de> wrote:
> Am Montag, den 19.05.2014, 16:10 +0900 schrieb Alexandre Courbot:
>> From: Lucas Stach <dev@lynxeye.de>
>>
>> Signed-off-by: Lucas Stach <dev@lynxeye.de>
>> [acourbot@nvidia.com: make conditional and platform-friendly]
>> Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
>> ---
>>  drivers/gpu/drm/nouveau/nouveau_bo.c  | 32 ++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/nouveau/nouveau_bo.h  | 20 ++++++++++++++++++++
>>  drivers/gpu/drm/nouveau/nouveau_gem.c |  8 +++++++-
>>  3 files changed, 59 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> index b6dc85c614be..0886f47e5244 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> @@ -407,6 +407,8 @@ nouveau_bo_validate(struct nouveau_bo *nvbo, bool interruptible,
>>  {
>>       int ret;
>>
>> +     nouveau_bo_sync_for_device(nvbo);
>> +
>>       ret = ttm_bo_validate(&nvbo->bo, &nvbo->placement,
>>                             interruptible, no_wait_gpu);
>>       if (ret)
>> @@ -487,6 +489,36 @@ nouveau_bo_invalidate_caches(struct ttm_bo_device *bdev, uint32_t flags)
>>       return 0;
>>  }
>>
>> +#ifdef NOUVEAU_NEED_CACHE_SYNC
>
> I don't like this ifdef at all. I know calling this functions will add a
> little overhead to x86 where it isn't strictly required, but I think
> it's negligible.
>
> When I looked at them the dma_sync_single_for_[device|cpu] functions
> which are called here map out to just a drain of the PCI store buffer on
> x86, which should be fast enough to be done unconditionally. They won't
> so any time-consuming cache synchronization on PCI coherent arches.

If Ben agrees with it I am also perfectly fine with getting rid of this #ifdef.
diff mbox

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index b6dc85c614be..0886f47e5244 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -407,6 +407,8 @@  nouveau_bo_validate(struct nouveau_bo *nvbo, bool interruptible,
 {
 	int ret;
 
+	nouveau_bo_sync_for_device(nvbo);
+
 	ret = ttm_bo_validate(&nvbo->bo, &nvbo->placement,
 			      interruptible, no_wait_gpu);
 	if (ret)
@@ -487,6 +489,36 @@  nouveau_bo_invalidate_caches(struct ttm_bo_device *bdev, uint32_t flags)
 	return 0;
 }
 
+#ifdef NOUVEAU_NEED_CACHE_SYNC
+void
+nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
+{
+	struct nouveau_device *device;
+	struct ttm_tt *ttm = nvbo->bo.ttm;
+
+	device = nouveau_dev(nouveau_bdev(ttm->bdev)->dev);
+
+	if (nvbo->bo.ttm && nvbo->bo.ttm->caching_state == tt_cached)
+		ttm_dma_tt_cache_sync_for_cpu((struct ttm_dma_tt *)nvbo->bo.ttm,
+					      nv_device_base(device));
+}
+
+void
+nouveau_bo_sync_for_device(struct nouveau_bo *nvbo)
+{
+	struct ttm_tt *ttm = nvbo->bo.ttm;
+
+	if (ttm && ttm->caching_state == tt_cached) {
+		struct nouveau_device *device;
+
+		device = nouveau_dev(nouveau_bdev(ttm->bdev)->dev);
+
+		ttm_dma_tt_cache_sync_for_device((struct ttm_dma_tt *)ttm,
+						 nv_device_base(device));
+	}
+}
+#endif
+
 static int
 nouveau_bo_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
 			 struct ttm_mem_type_manager *man)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
index ff17c1f432fc..ead214931223 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
@@ -89,6 +89,26 @@  int  nouveau_bo_vma_add(struct nouveau_bo *, struct nouveau_vm *,
 			struct nouveau_vma *);
 void nouveau_bo_vma_del(struct nouveau_bo *, struct nouveau_vma *);
 
+#if IS_ENABLED(CONFIG_ARCH_TEGRA)
+#define NOUVEAU_NEED_CACHE_SYNC
+#endif
+
+#ifdef NOUVEAU_NEED_CACHE_SYNC
+void nouveau_bo_sync_for_cpu(struct nouveau_bo *);
+void nouveau_bo_sync_for_device(struct nouveau_bo *);
+#else
+static inline void
+nouveau_bo_sync_for_cpu(struct nouveau_bo *)
+{
+}
+
+static inline void
+nouveau_bo_sync_for_device(struct nouveau_bo *)
+{
+}
+#endif
+
+
 /* TODO: submit equivalent to TTM generic API upstream? */
 static inline void __iomem *
 nvbo_kmap_obj_iovirtual(struct nouveau_bo *nvbo)
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index c90c0dc0afe8..b7e42fdc9634 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -897,7 +897,13 @@  nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
 	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
 	spin_unlock(&nvbo->bo.bdev->fence_lock);
 	drm_gem_object_unreference_unlocked(gem);
-	return ret;
+
+	if (ret)
+		return ret;
+
+	nouveau_bo_sync_for_cpu(nvbo);
+
+	return 0;
 }
 
 int