diff mbox

[4/6] drm/nouveau: introduce NOUVEAU_GEM_TILE_WCUS

Message ID 1377648050-6649-5-git-send-email-dev@lynxeye.de (mailing list archive)
State New, archived
Headers show

Commit Message

Lucas Stach Aug. 28, 2013, midnight UTC
This flag allows userspace to give the kernel a hint that it should use
a non-snooped resource. To guarantee coherency at all times mappings
into userspace are done write combined, so userspace should avoid
reading back from those resources.

Signed-off-by: Lucas Stach <dev@lynxeye.de>
---
On x86 an optimized userspace can save up on snoop traffic in the
system, on ARM the benefits are potentially much larger, as we can save
the manual cache flush/invalidate.
---
 drivers/gpu/drm/nouveau/nouveau_bo.c | 11 ++++++++++-
 drivers/gpu/drm/nouveau/nouveau_bo.h |  1 +
 include/uapi/drm/nouveau_drm.h       |  1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

Comments

Ben Skeggs Aug. 28, 2013, 7:11 a.m. UTC | #1
On Wed, Aug 28, 2013 at 10:00 AM, Lucas Stach <dev@lynxeye.de> wrote:
> This flag allows userspace to give the kernel a hint that it should use
> a non-snooped resource. To guarantee coherency at all times mappings
> into userspace are done write combined, so userspace should avoid
> reading back from those resources.
Do any other combinations of cached/uncached and snooped/non-snooped
make any sense?  If so, perhaps we want to split the flags.

>
> Signed-off-by: Lucas Stach <dev@lynxeye.de>
> ---
> On x86 an optimized userspace can save up on snoop traffic in the
> system, on ARM the benefits are potentially much larger, as we can save
> the manual cache flush/invalidate.
> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c | 11 ++++++++++-
>  drivers/gpu/drm/nouveau/nouveau_bo.h |  1 +
>  include/uapi/drm/nouveau_drm.h       |  1 +
>  3 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index f4a2eb9..c5fcbcc 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -231,6 +231,12 @@ nouveau_bo_new(struct drm_device *dev, int size, int align,
>
>         nouveau_bo_fixup_align(nvbo, flags, &align, &size);
>         nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
> +
> +       if (tile_flags & NOUVEAU_GEM_TILE_WCUS)
> +               nvbo->valid_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
> +       else
> +               nvbo->valid_caching = TTM_PL_MASK_CACHING;
> +
>         nouveau_bo_placement_set(nvbo, flags, 0);
>
>         acc_size = ttm_bo_dma_acc_size(&drm->ttm.bdev, size,
> @@ -292,7 +298,7 @@ void
>  nouveau_bo_placement_set(struct nouveau_bo *nvbo, uint32_t type, uint32_t busy)
>  {
>         struct ttm_placement *pl = &nvbo->placement;
> -       uint32_t flags = TTM_PL_MASK_CACHING |
> +       uint32_t flags = nvbo->valid_caching |
>                 (nvbo->pin_refcnt ? TTM_PL_FLAG_NO_EVICT : 0);
>
>         pl->placement = nvbo->placements;
> @@ -1554,6 +1560,9 @@ nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nouveau_vm *vm,
>         if (nvbo->bo.mem.mem_type == TTM_PL_VRAM)
>                 nouveau_vm_map(vma, nvbo->bo.mem.mm_node);
>         else if (nvbo->bo.mem.mem_type == TTM_PL_TT) {
> +               if (!(nvbo->valid_caching & TTM_PL_FLAG_CACHED))
> +                       vma->access |= NV_MEM_ACCESS_NOSNOOP;
> +
>                 if (node->sg)
>                         nouveau_vm_map_sg_table(vma, 0, size, node);
>                 else
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
> index 653dbbb..2ecf8b7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
> @@ -9,6 +9,7 @@ struct nouveau_bo {
>         struct ttm_buffer_object bo;
>         struct ttm_placement placement;
>         u32 valid_domains;
> +       u32 valid_caching;
>         u32 placements[3];
>         u32 busy_placements[3];
>         struct ttm_bo_kmap_obj kmap;
> diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
> index 2a5769f..4948eee2 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -36,6 +36,7 @@
>  #define NOUVEAU_GEM_TILE_32BPP       0x00000002
>  #define NOUVEAU_GEM_TILE_ZETA        0x00000004
>  #define NOUVEAU_GEM_TILE_NONCONTIG   0x00000008
> +#define NOUVEAU_GEM_TILE_WCUS        0x00000010 /* write-combined, unsnooped */
>
>  struct drm_nouveau_gem_info {
>         uint32_t handle;
> --
> 1.8.3.1
>
Lucas Stach Aug. 28, 2013, 7:39 a.m. UTC | #2
Am Mittwoch, den 28.08.2013, 17:11 +1000 schrieb Ben Skeggs:
> On Wed, Aug 28, 2013 at 10:00 AM, Lucas Stach <dev@lynxeye.de> wrote:
> > This flag allows userspace to give the kernel a hint that it should use
> > a non-snooped resource. To guarantee coherency at all times mappings
> > into userspace are done write combined, so userspace should avoid
> > reading back from those resources.
> Do any other combinations of cached/uncached and snooped/non-snooped
> make any sense?  If so, perhaps we want to split the flags.
> 
Thought about that and I came to the conclusion that it isn't worth the
hassle. If we split it then things get more complicated on x86, were we
would have to invalidate caches manually with all the related
performance implications.

So I think it's a lot easier for userspace writers to just set the WCUS
flag on resources where the can promise no to touch the resource for
reading (AFAIR Christoph wanted this flag mostly for resources that the
driver isn't going to touch ever), or where it can happily live with
uncached reading.

> >
> > Signed-off-by: Lucas Stach <dev@lynxeye.de>
> > ---
> > On x86 an optimized userspace can save up on snoop traffic in the
> > system, on ARM the benefits are potentially much larger, as we can save
> > the manual cache flush/invalidate.
> > ---
> >  drivers/gpu/drm/nouveau/nouveau_bo.c | 11 ++++++++++-
> >  drivers/gpu/drm/nouveau/nouveau_bo.h |  1 +
> >  include/uapi/drm/nouveau_drm.h       |  1 +
> >  3 files changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> > index f4a2eb9..c5fcbcc 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> > @@ -231,6 +231,12 @@ nouveau_bo_new(struct drm_device *dev, int size, int align,
> >
> >         nouveau_bo_fixup_align(nvbo, flags, &align, &size);
> >         nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
> > +
> > +       if (tile_flags & NOUVEAU_GEM_TILE_WCUS)
> > +               nvbo->valid_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
> > +       else
> > +               nvbo->valid_caching = TTM_PL_MASK_CACHING;
> > +
> >         nouveau_bo_placement_set(nvbo, flags, 0);
> >
> >         acc_size = ttm_bo_dma_acc_size(&drm->ttm.bdev, size,
> > @@ -292,7 +298,7 @@ void
> >  nouveau_bo_placement_set(struct nouveau_bo *nvbo, uint32_t type, uint32_t busy)
> >  {
> >         struct ttm_placement *pl = &nvbo->placement;
> > -       uint32_t flags = TTM_PL_MASK_CACHING |
> > +       uint32_t flags = nvbo->valid_caching |
> >                 (nvbo->pin_refcnt ? TTM_PL_FLAG_NO_EVICT : 0);
> >
> >         pl->placement = nvbo->placements;
> > @@ -1554,6 +1560,9 @@ nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nouveau_vm *vm,
> >         if (nvbo->bo.mem.mem_type == TTM_PL_VRAM)
> >                 nouveau_vm_map(vma, nvbo->bo.mem.mm_node);
> >         else if (nvbo->bo.mem.mem_type == TTM_PL_TT) {
> > +               if (!(nvbo->valid_caching & TTM_PL_FLAG_CACHED))
> > +                       vma->access |= NV_MEM_ACCESS_NOSNOOP;
> > +
> >                 if (node->sg)
> >                         nouveau_vm_map_sg_table(vma, 0, size, node);
> >                 else
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
> > index 653dbbb..2ecf8b7 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_bo.h
> > +++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
> > @@ -9,6 +9,7 @@ struct nouveau_bo {
> >         struct ttm_buffer_object bo;
> >         struct ttm_placement placement;
> >         u32 valid_domains;
> > +       u32 valid_caching;
> >         u32 placements[3];
> >         u32 busy_placements[3];
> >         struct ttm_bo_kmap_obj kmap;
> > diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
> > index 2a5769f..4948eee2 100644
> > --- a/include/uapi/drm/nouveau_drm.h
> > +++ b/include/uapi/drm/nouveau_drm.h
> > @@ -36,6 +36,7 @@
> >  #define NOUVEAU_GEM_TILE_32BPP       0x00000002
> >  #define NOUVEAU_GEM_TILE_ZETA        0x00000004
> >  #define NOUVEAU_GEM_TILE_NONCONTIG   0x00000008
> > +#define NOUVEAU_GEM_TILE_WCUS        0x00000010 /* write-combined, unsnooped */
> >
> >  struct drm_nouveau_gem_info {
> >         uint32_t handle;
> > --
> > 1.8.3.1
> >
diff mbox

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index f4a2eb9..c5fcbcc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -231,6 +231,12 @@  nouveau_bo_new(struct drm_device *dev, int size, int align,
 
 	nouveau_bo_fixup_align(nvbo, flags, &align, &size);
 	nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
+
+	if (tile_flags & NOUVEAU_GEM_TILE_WCUS)
+		nvbo->valid_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
+	else
+		nvbo->valid_caching = TTM_PL_MASK_CACHING;
+
 	nouveau_bo_placement_set(nvbo, flags, 0);
 
 	acc_size = ttm_bo_dma_acc_size(&drm->ttm.bdev, size,
@@ -292,7 +298,7 @@  void
 nouveau_bo_placement_set(struct nouveau_bo *nvbo, uint32_t type, uint32_t busy)
 {
 	struct ttm_placement *pl = &nvbo->placement;
-	uint32_t flags = TTM_PL_MASK_CACHING |
+	uint32_t flags = nvbo->valid_caching |
 		(nvbo->pin_refcnt ? TTM_PL_FLAG_NO_EVICT : 0);
 
 	pl->placement = nvbo->placements;
@@ -1554,6 +1560,9 @@  nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nouveau_vm *vm,
 	if (nvbo->bo.mem.mem_type == TTM_PL_VRAM)
 		nouveau_vm_map(vma, nvbo->bo.mem.mm_node);
 	else if (nvbo->bo.mem.mem_type == TTM_PL_TT) {
+		if (!(nvbo->valid_caching & TTM_PL_FLAG_CACHED))
+			vma->access |= NV_MEM_ACCESS_NOSNOOP;
+
 		if (node->sg)
 			nouveau_vm_map_sg_table(vma, 0, size, node);
 		else
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
index 653dbbb..2ecf8b7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
@@ -9,6 +9,7 @@  struct nouveau_bo {
 	struct ttm_buffer_object bo;
 	struct ttm_placement placement;
 	u32 valid_domains;
+	u32 valid_caching;
 	u32 placements[3];
 	u32 busy_placements[3];
 	struct ttm_bo_kmap_obj kmap;
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index 2a5769f..4948eee2 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -36,6 +36,7 @@ 
 #define NOUVEAU_GEM_TILE_32BPP       0x00000002
 #define NOUVEAU_GEM_TILE_ZETA        0x00000004
 #define NOUVEAU_GEM_TILE_NONCONTIG   0x00000008
+#define NOUVEAU_GEM_TILE_WCUS        0x00000010 /* write-combined, unsnooped */
 
 struct drm_nouveau_gem_info {
 	uint32_t handle;