diff mbox

[v2] drm/i915: Add ppgtt->kunmap_page_dma vfunc

Message ID 1463573166-7008-1-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin May 18, 2016, 12:06 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Rather than asking itself "am I a Broadwell, am I a Cherryview,
or am I neither of the two" on on low level page table operations,
like inserting and clearing PTEs; add a new vfunc kunmap_page_dma
and set it to appropriate flavour at ppgtt init time.

v2: Fix platfrom condition and group vfunc init more together.
    (Daniele Ceraolo Spurio)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 59 ++++++++++++++++++++++---------------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
 2 files changed, 37 insertions(+), 23 deletions(-)

Comments

Chris Wilson May 18, 2016, 12:22 p.m. UTC | #1
On Wed, May 18, 2016 at 01:06:06PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Rather than asking itself "am I a Broadwell, am I a Cherryview,
> or am I neither of the two" on on low level page table operations,
> like inserting and clearing PTEs; add a new vfunc kunmap_page_dma
> and set it to appropriate flavour at ppgtt init time.
> 
> v2: Fix platfrom condition and group vfunc init more together.
>     (Daniele Ceraolo Spurio)

Or we can take a different approach and use a WC mapping for the page.

The patch is a bit messy since we need to feed not the device into the
unmap function but the vm, but the guts are:

@@ -323,19 +323,21 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
        return pte;
 }
 
-static int __setup_page_dma(struct drm_device *dev,
-                           struct i915_page_dma *p, gfp_t flags)
+static int __setup_page_dma(struct i915_address_space *vm,
+                           struct i915_page_dma *p,
+                           gfp_t flags)
 {
-       struct device *device = &dev->pdev->dev;
-
        p->page = alloc_page(flags);
        if (!p->page)
                return -ENOMEM;
 
-       p->daddr = dma_map_page(device,
-                               p->page, 0, 4096, PCI_DMA_BIDIRECTIONAL);
+       if (vm->pt_kmap_wc)
+               set_pages_array_wc(&p->page, 1);
+
+       p->daddr = dma_map_page(vm->dma, p->page, 0, PAGE_SIZE,
+                               PCI_DMA_BIDIRECTIONAL);
 
-       if (dma_mapping_error(device, p->daddr)) {
+       if (dma_mapping_error(vm->dma, p->daddr)) {
                __free_page(p->page);
                return -EINVAL;
        }
@@ -343,94 +345,89 @@ static int __setup_page_dma(struct drm_device *dev,
        return 0;
 }
 
-static int setup_page_dma(struct drm_device *dev, struct i915_page_dma *p)
+static int setup_page_dma(struct i915_address_space *vm,
+                         struct i915_page_dma *p)
 {
-       return __setup_page_dma(dev, p, I915_GFP_DMA);
+       return __setup_page_dma(vm, p, I915_GFP_DMA);
 }
 
-static void cleanup_page_dma(struct drm_device *dev, struct i915_page_dma *p)
+static void cleanup_page_dma(struct i915_address_space *vm,
+                            struct i915_page_dma *p)
 {
-       if (WARN_ON(!p->page))
-               return;
+       dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+
+       if (vm->pt_kmap_wc)
+               set_pages_array_wb(&p->page, 1);
 
-       dma_unmap_page(&dev->pdev->dev, p->daddr, 4096, PCI_DMA_BIDIRECTIONAL);
        __free_page(p->page);
-       memset(p, 0, sizeof(*p));
 }
 


@@ -1484,8 +1475,16 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
        ppgtt->base.bind_vma = ppgtt_bind_vma;
        ppgtt->debug_dump = gen8_dump_ppgtt;
 
+       /* There are only few exceptions for gen >=6. chv and bxt.
+        * And we are not sure about the latter so play safe for now.
+        */
+       if (IS_CHERRYVIEW(ppgtt->base.dev) || IS_BROXTON(ppgtt->base.dev)) {
+               ppgtt->base.pt_kmap_wc = true;
+               ppgtt->base.pt_kmap_prot = pgprot_writecombine(PAGE_KERNEL_IO);
+       }
+

Advantage: we avoid the clflush after every update
Disadvantage: we invoke set_memory_*() on every page used by the ppggtt.
(To reduce that cost, I have in made keeping a pagevec cache of WC
pages.)

Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!!

Note that I started with exactly this patch (using a kunmap vfunc) many
moons ago and switched to the pgprot_t based approach instead.
-Chris
Tvrtko Ursulin May 18, 2016, 1:31 p.m. UTC | #2
On 18/05/16 13:22, Chris Wilson wrote:
> On Wed, May 18, 2016 at 01:06:06PM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Rather than asking itself "am I a Broadwell, am I a Cherryview,
>> or am I neither of the two" on on low level page table operations,
>> like inserting and clearing PTEs; add a new vfunc kunmap_page_dma
>> and set it to appropriate flavour at ppgtt init time.
>>
>> v2: Fix platfrom condition and group vfunc init more together.
>>      (Daniele Ceraolo Spurio)
>
> Or we can take a different approach and use a WC mapping for the page.
>
> The patch is a bit messy since we need to feed not the device into the
> unmap function but the vm, but the guts are:
>
> @@ -323,19 +323,21 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
>          return pte;
>   }
>
> -static int __setup_page_dma(struct drm_device *dev,
> -                           struct i915_page_dma *p, gfp_t flags)
> +static int __setup_page_dma(struct i915_address_space *vm,
> +                           struct i915_page_dma *p,
> +                           gfp_t flags)
>   {
> -       struct device *device = &dev->pdev->dev;
> -
>          p->page = alloc_page(flags);
>          if (!p->page)
>                  return -ENOMEM;
>
> -       p->daddr = dma_map_page(device,
> -                               p->page, 0, 4096, PCI_DMA_BIDIRECTIONAL);
> +       if (vm->pt_kmap_wc)
> +               set_pages_array_wc(&p->page, 1);
> +
> +       p->daddr = dma_map_page(vm->dma, p->page, 0, PAGE_SIZE,
> +                               PCI_DMA_BIDIRECTIONAL);
>
> -       if (dma_mapping_error(device, p->daddr)) {
> +       if (dma_mapping_error(vm->dma, p->daddr)) {
>                  __free_page(p->page);
>                  return -EINVAL;
>          }
> @@ -343,94 +345,89 @@ static int __setup_page_dma(struct drm_device *dev,
>          return 0;
>   }
>
> -static int setup_page_dma(struct drm_device *dev, struct i915_page_dma *p)
> +static int setup_page_dma(struct i915_address_space *vm,
> +                         struct i915_page_dma *p)
>   {
> -       return __setup_page_dma(dev, p, I915_GFP_DMA);
> +       return __setup_page_dma(vm, p, I915_GFP_DMA);
>   }
>
> -static void cleanup_page_dma(struct drm_device *dev, struct i915_page_dma *p)
> +static void cleanup_page_dma(struct i915_address_space *vm,
> +                            struct i915_page_dma *p)
>   {
> -       if (WARN_ON(!p->page))
> -               return;
> +       dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
> +
> +       if (vm->pt_kmap_wc)
> +               set_pages_array_wb(&p->page, 1);
>
> -       dma_unmap_page(&dev->pdev->dev, p->daddr, 4096, PCI_DMA_BIDIRECTIONAL);
>          __free_page(p->page);
> -       memset(p, 0, sizeof(*p));
>   }
>
>
>
> @@ -1484,8 +1475,16 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>          ppgtt->base.bind_vma = ppgtt_bind_vma;
>          ppgtt->debug_dump = gen8_dump_ppgtt;
>
> +       /* There are only few exceptions for gen >=6. chv and bxt.
> +        * And we are not sure about the latter so play safe for now.
> +        */
> +       if (IS_CHERRYVIEW(ppgtt->base.dev) || IS_BROXTON(ppgtt->base.dev)) {
> +               ppgtt->base.pt_kmap_wc = true;
> +               ppgtt->base.pt_kmap_prot = pgprot_writecombine(PAGE_KERNEL_IO);
> +       }
> +
>
> Advantage: we avoid the clflush after every update
> Disadvantage: we invoke set_memory_*() on every page used by the ppggtt.
> (To reduce that cost, I have in made keeping a pagevec cache of WC
> pages.)
>
> Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!!
>
> Note that I started with exactly this patch (using a kunmap vfunc) many
> moons ago and switched to the pgprot_t based approach instead.

The pattern of writes is such that write-combine makes a difference here 
versus an explicit flush? Taking into consideration the complexity of 
the pagevec cache you mention is required for the set_memory_wc route?

Many moons ago comment makes me think we could have at least had an 
inter-rim vfunc solution in place all this time.

Regards,

Tvrtko
Chris Wilson May 18, 2016, 1:51 p.m. UTC | #3
On Wed, May 18, 2016 at 02:31:28PM +0100, Tvrtko Ursulin wrote:
> 
> On 18/05/16 13:22, Chris Wilson wrote:
> >Advantage: we avoid the clflush after every update
> >Disadvantage: we invoke set_memory_*() on every page used by the ppggtt.
> >(To reduce that cost, I have in made keeping a pagevec cache of WC
> >pages.)
> >
> >Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!!
> >
> >Note that I started with exactly this patch (using a kunmap vfunc) many
> >moons ago and switched to the pgprot_t based approach instead.
> 
> The pattern of writes is such that write-combine makes a difference
> here versus an explicit flush? Taking into consideration the
> complexity of the pagevec cache you mention is required for the
> set_memory_wc route?

Yes, I think the WCB is the preferred path here (versus doing a sweep
over the whole page to clflush). Most frequent updates tends to be
framebuffers being passed between clients (since we close and unbind the
VMA every time) which are ~8 whole pages each. (We can expect to have
flush about 10 pages to misalignment, and so even if we were able to
match WC we still do 25% more clflushes).

It's not required for the set_memory_wc route, just will be a nice
addition. The principle issue should be fixed already in mm/vmalloc.c -
and note that we already have the issue with set_memory_uc/_wb being
invoked per context for no good reason.
 
> Many moons ago comment makes me think we could have at least had an
> inter-rim vfunc solution in place all this time.

Similarly there is a queue of patches to try and tackle some of the
regressions over the last couple of year that are now lost because no one
dares review them.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 7eab619a3eb2..892f8b41dca7 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -361,29 +361,30 @@  static void *kmap_page_dma(struct i915_page_dma *p)
 	return kmap_atomic(p->page);
 }
 
+static void kunmap_page_dma(void *vaddr)
+{
+	kunmap_atomic(vaddr);
+}
+
 /* We use the flushing unmap only with ppgtt structures:
  * page directories, page tables and scratch pages.
  */
-static void kunmap_page_dma(struct drm_device *dev, void *vaddr)
+static void kunmap_page_dma_flush(void *vaddr)
 {
-	/* There are only few exceptions for gen >=6. chv and bxt.
-	 * And we are not sure about the latter so play safe for now.
-	 */
-	if (IS_CHERRYVIEW(dev) || IS_BROXTON(dev))
-		drm_clflush_virt_range(vaddr, PAGE_SIZE);
+	drm_clflush_virt_range(vaddr, PAGE_SIZE);
 
 	kunmap_atomic(vaddr);
 }
 
 #define kmap_px(px) kmap_page_dma(px_base(px))
-#define kunmap_px(ppgtt, vaddr) kunmap_page_dma((ppgtt)->base.dev, (vaddr))
+#define kunmap_px(ppgtt, vaddr) (ppgtt)->kunmap_page_dma((vaddr))
 
 #define setup_px(dev, px) setup_page_dma((dev), px_base(px))
 #define cleanup_px(dev, px) cleanup_page_dma((dev), px_base(px))
-#define fill_px(dev, px, v) fill_page_dma((dev), px_base(px), (v))
-#define fill32_px(dev, px, v) fill_page_dma_32((dev), px_base(px), (v))
+#define fill_px(ppgtt, px, v) fill_page_dma((ppgtt), px_base(px), (v))
+#define fill32_px(ppgtt, px, v) fill_page_dma_32((ppgtt), px_base(px), (v))
 
-static void fill_page_dma(struct drm_device *dev, struct i915_page_dma *p,
+static void fill_page_dma(struct i915_hw_ppgtt *ppgtt, struct i915_page_dma *p,
 			  const uint64_t val)
 {
 	int i;
@@ -392,17 +393,17 @@  static void fill_page_dma(struct drm_device *dev, struct i915_page_dma *p,
 	for (i = 0; i < 512; i++)
 		vaddr[i] = val;
 
-	kunmap_page_dma(dev, vaddr);
+	ppgtt->kunmap_page_dma(vaddr);
 }
 
-static void fill_page_dma_32(struct drm_device *dev, struct i915_page_dma *p,
-			     const uint32_t val32)
+static void fill_page_dma_32(struct i915_hw_ppgtt *ppgtt,
+			     struct i915_page_dma *p, const uint32_t val32)
 {
 	uint64_t v = val32;
 
 	v = v << 32 | val32;
 
-	fill_page_dma(dev, p, v);
+	fill_page_dma(ppgtt, p, v);
 }
 
 static struct i915_page_scratch *alloc_scratch_page(struct drm_device *dev)
@@ -480,7 +481,7 @@  static void gen8_initialize_pt(struct i915_address_space *vm,
 	scratch_pte = gen8_pte_encode(px_dma(vm->scratch_page),
 				      I915_CACHE_LLC, true);
 
-	fill_px(vm->dev, pt, scratch_pte);
+	fill_px(i915_vm_to_ppgtt(vm), pt, scratch_pte);
 }
 
 static void gen6_initialize_pt(struct i915_address_space *vm,
@@ -493,7 +494,7 @@  static void gen6_initialize_pt(struct i915_address_space *vm,
 	scratch_pte = vm->pte_encode(px_dma(vm->scratch_page),
 				     I915_CACHE_LLC, true, 0);
 
-	fill32_px(vm->dev, pt, scratch_pte);
+	fill32_px(i915_vm_to_ppgtt(vm), pt, scratch_pte);
 }
 
 static struct i915_page_directory *alloc_pd(struct drm_device *dev)
@@ -540,7 +541,7 @@  static void gen8_initialize_pd(struct i915_address_space *vm,
 
 	scratch_pde = gen8_pde_encode(px_dma(vm->scratch_pt), I915_CACHE_LLC);
 
-	fill_px(vm->dev, pd, scratch_pde);
+	fill_px(i915_vm_to_ppgtt(vm), pd, scratch_pde);
 }
 
 static int __pdp_init(struct drm_device *dev,
@@ -621,7 +622,7 @@  static void gen8_initialize_pdp(struct i915_address_space *vm,
 
 	scratch_pdpe = gen8_pdpe_encode(px_dma(vm->scratch_pd), I915_CACHE_LLC);
 
-	fill_px(vm->dev, pdp, scratch_pdpe);
+	fill_px(i915_vm_to_ppgtt(vm), pdp, scratch_pdpe);
 }
 
 static void gen8_initialize_pml4(struct i915_address_space *vm,
@@ -632,7 +633,7 @@  static void gen8_initialize_pml4(struct i915_address_space *vm,
 	scratch_pml4e = gen8_pml4e_encode(px_dma(vm->scratch_pdp),
 					  I915_CACHE_LLC);
 
-	fill_px(vm->dev, pml4, scratch_pml4e);
+	fill_px(i915_vm_to_ppgtt(vm), pml4, scratch_pml4e);
 }
 
 static void
@@ -1512,12 +1513,9 @@  static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
  */
 static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 {
+	struct drm_i915_private *dev_priv = to_i915(ppgtt->base.dev);
 	int ret;
 
-	ret = gen8_init_scratch(&ppgtt->base);
-	if (ret)
-		return ret;
-
 	ppgtt->base.start = 0;
 	ppgtt->base.cleanup = gen8_ppgtt_cleanup;
 	ppgtt->base.allocate_va_range = gen8_alloc_va_range;
@@ -1527,6 +1525,18 @@  static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 	ppgtt->base.bind_vma = ppgtt_bind_vma;
 	ppgtt->debug_dump = gen8_dump_ppgtt;
 
+	/* There are only few exceptions for gen >=6. chv and bxt.
+	 * And we are not sure about the latter so play safe for now.
+	 */
+	if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv))
+		ppgtt->kunmap_page_dma = kunmap_page_dma_flush;
+	else
+		ppgtt->kunmap_page_dma = kunmap_page_dma;
+
+	ret = gen8_init_scratch(&ppgtt->base);
+	if (ret)
+		return ret;
+
 	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
 		ret = setup_px(ppgtt->base.dev, &ppgtt->pml4);
 		if (ret)
@@ -2073,6 +2083,9 @@  static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 	int ret;
 
 	ppgtt->base.pte_encode = ggtt->base.pte_encode;
+
+	ppgtt->kunmap_page_dma = kunmap_page_dma;
+
 	if (IS_GEN6(dev)) {
 		ppgtt->switch_mm = gen6_mm_switch;
 	} else if (IS_HASWELL(dev)) {
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 62be77cac5cd..b36b997406c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -379,6 +379,7 @@  struct i915_hw_ppgtt {
 
 	gen6_pte_t __iomem *pd_addr;
 
+	void (*kunmap_page_dma)(void *vaddr);
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
 			 struct drm_i915_gem_request *req);