Message ID | 1463573166-7008-1-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, May 18, 2016 at 01:06:06PM +0100, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Rather than asking itself "am I a Broadwell, am I a Cherryview, > or am I neither of the two" on on low level page table operations, > like inserting and clearing PTEs; add a new vfunc kunmap_page_dma > and set it to appropriate flavour at ppgtt init time. > > v2: Fix platfrom condition and group vfunc init more together. > (Daniele Ceraolo Spurio) Or we can take a different approach and use a WC mapping for the page. The patch is a bit messy since we need to feed not the device into the unmap function but the vm, but the guts are: @@ -323,19 +323,21 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr, return pte; } -static int __setup_page_dma(struct drm_device *dev, - struct i915_page_dma *p, gfp_t flags) +static int __setup_page_dma(struct i915_address_space *vm, + struct i915_page_dma *p, + gfp_t flags) { - struct device *device = &dev->pdev->dev; - p->page = alloc_page(flags); if (!p->page) return -ENOMEM; - p->daddr = dma_map_page(device, - p->page, 0, 4096, PCI_DMA_BIDIRECTIONAL); + if (vm->pt_kmap_wc) + set_pages_array_wc(&p->page, 1); + + p->daddr = dma_map_page(vm->dma, p->page, 0, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); - if (dma_mapping_error(device, p->daddr)) { + if (dma_mapping_error(vm->dma, p->daddr)) { __free_page(p->page); return -EINVAL; } @@ -343,94 +345,89 @@ static int __setup_page_dma(struct drm_device *dev, return 0; } -static int setup_page_dma(struct drm_device *dev, struct i915_page_dma *p) +static int setup_page_dma(struct i915_address_space *vm, + struct i915_page_dma *p) { - return __setup_page_dma(dev, p, I915_GFP_DMA); + return __setup_page_dma(vm, p, I915_GFP_DMA); } -static void cleanup_page_dma(struct drm_device *dev, struct i915_page_dma *p) +static void cleanup_page_dma(struct i915_address_space *vm, + struct i915_page_dma *p) { - if (WARN_ON(!p->page)) - return; + dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + + if (vm->pt_kmap_wc) + set_pages_array_wb(&p->page, 1); - dma_unmap_page(&dev->pdev->dev, p->daddr, 4096, PCI_DMA_BIDIRECTIONAL); __free_page(p->page); - memset(p, 0, sizeof(*p)); } @@ -1484,8 +1475,16 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) ppgtt->base.bind_vma = ppgtt_bind_vma; ppgtt->debug_dump = gen8_dump_ppgtt; + /* There are only few exceptions for gen >=6. chv and bxt. + * And we are not sure about the latter so play safe for now. + */ + if (IS_CHERRYVIEW(ppgtt->base.dev) || IS_BROXTON(ppgtt->base.dev)) { + ppgtt->base.pt_kmap_wc = true; + ppgtt->base.pt_kmap_prot = pgprot_writecombine(PAGE_KERNEL_IO); + } + Advantage: we avoid the clflush after every update Disadvantage: we invoke set_memory_*() on every page used by the ppggtt. (To reduce that cost, I have in made keeping a pagevec cache of WC pages.) Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!! Note that I started with exactly this patch (using a kunmap vfunc) many moons ago and switched to the pgprot_t based approach instead. -Chris
On 18/05/16 13:22, Chris Wilson wrote: > On Wed, May 18, 2016 at 01:06:06PM +0100, Tvrtko Ursulin wrote: >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> Rather than asking itself "am I a Broadwell, am I a Cherryview, >> or am I neither of the two" on on low level page table operations, >> like inserting and clearing PTEs; add a new vfunc kunmap_page_dma >> and set it to appropriate flavour at ppgtt init time. >> >> v2: Fix platfrom condition and group vfunc init more together. >> (Daniele Ceraolo Spurio) > > Or we can take a different approach and use a WC mapping for the page. > > The patch is a bit messy since we need to feed not the device into the > unmap function but the vm, but the guts are: > > @@ -323,19 +323,21 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr, > return pte; > } > > -static int __setup_page_dma(struct drm_device *dev, > - struct i915_page_dma *p, gfp_t flags) > +static int __setup_page_dma(struct i915_address_space *vm, > + struct i915_page_dma *p, > + gfp_t flags) > { > - struct device *device = &dev->pdev->dev; > - > p->page = alloc_page(flags); > if (!p->page) > return -ENOMEM; > > - p->daddr = dma_map_page(device, > - p->page, 0, 4096, PCI_DMA_BIDIRECTIONAL); > + if (vm->pt_kmap_wc) > + set_pages_array_wc(&p->page, 1); > + > + p->daddr = dma_map_page(vm->dma, p->page, 0, PAGE_SIZE, > + PCI_DMA_BIDIRECTIONAL); > > - if (dma_mapping_error(device, p->daddr)) { > + if (dma_mapping_error(vm->dma, p->daddr)) { > __free_page(p->page); > return -EINVAL; > } > @@ -343,94 +345,89 @@ static int __setup_page_dma(struct drm_device *dev, > return 0; > } > > -static int setup_page_dma(struct drm_device *dev, struct i915_page_dma *p) > +static int setup_page_dma(struct i915_address_space *vm, > + struct i915_page_dma *p) > { > - return __setup_page_dma(dev, p, I915_GFP_DMA); > + return __setup_page_dma(vm, p, I915_GFP_DMA); > } > > -static void cleanup_page_dma(struct drm_device *dev, struct i915_page_dma *p) > +static void cleanup_page_dma(struct i915_address_space *vm, > + struct i915_page_dma *p) > { > - if (WARN_ON(!p->page)) > - return; > + dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); > + > + if (vm->pt_kmap_wc) > + set_pages_array_wb(&p->page, 1); > > - dma_unmap_page(&dev->pdev->dev, p->daddr, 4096, PCI_DMA_BIDIRECTIONAL); > __free_page(p->page); > - memset(p, 0, sizeof(*p)); > } > > > > @@ -1484,8 +1475,16 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) > ppgtt->base.bind_vma = ppgtt_bind_vma; > ppgtt->debug_dump = gen8_dump_ppgtt; > > + /* There are only few exceptions for gen >=6. chv and bxt. > + * And we are not sure about the latter so play safe for now. > + */ > + if (IS_CHERRYVIEW(ppgtt->base.dev) || IS_BROXTON(ppgtt->base.dev)) { > + ppgtt->base.pt_kmap_wc = true; > + ppgtt->base.pt_kmap_prot = pgprot_writecombine(PAGE_KERNEL_IO); > + } > + > > Advantage: we avoid the clflush after every update > Disadvantage: we invoke set_memory_*() on every page used by the ppggtt. > (To reduce that cost, I have in made keeping a pagevec cache of WC > pages.) > > Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!! > > Note that I started with exactly this patch (using a kunmap vfunc) many > moons ago and switched to the pgprot_t based approach instead. The pattern of writes is such that write-combine makes a difference here versus an explicit flush? Taking into consideration the complexity of the pagevec cache you mention is required for the set_memory_wc route? Many moons ago comment makes me think we could have at least had an inter-rim vfunc solution in place all this time. Regards, Tvrtko
On Wed, May 18, 2016 at 02:31:28PM +0100, Tvrtko Ursulin wrote: > > On 18/05/16 13:22, Chris Wilson wrote: > >Advantage: we avoid the clflush after every update > >Disadvantage: we invoke set_memory_*() on every page used by the ppggtt. > >(To reduce that cost, I have in made keeping a pagevec cache of WC > >pages.) > > > >Sadly, we can't just use kmap_atomic_prot() as it is 32-bit only!!! > > > >Note that I started with exactly this patch (using a kunmap vfunc) many > >moons ago and switched to the pgprot_t based approach instead. > > The pattern of writes is such that write-combine makes a difference > here versus an explicit flush? Taking into consideration the > complexity of the pagevec cache you mention is required for the > set_memory_wc route? Yes, I think the WCB is the preferred path here (versus doing a sweep over the whole page to clflush). Most frequent updates tends to be framebuffers being passed between clients (since we close and unbind the VMA every time) which are ~8 whole pages each. (We can expect to have flush about 10 pages to misalignment, and so even if we were able to match WC we still do 25% more clflushes). It's not required for the set_memory_wc route, just will be a nice addition. The principle issue should be fixed already in mm/vmalloc.c - and note that we already have the issue with set_memory_uc/_wb being invoked per context for no good reason. > Many moons ago comment makes me think we could have at least had an > inter-rim vfunc solution in place all this time. Similarly there is a queue of patches to try and tackle some of the regressions over the last couple of year that are now lost because no one dares review them. -Chris
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 7eab619a3eb2..892f8b41dca7 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -361,29 +361,30 @@ static void *kmap_page_dma(struct i915_page_dma *p) return kmap_atomic(p->page); } +static void kunmap_page_dma(void *vaddr) +{ + kunmap_atomic(vaddr); +} + /* We use the flushing unmap only with ppgtt structures: * page directories, page tables and scratch pages. */ -static void kunmap_page_dma(struct drm_device *dev, void *vaddr) +static void kunmap_page_dma_flush(void *vaddr) { - /* There are only few exceptions for gen >=6. chv and bxt. - * And we are not sure about the latter so play safe for now. - */ - if (IS_CHERRYVIEW(dev) || IS_BROXTON(dev)) - drm_clflush_virt_range(vaddr, PAGE_SIZE); + drm_clflush_virt_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); } #define kmap_px(px) kmap_page_dma(px_base(px)) -#define kunmap_px(ppgtt, vaddr) kunmap_page_dma((ppgtt)->base.dev, (vaddr)) +#define kunmap_px(ppgtt, vaddr) (ppgtt)->kunmap_page_dma((vaddr)) #define setup_px(dev, px) setup_page_dma((dev), px_base(px)) #define cleanup_px(dev, px) cleanup_page_dma((dev), px_base(px)) -#define fill_px(dev, px, v) fill_page_dma((dev), px_base(px), (v)) -#define fill32_px(dev, px, v) fill_page_dma_32((dev), px_base(px), (v)) +#define fill_px(ppgtt, px, v) fill_page_dma((ppgtt), px_base(px), (v)) +#define fill32_px(ppgtt, px, v) fill_page_dma_32((ppgtt), px_base(px), (v)) -static void fill_page_dma(struct drm_device *dev, struct i915_page_dma *p, +static void fill_page_dma(struct i915_hw_ppgtt *ppgtt, struct i915_page_dma *p, const uint64_t val) { int i; @@ -392,17 +393,17 @@ static void fill_page_dma(struct drm_device *dev, struct i915_page_dma *p, for (i = 0; i < 512; i++) vaddr[i] = val; - kunmap_page_dma(dev, vaddr); + ppgtt->kunmap_page_dma(vaddr); } -static void fill_page_dma_32(struct drm_device *dev, struct i915_page_dma *p, - const uint32_t val32) +static void fill_page_dma_32(struct i915_hw_ppgtt *ppgtt, + struct i915_page_dma *p, const uint32_t val32) { uint64_t v = val32; v = v << 32 | val32; - fill_page_dma(dev, p, v); + fill_page_dma(ppgtt, p, v); } static struct i915_page_scratch *alloc_scratch_page(struct drm_device *dev) @@ -480,7 +481,7 @@ static void gen8_initialize_pt(struct i915_address_space *vm, scratch_pte = gen8_pte_encode(px_dma(vm->scratch_page), I915_CACHE_LLC, true); - fill_px(vm->dev, pt, scratch_pte); + fill_px(i915_vm_to_ppgtt(vm), pt, scratch_pte); } static void gen6_initialize_pt(struct i915_address_space *vm, @@ -493,7 +494,7 @@ static void gen6_initialize_pt(struct i915_address_space *vm, scratch_pte = vm->pte_encode(px_dma(vm->scratch_page), I915_CACHE_LLC, true, 0); - fill32_px(vm->dev, pt, scratch_pte); + fill32_px(i915_vm_to_ppgtt(vm), pt, scratch_pte); } static struct i915_page_directory *alloc_pd(struct drm_device *dev) @@ -540,7 +541,7 @@ static void gen8_initialize_pd(struct i915_address_space *vm, scratch_pde = gen8_pde_encode(px_dma(vm->scratch_pt), I915_CACHE_LLC); - fill_px(vm->dev, pd, scratch_pde); + fill_px(i915_vm_to_ppgtt(vm), pd, scratch_pde); } static int __pdp_init(struct drm_device *dev, @@ -621,7 +622,7 @@ static void gen8_initialize_pdp(struct i915_address_space *vm, scratch_pdpe = gen8_pdpe_encode(px_dma(vm->scratch_pd), I915_CACHE_LLC); - fill_px(vm->dev, pdp, scratch_pdpe); + fill_px(i915_vm_to_ppgtt(vm), pdp, scratch_pdpe); } static void gen8_initialize_pml4(struct i915_address_space *vm, @@ -632,7 +633,7 @@ static void gen8_initialize_pml4(struct i915_address_space *vm, scratch_pml4e = gen8_pml4e_encode(px_dma(vm->scratch_pdp), I915_CACHE_LLC); - fill_px(vm->dev, pml4, scratch_pml4e); + fill_px(i915_vm_to_ppgtt(vm), pml4, scratch_pml4e); } static void @@ -1512,12 +1513,9 @@ static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt) */ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) { + struct drm_i915_private *dev_priv = to_i915(ppgtt->base.dev); int ret; - ret = gen8_init_scratch(&ppgtt->base); - if (ret) - return ret; - ppgtt->base.start = 0; ppgtt->base.cleanup = gen8_ppgtt_cleanup; ppgtt->base.allocate_va_range = gen8_alloc_va_range; @@ -1527,6 +1525,18 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt) ppgtt->base.bind_vma = ppgtt_bind_vma; ppgtt->debug_dump = gen8_dump_ppgtt; + /* There are only few exceptions for gen >=6. chv and bxt. + * And we are not sure about the latter so play safe for now. + */ + if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv)) + ppgtt->kunmap_page_dma = kunmap_page_dma_flush; + else + ppgtt->kunmap_page_dma = kunmap_page_dma; + + ret = gen8_init_scratch(&ppgtt->base); + if (ret) + return ret; + if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { ret = setup_px(ppgtt->base.dev, &ppgtt->pml4); if (ret) @@ -2073,6 +2083,9 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt) int ret; ppgtt->base.pte_encode = ggtt->base.pte_encode; + + ppgtt->kunmap_page_dma = kunmap_page_dma; + if (IS_GEN6(dev)) { ppgtt->switch_mm = gen6_mm_switch; } else if (IS_HASWELL(dev)) { diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h index 62be77cac5cd..b36b997406c6 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.h +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h @@ -379,6 +379,7 @@ struct i915_hw_ppgtt { gen6_pte_t __iomem *pd_addr; + void (*kunmap_page_dma)(void *vaddr); int (*enable)(struct i915_hw_ppgtt *ppgtt); int (*switch_mm)(struct i915_hw_ppgtt *ppgtt, struct drm_i915_gem_request *req);