Message ID | 1344696088-24760-23-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, Aug 11, 2012 at 03:41:21PM +0100, Chris Wilson wrote: > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> What about putting kmap/unmap abstractions into obj->ops (like the dma_buf interface already has)? Since the pwrite/pread code is already rather branch heave I hope we don't see the overhead of the indirect call even in microbenchmarks (haven't checked). And this way we would also neatly wrap up dma_bufs for pwrite (if anyone ever really wants that ...). The kmap(_atomic) for stolen mem backed objects would boil down to doing the pointer arithmetic, kunmap would be just a noop. Cheers, Daniel > --- > drivers/gpu/drm/i915/i915_gem.c | 169 +++++++++++++++++++++++++-------------- > 1 file changed, 111 insertions(+), 58 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 552f95b..a2fb2aa 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -664,19 +664,17 @@ out: > * needs_clflush_before is set and flushes out any written cachelines after > * writing if needs_clflush is set. */ > static int > -shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, > +shmem_pwrite_fast(char *vaddr, int shmem_page_offset, int page_length, > char __user *user_data, > bool page_do_bit17_swizzling, > bool needs_clflush_before, > bool needs_clflush_after) > { > - char *vaddr; > int ret; > > if (unlikely(page_do_bit17_swizzling)) > return -EINVAL; > > - vaddr = kmap_atomic(page); > if (needs_clflush_before) > drm_clflush_virt_range(vaddr + shmem_page_offset, > page_length); > @@ -686,7 +684,6 @@ shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, > if (needs_clflush_after) > drm_clflush_virt_range(vaddr + shmem_page_offset, > page_length); > - kunmap_atomic(vaddr); > > return ret ? -EFAULT : 0; > } > @@ -694,16 +691,14 @@ shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, > /* Only difference to the fast-path function is that this can handle bit17 > * and uses non-atomic copy and kmap functions. */ > static int > -shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length, > +shmem_pwrite_slow(char *vaddr, int shmem_page_offset, int page_length, > char __user *user_data, > bool page_do_bit17_swizzling, > bool needs_clflush_before, > bool needs_clflush_after) > { > - char *vaddr; > int ret; > > - vaddr = kmap(page); > if (unlikely(needs_clflush_before || page_do_bit17_swizzling)) > shmem_clflush_swizzled_range(vaddr + shmem_page_offset, > page_length, > @@ -720,7 +715,6 @@ shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length, > shmem_clflush_swizzled_range(vaddr + shmem_page_offset, > page_length, > page_do_bit17_swizzling); > - kunmap(page); > > return ret ? -EFAULT : 0; > } > @@ -731,6 +725,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > struct drm_i915_gem_pwrite *args, > struct drm_file *file) > { > + struct drm_i915_private *dev_priv = dev->dev_private; > ssize_t remain; > loff_t offset; > char __user *user_data; > @@ -770,74 +765,132 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > if (ret) > return ret; > > - i915_gem_object_pin_pages(obj); > - > offset = args->offset; > obj->dirty = 1; > > - for_each_sg(obj->pages->sgl, sg, obj->pages->nents, i) { > - struct page *page; > - int partial_cacheline_write; > + if (obj->stolen) { > + while (remain > 0) { > + char *vaddr; > + int partial_cacheline_write; > > - if (i < offset >> PAGE_SHIFT) > - continue; > + /* Operation in this page > + * > + * shmem_page_offset = offset within page in shmem file > + * page_length = bytes to copy for this page > + */ > + shmem_page_offset = offset_in_page(offset); > > - if (remain <= 0) > - break; > + page_length = remain; > + if ((shmem_page_offset + page_length) > PAGE_SIZE) > + page_length = PAGE_SIZE - shmem_page_offset; > > - /* Operation in this page > - * > - * shmem_page_offset = offset within page in shmem file > - * page_length = bytes to copy for this page > - */ > - shmem_page_offset = offset_in_page(offset); > + /* If we don't overwrite a cacheline completely we need to be > + * careful to have up-to-date data by first clflushing. Don't > + * overcomplicate things and flush the entire patch. */ > + partial_cacheline_write = needs_clflush_before && > + ((shmem_page_offset | page_length) > + & (boot_cpu_data.x86_clflush_size - 1)); > > - page_length = remain; > - if ((shmem_page_offset + page_length) > PAGE_SIZE) > - page_length = PAGE_SIZE - shmem_page_offset; > + vaddr = (char *)(dev_priv->mm.stolen_base + obj->stolen->start + offset); > + page_do_bit17_swizzling = obj_do_bit17_swizzling && > + ((uintptr_t)vaddr & (1 << 17)) != 0; > > - /* If we don't overwrite a cacheline completely we need to be > - * careful to have up-to-date data by first clflushing. Don't > - * overcomplicate things and flush the entire patch. */ > - partial_cacheline_write = needs_clflush_before && > - ((shmem_page_offset | page_length) > - & (boot_cpu_data.x86_clflush_size - 1)); > + ret = shmem_pwrite_fast(vaddr, shmem_page_offset, page_length, > + user_data, page_do_bit17_swizzling, > + partial_cacheline_write, > + needs_clflush_after); > > - page = sg_page(sg); > - page_do_bit17_swizzling = obj_do_bit17_swizzling && > - (page_to_phys(page) & (1 << 17)) != 0; > + if (ret == 0) > + goto next_stolen; > > - ret = shmem_pwrite_fast(page, shmem_page_offset, page_length, > - user_data, page_do_bit17_swizzling, > - partial_cacheline_write, > - needs_clflush_after); > - if (ret == 0) > - goto next_page; > + hit_slowpath = 1; > + mutex_unlock(&dev->struct_mutex); > > - hit_slowpath = 1; > - mutex_unlock(&dev->struct_mutex); > - ret = shmem_pwrite_slow(page, shmem_page_offset, page_length, > - user_data, page_do_bit17_swizzling, > - partial_cacheline_write, > - needs_clflush_after); > + ret = shmem_pwrite_slow(vaddr, shmem_page_offset, page_length, > + user_data, page_do_bit17_swizzling, > + partial_cacheline_write, > + needs_clflush_after); > > - mutex_lock(&dev->struct_mutex); > + mutex_lock(&dev->struct_mutex); > + if (ret) > + goto out; > > -next_page: > - set_page_dirty(page); > - mark_page_accessed(page); > +next_stolen: > + remain -= page_length; > + user_data += page_length; > + offset += page_length; > + } > + } else { > + i915_gem_object_pin_pages(obj); > > - if (ret) > - goto out; > + for_each_sg(obj->pages->sgl, sg, obj->pages->nents, i) { > + struct page *page; > + char *vaddr; > + int partial_cacheline_write; > > - remain -= page_length; > - user_data += page_length; > - offset += page_length; > + if (i < offset >> PAGE_SHIFT) > + continue; > + > + if (remain <= 0) > + break; > + > + /* Operation in this page > + * > + * shmem_page_offset = offset within page in shmem file > + * page_length = bytes to copy for this page > + */ > + shmem_page_offset = offset_in_page(offset); > + > + page_length = remain; > + if ((shmem_page_offset + page_length) > PAGE_SIZE) > + page_length = PAGE_SIZE - shmem_page_offset; > + > + /* If we don't overwrite a cacheline completely we need to be > + * careful to have up-to-date data by first clflushing. Don't > + * overcomplicate things and flush the entire patch. */ > + partial_cacheline_write = needs_clflush_before && > + ((shmem_page_offset | page_length) > + & (boot_cpu_data.x86_clflush_size - 1)); > + > + page = sg_page(sg); > + page_do_bit17_swizzling = obj_do_bit17_swizzling && > + (page_to_phys(page) & (1 << 17)) != 0; > + > + vaddr = kmap_atomic(page); > + ret = shmem_pwrite_fast(vaddr, shmem_page_offset, page_length, > + user_data, page_do_bit17_swizzling, > + partial_cacheline_write, > + needs_clflush_after); > + > + kunmap_atomic(vaddr); > + > + if (ret == 0) > + goto next_page; > + > + hit_slowpath = 1; > + mutex_unlock(&dev->struct_mutex); > + > + vaddr = kmap(page); > + ret = shmem_pwrite_slow(vaddr, shmem_page_offset, page_length, > + user_data, page_do_bit17_swizzling, > + partial_cacheline_write, > + needs_clflush_after); > + kunmap(page); > + > + mutex_lock(&dev->struct_mutex); > + if (ret) > + goto out_unpin; > + > +next_page: > + remain -= page_length; > + user_data += page_length; > + offset += page_length; > + } > +out_unpin: > + i915_gem_object_unpin_pages(obj); > } > > out: > - i915_gem_object_unpin_pages(obj); > - > if (hit_slowpath) { > /* Fixup: Kill any reinstated backing storage pages */ > if (obj->madv == __I915_MADV_PURGED) > -- > 1.7.10.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Mon, 20 Aug 2012 21:56:08 +0200, Daniel Vetter <daniel@ffwll.ch> wrote: > On Sat, Aug 11, 2012 at 03:41:21PM +0100, Chris Wilson wrote: > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > What about putting kmap/unmap abstractions into obj->ops (like the dma_buf > interface already has)? Since the pwrite/pread code is already rather > branch heave I hope we don't see the overhead of the indirect call even > in microbenchmarks (haven't checked). And this way we would also neatly > wrap up dma_bufs for pwrite (if anyone ever really wants that ...). > > The kmap(_atomic) for stolen mem backed objects would boil down to doing > the pointer arithmetic, kunmap would be just a noop. Sounds nice, I'll cook something up and allocate yet another pointer in drm_i915_gem_object for the typed ops. I wonder if we can unify the phys and the dma_buf... -Chris
On Mon, 20 Aug 2012 21:56:08 +0200, Daniel Vetter <daniel@ffwll.ch> wrote: > On Sat, Aug 11, 2012 at 03:41:21PM +0100, Chris Wilson wrote: > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > What about putting kmap/unmap abstractions into obj->ops (like the dma_buf > interface already has)? Since the pwrite/pread code is already rather > branch heave I hope we don't see the overhead of the indirect call even > in microbenchmarks (haven't checked). And this way we would also neatly > wrap up dma_bufs for pwrite (if anyone ever really wants that ...). > > The kmap(_atomic) for stolen mem backed objects would boil down to doing > the pointer arithmetic, kunmap would be just a noop. Tried doing so. The lack of struct page for the stolen makes it more cumbersome than it is worth, and worse confusing. -Chris
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 552f95b..a2fb2aa 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -664,19 +664,17 @@ out: * needs_clflush_before is set and flushes out any written cachelines after * writing if needs_clflush is set. */ static int -shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, +shmem_pwrite_fast(char *vaddr, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush_before, bool needs_clflush_after) { - char *vaddr; int ret; if (unlikely(page_do_bit17_swizzling)) return -EINVAL; - vaddr = kmap_atomic(page); if (needs_clflush_before) drm_clflush_virt_range(vaddr + shmem_page_offset, page_length); @@ -686,7 +684,6 @@ shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, if (needs_clflush_after) drm_clflush_virt_range(vaddr + shmem_page_offset, page_length); - kunmap_atomic(vaddr); return ret ? -EFAULT : 0; } @@ -694,16 +691,14 @@ shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length, /* Only difference to the fast-path function is that this can handle bit17 * and uses non-atomic copy and kmap functions. */ static int -shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length, +shmem_pwrite_slow(char *vaddr, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush_before, bool needs_clflush_after) { - char *vaddr; int ret; - vaddr = kmap(page); if (unlikely(needs_clflush_before || page_do_bit17_swizzling)) shmem_clflush_swizzled_range(vaddr + shmem_page_offset, page_length, @@ -720,7 +715,6 @@ shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length, shmem_clflush_swizzled_range(vaddr + shmem_page_offset, page_length, page_do_bit17_swizzling); - kunmap(page); return ret ? -EFAULT : 0; } @@ -731,6 +725,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_i915_gem_pwrite *args, struct drm_file *file) { + struct drm_i915_private *dev_priv = dev->dev_private; ssize_t remain; loff_t offset; char __user *user_data; @@ -770,74 +765,132 @@ i915_gem_shmem_pwrite(struct drm_device *dev, if (ret) return ret; - i915_gem_object_pin_pages(obj); - offset = args->offset; obj->dirty = 1; - for_each_sg(obj->pages->sgl, sg, obj->pages->nents, i) { - struct page *page; - int partial_cacheline_write; + if (obj->stolen) { + while (remain > 0) { + char *vaddr; + int partial_cacheline_write; - if (i < offset >> PAGE_SHIFT) - continue; + /* Operation in this page + * + * shmem_page_offset = offset within page in shmem file + * page_length = bytes to copy for this page + */ + shmem_page_offset = offset_in_page(offset); - if (remain <= 0) - break; + page_length = remain; + if ((shmem_page_offset + page_length) > PAGE_SIZE) + page_length = PAGE_SIZE - shmem_page_offset; - /* Operation in this page - * - * shmem_page_offset = offset within page in shmem file - * page_length = bytes to copy for this page - */ - shmem_page_offset = offset_in_page(offset); + /* If we don't overwrite a cacheline completely we need to be + * careful to have up-to-date data by first clflushing. Don't + * overcomplicate things and flush the entire patch. */ + partial_cacheline_write = needs_clflush_before && + ((shmem_page_offset | page_length) + & (boot_cpu_data.x86_clflush_size - 1)); - page_length = remain; - if ((shmem_page_offset + page_length) > PAGE_SIZE) - page_length = PAGE_SIZE - shmem_page_offset; + vaddr = (char *)(dev_priv->mm.stolen_base + obj->stolen->start + offset); + page_do_bit17_swizzling = obj_do_bit17_swizzling && + ((uintptr_t)vaddr & (1 << 17)) != 0; - /* If we don't overwrite a cacheline completely we need to be - * careful to have up-to-date data by first clflushing. Don't - * overcomplicate things and flush the entire patch. */ - partial_cacheline_write = needs_clflush_before && - ((shmem_page_offset | page_length) - & (boot_cpu_data.x86_clflush_size - 1)); + ret = shmem_pwrite_fast(vaddr, shmem_page_offset, page_length, + user_data, page_do_bit17_swizzling, + partial_cacheline_write, + needs_clflush_after); - page = sg_page(sg); - page_do_bit17_swizzling = obj_do_bit17_swizzling && - (page_to_phys(page) & (1 << 17)) != 0; + if (ret == 0) + goto next_stolen; - ret = shmem_pwrite_fast(page, shmem_page_offset, page_length, - user_data, page_do_bit17_swizzling, - partial_cacheline_write, - needs_clflush_after); - if (ret == 0) - goto next_page; + hit_slowpath = 1; + mutex_unlock(&dev->struct_mutex); - hit_slowpath = 1; - mutex_unlock(&dev->struct_mutex); - ret = shmem_pwrite_slow(page, shmem_page_offset, page_length, - user_data, page_do_bit17_swizzling, - partial_cacheline_write, - needs_clflush_after); + ret = shmem_pwrite_slow(vaddr, shmem_page_offset, page_length, + user_data, page_do_bit17_swizzling, + partial_cacheline_write, + needs_clflush_after); - mutex_lock(&dev->struct_mutex); + mutex_lock(&dev->struct_mutex); + if (ret) + goto out; -next_page: - set_page_dirty(page); - mark_page_accessed(page); +next_stolen: + remain -= page_length; + user_data += page_length; + offset += page_length; + } + } else { + i915_gem_object_pin_pages(obj); - if (ret) - goto out; + for_each_sg(obj->pages->sgl, sg, obj->pages->nents, i) { + struct page *page; + char *vaddr; + int partial_cacheline_write; - remain -= page_length; - user_data += page_length; - offset += page_length; + if (i < offset >> PAGE_SHIFT) + continue; + + if (remain <= 0) + break; + + /* Operation in this page + * + * shmem_page_offset = offset within page in shmem file + * page_length = bytes to copy for this page + */ + shmem_page_offset = offset_in_page(offset); + + page_length = remain; + if ((shmem_page_offset + page_length) > PAGE_SIZE) + page_length = PAGE_SIZE - shmem_page_offset; + + /* If we don't overwrite a cacheline completely we need to be + * careful to have up-to-date data by first clflushing. Don't + * overcomplicate things and flush the entire patch. */ + partial_cacheline_write = needs_clflush_before && + ((shmem_page_offset | page_length) + & (boot_cpu_data.x86_clflush_size - 1)); + + page = sg_page(sg); + page_do_bit17_swizzling = obj_do_bit17_swizzling && + (page_to_phys(page) & (1 << 17)) != 0; + + vaddr = kmap_atomic(page); + ret = shmem_pwrite_fast(vaddr, shmem_page_offset, page_length, + user_data, page_do_bit17_swizzling, + partial_cacheline_write, + needs_clflush_after); + + kunmap_atomic(vaddr); + + if (ret == 0) + goto next_page; + + hit_slowpath = 1; + mutex_unlock(&dev->struct_mutex); + + vaddr = kmap(page); + ret = shmem_pwrite_slow(vaddr, shmem_page_offset, page_length, + user_data, page_do_bit17_swizzling, + partial_cacheline_write, + needs_clflush_after); + kunmap(page); + + mutex_lock(&dev->struct_mutex); + if (ret) + goto out_unpin; + +next_page: + remain -= page_length; + user_data += page_length; + offset += page_length; + } +out_unpin: + i915_gem_object_unpin_pages(obj); } out: - i915_gem_object_unpin_pages(obj); - if (hit_slowpath) { /* Fixup: Kill any reinstated backing storage pages */ if (obj->madv == __I915_MADV_PURGED)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_gem.c | 169 +++++++++++++++++++++++++-------------- 1 file changed, 111 insertions(+), 58 deletions(-)