diff mbox

drm/i915: Support to create uncached user mapping for a Gem object

Message ID 1414060436-29442-1-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com Oct. 23, 2014, 10:33 a.m. UTC
From: Akash Goel <akash.goel@intel.com>

This patch provides support to create uncached virtual mappings for a Gem
object. It intends to provide the same funtionality of 'mmap_gtt' interface
without the constraints of a limited aperture space, but provided clients
handles the linear to tile conversion on their own.
This is for improving the CPU write operation performance, as with such
mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the
Cache flush after update from CPU side, when object is passed onto GPU, which
will be the case if regular mmap ioctl interface is used.
This type of mapping is specially useful in case of sub-region update,
i.e. when only a portion of the object is to be updated.
To ensure the cache coherency, before using this mapping, the GTT domain has
been reused here. This provides the required Cache flush if the object is in
CPU domain or synchronization against the concurrent rendering. Although the
access through an uncached mmap shall automatically invalidate the cache lines,
but this may not be true for non temporal write instructions and also not all
pages of the object be updated at any given point of time through this mapping.
Having a call to get_pages in set_to_gtt_domain function, as added by Chris in
the earlier patch, would guarantee the clflush and so there will be no cache-
lines holding the data for the object before it is accessed through this map.
A new field 'flags' has been added to 'drm_i915_gem_mmap' structure, used in
gem_mmap ioctl, which allows to convey the required mapping type as uncached.
User can query the driver for the support of this mapping through the
get_params. For that a new define I915_PARAM_HAS_UC_MMAP has been added.

Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c |  3 +++
 drivers/gpu/drm/i915/i915_gem.c | 17 +++++++++++++++++
 include/uapi/drm/i915_drm.h     |  4 ++++
 3 files changed, 24 insertions(+)

Comments

Chris Wilson Oct. 23, 2014, 10:54 a.m. UTC | #1
On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
> 
> This patch provides support to create uncached virtual mappings for a Gem
> object. It intends to provide the same funtionality of 'mmap_gtt' interface
> without the constraints of a limited aperture space, but provided clients
> handles the linear to tile conversion on their own.
> This is for improving the CPU write operation performance, as with such
> mapping, writes are almost 50% faster than with mmap_gtt.

That seems like it should be easy to demonstrate with igt/gem_gtt_speed.
You should also copy igt/gem_mmap_gtt to test the interface, and extend
igt/gem_concurrent_blt to make sure that mmap(wc)+domain vs the GPU is
correct.

> Also it avoids the
> Cache flush after update from CPU side, when object is passed onto GPU, which
> will be the case if regular mmap ioctl interface is used.
                             ^CPU

> This type of mapping is specially useful in case of sub-region update,
> i.e. when only a portion of the object is to be updated.
> To ensure the cache coherency, before using this mapping, the GTT domain has
> been reused here. This provides the required Cache flush if the object is in
> CPU domain or synchronization against the concurrent rendering. Although the
> access through an uncached mmap shall automatically invalidate the cache lines,
> but this may not be true for non temporal write instructions and also not all
> pages of the object be updated at any given point of time through this mapping.
> Having a call to get_pages in set_to_gtt_domain function, as added by Chris in
> the earlier patch, would guarantee the clflush and so there will be no cache-
> lines holding the data for the object before it is accessed through this map.
> A new field 'flags' has been added to 'drm_i915_gem_mmap' structure, used in
> gem_mmap ioctl, which allows to convey the required mapping type as uncached.
> User can query the driver for the support of this mapping through the
> get_params. For that a new define I915_PARAM_HAS_UC_MMAP has been added.

I quibble over using the term uncached here, as what we mean is
write-combining.  They are quite different in terms of memory access!

I've put together a usecase in
http://cgit.freedesktop.org/~ickle/xf86-video-intel/commit/?h=wc-mmap&id=118e98a2d448469064cdcedf2013a91c3a69dab4
which I'll test shortly. It uses the new mmapping to allow direct access
to larger than aperture buffers, prefers the unbound access for linear
buffers and allows manual detiled uploads on !llc platforms.

> Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  3 +++
>  drivers/gpu/drm/i915/i915_gem.c | 17 +++++++++++++++++
>  include/uapi/drm/i915_drm.h     |  4 ++++
>  3 files changed, 24 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 1b39807..2d8191a 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_CMD_PARSER_VERSION:
>  		value = i915_cmd_parser_get_version();
>  		break;
> +	case I915_PARAM_HAS_UC_MMAP:
> +		value = 1;
> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1b192d4..16b267b 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1492,6 +1492,23 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>  	addr = vm_mmap(obj->filp, 0, args->size,
>  		       PROT_READ | PROT_WRITE, MAP_SHARED,
>  		       args->offset);
> +
> +	if (args->flags & I915_GEM_USE_UNCACHED_MMAP) {
> +		struct mm_struct *mm = current->mm;
> +		struct vm_area_struct *vma;

Newline please between variables and code.

> +		down_write(&mm->mmap_sem);
> +		vma = find_vma(mm, addr);
> +		if (!vma) {
> +			drm_gem_object_unreference_unlocked(obj);
> +			return -EINVAL;
Eek returning with mmap_sem held.

> +		}
> +		/* Change the page attribute to uncached (along with
> +		 * write-combinning to get better performance) */
> +		vma->vm_page_prot =
> +			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
> +		up_write(&mm->mmap_sem);

Perhaps,

down_write();
vma = find_vma(mm, addr);
if (vma && cpu_has_pat) 
	vma->vm_page_prot = pgprot_writecombing();
else
	addr = ERR_PTR(-ENODEV);
up_write();

I am pretty sure we can't do this trick on really old CPUs!
-Chris
Chris Wilson Oct. 23, 2014, 11:03 a.m. UTC | #2
On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
>  drivers/gpu/drm/i915/i915_dma.c |  3 +++
>  drivers/gpu/drm/i915/i915_gem.c | 17 +++++++++++++++++
>  include/uapi/drm/i915_drm.h     |  4 ++++
>  3 files changed, 24 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 1b39807..2d8191a 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_CMD_PARSER_VERSION:
>  		value = i915_cmd_parser_get_version();
>  		break;
> +	case I915_PARAM_HAS_UC_MMAP:
> +		value = 1;
> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1b192d4..16b267b 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1492,6 +1492,23 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>  	addr = vm_mmap(obj->filp, 0, args->size,
>  		       PROT_READ | PROT_WRITE, MAP_SHARED,
>  		       args->offset);

Forgot to mutter about:

if (args->flags & INVALID_FLAGS)
	return -EINVAL;

and the param then makes better sense as
I915_PARAM_HAS_EXT_MMAP (extended mmap interface, which we can then
version)

In future we can test for EXT_MMAP support and
mmap(RANDOM_FLAG) returning EINVAL for any new extension
flags.

> +
> +	if (args->flags & I915_GEM_USE_UNCACHED_MMAP) {
> +		struct mm_struct *mm = current->mm;
> +		struct vm_area_struct *vma;
> +		down_write(&mm->mmap_sem);
> +		vma = find_vma(mm, addr);
> +		if (!vma) {
> +			drm_gem_object_unreference_unlocked(obj);
> +			return -EINVAL;
> +		}
> +		/* Change the page attribute to uncached (along with
> +		 * write-combinning to get better performance) */
> +		vma->vm_page_prot =
> +			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
> +		up_write(&mm->mmap_sem);
> +	}
> +
>  	drm_gem_object_unreference_unlocked(obj);
>  	if (IS_ERR((void *)addr))
>  		return addr;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index ff57f07..3d0b1c0 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -340,6 +340,7 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>  #define I915_PARAM_HAS_WT     	 	 27
>  #define I915_PARAM_CMD_PARSER_VERSION	 28
> +#define I915_PARAM_HAS_UC_MMAP  	 29
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> @@ -487,6 +488,9 @@ struct drm_i915_gem_mmap {
>  	 * This is a fixed-size type for 32/64 compatibility.
>  	 */
>  	__u64 addr_ptr;
> +
> +#define I915_GEM_USE_UNCACHED_MMAP (1<<0)

#define I915_MMAP_WC is shorter
-Chris
Chris Wilson Oct. 23, 2014, 11:56 a.m. UTC | #3
On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
> 

> This is for improving the CPU write operation performance, as with such
> mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the

I doubt it is the actual write that is faster. For example,

   gtt       wc    Operation
--------  ------   ---------
424000.0    1.30   ShmPutImage 10x10 square 
 29500.0    1.42   ShmPutImage 100x100 square 
  1510.0    0.95   ShmPutImage 500x500 square 

It seems to reduce the overhead for small transfers (with an already
mmaped pointer). That's interesting certainly, and probably a Clue for
further improving performance. But it looks like peak throughput is
limited by memory bandwidth, which has been my experience with the GTT
mmap thus far.

I have some doubts as to whether it is coherent with the display though,
and so whether it is truly write-combining...
-Chris
Chris Wilson Oct. 23, 2014, 1:23 p.m. UTC | #4
On Thu, Oct 23, 2014 at 12:56:46PM +0100, Chris Wilson wrote:
> On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
> > From: Akash Goel <akash.goel@intel.com>
> > 
> 
> > This is for improving the CPU write operation performance, as with such
> > mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the
> 
> I doubt it is the actual write that is faster. For example,
> 
>    gtt       wc    Operation
> --------  ------   ---------
> 424000.0    1.30   ShmPutImage 10x10 square 
>  29500.0    1.42   ShmPutImage 100x100 square 
>   1510.0    0.95   ShmPutImage 500x500 square 

Oops, bugs beware. I wasn't calling the ioctl correctly.
Having re-run the test, I now get:

   gtt       wc    Operation
--------  ------   ---------
424000.0    1.30   ShmPutImage 10x10 square 
 29500.0    1.44   ShmPutImage 100x100 square 
  1510.0    1.38   ShmPutImage 500x500 square 

Supporting your claims. Very nice.
-Chris
Daniel Vetter Oct. 24, 2014, 8:40 a.m. UTC | #5
On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
> 
> This patch provides support to create uncached virtual mappings for a Gem
> object. It intends to provide the same funtionality of 'mmap_gtt' interface
> without the constraints of a limited aperture space, but provided clients
> handles the linear to tile conversion on their own.
> This is for improving the CPU write operation performance, as with such
> mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the
> Cache flush after update from CPU side, when object is passed onto GPU, which
> will be the case if regular mmap ioctl interface is used.
> This type of mapping is specially useful in case of sub-region update,
> i.e. when only a portion of the object is to be updated.
> To ensure the cache coherency, before using this mapping, the GTT domain has
> been reused here. This provides the required Cache flush if the object is in
> CPU domain or synchronization against the concurrent rendering. Although the
> access through an uncached mmap shall automatically invalidate the cache lines,
> but this may not be true for non temporal write instructions and also not all
> pages of the object be updated at any given point of time through this mapping.
> Having a call to get_pages in set_to_gtt_domain function, as added by Chris in
> the earlier patch, would guarantee the clflush and so there will be no cache-
> lines holding the data for the object before it is accessed through this map.
> A new field 'flags' has been added to 'drm_i915_gem_mmap' structure, used in
> gem_mmap ioctl, which allows to convey the required mapping type as uncached.
> User can query the driver for the support of this mapping through the
> get_params. For that a new define I915_PARAM_HAS_UC_MMAP has been added.
> 
> Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
> Signed-off-by: Akash Goel <akash.goel@intel.com>

Yeah, I like this. And Chris has already gone ahead and added all the
functional tests. So here's what seems to still be missing:
- Reviewing Chris' patches. Akash, can you please do that? This kind of
  cross-review usually works best.
- Polish for Akash' patch according to Chris' review. Commit message also
  needs to gain the performance data. And we need to have a "does PAT
  work" test I think like Chris suggested.
- ioctl argument check tests for the cpu mmap ioctl. We currently have
  absotutely nothing here (one of the very few cases left). So Akash, can
  you please take the AR to write a new igt testcase which does all the
  usual argument checking test like invalid buffer, invalid flags?

Cheers, Daniel
Chris Wilson Oct. 24, 2014, 9:23 a.m. UTC | #6
On Fri, Oct 24, 2014 at 10:40:10AM +0200, Daniel Vetter wrote:
> - Polish for Akash' patch according to Chris' review. Commit message also
>   needs to gain the performance data. And we need to have a "does PAT
>   work" test I think like Chris suggested.

Done, see v2 I sent. Commit message contains relative performance data,
and that holds true for snb,ivb,byt,bdw. pnv didn't show improvement
and I suspect neither will gen5-. Performance in igt/gem_gtt_speed and
x11perf show similar trends.
-Chris
Chad Versace Dec. 10, 2014, 4:48 a.m. UTC | #7
On 10/23/2014 06:23 AM, Chris Wilson wrote:
> On Thu, Oct 23, 2014 at 12:56:46PM +0100, Chris Wilson wrote:
>> On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
>>> From: Akash Goel <akash.goel@intel.com>
>>>
>>
>>> This is for improving the CPU write operation performance, as with such
>>> mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the


> Oops, bugs beware. I wasn't calling the ioctl correctly.
> Having re-run the test, I now get:
> 
>    gtt       wc    Operation
> --------  ------   ---------
> 424000.0    1.30   ShmPutImage 10x10 square 
>  29500.0    1.44   ShmPutImage 100x100 square 
>   1510.0    1.38   ShmPutImage 500x500 square 
> 
> Supporting your claims. Very nice.

What are the units for these numbers?
Chris Wilson Dec. 10, 2014, 8:02 a.m. UTC | #8
On Tue, Dec 09, 2014 at 08:48:38PM -0800, Chad Versace wrote:
> 
> 
> On 10/23/2014 06:23 AM, Chris Wilson wrote:
> > On Thu, Oct 23, 2014 at 12:56:46PM +0100, Chris Wilson wrote:
> >> On Thu, Oct 23, 2014 at 04:03:56PM +0530, akash.goel@intel.com wrote:
> >>> From: Akash Goel <akash.goel@intel.com>
> >>>
> >>
> >>> This is for improving the CPU write operation performance, as with such
> >>> mapping, writes are almost 50% faster than with mmap_gtt. Also it avoids the
> 
> 
> > Oops, bugs beware. I wasn't calling the ioctl correctly.
> > Having re-run the test, I now get:
> > 
> >    gtt       wc    Operation
> > --------  ------   ---------
> > 424000.0    1.30   ShmPutImage 10x10 square 
> >  29500.0    1.44   ShmPutImage 100x100 square 
> >   1510.0    1.38   ShmPutImage 500x500 square 
> > 
> > Supporting your claims. Very nice.
> 
> What are the units for these numbers?

ops/s, relative factor on a baby byt.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1b39807..2d8191a 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1027,6 +1027,9 @@  static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_CMD_PARSER_VERSION:
 		value = i915_cmd_parser_get_version();
 		break;
+	case I915_PARAM_HAS_UC_MMAP:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1b192d4..16b267b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1492,6 +1492,23 @@  i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	addr = vm_mmap(obj->filp, 0, args->size,
 		       PROT_READ | PROT_WRITE, MAP_SHARED,
 		       args->offset);
+
+	if (args->flags & I915_GEM_USE_UNCACHED_MMAP) {
+		struct mm_struct *mm = current->mm;
+		struct vm_area_struct *vma;
+		down_write(&mm->mmap_sem);
+		vma = find_vma(mm, addr);
+		if (!vma) {
+			drm_gem_object_unreference_unlocked(obj);
+			return -EINVAL;
+		}
+		/* Change the page attribute to uncached (along with
+		 * write-combinning to get better performance) */
+		vma->vm_page_prot =
+			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+		up_write(&mm->mmap_sem);
+	}
+
 	drm_gem_object_unreference_unlocked(obj);
 	if (IS_ERR((void *)addr))
 		return addr;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ff57f07..3d0b1c0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -340,6 +340,7 @@  typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
 #define I915_PARAM_HAS_WT     	 	 27
 #define I915_PARAM_CMD_PARSER_VERSION	 28
+#define I915_PARAM_HAS_UC_MMAP  	 29
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -487,6 +488,9 @@  struct drm_i915_gem_mmap {
 	 * This is a fixed-size type for 32/64 compatibility.
 	 */
 	__u64 addr_ptr;
+
+#define I915_GEM_USE_UNCACHED_MMAP (1<<0)
+	__u64 flags;
 };
 
 struct drm_i915_gem_mmap_gtt {