[RFC,3/5] drm/i915: Add support for CPU mapping to DRM_IOCTL_I915_GEM_MMAP_GTT
diff mbox

Message ID 1453820013-3908-4-git-send-email-tvrtko.ursulin@linux.intel.com
State New
Headers show

Commit Message

Tvrtko Ursulin Jan. 26, 2016, 2:53 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 96 ++++++++++++++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h     |  3 ++
 2 files changed, 93 insertions(+), 6 deletions(-)

Comments

Chris Wilson Jan. 26, 2016, 3:10 p.m. UTC | #1
On Tue, Jan 26, 2016 at 02:53:31PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 96 ++++++++++++++++++++++++++++++++++++++---
>  include/uapi/drm/i915_drm.h     |  3 ++
>  2 files changed, 93 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index dacf6a0013c5..039d55a49fc6 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1954,6 +1954,60 @@ out:
>  	return i915_gem_ret_to_vm_ret(dev_priv, ret);
>  }
>  
> +static int
> +i915_gem_cpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
> +	struct drm_device *dev = obj->base.dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
> +	pgoff_t page_offset;
> +	struct page *page;
> +	int ret;
> +
> +	/* We don't use vmf->pgoff since that has the fake offset */
> +	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
> +			PAGE_SHIFT;
> +
> +	trace_i915_gem_object_fault(obj, page_offset, true, write);
> +
> +	intel_runtime_pm_get(dev_priv);
> +
> +	ret = i915_mutex_lock_interruptible(dev);
> +	if (ret)
> +		goto out;
> +
> +	ret = i915_gem_object_set_to_cpu_domain(obj, write);
> +	if (ret)
> +		goto out_unlock;

That was a mistake in the GTT gem_fault(). If you do this, we also want
the nonblocking wait for obvious reasons.

> +	ret = i915_gem_object_get_pages(obj);
> +	if (ret)
> +		goto out_unlock;
> +
> +	page = i915_gem_object_get_page(obj, page_offset);
> +	if (!page) {
> +		ret = -ERANGE;
> +		goto out_unlock;
> +	}
> +
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
> +			    page_to_pfn(page));

We don't have a page ref at this point, so this obj+page could be
freed (via the shrinker at least) before we insert it.

I would also be more interested in having a version that faulted the
entire object at once - though maybe we will see more random access in
future.

> +	intel_runtime_pm_put(dev_priv);
> +
> +	return i915_gem_ret_to_vm_ret(dev_priv, ret);
> +
> +out_unlock:
> +	mutex_unlock(&dev->struct_mutex);
> +out:
> +	intel_runtime_pm_put(dev_priv);
> +
> +	return i915_gem_ret_to_vm_ret(dev_priv, ret);
> +}
> +
>  /**
>   * i915_gem_release_mmap - remove physical page mappings
>   * @obj: obj in question
> @@ -2078,11 +2132,18 @@ static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
>  	drm_gem_free_mmap_offset(&obj->base);
>  }
>  
> -int
> -i915_gem_mmap_gtt(struct drm_file *file,
> -		  struct drm_device *dev,
> -		  uint32_t handle,
> -		  uint64_t *offset)
> +static const struct vm_operations_struct i915_gem_cpu_vm_ops = {
> +	.fault = i915_gem_cpu_fault,
> +	.open = drm_gem_vm_open,
> +	.close = drm_gem_vm_close,
> +};
> +
> +static int
> +i915_gem_mmap(struct drm_file *file,
> +	      struct drm_device *dev,
> +	      uint32_t handle,
> +	      uint32_t flags,
> +	      uint64_t *offset)
>  {
>  	struct drm_i915_gem_object *obj;
>  	int ret;
> @@ -2103,10 +2164,23 @@ i915_gem_mmap_gtt(struct drm_file *file,
>  		goto out;
>  	}
>  
> +	if (!obj->base.filp && (flags & I915_MMAP2_CPU)) {
> +		DRM_DEBUG("Attempting to mmap non-shm based object via CPU!\n");
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
>  	ret = i915_gem_object_create_mmap_offset(obj);
>  	if (ret)
>  		goto out;
>  
> +	if (flags & I915_MMAP2_CPU) {
> +		ret = drm_vma_node_set_vm_ops(&obj->base.vma_node,
> +					      &i915_gem_cpu_vm_ops);
> +		if (ret)
> +			goto out;
> +	}

We would also need a WC equivalent.

It looks fairly sane. I wanted this just a short while ago, but figured
out a way of using regular mmap() to give me the inheritance instead.
-Chris
Tvrtko Ursulin Jan. 26, 2016, 4:23 p.m. UTC | #2
On 26/01/16 15:10, Chris Wilson wrote:
> On Tue, Jan 26, 2016 at 02:53:31PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem.c | 96 ++++++++++++++++++++++++++++++++++++++---
>>   include/uapi/drm/i915_drm.h     |  3 ++
>>   2 files changed, 93 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index dacf6a0013c5..039d55a49fc6 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1954,6 +1954,60 @@ out:
>>   	return i915_gem_ret_to_vm_ret(dev_priv, ret);
>>   }
>>
>> +static int
>> +i915_gem_cpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>> +{
>> +	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
>> +	struct drm_device *dev = obj->base.dev;
>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
>> +	pgoff_t page_offset;
>> +	struct page *page;
>> +	int ret;
>> +
>> +	/* We don't use vmf->pgoff since that has the fake offset */
>> +	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
>> +			PAGE_SHIFT;
>> +
>> +	trace_i915_gem_object_fault(obj, page_offset, true, write);
>> +
>> +	intel_runtime_pm_get(dev_priv);
>> +
>> +	ret = i915_mutex_lock_interruptible(dev);
>> +	if (ret)
>> +		goto out;
>> +
>> +	ret = i915_gem_object_set_to_cpu_domain(obj, write);
>> +	if (ret)
>> +		goto out_unlock;
>
> That was a mistake in the GTT gem_fault(). If you do this, we also want
> the nonblocking wait for obvious reasons.

You suggest leaving it for userspace?

And how would a non-blocking wait work?

>
>> +	ret = i915_gem_object_get_pages(obj);
>> +	if (ret)
>> +		goto out_unlock;
>> +
>> +	page = i915_gem_object_get_page(obj, page_offset);
>> +	if (!page) {
>> +		ret = -ERANGE;
>> +		goto out_unlock;
>> +	}
>> +
>> +	mutex_unlock(&dev->struct_mutex);
>> +
>> +	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
>> +			    page_to_pfn(page));
>
> We don't have a page ref at this point, so this obj+page could be
> freed (via the shrinker at least) before we insert it.

Oh yeah, need to pin the pages..

> I would also be more interested in having a version that faulted the
> entire object at once - though maybe we will see more random access in
> future.

Yeah I did not want to concern myself with more code since this was a 
proof of concept anyway.

>> +	intel_runtime_pm_put(dev_priv);
>> +
>> +	return i915_gem_ret_to_vm_ret(dev_priv, ret);
>> +
>> +out_unlock:
>> +	mutex_unlock(&dev->struct_mutex);
>> +out:
>> +	intel_runtime_pm_put(dev_priv);
>> +
>> +	return i915_gem_ret_to_vm_ret(dev_priv, ret);
>> +}
>> +
>>   /**
>>    * i915_gem_release_mmap - remove physical page mappings
>>    * @obj: obj in question
>> @@ -2078,11 +2132,18 @@ static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
>>   	drm_gem_free_mmap_offset(&obj->base);
>>   }
>>
>> -int
>> -i915_gem_mmap_gtt(struct drm_file *file,
>> -		  struct drm_device *dev,
>> -		  uint32_t handle,
>> -		  uint64_t *offset)
>> +static const struct vm_operations_struct i915_gem_cpu_vm_ops = {
>> +	.fault = i915_gem_cpu_fault,
>> +	.open = drm_gem_vm_open,
>> +	.close = drm_gem_vm_close,
>> +};
>> +
>> +static int
>> +i915_gem_mmap(struct drm_file *file,
>> +	      struct drm_device *dev,
>> +	      uint32_t handle,
>> +	      uint32_t flags,
>> +	      uint64_t *offset)
>>   {
>>   	struct drm_i915_gem_object *obj;
>>   	int ret;
>> @@ -2103,10 +2164,23 @@ i915_gem_mmap_gtt(struct drm_file *file,
>>   		goto out;
>>   	}
>>
>> +	if (!obj->base.filp && (flags & I915_MMAP2_CPU)) {
>> +		DRM_DEBUG("Attempting to mmap non-shm based object via CPU!\n");
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>>   	ret = i915_gem_object_create_mmap_offset(obj);
>>   	if (ret)
>>   		goto out;
>>
>> +	if (flags & I915_MMAP2_CPU) {
>> +		ret = drm_vma_node_set_vm_ops(&obj->base.vma_node,
>> +					      &i915_gem_cpu_vm_ops);
>> +		if (ret)
>> +			goto out;
>> +	}
>
> We would also need a WC equivalent.
>
> It looks fairly sane. I wanted this just a short while ago, but figured
> out a way of using regular mmap() to give me the inheritance instead.

So would it be useful to cleanup and finish this work or not?

Regards,

Tvrtko
Chris Wilson Jan. 26, 2016, 4:59 p.m. UTC | #3
On Tue, Jan 26, 2016 at 04:23:28PM +0000, Tvrtko Ursulin wrote:
> 
> On 26/01/16 15:10, Chris Wilson wrote:
> >On Tue, Jan 26, 2016 at 02:53:31PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>---
> >>  drivers/gpu/drm/i915/i915_gem.c | 96 ++++++++++++++++++++++++++++++++++++++---
> >>  include/uapi/drm/i915_drm.h     |  3 ++
> >>  2 files changed, 93 insertions(+), 6 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>index dacf6a0013c5..039d55a49fc6 100644
> >>--- a/drivers/gpu/drm/i915/i915_gem.c
> >>+++ b/drivers/gpu/drm/i915/i915_gem.c
> >>@@ -1954,6 +1954,60 @@ out:
> >>  	return i915_gem_ret_to_vm_ret(dev_priv, ret);
> >>  }
> >>
> >>+static int
> >>+i915_gem_cpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> >>+{
> >>+	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
> >>+	struct drm_device *dev = obj->base.dev;
> >>+	struct drm_i915_private *dev_priv = dev->dev_private;
> >>+	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
> >>+	pgoff_t page_offset;
> >>+	struct page *page;
> >>+	int ret;
> >>+
> >>+	/* We don't use vmf->pgoff since that has the fake offset */
> >>+	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
> >>+			PAGE_SHIFT;
> >>+
> >>+	trace_i915_gem_object_fault(obj, page_offset, true, write);
> >>+
> >>+	intel_runtime_pm_get(dev_priv);
> >>+
> >>+	ret = i915_mutex_lock_interruptible(dev);
> >>+	if (ret)
> >>+		goto out;
> >>+
> >>+	ret = i915_gem_object_set_to_cpu_domain(obj, write);
> >>+	if (ret)
> >>+		goto out_unlock;
> >
> >That was a mistake in the GTT gem_fault(). If you do this, we also want
> >the nonblocking wait for obvious reasons.
> 
> You suggest leaving it for userspace?

It is userspace's responsibility. Page faults are random and do not
occur around every pointer acceess - userspace has to mark the domain
changes on its boundaries (and coordinate amongst its peers).
 
> And how would a non-blocking wait work?

Before set-to-cpu-domain, we do a wait_rendering_nonblocking which drops
and then reacquires the mutex. (That allows for multiple waiters which
tends to be the lowest hanging fruit with struct_mutex contention.) Then
set-to-cpu domain does a blocking wait to ensure nothing snuck in.

But I don't think we want this. And we can then reduce the
i915_mutex_lock_interruptible() to a plain mutex_lock_interruptible() as
we are not touching the GPU.

> >>+	ret = i915_gem_object_get_pages(obj);
> >>+	if (ret)
> >>+		goto out_unlock;
> >>+
> >>+	page = i915_gem_object_get_page(obj, page_offset);
> >>+	if (!page) {
> >>+		ret = -ERANGE;

ret = -EFAULT;

though it would definitely be a stack bug.

> >>+		goto out_unlock;
> >>+	}
> >>+
> >>+	mutex_unlock(&dev->struct_mutex);
> >>+
> >>+	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
> >>+			    page_to_pfn(page));
> >
> >We don't have a page ref at this point, so this obj+page could be
> >freed (via the shrinker at least) before we insert it.
> 
> Oh yeah, need to pin the pages..

But only whilst inserting. Once inserted they need to be evicted, and I
was wondering whether we should do the zap on put_pages(). If we don't,
it means that the shrinker is neutered.

> >I would also be more interested in having a version that faulted the
> >entire object at once - though maybe we will see more random access in
> >future.
> 
> Yeah I did not want to concern myself with more code since this was
> a proof of concept anyway.

No worries, the transformation is simple with a certain remap function.

> >It looks fairly sane. I wanted this just a short while ago, but figured
> >out a way of using regular mmap() to give me the inheritance instead.
> 
> So would it be useful to cleanup and finish this work or not?

I agree that it closes a big hole in the API - the ability to CPU mmap
non-shmemfs object (i.e. userptr, dmabuf). With a bit of polish we
should be able to offer something to take advantage of the existing GEM
infrastructure better than a regular CPU mmapping - though off the top
of my head, I don't have anything that is ratelimited by CPU pagefaults.

Another thing I realised was that this severely limits the mmap space on
32-bit systems, as the vma manager is unsigned long. The CPU mmaping was
a way around some of the restrictions. That would seem fairly easy to
lift (and I hope without consequence).
-Chris
Tvrtko Ursulin Jan. 27, 2016, 3:24 p.m. UTC | #4
On 26/01/16 16:59, Chris Wilson wrote:
> On Tue, Jan 26, 2016 at 04:23:28PM +0000, Tvrtko Ursulin wrote:
>>
>> On 26/01/16 15:10, Chris Wilson wrote:
>>> On Tue, Jan 26, 2016 at 02:53:31PM +0000, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/i915/i915_gem.c | 96 ++++++++++++++++++++++++++++++++++++++---
>>>>   include/uapi/drm/i915_drm.h     |  3 ++
>>>>   2 files changed, 93 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>>> index dacf6a0013c5..039d55a49fc6 100644
>>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>>> @@ -1954,6 +1954,60 @@ out:
>>>>   	return i915_gem_ret_to_vm_ret(dev_priv, ret);
>>>>   }
>>>>
>>>> +static int
>>>> +i915_gem_cpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>>>> +{
>>>> +	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
>>>> +	struct drm_device *dev = obj->base.dev;
>>>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>>>> +	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
>>>> +	pgoff_t page_offset;
>>>> +	struct page *page;
>>>> +	int ret;
>>>> +
>>>> +	/* We don't use vmf->pgoff since that has the fake offset */
>>>> +	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
>>>> +			PAGE_SHIFT;
>>>> +
>>>> +	trace_i915_gem_object_fault(obj, page_offset, true, write);
>>>> +
>>>> +	intel_runtime_pm_get(dev_priv);
>>>> +
>>>> +	ret = i915_mutex_lock_interruptible(dev);
>>>> +	if (ret)
>>>> +		goto out;
>>>> +
>>>> +	ret = i915_gem_object_set_to_cpu_domain(obj, write);
>>>> +	if (ret)
>>>> +		goto out_unlock;
>>>
>>> That was a mistake in the GTT gem_fault(). If you do this, we also want
>>> the nonblocking wait for obvious reasons.
>>
>> You suggest leaving it for userspace?
>
> It is userspace's responsibility. Page faults are random and do not
> occur around every pointer acceess - userspace has to mark the domain
> changes on its boundaries (and coordinate amongst its peers).
>
>> And how would a non-blocking wait work?
>
> Before set-to-cpu-domain, we do a wait_rendering_nonblocking which drops
> and then reacquires the mutex. (That allows for multiple waiters which
> tends to be the lowest hanging fruit with struct_mutex contention.) Then
> set-to-cpu domain does a blocking wait to ensure nothing snuck in.
>
> But I don't think we want this. And we can then reduce the
> i915_mutex_lock_interruptible() to a plain mutex_lock_interruptible() as
> we are not touching the GPU.
>
>>>> +	ret = i915_gem_object_get_pages(obj);
>>>> +	if (ret)
>>>> +		goto out_unlock;
>>>> +
>>>> +	page = i915_gem_object_get_page(obj, page_offset);
>>>> +	if (!page) {
>>>> +		ret = -ERANGE;
>
> ret = -EFAULT;
>
> though it would definitely be a stack bug.
>
>>>> +		goto out_unlock;
>>>> +	}
>>>> +
>>>> +	mutex_unlock(&dev->struct_mutex);
>>>> +
>>>> +	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
>>>> +			    page_to_pfn(page));
>>>
>>> We don't have a page ref at this point, so this obj+page could be
>>> freed (via the shrinker at least) before we insert it.
>>
>> Oh yeah, need to pin the pages..
>
> But only whilst inserting. Once inserted they need to be evicted, and I
> was wondering whether we should do the zap on put_pages(). If we don't,
> it means that the shrinker is neutered.

Ah I forgot this in v2. So we would need something like 
obj->fault_mappable so that i915_gem_release_mmap runs, yes?

>>> I would also be more interested in having a version that faulted the
>>> entire object at once - though maybe we will see more random access in
>>> future.
>>
>> Yeah I did not want to concern myself with more code since this was
>> a proof of concept anyway.
>
> No worries, the transformation is simple with a certain remap function.
>
>>> It looks fairly sane. I wanted this just a short while ago, but figured
>>> out a way of using regular mmap() to give me the inheritance instead.
>>
>> So would it be useful to cleanup and finish this work or not?
>
> I agree that it closes a big hole in the API - the ability to CPU mmap
> non-shmemfs object (i.e. userptr, dmabuf). With a bit of polish we
> should be able to offer something to take advantage of the existing GEM
> infrastructure better than a regular CPU mmapping - though off the top
> of my head, I don't have anything that is ratelimited by CPU pagefaults.
>
> Another thing I realised was that this severely limits the mmap space on
> 32-bit systems, as the vma manager is unsigned long. The CPU mmaping was
> a way around some of the restrictions. That would seem fairly easy to
> lift (and I hope without consequence).

I did not manage to figure out what here limits the space on 32-bit systems?

Regards,

Tvrtko
Chris Wilson Jan. 27, 2016, 4:36 p.m. UTC | #5
On Wed, Jan 27, 2016 at 03:24:43PM +0000, Tvrtko Ursulin wrote:
> 
> On 26/01/16 16:59, Chris Wilson wrote:
> >Another thing I realised was that this severely limits the mmap space on
> >32-bit systems, as the vma manager is unsigned long. The CPU mmaping was
> >a way around some of the restrictions. That would seem fairly easy to
> >lift (and I hope without consequence).
> 
> I did not manage to figure out what here limits the space on 32-bit systems?

A hang-over of mine. We once used to exhaust the 32bit mmap space quite
easily. I was thinking drm_vma_manager had the same limit due to its use
of unsigned long, but it is counting in pages not bytes, so we have
44bits available of space, which will hopefully be enuogh for the last
remaining 32bit system to grow old gracefully.
-Chris
Chris Wilson Jan. 27, 2016, 4:40 p.m. UTC | #6
On Wed, Jan 27, 2016 at 03:24:43PM +0000, Tvrtko Ursulin wrote:
> 
> On 26/01/16 16:59, Chris Wilson wrote:
> >On Tue, Jan 26, 2016 at 04:23:28PM +0000, Tvrtko Ursulin wrote:
> >>Oh yeah, need to pin the pages..
> >
> >But only whilst inserting. Once inserted they need to be evicted, and I
> >was wondering whether we should do the zap on put_pages(). If we don't,
> >it means that the shrinker is neutered.
> 
> Ah I forgot this in v2. So we would need something like
> obj->fault_mappable so that i915_gem_release_mmap runs, yes?

Something like that. I don't have a good idea, but I was wondering if
plonking a callback into the offset-node would be useful, and walking a
list of nodes on the object.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dacf6a0013c5..039d55a49fc6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1954,6 +1954,60 @@  out:
 	return i915_gem_ret_to_vm_ret(dev_priv, ret);
 }
 
+static int
+i915_gem_cpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
+	struct drm_device *dev = obj->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
+	pgoff_t page_offset;
+	struct page *page;
+	int ret;
+
+	/* We don't use vmf->pgoff since that has the fake offset */
+	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
+			PAGE_SHIFT;
+
+	trace_i915_gem_object_fault(obj, page_offset, true, write);
+
+	intel_runtime_pm_get(dev_priv);
+
+	ret = i915_mutex_lock_interruptible(dev);
+	if (ret)
+		goto out;
+
+	ret = i915_gem_object_set_to_cpu_domain(obj, write);
+	if (ret)
+		goto out_unlock;
+
+	ret = i915_gem_object_get_pages(obj);
+	if (ret)
+		goto out_unlock;
+
+	page = i915_gem_object_get_page(obj, page_offset);
+	if (!page) {
+		ret = -ERANGE;
+		goto out_unlock;
+	}
+
+	mutex_unlock(&dev->struct_mutex);
+
+	ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+			    page_to_pfn(page));
+
+	intel_runtime_pm_put(dev_priv);
+
+	return i915_gem_ret_to_vm_ret(dev_priv, ret);
+
+out_unlock:
+	mutex_unlock(&dev->struct_mutex);
+out:
+	intel_runtime_pm_put(dev_priv);
+
+	return i915_gem_ret_to_vm_ret(dev_priv, ret);
+}
+
 /**
  * i915_gem_release_mmap - remove physical page mappings
  * @obj: obj in question
@@ -2078,11 +2132,18 @@  static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
 	drm_gem_free_mmap_offset(&obj->base);
 }
 
-int
-i915_gem_mmap_gtt(struct drm_file *file,
-		  struct drm_device *dev,
-		  uint32_t handle,
-		  uint64_t *offset)
+static const struct vm_operations_struct i915_gem_cpu_vm_ops = {
+	.fault = i915_gem_cpu_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
+static int
+i915_gem_mmap(struct drm_file *file,
+	      struct drm_device *dev,
+	      uint32_t handle,
+	      uint32_t flags,
+	      uint64_t *offset)
 {
 	struct drm_i915_gem_object *obj;
 	int ret;
@@ -2103,10 +2164,23 @@  i915_gem_mmap_gtt(struct drm_file *file,
 		goto out;
 	}
 
+	if (!obj->base.filp && (flags & I915_MMAP2_CPU)) {
+		DRM_DEBUG("Attempting to mmap non-shm based object via CPU!\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	ret = i915_gem_object_create_mmap_offset(obj);
 	if (ret)
 		goto out;
 
+	if (flags & I915_MMAP2_CPU) {
+		ret = drm_vma_node_set_vm_ops(&obj->base.vma_node,
+					      &i915_gem_cpu_vm_ops);
+		if (ret)
+			goto out;
+	}
+
 	*offset = drm_vma_node_offset_addr(&obj->base.vma_node);
 
 out:
@@ -2116,6 +2190,15 @@  unlock:
 	return ret;
 }
 
+int
+i915_gem_mmap_gtt(struct drm_file *file,
+		  struct drm_device *dev,
+		  uint32_t handle,
+		  uint64_t *offset)
+{
+	return i915_gem_mmap(file, dev, handle, 0, offset);
+}
+
 /**
  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
  * @dev: DRM device
@@ -2137,7 +2220,8 @@  i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_gem_mmap_gtt *args = data;
 
-	return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
+	return i915_gem_mmap(file, dev, args->handle, args->flags,
+			     &args->offset);
 }
 
 /* Immediately discard the backing storage */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6a19371391fa..359a36d604bb 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -528,6 +528,9 @@  struct drm_i915_gem_mmap_gtt {
 	 * This is a fixed-size type for 32/64 compatibility.
 	 */
 	__u64 offset;
+
+#define I915_MMAP2_CPU   0x1
+	__u64 flags;
 };
 
 struct drm_i915_gem_set_domain {