diff mbox

[172/190] drm/i915: Eliminate lots of iterations over the execobjects array

Message ID 1452510091-6833-31-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Jan. 11, 2016, 11:01 a.m. UTC
The major scaling bottleneck in execbuffer is the processing of the
execobjects. Creating an auxiliary list is ineffecient when compared to
using the execobject array we already have allocated.

Reservation is then split into phases. As we lookup up the VMA, we
try and bind it back into active location. Only if that fails, do we add
it to the unbound list for phase 2. In phase 2, we try and add all those
objects that could not fit into their previous location, with fallback
to retrying all objects and evicting the VM in case of severe
fragmentation. (This is the same as before, except that phase 1 is now
done inline with looking up the VMA to avoid an interation over the
execobject array. In the ideal case, we eliminate the separate reservation
phase). During the reservation phase, we only evict from the VM between
passes (rather than currently as we try to fit every new VMA). In
testing with Unreal Engine's Atlantis demo which stresses the eviction
logic on gen7 class hardware, this speed up the framerate by a factor of
2.

The second loop amlagamation is between move_to_gpu and move_to_active.
As we always submit the request, even if incomplete, we can use the
current request to track active VMA as we perform the flushes and
synchronisation required.

The next big advancement is to avoid copying back to the user any
execobjects and relocations that are not changed.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h            |    3 +-
 drivers/gpu/drm/i915/i915_gem.c            |    4 +-
 drivers/gpu/drm/i915/i915_gem_evict.c      |   71 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 1310 ++++++++++++++--------------
 drivers/gpu/drm/i915/i915_gem_gtt.c        |    2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h        |    4 +-
 6 files changed, 713 insertions(+), 681 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2ceefce0e731..601ef7412cf9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2733,6 +2733,7 @@  int i915_gem_wait_ioctl(struct drm_device *dev, void *data,
 void i915_gem_load(struct drm_device *dev);
 void *i915_gem_object_alloc(struct drm_device *dev);
 void i915_gem_object_free(struct drm_i915_gem_object *obj);
+bool i915_gem_object_flush_active(struct drm_i915_gem_object *obj);
 void i915_gem_object_init(struct drm_i915_gem_object *obj,
 			 const struct drm_i915_gem_object_ops *ops);
 struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
@@ -3078,7 +3079,7 @@  int __must_check i915_gem_evict_something(struct drm_device *dev,
 					  unsigned long end,
 					  unsigned flags);
 int __must_check i915_gem_evict_for_vma(struct i915_vma *vma, unsigned flags);
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
+int i915_gem_evict_vm(struct i915_address_space *vm);
 
 /* belongs in i915_gem_gtt.h */
 static inline void i915_gem_chipset_flush(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3eeca1fb89d2..0bd6db4e83d9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2393,7 +2393,7 @@  out:
  * write domains, emitting any outstanding lazy request and retiring and
  * completed requests.
  */
-static bool i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
+bool i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 {
 	int i;
 
@@ -2821,7 +2821,7 @@  i915_vma_insert(struct i915_vma *vma,
 			  size, obj->base.size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
 			  end);
-		return -E2BIG;
+		return -ENOSPC;
 	}
 
 	ret = i915_gem_object_get_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index d40bcb81c922..e71b89bac168 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -72,7 +72,7 @@  mark_free(struct i915_vma *vma, unsigned flags, struct list_head *unwind)
 	if (flags & PIN_NOFAULT && vma->obj->fault_mappable)
 		return false;
 
-	list_add(&vma->exec_list, unwind);
+	list_add(&vma->evict_link, unwind);
 	return drm_mm_scan_add_block(&vma->node);
 }
 
@@ -154,11 +154,11 @@  search_again:
 	while (!list_empty(&eviction_list)) {
 		vma = list_first_entry(&eviction_list,
 				       struct i915_vma,
-				       exec_list);
+				       evict_link);
 		ret = drm_mm_scan_remove_block(&vma->node);
 		BUG_ON(ret);
 
-		list_del(&vma->exec_list);
+		list_del(&vma->evict_link);
 	}
 
 	/* Can we unpin some objects such as idle hw contents,
@@ -201,16 +201,16 @@  found:
 	 * calling unbind (which may remove the active reference
 	 * of any of our objects, thus corrupting the list).
 	 */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		if (drm_mm_scan_remove_block(&vma->node))
 			drm_gem_object_reference(&vma->obj->base);
 		else
-			list_del(&vma->exec_list);
+			list_del(&vma->evict_link);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -261,14 +261,13 @@  i915_gem_evict_for_vma(struct i915_vma *target, unsigned flags)
 			break;
 		}
 
-		list_add(&vma->exec_list, &eviction_list);
+		list_add(&vma->evict_link, &eviction_list);
 		drm_gem_object_reference(&vma->obj->base);
 	}
 
 	ret = 0;
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
-		list_del(&vma->exec_list);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
 		drm_gem_object_unreference(&obj->base);
@@ -291,37 +290,49 @@  i915_gem_evict_for_vma(struct i915_vma *target, unsigned flags)
  * To clarify: This is for freeing up virtual address space, not for freeing
  * memory in e.g. the shrinker.
  */
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle)
+int i915_gem_evict_vm(struct i915_address_space *vm)
 {
+	struct list_head *phases[] = {
+		&vm->inactive_list,
+		&vm->active_list,
+		NULL
+	}, **phase;
+	struct list_head eviction_list;
 	struct i915_vma *vma, *next;
 	int ret;
 
 	WARN_ON(!mutex_is_locked(&vm->dev->struct_mutex));
 	trace_i915_gem_evict_vm(vm);
 
-	if (do_idle) {
-		/* Switch back to the default context in order to unpin
-		 * the existing context objects. However, such objects only
-		 * pin themselves inside the global GTT and performing the
-		 * switch otherwise is ineffective.
-		 */
-		if (i915_is_ggtt(vm)) {
-			ret = switch_to_pinned_context(to_i915(vm->dev));
-			if (ret)
-				return ret;
-		}
-
-		ret = i915_gpu_idle(vm->dev);
+	/* Switch back to the default context in order to unpin
+	 * the existing context objects. However, such objects only
+	 * pin themselves inside the global GTT and performing the
+	 * switch otherwise is ineffective.
+	 */
+	if (i915_is_ggtt(vm)) {
+		ret = switch_to_pinned_context(to_i915(vm->dev));
 		if (ret < 0)
 			return ret;
-
-		i915_gem_retire_requests(vm->dev);
-		WARN_ON(!list_empty(&vm->active_list));
 	}
 
-	list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link)
-		if (vma->pin_count == 0)
-			WARN_ON(i915_vma_unbind(vma));
+	INIT_LIST_HEAD(&eviction_list);
+	phase = phases;
+	do {
+		list_for_each_entry(vma, *phase, vm_link) {
+			if (vma->pin_count)
+				continue;
 
-	return 0;
+			list_add(&vma->evict_link, &eviction_list);
+			drm_gem_object_reference(&vma->obj->base);
+		}
+	} while (*++phase);
+
+	ret = 0;
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
+		struct drm_i915_gem_object *obj = vma->obj;
+		if (ret == 0)
+			ret = i915_vma_unbind(vma);
+		drm_gem_object_unreference(&obj->base);
+	}
+	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2868e094f67c..f40d3254249a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,6 +40,10 @@ 
 #define  __EXEC_OBJECT_HAS_FENCE (1U<<30)
 #define  __EXEC_OBJECT_NEEDS_MAP (1U<<29)
 #define  __EXEC_OBJECT_NEEDS_BIAS (1U<<28)
+#define __EB_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
+
+#define __EXEC_HAS_RELOC (1ULL<<31)
+#define UPDATE PIN_OFFSET_FIXED
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
@@ -52,21 +56,44 @@  struct i915_execbuffer {
 	struct intel_context *ctx;
 	struct i915_address_space *vm;
 	struct i915_vma *batch_vma;
-	uint32_t batch_start_offset;
 	struct drm_i915_gem_request *request;
-	unsigned dispatch_flags;
-	bool need_relocs;
-	struct list_head vmas;
+	struct list_head unbound;
+	struct list_head relocs;
 	struct reloc_cache {
 		unsigned long vaddr;
 		unsigned page;
 		struct drm_mm_node node;
 		bool use_64bit_reloc;
+		bool has_llc;
+		bool has_fence;
 	} reloc_cache;
+	u64 invalid_flags;
+	u32 context_flags;
+	u32 dispatch_flags;
 	int lut_mask;
 	struct hlist_head *buckets;
 };
 
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+/* Used to convert any address to canonical form.
+ * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
+ * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
+ * addresses to be in a canonical form:
+ * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
+ * canonical form [63:48] == [47]."
+ */
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline uint64_t gen8_canonical_addr(uint64_t address)
+{
+	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
+static inline uint64_t gen8_noncanonical_addr(uint64_t address)
+{
+	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+}
+
 static int
 eb_create(struct i915_execbuffer *eb)
 {
@@ -91,78 +118,317 @@  eb_create(struct i915_execbuffer *eb)
 	return 0;
 }
 
+static bool
+eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
+		 const struct i915_vma *vma)
+{
+	if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)
+		return true;
+
+	if (vma->node.size < entry->pad_to_size)
+		return true;
+
+	if (entry->alignment && vma->node.start & (entry->alignment - 1))
+		return true;
+
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	    vma->node.start < BATCH_OFFSET_BIAS)
+		return true;
+
+	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
+	    (vma->node.start + vma->node.size - 1) >> 32)
+		return true;
+
+	return false;
+}
+
+static void
+eb_pin_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
+{
+	u64 flags;
+
+	flags = PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+		flags |= PIN_GLOBAL;
+	if (entry->flags & EXEC_OBJECT_PINNED)
+		flags |= entry->offset;
+	else
+		flags |= vma->node.start;
+	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
+		return;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		if (unlikely(i915_vma_get_fence(vma))) {
+			i915_vma_unpin(vma);
+			return;
+		}
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	if (entry->offset != vma->node.start) {
+		entry->offset = vma->node.start | UPDATE;
+		eb->args->flags |= __EXEC_HAS_RELOC;
+	}
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+}
+
 static inline void
 __eb_unreserve_vma(struct i915_vma *vma,
-		   const struct drm_i915_gem_exec_object2 *entry)
+		 const struct drm_i915_gem_exec_object2 *entry)
 {
+	GEM_BUG_ON((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0);
+
 	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
 		i915_vma_unpin_fence(vma);
 
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
+	__i915_vma_unpin(vma);
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
+static inline void
+eb_unreserve_vma(struct i915_vma *vma,
+		 struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN) {
+		__eb_unreserve_vma(vma, entry);
+		entry->flags &= ~__EB_RESERVED;
+	}
 }
 
-static void
-eb_reset(struct i915_execbuffer *eb)
+static int
+eb_add_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
 {
-	struct i915_vma *vma;
+	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		eb_unreserve_vma(vma);
-		vma->exec_entry = NULL;
-	}
+	GEM_BUG_ON(vma->closed);
 
-	if (eb->lut_mask >= 0)
-		memset(eb->buckets, 0,
-		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
-}
+	if (unlikely(entry->flags & eb->invalid_flags))
+		return -EINVAL;
 
-#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+	if (unlikely(entry->alignment && !is_power_of_2(entry->alignment)))
+		return -EINVAL;
+
+	/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
+	 * any non-page-aligned or non-canonical addresses.
+	 */
+	if (entry->flags & EXEC_OBJECT_PINNED) {
+		if (unlikely(entry->offset !=
+			     gen8_canonical_addr(entry->offset & PAGE_MASK)))
+			return -EINVAL;
+
+		/* From drm_mm perspective address space is continuous,
+		 * so from this point we're always using non-canonical
+		 * form internally.
+		 */
+		entry->offset = gen8_noncanonical_addr(entry->offset);
+	}
+
+	/* pad_to_size was once a reserved field, so sanitize it */
+	if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) {
+		if (unlikely(offset_in_page(entry->pad_to_size)))
+			return -EINVAL;
+	} else
+		entry->pad_to_size = 0;
 
-static bool
-eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
-{
 	if (unlikely(vma->exec_entry)) {
 		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
-			  eb->exec[i].handle, i);
-		return false;
+			  entry->handle, (int)(entry - eb->exec));
+		return -EINVAL;
 	}
-	list_add_tail(&vma->exec_list, &eb->vmas);
 
-	vma->exec_entry = &eb->exec[i];
+	vma->exec_entry = entry;
+	entry->rsvd2 = (uintptr_t)vma;
+
 	if (eb->lut_mask >= 0) {
-		vma->exec_handle = eb->exec[i].handle;
+		vma->exec_handle = entry->handle;
 		hlist_add_head(&vma->exec_node,
-			       &eb->buckets[hash_32(vma->exec_handle,
+			       &eb->buckets[hash_32(entry->handle,
 						    eb->lut_mask)]);
 	}
 
-	eb->exec[i].rsvd2 = (uintptr_t)vma;
-	return true;
+	if (entry->relocation_count)
+		list_add_tail(&vma->reloc_link, &eb->relocs);
+
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if (entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+		    vma->obj->tiling_mode != I915_TILING_NONE)
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
+		entry->flags |= eb->context_flags;
+
+	ret = 0;
+	if (vma->node.size)
+		eb_pin_vma(eb, entry, vma);
+	if (eb_vma_misplaced(entry, vma)) {
+		eb_unreserve_vma(vma, entry);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			ret = i915_vma_unbind(vma);
+	}
+	return ret;
+}
+
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+				const struct drm_i915_gem_object *obj)
+{
+	return (DBG_USE_CPU_RELOC ||
+		cache->has_llc ||
+		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
+static int
+eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int pass)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	u64 flags;
+	int ret;
+
+	flags = PIN_USER | PIN_NONBLOCK;
+	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+		flags |= PIN_GLOBAL;
+
+	if (!drm_mm_node_allocated(&vma->node)) {
+		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+		 * limit address to the first 4GBs for unflagged objects.
+		 */
+		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
+			flags |= PIN_ZONE_4G;
+
+		if (vma->is_ggtt) {
+			if (entry->flags & __EXEC_OBJECT_NEEDS_MAP) {
+				flags |= PIN_MAPPABLE;
+			} else if (entry->relocation_count &&
+				   !use_cpu_reloc(&eb->reloc_cache, vma->obj)) {
+				if (pass <= 0)
+					flags |= PIN_MAPPABLE;
+			} else {
+				flags |= PIN_HIGH;
+			}
+		}
+
+		if (entry->flags & EXEC_OBJECT_PINNED)
+			flags |= entry->offset | PIN_OFFSET_FIXED;
+		else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
+			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+	}
+
+	ret = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	if (ret)
+		return ret;
+
+	if ((entry->offset & PIN_OFFSET_MASK) != vma->node.start) {
+		entry->offset = vma->node.start | UPDATE;
+		eb->args->flags |= __EXEC_HAS_RELOC;
+	}
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		ret = i915_vma_get_fence(vma);
+		if (ret)
+			return ret;
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	return 0;
 }
 
-static inline struct hlist_head *ht_head(struct intel_context *ctx, u32 handle)
+static int eb_reserve(struct i915_execbuffer *eb)
+{
+	const unsigned count = eb->args->buffer_count;
+	struct i915_vma *vma;
+	unsigned i;
+	int pass;
+	int ret;
+
+	/* Attempt to pin all of the buffers into the GTT.
+	 * This is done in 3 phases:
+	 *
+	 * 1a. Unbind all objects that do not match the GTT constraints for
+	 *     the execbuffer (fenceable, mappable, alignment etc).
+	 * 1b. Increment pin count for already bound objects.
+	 * 2.  Bind new objects.
+	 * 3.  Decrement pin count.
+	 *
+	 * This avoid unnecessary unbinding of later objects in order to make
+	 * room for the earlier objects *unless* we need to defragment.
+	 */
+	if (list_empty(&eb->unbound))
+		return 0;
+
+	pass = -1;
+	do {
+		struct list_head last;
+
+		i915_gem_retire_requests(eb->i915->dev);
+
+		list_for_each_entry(vma, &eb->unbound, exec_link) {
+			ret = eb_reserve_vma(eb, vma, pass);
+			if (ret)
+				break;
+		}
+		if (ret != -ENOSPC || pass++ > 0)
+			return ret;
+
+		/* Resort *all* the objects into priority order */
+		INIT_LIST_HEAD(&eb->unbound);
+		INIT_LIST_HEAD(&last);
+		for (i = 0; i < count; i++) {
+			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+			vma = to_ptr(struct i915_vma, entry->rsvd2);
+
+			eb_unreserve_vma(vma, entry);
+
+			if (entry->flags & EXEC_OBJECT_PINNED)
+				list_add(&vma->exec_link, &eb->unbound);
+			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+				list_add_tail(&vma->exec_link, &eb->unbound);
+			else
+				list_add_tail(&vma->exec_link, &last);
+		}
+		list_splice_tail(&last, &eb->unbound);
+
+		/* Too fragmented, unbind everything and retry */
+		ret = i915_gem_evict_vm(eb->vm);
+		if (ret)
+			return ret;
+	} while (1);
+}
+
+static inline struct hlist_head *
+ht_head(const struct intel_context *ctx, u32 handle)
 {
 	return &ctx->vma_ht[hash_32(handle, ctx->vma_ht_bits)];
 }
 
+static int eb_batch_index(const struct i915_execbuffer *eb)
+{
+	return eb->args->buffer_count - 1;
+}
+
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	const int count = eb->args->buffer_count;
 	struct i915_vma *vma;
+	struct idr *idr;
 	int slow_pass = -1;
-	int i;
+	int i, ret;
 
-	INIT_LIST_HEAD(&eb->vmas);
+	INIT_LIST_HEAD(&eb->relocs);
+	INIT_LIST_HEAD(&eb->unbound);
 
 	if (unlikely(eb->ctx->vma_ht_size & 1))
 		flush_work(&eb->ctx->vma_ht_resize);
@@ -175,8 +441,9 @@  eb_lookup_vmas(struct i915_execbuffer *eb)
 			if (vma->ctx_handle != eb->exec[i].handle)
 				continue;
 
-			if (!eb_add_vma(eb, vma, i))
-				return -EINVAL;
+			ret = eb_add_vma(eb, &eb->exec[i], vma);
+			if (unlikely(ret))
+				return ret;
 
 			goto next_vma;
 		}
@@ -187,24 +454,25 @@  next_vma: ;
 	}
 
 	if (slow_pass < 0)
-		return 0;
+		goto out;
 
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
+	idr = &eb->file->object_idr;
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
 
 		if (eb->exec[i].rsvd2)
 			continue;
 
-		obj = to_intel_bo(idr_find(&eb->file->object_idr,
-					   eb->exec[i].handle));
+		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
 		if (unlikely(obj == NULL)) {
 			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
 				  eb->exec[i].handle, i);
-			return -ENOENT;
+			ret = -ENOENT;
+			goto err;
 		}
 
 		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
@@ -225,11 +493,12 @@  next_vma: ;
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
+		obj = to_ptr(typeof(*obj), eb->exec[i].rsvd2 & ~1);
 		vma = i915_gem_obj_lookup_or_create_vma(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			return PTR_ERR(vma);
+			ret = PTR_ERR(vma);
+			goto err;
 		}
 
 		/* First come, first served */
@@ -240,28 +509,24 @@  next_vma: ;
 				       ht_head(eb->ctx, eb->exec[i].handle));
 			eb->ctx->vma_ht_count++;
 			if (vma->is_ggtt) {
-				BUG_ON(obj->vma_hashed);
+				GEM_BUG_ON(obj->vma_hashed);
 				obj->vma_hashed = vma;
 			}
 		}
 
-		if (!eb_add_vma(eb, vma, i))
-			return -EINVAL;
+		ret = eb_add_vma(eb, &eb->exec[i], vma);
+		if (unlikely(ret))
+			goto err;
 	}
 	if (4*eb->ctx->vma_ht_count > 3*eb->ctx->vma_ht_size) {
 		eb->ctx->vma_ht_size |= 1;
 		queue_work(system_highpri_wq, &eb->ctx->vma_ht_resize);
 	}
 
-	return 0;
-}
-
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
+out:
+	/* take note of the batch buffer before we might reorder the lists */
+	i = eb_batch_index(eb);
+	eb->batch_vma = to_ptr(struct i915_vma, eb->exec[i].rsvd2);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -272,14 +537,23 @@  eb_get_batch(struct i915_execbuffer *eb)
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if ((eb->exec[i].flags & EXEC_OBJECT_PINNED) == 0)
+		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (eb->reloc_cache.has_fence)
+		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
 
-	return vma;
+	return eb_reserve(eb);
+
+err:
+	for (i = slow_pass; i < count; i++) {
+		if (eb->exec[i].rsvd2 & 1)
+			eb->exec[i].rsvd2 = 0;
+	}
+	return ret;
 }
 
 static struct i915_vma *
-eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 {
 	if (eb->lut_mask < 0) {
 		if (handle >= -eb->lut_mask)
@@ -301,46 +575,51 @@  eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 	}
 }
 
-static void eb_destroy(struct i915_execbuffer *eb)
+static void
+eb_reset(const struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned count = eb->args->buffer_count;
+	unsigned i;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		if (vma->exec_entry == NULL)
-			continue;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
+
+		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, entry);
+		entry->flags &= ~eb->invalid_flags;
 
-		__eb_unreserve_vma(vma, vma->exec_entry);
 		vma->exec_entry = NULL;
 	}
 
 	if (eb->lut_mask >= 0)
-		kfree(eb->buckets);
+		memset(eb->buckets, 0,
+		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
 }
 
-static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+static void eb_destroy(const struct i915_execbuffer *eb)
 {
-	return (DBG_USE_CPU_RELOC ||
-		HAS_LLC(obj->base.dev) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
-		obj->cache_level != I915_CACHE_NONE);
-}
+	const unsigned count = eb->args->buffer_count;
+	unsigned i;
 
-/* Used to convert any address to canonical form.
- * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
- * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
- * addresses to be in a canonical form:
- * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
- * canonical form [63:48] == [47]."
- */
-#define GEN8_HIGH_ADDRESS_BIT 47
-static inline uint64_t gen8_canonical_addr(uint64_t address)
-{
-	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
-}
+	if (eb->lut_mask >= 0)
+		kfree(eb->buckets);
 
-static inline uint64_t gen8_noncanonical_addr(uint64_t address)
-{
-	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+	if (eb->exec == NULL)
+		return;
+
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
+
+		if (vma == NULL || vma->exec_entry == NULL)
+			continue;
+
+		GEM_BUG_ON(vma->exec_entry != entry);
+		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+	}
 }
 
 static inline uint64_t
@@ -355,7 +634,9 @@  static void reloc_cache_init(struct reloc_cache *cache,
 {
 	cache->page = -1;
 	cache->vaddr = 0;
+	cache->has_llc = HAS_LLC(i915);
 	cache->use_64bit_reloc = INTEL_INFO(i915)->gen >= 8;
+	cache->has_fence = INTEL_INFO(i915)->gen < 4;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -442,7 +723,7 @@  static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		struct i915_vma *vma;
 		int ret;
 
-		if (use_cpu_reloc(obj))
+		if (use_cpu_reloc(cache, obj))
 			return NULL;
 
 		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
@@ -542,10 +823,10 @@  repeat:
 	return 0;
 }
 
-static int
-eb_relocate_entry(struct i915_vma *vma,
-		  struct i915_execbuffer *eb,
-		  struct drm_i915_gem_relocation_entry *reloc)
+static uint64_t
+eb_relocate_entry(struct i915_execbuffer *eb,
+		  const struct i915_vma *vma,
+		  const struct drm_i915_gem_relocation_entry *reloc)
 {
 	struct i915_vma *target;
 	u64 target_offset;
@@ -618,318 +899,127 @@  eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	/* We can't wait for rendering with pagefaults disabled */
-	if (i915_gem_object_is_active(vma->obj) && pagefault_disabled())
-		return -EFAULT;
-
-	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
-	if (ret)
-		return ret;
-
-	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-	return 0;
-}
-
-static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
-{
-#define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
-	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
-	struct drm_i915_gem_relocation_entry __user *user_relocs;
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int remain, ret = 0;
-
-	user_relocs = to_user_ptr(entry->relocs_ptr);
-
-	remain = entry->relocation_count;
-	while (remain) {
-		struct drm_i915_gem_relocation_entry *r = stack_reloc;
-		int count = remain;
-		if (count > ARRAY_SIZE(stack_reloc))
-			count = ARRAY_SIZE(stack_reloc);
-		remain -= count;
-
-		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		do {
-			u64 offset = r->presumed_offset;
-
-			ret = eb_relocate_entry(vma, eb, r);
-			if (ret)
-				goto out;
-
-			if (r->presumed_offset != offset &&
-			    __copy_to_user_inatomic(&user_relocs->presumed_offset,
-						    &r->presumed_offset,
-						    sizeof(r->presumed_offset))) {
-				ret = -EFAULT;
-				goto out;
-			}
-
-			user_relocs++;
-			r++;
-		} while (--count);
-	}
-
-out:
-	reloc_cache_reset(&eb->reloc_cache);
-	return ret;
-#undef N_RELOC
-}
-
-static int
-eb_relocate_vma_slow(struct i915_vma *vma,
-		     struct i915_execbuffer *eb,
-		     struct drm_i915_gem_relocation_entry *relocs)
-{
-	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int i, ret = 0;
-
-	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma, eb, &relocs[i]);
-		if (ret)
-			break;
-	}
-	reloc_cache_reset(&eb->reloc_cache);
-	return ret;
-}
-
-static int eb_relocate(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-	int ret = 0;
-
-	/* This is the fast path and we cannot handle a pagefault whilst
-	 * holding the struct mutex lest the user pass in the relocations
-	 * contained within a mmaped bo. For in such a case we, the page
-	 * fault handler would call i915_gem_fault() and we would try to
-	 * acquire the struct mutex again. Obviously this is bad and so
-	 * lockdep complains vehemently.
-	 */
-	pagefault_disable();
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = eb_relocate_vma(vma, eb);
-		if (ret)
-			break;
-	}
-	pagefault_enable();
-
-	return ret;
-}
-
-static bool only_mappable_for_reloc(unsigned int flags)
-{
-	return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) ==
-		__EXEC_OBJECT_NEEDS_MAP;
-}
-
-static int
-eb_reserve_vma(struct i915_vma *vma,
-	       struct intel_engine_cs *ring,
-	       bool *need_reloc)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	uint64_t flags;
-	int ret;
-
-	flags = PIN_USER;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
-
-	if (!drm_mm_node_allocated(&vma->node)) {
-		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
-		 * limit address to the first 4GBs for unflagged objects.
-		 */
-		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
-			flags |= PIN_ZONE_4G;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-			flags |= PIN_GLOBAL | PIN_MAPPABLE;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
-			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			flags |= entry->offset | PIN_OFFSET_FIXED;
-		if ((flags & PIN_MAPPABLE) == 0)
-			flags |= PIN_HIGH;
-	}
-
-	ret = i915_vma_pin(vma,
-			   entry->pad_to_size,
-			   entry->alignment,
-			   flags);
-	if ((ret == -ENOSPC || ret == -E2BIG) &&
-	    only_mappable_for_reloc(entry->flags))
-		ret = i915_vma_pin(vma,
-				   entry->pad_to_size,
-				   entry->alignment,
-				   flags & ~PIN_MAPPABLE);
-	if (ret)
-		return ret;
-
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
-
-	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
-		ret = i915_vma_get_fence(vma);
-		if (ret)
-			return ret;
-
-		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
-	}
-
-	if (entry->offset != vma->node.start) {
-		entry->offset = vma->node.start;
-		*need_reloc = true;
-	}
-
-	return 0;
-}
-
-static bool
-need_reloc_mappable(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	if (entry->relocation_count == 0)
-		return false;
-
-	if (!vma->is_ggtt)
-		return false;
-
-	/* See also use_cpu_reloc() */
-	if (HAS_LLC(vma->obj->base.dev))
-		return false;
-
-	if (vma->obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return false;
-
-	return true;
-}
-
-static bool
-eb_vma_misplaced(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP && !vma->is_ggtt);
-
-	if (entry->alignment &&
-	    vma->node.start & (entry->alignment - 1))
-		return true;
-
-	if (vma->node.size < entry->pad_to_size)
-		return true;
-
-	if (entry->flags & EXEC_OBJECT_PINNED &&
-	    vma->node.start != entry->offset)
-		return true;
-
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
-	    vma->node.start < BATCH_OFFSET_BIAS)
-		return true;
-
-	/* avoid costly ping-pong once a batch bo ended up non-mappable */
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !vma->map_and_fenceable)
-		return !only_mappable_for_reloc(entry->flags);
+	/* We can't wait for rendering with pagefaults disabled */
+	if (i915_gem_object_is_active(vma->obj) && pagefault_disabled()) {
+		if (i915_gem_object_flush_active(vma->obj))
+			return -EBUSY;
+	}
 
-	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
-	    (vma->node.start + vma->node.size - 1) >> 32)
-		return true;
+	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
+	if (ret)
+		return ret;
 
-	return false;
+	/* and update the user's relocation entry */
+	return target_offset | 1;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static int eb_relocate_vma(struct i915_execbuffer *eb,
+			   const struct i915_vma *vma)
 {
-	const bool has_fenced_gpu_access = INTEL_INFO(eb->i915)->gen < 4;
-	struct i915_vma *vma;
-	struct list_head ordered_vmas;
-	struct list_head pinned_vmas;
-	int retry;
-
-	INIT_LIST_HEAD(&ordered_vmas);
-	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(&eb->vmas)) {
-		struct drm_i915_gem_exec_object2 *entry;
-		bool need_fence, need_mappable;
-
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
-		entry = vma->exec_entry;
-
-		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
-			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
-
-		if (!has_fenced_gpu_access)
-			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-		need_fence =
-			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			vma->obj->tiling_mode != I915_TILING_NONE;
-		need_mappable = need_fence || need_reloc_mappable(vma);
+#define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
+	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
+	struct drm_i915_gem_relocation_entry __user *user_relocs;
+	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	int remain;
 
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_list, &pinned_vmas);
-		else if (need_mappable) {
-			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_list, &ordered_vmas);
-		} else
-			list_move_tail(&vma->exec_list, &ordered_vmas);
-	}
-	list_splice(&ordered_vmas, &eb->vmas);
-	list_splice(&pinned_vmas, &eb->vmas);
+	user_relocs = to_user_ptr(entry->relocs_ptr);
+	remain = entry->relocation_count;
 
-	/* Attempt to pin all of the buffers into the GTT.
-	 * This is done in 3 phases:
-	 *
-	 * 1a. Unbind all objects that do not match the GTT constraints for
-	 *     the execbuffer (fenceable, mappable, alignment etc).
-	 * 1b. Increment pin count for already bound objects.
-	 * 2.  Bind new objects.
-	 * 3.  Decrement pin count.
-	 *
-	 * This avoid unnecessary unbinding of later objects in order to make
-	 * room for the earlier objects *unless* we need to defragment.
+	/*
+	 * We must check that the entire relocation array is safe
+	 * to read, but since we may need to update the presumed
+	 * offsets during execution, check for full write access.
 	 */
-	retry = 0;
-	do {
-		int ret = 0;
+	if (!access_ok(VERIFY_WRITE, user_relocs, remain*sizeof(*user_relocs)))
+		return -EFAULT;
 
-		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
-			if (!drm_mm_node_allocated(&vma->node))
-				continue;
+	do {
+		struct drm_i915_gem_relocation_entry *r = stack_reloc;
+		int count = min_t(int, remain, ARRAY_SIZE(stack_reloc));
 
-			if (eb_vma_misplaced(vma))
-				ret = i915_vma_unbind(vma);
-			else
-				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
+		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
+			remain = -EFAULT;
+			goto out;
 		}
 
-		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
-			if (drm_mm_node_allocated(&vma->node))
-				continue;
+		remain -= count;
+		do {
+			uint64_t offset = eb_relocate_entry(eb, vma, r);
+			if (offset == 0) {
+			} else if ((int64_t)offset < 0) {
+				remain = (int64_t)offset;
+				goto out;
+			} else {
+				offset &= ~1;
+				if (__copy_to_user_inatomic(&user_relocs[r-stack_reloc].presumed_offset,
+							    &offset,
+							    sizeof(offset))) {
+					remain = -EFAULT;
+					goto out;
+				}
+			}
+		} while (r++, --count);
+		user_relocs += ARRAY_SIZE(stack_reloc);
+	} while (remain);
+out:
+	reloc_cache_reset(&eb->reloc_cache);
+	return remain;
+#undef N_RELOC
+}
 
-			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
+static int
+eb_relocate_vma_slow(struct i915_execbuffer *eb,
+		     const struct i915_vma *vma)
+{
+	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	struct drm_i915_gem_relocation_entry *relocs =
+		to_ptr(typeof(*relocs), entry->relocs_ptr);
+	int i, ret;
 
+	for (i = 0; i < entry->relocation_count; i++) {
+		uint64_t offset = eb_relocate_entry(eb, vma, &relocs[i]);
+		if ((int64_t)offset < 0) {
+			ret = (int64_t)offset;
+			goto err;
+		}
+	}
+	ret = 0;
 err:
-		if (ret != -ENOSPC || retry++)
-			return ret;
+	reloc_cache_reset(&eb->reloc_cache);
+	return ret;
+}
 
-		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list)
-			eb_unreserve_vma(vma);
+static int eb_relocate(struct i915_execbuffer *eb)
+{
+	const struct i915_vma *vma;
+	int ret = 0;
 
-		ret = i915_gem_evict_vm(eb->vm, true);
-		if (ret)
-			return ret;
-	} while (1);
+	/* This is the fast path and we cannot handle a pagefault whilst
+	 * holding the struct mutex lest the user pass in the relocations
+	 * contained within a mmaped bo. For in such a case we, the page
+	 * fault handler would call i915_gem_fault() and we would try to
+	 * acquire the struct mutex again. Obviously this is bad and so
+	 * lockdep complains vehemently.
+	 */
+	pagefault_disable();
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+retry:
+		ret = eb_relocate_vma(eb, vma);
+		if (ret == 0)
+			continue;
+
+		if (ret == -EBUSY) {
+			pagefault_enable();
+			ret = i915_gem_object_wait_rendering(vma->obj, false);
+			pagefault_disable();
+			if (ret == 0)
+				goto retry;
+		}
+		break;
+	}
+	pagefault_enable();
+
+	return ret;
 }
 
 static int eb_select_context(struct i915_execbuffer *eb)
@@ -950,49 +1040,52 @@  static int eb_select_context(struct i915_execbuffer *eb)
 	eb->ctx = ctx;
 	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->gtt.base;
 
+	eb->context_flags = 0;
+	if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
 	return 0;
 }
 
-static int
-eb_relocate_slow(struct i915_execbuffer *eb)
+static struct drm_i915_gem_relocation_entry *
+eb_copy_relocations(const struct i915_execbuffer *eb)
 {
+	const unsigned relocs_max = UINT_MAX / sizeof(struct drm_i915_gem_relocation_entry);
 	const unsigned count = eb->args->buffer_count;
-	struct drm_device *dev = eb->i915->dev;
 	struct drm_i915_gem_relocation_entry *reloc;
-	struct i915_vma *vma;
-	int *reloc_offset;
-	int i, total, ret;
-
-	/* We may process another execbuffer during the unlock... */
-	eb_reset(eb);
-	mutex_unlock(&dev->struct_mutex);
+	unsigned total, i;
 
 	total = 0;
-	for (i = 0; i < count; i++)
-		total += eb->exec[i].relocation_count;
-
-	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
-	reloc = drm_malloc_ab(total, sizeof(*reloc));
-	if (reloc == NULL || reloc_offset == NULL) {
-		drm_free_large(reloc);
-		drm_free_large(reloc_offset);
-		mutex_lock(&dev->struct_mutex);
-		return -ENOMEM;
+	for (i = 0; i < count; i++) {
+		unsigned nreloc = eb->exec[i].relocation_count;
+
+		if (total > relocs_max - nreloc)
+			return ERR_PTR(-EINVAL);
+
+		total += nreloc;
 	}
+	if (total == 0)
+		return NULL;
+
+	reloc = drm_malloc_gfp(total, sizeof(*reloc), GFP_TEMPORARY);
+	if (reloc == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	total = 0;
 	for (i = 0; i < count; i++) {
 		struct drm_i915_gem_relocation_entry __user *user_relocs;
+		unsigned nreloc = eb->exec[i].relocation_count, j;
 		u64 invalid_offset = (u64)-1;
-		int j;
+
+		if (nreloc == 0)
+			continue;
 
 		user_relocs = to_user_ptr(eb->exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
-				   eb->exec[i].relocation_count * sizeof(*reloc))) {
-			ret = -EFAULT;
-			mutex_lock(&dev->struct_mutex);
-			goto err;
+				   nreloc * sizeof(*reloc))) {
+			drm_free_large(reloc);
+			return ERR_PTR(-EFAULT);
 		}
 
 		/* As we do not update the known relocation offsets after
@@ -1004,18 +1097,40 @@  eb_relocate_slow(struct i915_execbuffer *eb)
 		 * happened we would make the mistake of assuming that the
 		 * relocations were valid.
 		 */
-		for (j = 0; j < eb->exec[i].relocation_count; j++) {
+		for (j = 0; j < nreloc; j++) {
 			if (__copy_to_user(&user_relocs[j].presumed_offset,
 					   &invalid_offset,
 					   sizeof(invalid_offset))) {
-				ret = -EFAULT;
-				mutex_lock(&dev->struct_mutex);
-				goto err;
+				drm_free_large(reloc);
+				return ERR_PTR(-EFAULT);
 			}
 		}
 
-		reloc_offset[i] = total;
-		total += eb->exec[i].relocation_count;
+		eb->exec[i].relocs_ptr = (uintptr_t)(reloc + total);
+		total += nreloc;
+	}
+
+	return reloc;
+}
+
+static int eb_relocate_slow(struct i915_execbuffer *eb)
+{
+	struct drm_device *dev = eb->i915->dev;
+	struct drm_i915_gem_relocation_entry *reloc = NULL;
+	const struct i915_vma *vma;
+	int ret;
+
+repeat:
+	/* We may process another execbuffer during the unlock... */
+	eb_reset(eb);
+	mutex_unlock(&dev->struct_mutex);
+
+	if (reloc == NULL) {
+		reloc = eb_copy_relocations(eb);
+		if (IS_ERR(reloc)) {
+			mutex_lock(&dev->struct_mutex);
+			return PTR_ERR(reloc);
+		}
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1033,13 +1148,8 @@  eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	ret = eb_reserve(eb);
-	if (ret)
-		goto err;
-
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		int offset = vma->exec_entry - eb->exec;
-		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[offset]);
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		ret = eb_relocate_vma_slow(eb, vma);
 		if (ret)
 			goto err;
 	}
@@ -1051,8 +1161,12 @@  eb_relocate_slow(struct i915_execbuffer *eb)
 	 */
 
 err:
+	if (ret == -EAGAIN) {
+		cond_resched();
+		goto repeat;
+	}
 	drm_free_large(reloc);
-	drm_free_large(reloc_offset);
+
 	return ret;
 }
 
@@ -1060,27 +1174,38 @@  static int
 eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	const unsigned other_rings = (~intel_engine_flag(eb->engine) & I915_BO_ACTIVE_MASK) << I915_BO_ACTIVE_SHIFT;
-	struct i915_vma *vma;
-	uint32_t flush_domains = 0;
+	const unsigned count = eb->args->buffer_count;
 	bool flush_chipset = false;
+	unsigned i;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->flags & other_rings) {
-			ret = i915_gem_object_sync(obj,
-						   eb->request,
-						   vma->exec_entry->flags & EXEC_OBJECT_WRITE);
+			ret = i915_gem_object_sync(obj, eb->request,
+						   entry->flags & EXEC_OBJECT_WRITE);
 			if (ret)
 				return ret;
 		}
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			flush_chipset |= i915_gem_clflush_object(obj, false);
+		if (!i915_gem_object_is_active(obj)) {
+			if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+				flush_chipset |= i915_gem_clflush_object(obj, false);
+
+			obj->base.write_domain = 0;
+			if (entry->flags & EXEC_OBJECT_WRITE)
+				obj->base.read_domains = 0;
+			obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
+		}
 
-		flush_domains |= obj->base.write_domain;
+		i915_vma_move_to_active(vma, eb->request, entry->flags);
+		__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
 	}
+	eb->exec = NULL;
 
 	if (flush_chipset)
 		i915_gem_chipset_flush(eb->i915->dev);
@@ -1115,79 +1240,6 @@  i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 	return true;
 }
 
-static int
-validate_exec_list(struct drm_device *dev,
-		   struct drm_i915_gem_exec_object2 *exec,
-		   int count)
-{
-	unsigned relocs_total = 0;
-	unsigned relocs_max = UINT_MAX / sizeof(struct drm_i915_gem_relocation_entry);
-	unsigned invalid_flags;
-	int i;
-
-	invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
-	if (USES_FULL_PPGTT(dev))
-		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
-
-	for (i = 0; i < count; i++) {
-		char __user *ptr = to_user_ptr(exec[i].relocs_ptr);
-		int length; /* limited by fault_in_pages_readable() */
-
-		if (exec[i].flags & invalid_flags)
-			return -EINVAL;
-
-		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
-		 * any non-page-aligned or non-canonical addresses.
-		 */
-		if (exec[i].flags & EXEC_OBJECT_PINNED) {
-			if (exec[i].offset !=
-			    gen8_canonical_addr(exec[i].offset & PAGE_MASK))
-				return -EINVAL;
-
-			/* From drm_mm perspective address space is continuous,
-			 * so from this point we're always using non-canonical
-			 * form internally.
-			 */
-			exec[i].offset = gen8_noncanonical_addr(exec[i].offset);
-		}
-
-		if (exec[i].alignment && !is_power_of_2(exec[i].alignment))
-			return -EINVAL;
-
-		/* pad_to_size was once a reserved field, so sanitize it */
-		if (exec[i].flags & EXEC_OBJECT_PAD_TO_SIZE) {
-			if (offset_in_page(exec[i].pad_to_size))
-				return -EINVAL;
-		} else
-			exec[i].pad_to_size = 0;
-
-		/* First check for malicious input causing overflow in
-		 * the worst case where we need to allocate the entire
-		 * relocation tree as a single array.
-		 */
-		if (exec[i].relocation_count > relocs_max - relocs_total)
-			return -EINVAL;
-		relocs_total += exec[i].relocation_count;
-
-		length = exec[i].relocation_count *
-			sizeof(struct drm_i915_gem_relocation_entry);
-		/*
-		 * We must check that the entire relocation array is safe
-		 * to read, but since we may need to update the presumed
-		 * offsets during execution, check for full write access.
-		 */
-		if (!access_ok(VERIFY_WRITE, ptr, length))
-			return -EFAULT;
-
-		if (likely(!i915.prefault_disable)) {
-			if (fault_in_multipages_readable(ptr, length))
-				return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned flags)
@@ -1224,26 +1276,6 @@  void i915_vma_move_to_active(struct i915_vma *vma,
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 }
 
-static void
-eb_move_to_active(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		struct drm_i915_gem_object *obj = vma->obj;
-		u32 old_read = obj->base.read_domains;
-		u32 old_write = obj->base.write_domain;
-
-		obj->base.write_domain = 0;
-		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
-			obj->base.read_domains = 0;
-		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
-		trace_i915_gem_object_change_domain(obj, old_read, old_write);
-	}
-}
-
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
@@ -1255,25 +1287,22 @@  i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 		return -EINVAL;
 	}
 
-	ret = intel_ring_begin(req, 4 * 3);
+	ret = intel_ring_begin(req, 4 * 2 + 2);
 	if (ret)
 		return ret;
 
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(4));
 	for (i = 0; i < 4; i++) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
 		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
 		intel_ring_emit(ring, 0);
 	}
-
+	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
 
 	return 0;
 }
 
-static struct i915_vma *
-eb_parse(struct i915_execbuffer *eb,
-	 struct drm_i915_gem_exec_object2 *shadow_exec_entry,
-	 bool is_master)
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 {
 	struct drm_i915_gem_object *shadow_batch_obj;
 	struct i915_vma *vma;
@@ -1304,11 +1333,11 @@  eb_parse(struct i915_execbuffer *eb,
 		goto err;
 	}
 
-	memset(shadow_exec_entry, 0, sizeof(*shadow_exec_entry));
-
-	vma->exec_entry = shadow_exec_entry;
+	vma->exec_entry =
+	       	memset(&eb->exec[eb->args->buffer_count++],
+		       0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
-	list_add_tail(&vma->exec_list, &eb->vmas);
+	vma->exec_entry->rsvd2 = (uintptr_t)vma;
 
 err:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -1324,70 +1353,81 @@  add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer *eb)
+eb_set_constants_offset(struct i915_execbuffer *eb)
 {
-	struct intel_ring *ring = eb->request->ring;
 	struct drm_i915_private *dev_priv = eb->i915;
-	int instp_mode;
-	u32 instp_mask;
+	struct intel_ring *ring;
+	u32 mode, mask;
 	int ret;
 
-	ret = eb_move_to_gpu(eb);
-	if (ret)
-		return ret;
-
-	ret = i915_switch_context(eb->request);
-	if (ret)
-		return ret;
-
-	instp_mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
-	instp_mask = I915_EXEC_CONSTANTS_MASK;
-	switch (instp_mode) {
+	mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
+	switch (mode) {
 	case I915_EXEC_CONSTANTS_REL_GENERAL:
 	case I915_EXEC_CONSTANTS_ABSOLUTE:
 	case I915_EXEC_CONSTANTS_REL_SURFACE:
-		if (instp_mode != 0 && eb->engine->id != RCS) {
-			DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
-			return -EINVAL;
-		}
-
-		if (instp_mode != dev_priv->relative_constants_mode) {
-			if (INTEL_INFO(dev_priv)->gen < 4) {
-				DRM_DEBUG("no rel constants on pre-gen4\n");
-				return -EINVAL;
-			}
-
-			if (INTEL_INFO(dev_priv)->gen > 5 &&
-			    instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
-				DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
-				return -EINVAL;
-			}
-
-			/* The HW changed the meaning on this bit on gen6 */
-			if (INTEL_INFO(dev_priv)->gen >= 6)
-				instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
-		}
 		break;
 	default:
-		DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode);
+		DRM_DEBUG("execbuf with unknown constants: %d\n", mode);
 		return -EINVAL;
 	}
 
-	if (eb->engine->id == RCS &&
-	    instp_mode != dev_priv->relative_constants_mode) {
-		ret = intel_ring_begin(eb->request, 4);
-		if (ret)
-			return ret;
+	if (mode == dev_priv->relative_constants_mode)
+		return 0;
+
+	if (eb->engine->id != RCS) {
+		DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
+		return -EINVAL;
+	}
 
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, INSTPM);
-		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
-		intel_ring_advance(ring);
+	if (INTEL_INFO(dev_priv)->gen < 4) {
+		DRM_DEBUG("no rel constants on pre-gen4\n");
+		return -EINVAL;
+	}
 
-		dev_priv->relative_constants_mode = instp_mode;
+	if (INTEL_INFO(dev_priv)->gen > 5 &&
+	    mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
+		DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
+		return -EINVAL;
 	}
 
+	/* The HW changed the meaning on this bit on gen6 */
+	mask = I915_EXEC_CONSTANTS_MASK;
+	if (INTEL_INFO(dev_priv)->gen >= 6)
+		mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
+
+	ret = intel_ring_begin(eb->request, 4);
+	if (ret)
+		return ret;
+
+	ring = eb->request->ring;
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit_reg(ring, INSTPM);
+	intel_ring_emit(ring, mask << 16 | mode);
+	intel_ring_advance(ring);
+
+	dev_priv->relative_constants_mode = mode;
+
+	return 0;
+}
+
+static int
+execbuf_submit(struct i915_execbuffer *eb)
+{
+	int ret;
+
+	ret = eb_move_to_gpu(eb);
+	if (ret)
+		return ret;
+
+	ret = i915_switch_context(eb->request);
+	if (ret)
+		return ret;
+
+	ret = eb_set_constants_offset(eb);
+	if (ret)
+		return ret;
+
 	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
 		ret = i915_reset_gen7_sol_offsets(eb->request);
 		if (ret)
@@ -1396,15 +1436,13 @@  execbuf_submit(struct i915_execbuffer *eb)
 
 	ret = eb->engine->emit_bb_start(eb->request,
 					eb->batch_vma->node.start +
-					eb->batch_start_offset,
+					eb->args->batch_start_offset,
 					eb->args->batch_len,
 					eb->dispatch_flags);
 	if (ret)
 		return ret;
 
 	trace_i915_gem_ring_dispatch(eb->request, eb->dispatch_flags);
-
-	eb_move_to_active(eb);
 	add_to_client(eb->request, eb->file);
 
 	return 0;
@@ -1448,16 +1486,11 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_i915_gem_exec_object2 *exec)
 {
 	struct i915_execbuffer eb;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
 	int ret;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
 
-	ret = validate_exec_list(dev, exec, args->buffer_count);
-	if (ret)
-		return ret;
-
 	eb.dispatch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
 		if (!file->is_master || !capable(CAP_SYS_ADMIN))
@@ -1485,7 +1518,11 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.file = file;
 	eb.args = args;
 	eb.exec = exec;
-	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	if ((args->flags & I915_EXEC_NO_RELOC) == 0)
+		args->flags |= __EXEC_HAS_RELOC;
+	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
+	if (USES_FULL_PPGTT(eb.i915))
+		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 	reloc_cache_init(&eb.reloc_cache, eb.i915);
 
 	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
@@ -1561,22 +1598,11 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 	if (ret)
 		goto err;
 
-	/* take note of the batch buffer before we might reorder the lists */
-	eb.batch_vma = eb_get_batch(&eb);
-
-	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	ret = eb_reserve(&eb);
-	if (ret)
-		goto err;
-
 	/* The objects are in their final locations, apply the relocations. */
-	if (eb.need_relocs)
+	if (args->flags & __EXEC_HAS_RELOC) {
 		ret = eb_relocate(&eb);
-	if (ret) {
-		if (ret == -EFAULT) {
+		if (ret == -EAGAIN || ret == -EFAULT)
 			ret = eb_relocate_slow(&eb);
-			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
-		}
 		if (ret)
 			goto err;
 	}
@@ -1588,11 +1614,10 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 		goto err;
 	}
 
-	eb.batch_start_offset = args->batch_start_offset;
 	if (intel_engine_needs_cmd_parser(eb.engine) && args->batch_len) {
 		struct i915_vma *vma;
 
-		vma = eb_parse(&eb, &shadow_exec_entry, file->is_master);
+		vma = eb_parse(&eb, file->is_master);
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
 			goto err;
@@ -1609,7 +1634,7 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 			 * command parser has accepted.
 			 */
 			eb.dispatch_flags |= I915_DISPATCH_SECURE;
-			eb.batch_start_offset = 0;
+			eb.args->batch_start_offset = 0;
 			eb.batch_vma = vma;
 		}
 	}
@@ -1700,7 +1725,7 @@  i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 	/* Copy in the exec list from userland */
 	exec_list = drm_malloc_ab(sizeof(*exec_list), args->buffer_count);
-	exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count);
+	exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count + 1);
 	if (exec_list == NULL || exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
@@ -1743,24 +1768,22 @@  i915_gem_execbuffer(struct drm_device *dev, void *data,
 	i915_execbuffer2_set_context_id(exec2, 0);
 
 	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
-	if (!ret) {
+	if (exec2.flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user (%d)\n",
-					  args->buffer_count, ret);
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			exec2_list[i].offset &= PIN_OFFSET_MASK;
+			if (__copy_to_user(&user_exec_list[i].offset,
+					   &exec2_list[i].offset,
+					   sizeof(user_exec_list[i].offset)))
 				break;
-			}
 		}
 	}
 
@@ -1789,43 +1812,38 @@  i915_gem_execbuffer2(struct drm_device *dev, void *data,
 	}
 
 	exec2_list = drm_malloc_gfp(sizeof(*exec2_list),
-				    args->buffer_count,
+				    args->buffer_count + 1,
 				    GFP_TEMPORARY);
 	if (exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
 		return -ENOMEM;
 	}
-	ret = copy_from_user(exec2_list,
-			     to_user_ptr(args->buffers_ptr),
-			     sizeof(*exec2_list) * args->buffer_count);
-	if (ret != 0) {
-		DRM_DEBUG("copy %d exec entries failed %d\n",
-			  args->buffer_count, ret);
+	if (copy_from_user(exec2_list,
+			   to_user_ptr(args->buffers_ptr),
+			   sizeof(*exec2_list) * args->buffer_count)) {
+		DRM_DEBUG("copy %d exec entries failed\n", args->buffer_count);
 		drm_free_large(exec2_list);
 		return -EFAULT;
 	}
 
 	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
-	if (!ret) {
+	if (args->flags & __EXEC_HAS_RELOC) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   to_user_ptr(args->buffers_ptr);
+			to_user_ptr(args->buffers_ptr);
 		int i;
 
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user\n",
-					  args->buffer_count);
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			if (__copy_to_user(&user_exec_list[i].offset,
+					   &exec2_list[i].offset,
+					   sizeof(user_exec_list[i].offset)))
 				break;
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index cb3a6e272e22..a9b547e4ea6f 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3594,7 +3594,7 @@  void *i915_vma_iomap(struct drm_i915_private *dev_priv,
 			int ret;
 
 			/* Too many areas already allocated? */
-			ret = i915_gem_evict_vm(vma->vm, true);
+			ret = i915_gem_evict_vm(vma->vm);
 			if (ret)
 				return ERR_PTR(ret);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 3080033b722c..6996d79175a0 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -244,7 +244,9 @@  struct i915_vma {
 	struct hlist_node obj_node;
 
 	/** This vma's place in the batchbuffer or on the eviction list */
-	struct list_head exec_list;
+	struct list_head exec_link;
+	struct list_head reloc_link;
+	struct list_head evict_link;
 
 	/**
 	 * Used for performing relocations during execbuffer insertion.