diff mbox

[v4,1/2] intel: 48b ppgtt support (EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag)

Message ID 1441290239-31462-2-git-send-email-michel.thierry@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Thierry Sept. 3, 2015, 2:23 p.m. UTC
Gen8+ supports 48-bit virtual addresses, but some objects must always be
allocated inside the 32-bit address range.

In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
General State Heap (GSH) or Instruction State Heap (ISH) must be in a
32-bit range, because the General State Offset and Instruction State Offset
are limited to 32-bits.

The i915 driver has been modified to provide a flag to set when the 4GB
limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
48-bit range will only be used when explicitly requested.

Callers to the existing drm_intel_bo_emit_reloc function should set the
use_48b_address_range flag beforehand, in order to use full ppgtt range.

v2: Make set/clear functions nops on pre-gen8 platforms, and use them
    internally in emit_reloc functions (Ben)
    s/48BADDRESS/48B_ADDRESS/ (Dave)
v3: Keep set/clear functions internal, no-one needs to use them directly.
v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
    before enabling set/clear function, print full offsets in debug
    statements, using port of lower_32_bits and upper_32_bits from linux
    kernel (Micha?)

References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Micha? Winiarski <michal.winiarski@intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 include/drm/i915_drm.h    |  3 +-
 intel/intel_bufmgr.c      | 11 ++++++
 intel/intel_bufmgr.h      |  1 +
 intel/intel_bufmgr_gem.c  | 88 +++++++++++++++++++++++++++++++++++++----------
 intel/intel_bufmgr_priv.h | 14 ++++++++
 5 files changed, 97 insertions(+), 20 deletions(-)

Comments

Michał Winiarski Sept. 14, 2015, 1:54 p.m. UTC | #1
On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
> Gen8+ supports 48-bit virtual addresses, but some objects must always be
> allocated inside the 32-bit address range.
> 
> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
> 32-bit range, because the General State Offset and Instruction State Offset
> are limited to 32-bits.
> 
> The i915 driver has been modified to provide a flag to set when the 4GB
> limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
> 48-bit range will only be used when explicitly requested.
> 
> Callers to the existing drm_intel_bo_emit_reloc function should set the
> use_48b_address_range flag beforehand, in order to use full ppgtt range.
> 
> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>     internally in emit_reloc functions (Ben)
>     s/48BADDRESS/48B_ADDRESS/ (Dave)
> v3: Keep set/clear functions internal, no-one needs to use them directly.
> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>     before enabling set/clear function, print full offsets in debug
>     statements, using port of lower_32_bits and upper_32_bits from linux
>     kernel (Micha?)
> 
> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Cc: Micha? Winiarski <michal.winiarski@intel.com>

+Kristian

LGTM.
Acked-by: Micha? Winiarski <michal.winiarski@intel.com>

> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  include/drm/i915_drm.h    |  3 +-
>  intel/intel_bufmgr.c      | 11 ++++++
>  intel/intel_bufmgr.h      |  1 +
>  intel/intel_bufmgr_gem.c  | 88 +++++++++++++++++++++++++++++++++++++----------
>  intel/intel_bufmgr_priv.h | 14 ++++++++
>  5 files changed, 97 insertions(+), 20 deletions(-)
> 
> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
> index ded43b1..426b25c 100644
> --- a/include/drm/i915_drm.h
> +++ b/include/drm/i915_drm.h
> @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 {
>  #define EXEC_OBJECT_NEEDS_FENCE (1<<0)
>  #define EXEC_OBJECT_NEEDS_GTT	(1<<1)
>  #define EXEC_OBJECT_WRITE	(1<<2)
> -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
> +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
> +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1)
>  	__u64 flags;
>  
>  	__u64 rsvd1;
> diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c
> index 14ea9f9..0856e60 100644
> --- a/intel/intel_bufmgr.c
> +++ b/intel/intel_bufmgr.c
> @@ -293,6 +293,17 @@ drm_intel_bo_madvise(drm_intel_bo *bo, int madv)
>  }
>  
>  int
> +drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
> +{
> +	if (bo->bufmgr->bo_use_48b_address_range) {
> +		bo->bufmgr->bo_use_48b_address_range(bo, enable);
> +		return 0;
> +	}
> +
> +	return -ENODEV;
> +}
> +
> +int
>  drm_intel_bo_references(drm_intel_bo *bo, drm_intel_bo *target_bo)
>  {
>  	return bo->bufmgr->bo_references(bo, target_bo);
> diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
> index 95eecb8..a14c78f 100644
> --- a/intel/intel_bufmgr.h
> +++ b/intel/intel_bufmgr.h
> @@ -164,6 +164,7 @@ int drm_intel_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode,
>  int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name);
>  int drm_intel_bo_busy(drm_intel_bo *bo);
>  int drm_intel_bo_madvise(drm_intel_bo *bo, int madv);
> +int drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable);
>  
>  int drm_intel_bo_disable_reuse(drm_intel_bo *bo);
>  int drm_intel_bo_is_reusable(drm_intel_bo *bo);
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 2723e21..09d82d2 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -83,6 +83,22 @@
>  #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>  #define MAX2(A, B) ((A) > (B) ? (A) : (B))
>  
> +/**
> + * upper_32_bits - return bits 32-63 of a number
> + * @n: the number we're accessing
> + *
> + * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
> + * the "right shift count >= width of type" warning when that quantity is
> + * 32-bits.
> + */
> +#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16))
> +
> +/**
> + * lower_32_bits - return bits 0-31 of a number
> + * @n: the number we're accessing
> + */
> +#define lower_32_bits(n) ((__u32)(n))
> +
>  typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
>  
>  struct drm_intel_gem_bo_bucket {
> @@ -237,6 +253,15 @@ struct _drm_intel_bo_gem {
>  	bool is_userptr;
>  
>  	/**
> +	 * Boolean of whether this buffer can be placed in the full 48-bit
> +	 * address range on gen8+.
> +	 *
> +	 * By default, buffers will be keep in a 32-bit range, unless this
> +	 * flag is explicitly set.
> +	 */
> +	bool use_48b_address_range;
> +
> +	/**
>  	 * Size in bytes of this buffer and its relocation descendents.
>  	 *
>  	 * Used to avoid costly tree walking in
> @@ -395,14 +420,16 @@ drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
>  			drm_intel_bo_gem *target_gem =
>  			    (drm_intel_bo_gem *) target_bo;
>  
> -			DBG("%2d: %d (%s)@0x%08llx -> "
> -			    "%d (%s)@0x%08lx + 0x%08x\n",
> +			DBG("%2d: %d (%s)@0x%08x %08x -> "
> +			    "%d (%s)@0x%08x %08x + 0x%08x\n",
>  			    i,
>  			    bo_gem->gem_handle, bo_gem->name,
> -			    (unsigned long long)bo_gem->relocs[j].offset,
> +			    upper_32_bits(bo_gem->relocs[j].offset),
> +			    lower_32_bits(bo_gem->relocs[j].offset),
>  			    target_gem->gem_handle,
>  			    target_gem->name,
> -			    target_bo->offset64,
> +			    upper_32_bits(target_bo->offset64),
> +			    lower_32_bits(target_bo->offset64),
>  			    bo_gem->relocs[j].delta);
>  		}
>  	}
> @@ -468,11 +495,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
>  	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr;
>  	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo;
>  	int index;
> +	int flags = 0;
> +
> +	if (need_fence)
> +		flags |= EXEC_OBJECT_NEEDS_FENCE;
> +	if (bo_gem->use_48b_address_range)
> +		flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>  
>  	if (bo_gem->validate_index != -1) {
> -		if (need_fence)
> -			bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
> -				EXEC_OBJECT_NEEDS_FENCE;
> +		bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= flags;
>  		return;
>  	}
>  
> @@ -501,13 +532,9 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
>  	bufmgr_gem->exec2_objects[index].alignment = bo->align;
>  	bufmgr_gem->exec2_objects[index].offset = 0;
>  	bufmgr_gem->exec_bos[index] = bo;
> -	bufmgr_gem->exec2_objects[index].flags = 0;
> +	bufmgr_gem->exec2_objects[index].flags = flags;
>  	bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>  	bufmgr_gem->exec2_objects[index].rsvd2 = 0;
> -	if (need_fence) {
> -		bufmgr_gem->exec2_objects[index].flags |=
> -			EXEC_OBJECT_NEEDS_FENCE;
> -	}
>  	bufmgr_gem->exec_count++;
>  }
>  
> @@ -780,6 +807,7 @@ retry:
>  	bo_gem->used_as_reloc_target = false;
>  	bo_gem->has_error = false;
>  	bo_gem->reusable = true;
> +	bo_gem->use_48b_address_range = false;
>  
>  	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, alignment);
>  
> @@ -926,6 +954,7 @@ drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr *bufmgr,
>  	bo_gem->used_as_reloc_target = false;
>  	bo_gem->has_error = false;
>  	bo_gem->reusable = false;
> +	bo_gem->use_48b_address_range = false;
>  
>  	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
>  
> @@ -1081,6 +1110,7 @@ drm_intel_bo_gem_create_from_name(drm_intel_bufmgr *bufmgr,
>  	bo_gem->bo.handle = open_arg.handle;
>  	bo_gem->global_name = handle;
>  	bo_gem->reusable = false;
> +	bo_gem->use_48b_address_range = false;
>  
>  	memclear(get_tiling);
>  	get_tiling.handle = bo_gem->gem_handle;
> @@ -1930,6 +1960,13 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>  	return 0;
>  }
>  
> +static void
> +drm_intel_gem_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
> +{
> +	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
> +	bo_gem->use_48b_address_range = enable;
> +}
> +
>  static int
>  drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>  			    drm_intel_bo *target_bo, uint32_t target_offset,
> @@ -2073,10 +2110,12 @@ drm_intel_update_buffer_offsets(drm_intel_bufmgr_gem *bufmgr_gem)
>  
>  		/* Update the buffer offset */
>  		if (bufmgr_gem->exec_objects[i].offset != bo->offset64) {
> -			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
> -			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
> -			    (unsigned long long)bufmgr_gem->exec_objects[i].
> -			    offset);
> +			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
> +			    bo_gem->gem_handle, bo_gem->name,
> +			    upper_32_bits(bo->offset64),
> +			    lower_32_bits(bo->offset64),
> +			    upper_32_bits(bufmgr_gem->exec_objects[i].offset),
> +			    lower_32_bits(bufmgr_gem->exec_objects[i].offset));
>  			bo->offset64 = bufmgr_gem->exec_objects[i].offset;
>  			bo->offset = bufmgr_gem->exec_objects[i].offset;
>  		}
> @@ -2094,9 +2133,12 @@ drm_intel_update_buffer_offsets2 (drm_intel_bufmgr_gem *bufmgr_gem)
>  
>  		/* Update the buffer offset */
>  		if (bufmgr_gem->exec2_objects[i].offset != bo->offset64) {
> -			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
> -			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
> -			    (unsigned long long)bufmgr_gem->exec2_objects[i].offset);
> +			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
> +			    bo_gem->gem_handle, bo_gem->name,
> +			    upper_32_bits(bo->offset64),
> +			    lower_32_bits(bo->offset64),
> +			    upper_32_bits(bufmgr_gem->exec2_objects[i].offset),
> +			    lower_32_bits(bufmgr_gem->exec2_objects[i].offset));
>  			bo->offset64 = bufmgr_gem->exec2_objects[i].offset;
>  			bo->offset = bufmgr_gem->exec2_objects[i].offset;
>  		}
> @@ -2481,6 +2523,7 @@ drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr, int prime_fd, int s
>  	bo_gem->used_as_reloc_target = false;
>  	bo_gem->has_error = false;
>  	bo_gem->reusable = false;
> +	bo_gem->use_48b_address_range = false;
>  
>  	DRMINITLISTHEAD(&bo_gem->vma_list);
>  	DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named);
> @@ -3278,6 +3321,13 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>  		}
>  	}
>  
> +	if (bufmgr_gem->gen >= 8) {
> +		gp.param = I915_PARAM_HAS_ALIASING_PPGTT;
> +		ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> +		if (ret == 0 && *gp.value == 3)
> +			bufmgr_gem->bufmgr.bo_use_48b_address_range = drm_intel_gem_bo_use_48b_address_range;
> +	}
> +
>  	/* Let's go with one relocation per every 2 dwords (but round down a bit
>  	 * since a power of two will mean an extra page allocation for the reloc
>  	 * buffer).
> diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h
> index 59ebd18..5c17ffb 100644
> --- a/intel/intel_bufmgr_priv.h
> +++ b/intel/intel_bufmgr_priv.h
> @@ -152,6 +152,20 @@ struct _drm_intel_bufmgr {
>  	void (*destroy) (drm_intel_bufmgr *bufmgr);
>  
>  	/**
> +	 * Indicate if the buffer can be placed anywhere in the full ppgtt
> +	 * address range (2^48).
> +	 *
> +	 * Any resource used with flat/heapless (0x00000000-0xfffff000)
> +	 * General State Heap (GSH) or Intructions State Heap (ISH) must
> +	 * be in a 32-bit range. 48-bit range will only be used when explicitly
> +	 * requested.
> +	 *
> +	 * \param bo Buffer to set the use_48b_address_range flag.
> +	 * \param enable The flag value.
> +	 */
> +	void (*bo_use_48b_address_range) (drm_intel_bo *bo, uint32_t enable);
> +
> +	/**
>  	 * Add relocation entry in reloc_buf, which will be updated with the
>  	 * target buffer's real offset on on command submission.
>  	 *
> -- 
> 2.5.0
>
Michel Thierry Oct. 5, 2015, 2:03 p.m. UTC | #2
On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>> allocated inside the 32-bit address range.
>>
>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>> 32-bit range, because the General State Offset and Instruction State Offset
>> are limited to 32-bits.
>>
>> The i915 driver has been modified to provide a flag to set when the 4GB
>> limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>> 48-bit range will only be used when explicitly requested.
>>
>> Callers to the existing drm_intel_bo_emit_reloc function should set the
>> use_48b_address_range flag beforehand, in order to use full ppgtt range.
>>
>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>      internally in emit_reloc functions (Ben)
>>      s/48BADDRESS/48B_ADDRESS/ (Dave)
>> v3: Keep set/clear functions internal, no-one needs to use them directly.
>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>      before enabling set/clear function, print full offsets in debug
>>      statements, using port of lower_32_bits and upper_32_bits from linux
>>      kernel (Micha?)
>>
>> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>> Cc: Ben Widawsky <ben@bwidawsk.net>
>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>
> +Kristian
>
> LGTM.
> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>


Hi Kristian,

More comments on this?
I've resent the mesa patch with the last changes 
(http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).

Micha? has agreed with the interface and will use it for his work.
If mesa doesn't plan to use this for now, it's ok. The kernel changes 
have been done to prevent any regressions when the support 48-bit flag 
is not set.

Thanks,

-Michel

>> ---
>>   include/drm/i915_drm.h    |  3 +-
>>   intel/intel_bufmgr.c      | 11 ++++++
>>   intel/intel_bufmgr.h      |  1 +
>>   intel/intel_bufmgr_gem.c  | 88 +++++++++++++++++++++++++++++++++++++----------
>>   intel/intel_bufmgr_priv.h | 14 ++++++++
>>   5 files changed, 97 insertions(+), 20 deletions(-)
>>
>> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
>> index ded43b1..426b25c 100644
>> --- a/include/drm/i915_drm.h
>> +++ b/include/drm/i915_drm.h
>> @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 {
>>   #define EXEC_OBJECT_NEEDS_FENCE (1<<0)
>>   #define EXEC_OBJECT_NEEDS_GTT	(1<<1)
>>   #define EXEC_OBJECT_WRITE	(1<<2)
>> -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
>> +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
>> +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1)
>>   	__u64 flags;
>>
>>   	__u64 rsvd1;
>> diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c
>> index 14ea9f9..0856e60 100644
>> --- a/intel/intel_bufmgr.c
>> +++ b/intel/intel_bufmgr.c
>> @@ -293,6 +293,17 @@ drm_intel_bo_madvise(drm_intel_bo *bo, int madv)
>>   }
>>
>>   int
>> +drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
>> +{
>> +	if (bo->bufmgr->bo_use_48b_address_range) {
>> +		bo->bufmgr->bo_use_48b_address_range(bo, enable);
>> +		return 0;
>> +	}
>> +
>> +	return -ENODEV;
>> +}
>> +
>> +int
>>   drm_intel_bo_references(drm_intel_bo *bo, drm_intel_bo *target_bo)
>>   {
>>   	return bo->bufmgr->bo_references(bo, target_bo);
>> diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
>> index 95eecb8..a14c78f 100644
>> --- a/intel/intel_bufmgr.h
>> +++ b/intel/intel_bufmgr.h
>> @@ -164,6 +164,7 @@ int drm_intel_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode,
>>   int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name);
>>   int drm_intel_bo_busy(drm_intel_bo *bo);
>>   int drm_intel_bo_madvise(drm_intel_bo *bo, int madv);
>> +int drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable);
>>
>>   int drm_intel_bo_disable_reuse(drm_intel_bo *bo);
>>   int drm_intel_bo_is_reusable(drm_intel_bo *bo);
>> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>> index 2723e21..09d82d2 100644
>> --- a/intel/intel_bufmgr_gem.c
>> +++ b/intel/intel_bufmgr_gem.c
>> @@ -83,6 +83,22 @@
>>   #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>>   #define MAX2(A, B) ((A) > (B) ? (A) : (B))
>>
>> +/**
>> + * upper_32_bits - return bits 32-63 of a number
>> + * @n: the number we're accessing
>> + *
>> + * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
>> + * the "right shift count >= width of type" warning when that quantity is
>> + * 32-bits.
>> + */
>> +#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16))
>> +
>> +/**
>> + * lower_32_bits - return bits 0-31 of a number
>> + * @n: the number we're accessing
>> + */
>> +#define lower_32_bits(n) ((__u32)(n))
>> +
>>   typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
>>
>>   struct drm_intel_gem_bo_bucket {
>> @@ -237,6 +253,15 @@ struct _drm_intel_bo_gem {
>>   	bool is_userptr;
>>
>>   	/**
>> +	 * Boolean of whether this buffer can be placed in the full 48-bit
>> +	 * address range on gen8+.
>> +	 *
>> +	 * By default, buffers will be keep in a 32-bit range, unless this
>> +	 * flag is explicitly set.
>> +	 */
>> +	bool use_48b_address_range;
>> +
>> +	/**
>>   	 * Size in bytes of this buffer and its relocation descendents.
>>   	 *
>>   	 * Used to avoid costly tree walking in
>> @@ -395,14 +420,16 @@ drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
>>   			drm_intel_bo_gem *target_gem =
>>   			    (drm_intel_bo_gem *) target_bo;
>>
>> -			DBG("%2d: %d (%s)@0x%08llx -> "
>> -			    "%d (%s)@0x%08lx + 0x%08x\n",
>> +			DBG("%2d: %d (%s)@0x%08x %08x -> "
>> +			    "%d (%s)@0x%08x %08x + 0x%08x\n",
>>   			    i,
>>   			    bo_gem->gem_handle, bo_gem->name,
>> -			    (unsigned long long)bo_gem->relocs[j].offset,
>> +			    upper_32_bits(bo_gem->relocs[j].offset),
>> +			    lower_32_bits(bo_gem->relocs[j].offset),
>>   			    target_gem->gem_handle,
>>   			    target_gem->name,
>> -			    target_bo->offset64,
>> +			    upper_32_bits(target_bo->offset64),
>> +			    lower_32_bits(target_bo->offset64),
>>   			    bo_gem->relocs[j].delta);
>>   		}
>>   	}
>> @@ -468,11 +495,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
>>   	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr;
>>   	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo;
>>   	int index;
>> +	int flags = 0;
>> +
>> +	if (need_fence)
>> +		flags |= EXEC_OBJECT_NEEDS_FENCE;
>> +	if (bo_gem->use_48b_address_range)
>> +		flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>>
>>   	if (bo_gem->validate_index != -1) {
>> -		if (need_fence)
>> -			bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
>> -				EXEC_OBJECT_NEEDS_FENCE;
>> +		bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= flags;
>>   		return;
>>   	}
>>
>> @@ -501,13 +532,9 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
>>   	bufmgr_gem->exec2_objects[index].alignment = bo->align;
>>   	bufmgr_gem->exec2_objects[index].offset = 0;
>>   	bufmgr_gem->exec_bos[index] = bo;
>> -	bufmgr_gem->exec2_objects[index].flags = 0;
>> +	bufmgr_gem->exec2_objects[index].flags = flags;
>>   	bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>>   	bufmgr_gem->exec2_objects[index].rsvd2 = 0;
>> -	if (need_fence) {
>> -		bufmgr_gem->exec2_objects[index].flags |=
>> -			EXEC_OBJECT_NEEDS_FENCE;
>> -	}
>>   	bufmgr_gem->exec_count++;
>>   }
>>
>> @@ -780,6 +807,7 @@ retry:
>>   	bo_gem->used_as_reloc_target = false;
>>   	bo_gem->has_error = false;
>>   	bo_gem->reusable = true;
>> +	bo_gem->use_48b_address_range = false;
>>
>>   	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, alignment);
>>
>> @@ -926,6 +954,7 @@ drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr *bufmgr,
>>   	bo_gem->used_as_reloc_target = false;
>>   	bo_gem->has_error = false;
>>   	bo_gem->reusable = false;
>> +	bo_gem->use_48b_address_range = false;
>>
>>   	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
>>
>> @@ -1081,6 +1110,7 @@ drm_intel_bo_gem_create_from_name(drm_intel_bufmgr *bufmgr,
>>   	bo_gem->bo.handle = open_arg.handle;
>>   	bo_gem->global_name = handle;
>>   	bo_gem->reusable = false;
>> +	bo_gem->use_48b_address_range = false;
>>
>>   	memclear(get_tiling);
>>   	get_tiling.handle = bo_gem->gem_handle;
>> @@ -1930,6 +1960,13 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>>   	return 0;
>>   }
>>
>> +static void
>> +drm_intel_gem_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
>> +{
>> +	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
>> +	bo_gem->use_48b_address_range = enable;
>> +}
>> +
>>   static int
>>   drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>>   			    drm_intel_bo *target_bo, uint32_t target_offset,
>> @@ -2073,10 +2110,12 @@ drm_intel_update_buffer_offsets(drm_intel_bufmgr_gem *bufmgr_gem)
>>
>>   		/* Update the buffer offset */
>>   		if (bufmgr_gem->exec_objects[i].offset != bo->offset64) {
>> -			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>> -			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
>> -			    (unsigned long long)bufmgr_gem->exec_objects[i].
>> -			    offset);
>> +			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
>> +			    bo_gem->gem_handle, bo_gem->name,
>> +			    upper_32_bits(bo->offset64),
>> +			    lower_32_bits(bo->offset64),
>> +			    upper_32_bits(bufmgr_gem->exec_objects[i].offset),
>> +			    lower_32_bits(bufmgr_gem->exec_objects[i].offset));
>>   			bo->offset64 = bufmgr_gem->exec_objects[i].offset;
>>   			bo->offset = bufmgr_gem->exec_objects[i].offset;
>>   		}
>> @@ -2094,9 +2133,12 @@ drm_intel_update_buffer_offsets2 (drm_intel_bufmgr_gem *bufmgr_gem)
>>
>>   		/* Update the buffer offset */
>>   		if (bufmgr_gem->exec2_objects[i].offset != bo->offset64) {
>> -			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>> -			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
>> -			    (unsigned long long)bufmgr_gem->exec2_objects[i].offset);
>> +			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
>> +			    bo_gem->gem_handle, bo_gem->name,
>> +			    upper_32_bits(bo->offset64),
>> +			    lower_32_bits(bo->offset64),
>> +			    upper_32_bits(bufmgr_gem->exec2_objects[i].offset),
>> +			    lower_32_bits(bufmgr_gem->exec2_objects[i].offset));
>>   			bo->offset64 = bufmgr_gem->exec2_objects[i].offset;
>>   			bo->offset = bufmgr_gem->exec2_objects[i].offset;
>>   		}
>> @@ -2481,6 +2523,7 @@ drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr, int prime_fd, int s
>>   	bo_gem->used_as_reloc_target = false;
>>   	bo_gem->has_error = false;
>>   	bo_gem->reusable = false;
>> +	bo_gem->use_48b_address_range = false;
>>
>>   	DRMINITLISTHEAD(&bo_gem->vma_list);
>>   	DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named);
>> @@ -3278,6 +3321,13 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>>   		}
>>   	}
>>
>> +	if (bufmgr_gem->gen >= 8) {
>> +		gp.param = I915_PARAM_HAS_ALIASING_PPGTT;
>> +		ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> +		if (ret == 0 && *gp.value == 3)
>> +			bufmgr_gem->bufmgr.bo_use_48b_address_range = drm_intel_gem_bo_use_48b_address_range;
>> +	}
>> +
>>   	/* Let's go with one relocation per every 2 dwords (but round down a bit
>>   	 * since a power of two will mean an extra page allocation for the reloc
>>   	 * buffer).
>> diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h
>> index 59ebd18..5c17ffb 100644
>> --- a/intel/intel_bufmgr_priv.h
>> +++ b/intel/intel_bufmgr_priv.h
>> @@ -152,6 +152,20 @@ struct _drm_intel_bufmgr {
>>   	void (*destroy) (drm_intel_bufmgr *bufmgr);
>>
>>   	/**
>> +	 * Indicate if the buffer can be placed anywhere in the full ppgtt
>> +	 * address range (2^48).
>> +	 *
>> +	 * Any resource used with flat/heapless (0x00000000-0xfffff000)
>> +	 * General State Heap (GSH) or Intructions State Heap (ISH) must
>> +	 * be in a 32-bit range. 48-bit range will only be used when explicitly
>> +	 * requested.
>> +	 *
>> +	 * \param bo Buffer to set the use_48b_address_range flag.
>> +	 * \param enable The flag value.
>> +	 */
>> +	void (*bo_use_48b_address_range) (drm_intel_bo *bo, uint32_t enable);
>> +
>> +	/**
>>   	 * Add relocation entry in reloc_buf, which will be updated with the
>>   	 * target buffer's real offset on on command submission.
>>   	 *
>> --
>> 2.5.0
>>
Kristian Hogsberg Oct. 5, 2015, 6:06 p.m. UTC | #3
On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry <michel.thierry@intel.com> wrote:
> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>
>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>
>>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>>> allocated inside the 32-bit address range.
>>>
>>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>> 32-bit range, because the General State Offset and Instruction State
>>> Offset
>>> are limited to 32-bits.
>>>
>>> The i915 driver has been modified to provide a flag to set when the 4GB
>>> limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>> 48-bit range will only be used when explicitly requested.
>>>
>>> Callers to the existing drm_intel_bo_emit_reloc function should set the
>>> use_48b_address_range flag beforehand, in order to use full ppgtt range.
>>>
>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>      internally in emit_reloc functions (Ben)
>>>      s/48BADDRESS/48B_ADDRESS/ (Dave)
>>> v3: Keep set/clear functions internal, no-one needs to use them directly.
>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>      before enabling set/clear function, print full offsets in debug
>>>      statements, using port of lower_32_bits and upper_32_bits from linux
>>>      kernel (Micha?)
>>>
>>> References:
>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>
>>
>> +Kristian
>>
>> LGTM.
>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>
>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>
>
>
> Hi Kristian,
>
> More comments on this?
> I've resent the mesa patch with the last changes
> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>
> Micha? has agreed with the interface and will use it for his work.
> If mesa doesn't plan to use this for now, it's ok. The kernel changes have
> been done to prevent any regressions when the support 48-bit flag is not
> set.

I didn't get any replies to my last comments on this:

http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html

Basically, I tried explicitly placing buffers above 8G and didn't see
the HW problem described (ie it all worked fine).  I still think
there's some confusion as to what the W/A is about.

Here's my take: the W/A is a SW-only workaround to handle the cases
where a driver uses heapless and 48-bit ppgtt. The problem is that the
heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
anywhere it the 48 bit address space. If that happens it's consideredd
out-of-bounds for the heap and access fails. To prevent this we need
to make sure the bos we address in a heapless fashion are placed below
4g.

For mesa, we only configure general state base as heap-less, which
affects scratch bos. What this boils down to is that we need the 4G
restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
and 3DSTATE_PS (and tesselation stage eventually). Look for the
OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
and gen8_ps_state.c

Kristian

> Thanks,
>
> -Michel
>
>
>>> ---
>>>   include/drm/i915_drm.h    |  3 +-
>>>   intel/intel_bufmgr.c      | 11 ++++++
>>>   intel/intel_bufmgr.h      |  1 +
>>>   intel/intel_bufmgr_gem.c  | 88
>>> +++++++++++++++++++++++++++++++++++++----------
>>>   intel/intel_bufmgr_priv.h | 14 ++++++++
>>>   5 files changed, 97 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
>>> index ded43b1..426b25c 100644
>>> --- a/include/drm/i915_drm.h
>>> +++ b/include/drm/i915_drm.h
>>> @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 {
>>>   #define EXEC_OBJECT_NEEDS_FENCE (1<<0)
>>>   #define EXEC_OBJECT_NEEDS_GTT (1<<1)
>>>   #define EXEC_OBJECT_WRITE     (1<<2)
>>> -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
>>> +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
>>> +#define __EXEC_OBJECT_UNKNOWN_FLAGS
>>> -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1)
>>>         __u64 flags;
>>>
>>>         __u64 rsvd1;
>>> diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c
>>> index 14ea9f9..0856e60 100644
>>> --- a/intel/intel_bufmgr.c
>>> +++ b/intel/intel_bufmgr.c
>>> @@ -293,6 +293,17 @@ drm_intel_bo_madvise(drm_intel_bo *bo, int madv)
>>>   }
>>>
>>>   int
>>> +drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
>>> +{
>>> +       if (bo->bufmgr->bo_use_48b_address_range) {
>>> +               bo->bufmgr->bo_use_48b_address_range(bo, enable);
>>> +               return 0;
>>> +       }
>>> +
>>> +       return -ENODEV;
>>> +}
>>> +
>>> +int
>>>   drm_intel_bo_references(drm_intel_bo *bo, drm_intel_bo *target_bo)
>>>   {
>>>         return bo->bufmgr->bo_references(bo, target_bo);
>>> diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
>>> index 95eecb8..a14c78f 100644
>>> --- a/intel/intel_bufmgr.h
>>> +++ b/intel/intel_bufmgr.h
>>> @@ -164,6 +164,7 @@ int drm_intel_bo_get_tiling(drm_intel_bo *bo,
>>> uint32_t * tiling_mode,
>>>   int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name);
>>>   int drm_intel_bo_busy(drm_intel_bo *bo);
>>>   int drm_intel_bo_madvise(drm_intel_bo *bo, int madv);
>>> +int drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t
>>> enable);
>>>
>>>   int drm_intel_bo_disable_reuse(drm_intel_bo *bo);
>>>   int drm_intel_bo_is_reusable(drm_intel_bo *bo);
>>> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>>> index 2723e21..09d82d2 100644
>>> --- a/intel/intel_bufmgr_gem.c
>>> +++ b/intel/intel_bufmgr_gem.c
>>> @@ -83,6 +83,22 @@
>>>   #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>>>   #define MAX2(A, B) ((A) > (B) ? (A) : (B))
>>>
>>> +/**
>>> + * upper_32_bits - return bits 32-63 of a number
>>> + * @n: the number we're accessing
>>> + *
>>> + * A basic shift-right of a 64- or 32-bit quantity.  Use this to
>>> suppress
>>> + * the "right shift count >= width of type" warning when that quantity
>>> is
>>> + * 32-bits.
>>> + */
>>> +#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16))
>>> +
>>> +/**
>>> + * lower_32_bits - return bits 0-31 of a number
>>> + * @n: the number we're accessing
>>> + */
>>> +#define lower_32_bits(n) ((__u32)(n))
>>> +
>>>   typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
>>>
>>>   struct drm_intel_gem_bo_bucket {
>>> @@ -237,6 +253,15 @@ struct _drm_intel_bo_gem {
>>>         bool is_userptr;
>>>
>>>         /**
>>> +        * Boolean of whether this buffer can be placed in the full
>>> 48-bit
>>> +        * address range on gen8+.
>>> +        *
>>> +        * By default, buffers will be keep in a 32-bit range, unless
>>> this
>>> +        * flag is explicitly set.
>>> +        */
>>> +       bool use_48b_address_range;
>>> +
>>> +       /**
>>>          * Size in bytes of this buffer and its relocation descendents.
>>>          *
>>>          * Used to avoid costly tree walking in
>>> @@ -395,14 +420,16 @@
>>> drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
>>>                         drm_intel_bo_gem *target_gem =
>>>                             (drm_intel_bo_gem *) target_bo;
>>>
>>> -                       DBG("%2d: %d (%s)@0x%08llx -> "
>>> -                           "%d (%s)@0x%08lx + 0x%08x\n",
>>> +                       DBG("%2d: %d (%s)@0x%08x %08x -> "
>>> +                           "%d (%s)@0x%08x %08x + 0x%08x\n",
>>>                             i,
>>>                             bo_gem->gem_handle, bo_gem->name,
>>> -                           (unsigned long long)bo_gem->relocs[j].offset,
>>> +                           upper_32_bits(bo_gem->relocs[j].offset),
>>> +                           lower_32_bits(bo_gem->relocs[j].offset),
>>>                             target_gem->gem_handle,
>>>                             target_gem->name,
>>> -                           target_bo->offset64,
>>> +                           upper_32_bits(target_bo->offset64),
>>> +                           lower_32_bits(target_bo->offset64),
>>>                             bo_gem->relocs[j].delta);
>>>                 }
>>>         }
>>> @@ -468,11 +495,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo,
>>> int need_fence)
>>>         drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem
>>> *)bo->bufmgr;
>>>         drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo;
>>>         int index;
>>> +       int flags = 0;
>>> +
>>> +       if (need_fence)
>>> +               flags |= EXEC_OBJECT_NEEDS_FENCE;
>>> +       if (bo_gem->use_48b_address_range)
>>> +               flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>>>
>>>         if (bo_gem->validate_index != -1) {
>>> -               if (need_fence)
>>> -
>>> bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
>>> -                               EXEC_OBJECT_NEEDS_FENCE;
>>> +               bufmgr_gem->exec2_objects[bo_gem->validate_index].flags
>>> |= flags;
>>>                 return;
>>>         }
>>>
>>> @@ -501,13 +532,9 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int
>>> need_fence)
>>>         bufmgr_gem->exec2_objects[index].alignment = bo->align;
>>>         bufmgr_gem->exec2_objects[index].offset = 0;
>>>         bufmgr_gem->exec_bos[index] = bo;
>>> -       bufmgr_gem->exec2_objects[index].flags = 0;
>>> +       bufmgr_gem->exec2_objects[index].flags = flags;
>>>         bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>>>         bufmgr_gem->exec2_objects[index].rsvd2 = 0;
>>> -       if (need_fence) {
>>> -               bufmgr_gem->exec2_objects[index].flags |=
>>> -                       EXEC_OBJECT_NEEDS_FENCE;
>>> -       }
>>>         bufmgr_gem->exec_count++;
>>>   }
>>>
>>> @@ -780,6 +807,7 @@ retry:
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = true;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem,
>>> alignment);
>>>
>>> @@ -926,6 +954,7 @@ drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr
>>> *bufmgr,
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
>>>
>>> @@ -1081,6 +1110,7 @@ drm_intel_bo_gem_create_from_name(drm_intel_bufmgr
>>> *bufmgr,
>>>         bo_gem->bo.handle = open_arg.handle;
>>>         bo_gem->global_name = handle;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         memclear(get_tiling);
>>>         get_tiling.handle = bo_gem->gem_handle;
>>> @@ -1930,6 +1960,13 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t
>>> offset,
>>>         return 0;
>>>   }
>>>
>>> +static void
>>> +drm_intel_gem_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t
>>> enable)
>>> +{
>>> +       drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
>>> +       bo_gem->use_48b_address_range = enable;
>>> +}
>>> +
>>>   static int
>>>   drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>>>                             drm_intel_bo *target_bo, uint32_t
>>> target_offset,
>>> @@ -2073,10 +2110,12 @@
>>> drm_intel_update_buffer_offsets(drm_intel_bufmgr_gem *bufmgr_gem)
>>>
>>>                 /* Update the buffer offset */
>>>                 if (bufmgr_gem->exec_objects[i].offset != bo->offset64) {
>>> -                       DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>>> -                           bo_gem->gem_handle, bo_gem->name,
>>> bo->offset64,
>>> -                           (unsigned long
>>> long)bufmgr_gem->exec_objects[i].
>>> -                           offset);
>>> +                       DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x
>>> %08x\n",
>>> +                           bo_gem->gem_handle, bo_gem->name,
>>> +                           upper_32_bits(bo->offset64),
>>> +                           lower_32_bits(bo->offset64),
>>> +
>>> upper_32_bits(bufmgr_gem->exec_objects[i].offset),
>>> +
>>> lower_32_bits(bufmgr_gem->exec_objects[i].offset));
>>>                         bo->offset64 =
>>> bufmgr_gem->exec_objects[i].offset;
>>>                         bo->offset = bufmgr_gem->exec_objects[i].offset;
>>>                 }
>>> @@ -2094,9 +2133,12 @@ drm_intel_update_buffer_offsets2
>>> (drm_intel_bufmgr_gem *bufmgr_gem)
>>>
>>>                 /* Update the buffer offset */
>>>                 if (bufmgr_gem->exec2_objects[i].offset != bo->offset64)
>>> {
>>> -                       DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>>> -                           bo_gem->gem_handle, bo_gem->name,
>>> bo->offset64,
>>> -                           (unsigned long
>>> long)bufmgr_gem->exec2_objects[i].offset);
>>> +                       DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x
>>> %08x\n",
>>> +                           bo_gem->gem_handle, bo_gem->name,
>>> +                           upper_32_bits(bo->offset64),
>>> +                           lower_32_bits(bo->offset64),
>>> +
>>> upper_32_bits(bufmgr_gem->exec2_objects[i].offset),
>>> +
>>> lower_32_bits(bufmgr_gem->exec2_objects[i].offset));
>>>                         bo->offset64 =
>>> bufmgr_gem->exec2_objects[i].offset;
>>>                         bo->offset = bufmgr_gem->exec2_objects[i].offset;
>>>                 }
>>> @@ -2481,6 +2523,7 @@ drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr
>>> *bufmgr, int prime_fd, int s
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         DRMINITLISTHEAD(&bo_gem->vma_list);
>>>         DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named);
>>> @@ -3278,6 +3321,13 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>>>                 }
>>>         }
>>>
>>> +       if (bufmgr_gem->gen >= 8) {
>>> +               gp.param = I915_PARAM_HAS_ALIASING_PPGTT;
>>> +               ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM,
>>> &gp);
>>> +               if (ret == 0 && *gp.value == 3)
>>> +                       bufmgr_gem->bufmgr.bo_use_48b_address_range =
>>> drm_intel_gem_bo_use_48b_address_range;
>>> +       }
>>> +
>>>         /* Let's go with one relocation per every 2 dwords (but round
>>> down a bit
>>>          * since a power of two will mean an extra page allocation for
>>> the reloc
>>>          * buffer).
>>> diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h
>>> index 59ebd18..5c17ffb 100644
>>> --- a/intel/intel_bufmgr_priv.h
>>> +++ b/intel/intel_bufmgr_priv.h
>>> @@ -152,6 +152,20 @@ struct _drm_intel_bufmgr {
>>>         void (*destroy) (drm_intel_bufmgr *bufmgr);
>>>
>>>         /**
>>> +        * Indicate if the buffer can be placed anywhere in the full
>>> ppgtt
>>> +        * address range (2^48).
>>> +        *
>>> +        * Any resource used with flat/heapless (0x00000000-0xfffff000)
>>> +        * General State Heap (GSH) or Intructions State Heap (ISH) must
>>> +        * be in a 32-bit range. 48-bit range will only be used when
>>> explicitly
>>> +        * requested.
>>> +        *
>>> +        * \param bo Buffer to set the use_48b_address_range flag.
>>> +        * \param enable The flag value.
>>> +        */
>>> +       void (*bo_use_48b_address_range) (drm_intel_bo *bo, uint32_t
>>> enable);
>>> +
>>> +       /**
>>>          * Add relocation entry in reloc_buf, which will be updated with
>>> the
>>>          * target buffer's real offset on on command submission.
>>>          *
>>> --
>>> 2.5.0
>>>
>
Michel Thierry Oct. 6, 2015, 1:12 p.m. UTC | #4
On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry <michel.thierry@intel.com> wrote:
>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>
>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>
>>>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>>>> allocated inside the 32-bit address range.
>>>>
>>>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>>> 32-bit range, because the General State Offset and Instruction State
>>>> Offset
>>>> are limited to 32-bits.
>>>>
>>>> The i915 driver has been modified to provide a flag to set when the 4GB
>>>> limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>> 48-bit range will only be used when explicitly requested.
>>>>
>>>> Callers to the existing drm_intel_bo_emit_reloc function should set the
>>>> use_48b_address_range flag beforehand, in order to use full ppgtt range.
>>>>
>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>>       internally in emit_reloc functions (Ben)
>>>>       s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>> v3: Keep set/clear functions internal, no-one needs to use them directly.
>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>       before enabling set/clear function, print full offsets in debug
>>>>       statements, using port of lower_32_bits and upper_32_bits from linux
>>>>       kernel (Micha?)
>>>>
>>>> References:
>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>
>>>
>>> +Kristian
>>>
>>> LGTM.
>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>
>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>
>>
>>
>> Hi Kristian,
>>
>> More comments on this?
>> I've resent the mesa patch with the last changes
>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>
>> Micha? has agreed with the interface and will use it for his work.
>> If mesa doesn't plan to use this for now, it's ok. The kernel changes have
>> been done to prevent any regressions when the support 48-bit flag is not
>> set.
>
> I didn't get any replies to my last comments on this:
>
> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>
> Basically, I tried explicitly placing buffers above 8G and didn't see
> the HW problem described (ie it all worked fine).  I still think
> there's some confusion as to what the W/A is about.
>
> Here's my take: the W/A is a SW-only workaround to handle the cases
> where a driver uses heapless and 48-bit ppgtt. The problem is that the
> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
> anywhere it the 48 bit address space. If that happens it's consideredd
> out-of-bounds for the heap and access fails. To prevent this we need
> to make sure the bos we address in a heapless fashion are placed below
> 4g.
>
> For mesa, we only configure general state base as heap-less, which
> affects scratch bos. What this boils down to is that we need the 4G
> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
> and 3DSTATE_PS (and tesselation stage eventually). Look for the
> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
> and gen8_ps_state.c

I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it 
isn't exclusive to the scratch bos (the error state points to that 
instruction).
Michel Thierry Oct. 13, 2015, 12:16 p.m. UTC | #5
On 10/6/2015 2:12 PM, Michel Thierry wrote:
> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>> <michel.thierry@intel.com> wrote:
>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>
>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>
>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>> always be
>>>>> allocated inside the 32-bit address range.
>>>>>
>>>>> In specific, any resource used with flat/heapless
>>>>> (0x00000000-0xfffff000)
>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>>>> 32-bit range, because the General State Offset and Instruction State
>>>>> Offset
>>>>> are limited to 32-bits.
>>>>>
>>>>> The i915 driver has been modified to provide a flag to set when the
>>>>> 4GB
>>>>> limit is not necessary in a given bo
>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>> 48-bit range will only be used when explicitly requested.
>>>>>
>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
>>>>> the
>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>> range.
>>>>>
>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>>>       internally in emit_reloc functions (Ben)
>>>>>       s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>> directly.
>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>>       before enabling set/clear function, print full offsets in debug
>>>>>       statements, using port of lower_32_bits and upper_32_bits
>>>>> from linux
>>>>>       kernel (Micha?)
>>>>>
>>>>> References:
>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>
>>>>
>>>> +Kristian
>>>>
>>>> LGTM.
>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>
>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>
>>>
>>>
>>> Hi Kristian,
>>>
>>> More comments on this?
>>> I've resent the mesa patch with the last changes
>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>
>>>
>>> Micha? has agreed with the interface and will use it for his work.
>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
>>> have
>>> been done to prevent any regressions when the support 48-bit flag is not
>>> set.
>>
>> I didn't get any replies to my last comments on this:
>>
>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>
>> Basically, I tried explicitly placing buffers above 8G and didn't see
>> the HW problem described (ie it all worked fine).  I still think
>> there's some confusion as to what the W/A is about.
>>
>> Here's my take: the W/A is a SW-only workaround to handle the cases
>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>> anywhere it the 48 bit address space. If that happens it's consideredd
>> out-of-bounds for the heap and access fails. To prevent this we need
>> to make sure the bos we address in a heapless fashion are placed below
>> 4g.
>>
>> For mesa, we only configure general state base as heap-less, which
>> affects scratch bos. What this boils down to is that we need the 4G
>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
>> and gen8_ps_state.c
>
> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
> isn't exclusive to the scratch bos (the error state points to that
> instruction).
>
>

Hi,

Anymore inputs about this patch?
AFAIK, if changes are required based on further comments from Kristian, 
these will be in the mesa patch[1], not libdrm. Also, Micha? will use 
this flag in another project.

Thanks,

-Michel

[1]http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html
Emil Velikov Oct. 13, 2015, 2:13 p.m. UTC | #6
On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com> wrote:
> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>
>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>
>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>> <michel.thierry@intel.com> wrote:
>>>>
>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>
>>>>>
>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>
>>>>>>
>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>> always be
>>>>>> allocated inside the 32-bit address range.
>>>>>>
>>>>>> In specific, any resource used with flat/heapless
>>>>>> (0x00000000-0xfffff000)
>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>>>>> 32-bit range, because the General State Offset and Instruction State
>>>>>> Offset
>>>>>> are limited to 32-bits.
>>>>>>
>>>>>> The i915 driver has been modified to provide a flag to set when the
>>>>>> 4GB
>>>>>> limit is not necessary in a given bo
>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>
>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
>>>>>> the
>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>> range.
>>>>>>
>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>>>>       internally in emit_reloc functions (Ben)
>>>>>>       s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>> directly.
>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>>>       before enabling set/clear function, print full offsets in debug
>>>>>>       statements, using port of lower_32_bits and upper_32_bits
>>>>>> from linux
>>>>>>       kernel (Micha?)
>>>>>>
>>>>>> References:
>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>
>>>>>
>>>>>
>>>>> +Kristian
>>>>>
>>>>> LGTM.
>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>
>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>
>>>>
>>>>
>>>>
>>>> Hi Kristian,
>>>>
>>>> More comments on this?
>>>> I've resent the mesa patch with the last changes
>>>>
>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>
>>>>
>>>> Micha? has agreed with the interface and will use it for his work.
>>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
>>>> have
>>>> been done to prevent any regressions when the support 48-bit flag is not
>>>> set.
>>>
>>>
>>> I didn't get any replies to my last comments on this:
>>>
>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>
>>> Basically, I tried explicitly placing buffers above 8G and didn't see
>>> the HW problem described (ie it all worked fine).  I still think
>>> there's some confusion as to what the W/A is about.
>>>
>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>> anywhere it the 48 bit address space. If that happens it's consideredd
>>> out-of-bounds for the heap and access fails. To prevent this we need
>>> to make sure the bos we address in a heapless fashion are placed below
>>> 4g.
>>>
>>> For mesa, we only configure general state base as heap-less, which
>>> affects scratch bos. What this boils down to is that we need the 4G
>>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
>>> and gen8_ps_state.c
>>
>>
>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>> isn't exclusive to the scratch bos (the error state points to that
>> instruction).
>>
>>
>
> Hi,
>
> Anymore inputs about this patch?
> AFAIK, if changes are required based on further comments from Kristian,
> these will be in the mesa patch[1], not libdrm. Also, Micha? will use this
> flag in another project.
>
The comment seems quite brief and I'm not sure it fully addresses
Kristian's concern. I'd suspect that providing reference to the HW
documentation (confirming your assumption) might be beneficial.

Regards,
Emil
Michel Thierry Oct. 13, 2015, 2:55 p.m. UTC | #7
On 10/13/2015 3:13 PM, Emil Velikov wrote:
> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com> wrote:
>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>
>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>
>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>> <michel.thierry@intel.com> wrote:
>>>>>
>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>
>>>>>>
>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>
>>>>>>>
>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>> always be
>>>>>>> allocated inside the 32-bit address range.
>>>>>>>
>>>>>>> In specific, any resource used with flat/heapless
>>>>>>> (0x00000000-0xfffff000)
>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>>>>>> 32-bit range, because the General State Offset and Instruction State
>>>>>>> Offset
>>>>>>> are limited to 32-bits.
>>>>>>>
>>>>>>> The i915 driver has been modified to provide a flag to set when the
>>>>>>> 4GB
>>>>>>> limit is not necessary in a given bo
>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>
>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
>>>>>>> the
>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>>> range.
>>>>>>>
>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>>>>>        internally in emit_reloc functions (Ben)
>>>>>>>        s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>> directly.
>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>>>>        before enabling set/clear function, print full offsets in debug
>>>>>>>        statements, using port of lower_32_bits and upper_32_bits
>>>>>>> from linux
>>>>>>>        kernel (Micha?)
>>>>>>>
>>>>>>> References:
>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>
>>>>>>
>>>>>>
>>>>>> +Kristian
>>>>>>
>>>>>> LGTM.
>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>
>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Hi Kristian,
>>>>>
>>>>> More comments on this?
>>>>> I've resent the mesa patch with the last changes
>>>>>
>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>
>>>>>
>>>>> Micha? has agreed with the interface and will use it for his work.
>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
>>>>> have
>>>>> been done to prevent any regressions when the support 48-bit flag is not
>>>>> set.
>>>>
>>>>
>>>> I didn't get any replies to my last comments on this:
>>>>
>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>
>>>> Basically, I tried explicitly placing buffers above 8G and didn't see
>>>> the HW problem described (ie it all worked fine).  I still think
>>>> there's some confusion as to what the W/A is about.
>>>>
>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>>> anywhere it the 48 bit address space. If that happens it's consideredd
>>>> out-of-bounds for the heap and access fails. To prevent this we need
>>>> to make sure the bos we address in a heapless fashion are placed below
>>>> 4g.
>>>>
>>>> For mesa, we only configure general state base as heap-less, which
>>>> affects scratch bos. What this boils down to is that we need the 4G
>>>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
>>>> and gen8_ps_state.c
>>>
>>>
>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>>> isn't exclusive to the scratch bos (the error state points to that
>>> instruction).
>>>
>>>
>>
>> Hi,
>>
>> Anymore inputs about this patch?
>> AFAIK, if changes are required based on further comments from Kristian,
>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use this
>> flag in another project.
>>
> The comment seems quite brief and I'm not sure it fully addresses
> Kristian's concern. I'd suspect that providing reference to the HW
> documentation (confirming your assumption) might be beneficial.
>

Sure, attached is the hang I get if I don't set the restriction in 
gen8_misc_state.c and try to use the full 48-bit range 
(i915_error_state_nowa.txt). This is just running gfxbench t-rex.

I see the same hang signature when it is only applied to the scratch bos 
(in gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c - 
i915_error_state_scratchbo.txt).

3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf 
(page 256)

Thanks,

-Michel
Kristian Hogsberg Oct. 13, 2015, 9:51 p.m. UTC | #8
On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
<michel.thierry@intel.com> wrote:
> On 10/13/2015 3:13 PM, Emil Velikov wrote:
>>
>> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com>
>> wrote:
>>>
>>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>>
>>>>
>>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>>
>>>>>
>>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>>> <michel.thierry@intel.com> wrote:
>>>>>>
>>>>>>
>>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>>> always be
>>>>>>>> allocated inside the 32-bit address range.
>>>>>>>>
>>>>>>>> In specific, any resource used with flat/heapless
>>>>>>>> (0x00000000-0xfffff000)
>>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in
>>>>>>>> a
>>>>>>>> 32-bit range, because the General State Offset and Instruction State
>>>>>>>> Offset
>>>>>>>> are limited to 32-bits.
>>>>>>>>
>>>>>>>> The i915 driver has been modified to provide a flag to set when the
>>>>>>>> 4GB
>>>>>>>> limit is not necessary in a given bo
>>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>>
>>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
>>>>>>>> the
>>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>>>> range.
>>>>>>>>
>>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use
>>>>>>>> them
>>>>>>>>        internally in emit_reloc functions (Ben)
>>>>>>>>        s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>>> directly.
>>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>>>>>        before enabling set/clear function, print full offsets in
>>>>>>>> debug
>>>>>>>>        statements, using port of lower_32_bits and upper_32_bits
>>>>>>>> from linux
>>>>>>>>        kernel (Micha?)
>>>>>>>>
>>>>>>>> References:
>>>>>>>>
>>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> +Kristian
>>>>>>>
>>>>>>> LGTM.
>>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>
>>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Hi Kristian,
>>>>>>
>>>>>> More comments on this?
>>>>>> I've resent the mesa patch with the last changes
>>>>>>
>>>>>>
>>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>>
>>>>>>
>>>>>> Micha? has agreed with the interface and will use it for his work.
>>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
>>>>>> have
>>>>>> been done to prevent any regressions when the support 48-bit flag is
>>>>>> not
>>>>>> set.
>>>>>
>>>>>
>>>>>
>>>>> I didn't get any replies to my last comments on this:
>>>>>
>>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>>
>>>>> Basically, I tried explicitly placing buffers above 8G and didn't see
>>>>> the HW problem described (ie it all worked fine).  I still think
>>>>> there's some confusion as to what the W/A is about.
>>>>>
>>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
>>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>>>> anywhere it the 48 bit address space. If that happens it's consideredd
>>>>> out-of-bounds for the heap and access fails. To prevent this we need
>>>>> to make sure the bos we address in a heapless fashion are placed below
>>>>> 4g.
>>>>>
>>>>> For mesa, we only configure general state base as heap-less, which
>>>>> affects scratch bos. What this boils down to is that we need the 4G
>>>>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
>>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
>>>>> and gen8_ps_state.c
>>>>
>>>>
>>>>
>>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>>>> isn't exclusive to the scratch bos (the error state points to that
>>>> instruction).
>>>>
>>>>
>>>
>>> Hi,
>>>
>>> Anymore inputs about this patch?
>>> AFAIK, if changes are required based on further comments from Kristian,
>>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use
>>> this
>>> flag in another project.
>>>
>> The comment seems quite brief and I'm not sure it fully addresses
>> Kristian's concern. I'd suspect that providing reference to the HW
>> documentation (confirming your assumption) might be beneficial.
>>
>
> Sure, attached is the hang I get if I don't set the restriction in
> gen8_misc_state.c and try to use the full 48-bit range
> (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
>
> I see the same hang signature when it is only applied to the scratch bos (in
> gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
> i915_error_state_scratchbo.txt).
>
> 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
> https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
> (page 256)

I applied your mesa and libdrm patches and then backed out the 4G
restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
reproduce any hang with trex or glxgears. I confirmed (using
INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
got this:

 0: 8 @ 0xfffffff90000 (miptree)
 1: 9 @ 0xfffffff78000 (hiz)
 2: 4 @ 0xfffffff77000 (pipe_control workaround)
 3: 5 @ 0xfffffff76000 (program cache)
 4: 12 @ 0x000000000000 (miptree)
 5: 7 @ 0x000000001000 (image)
 6: 10 @ 0xfffffff5a000 (miptree)
 7: 13 @ 0xfffffff59000 (bufferobj)
 8: 11 @ 0xfffffff51000 (bufferobj)
 9: 14 @ 0xfffffff50000 (bufferobj)
10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
(miptree)@0x0000ffff fff90000 + 0x00000000
10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
(hiz)@0x0000ffff fff78000 + 0x00000000
10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
(pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
...

In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
heap bases are set to 0xfffffff40000.

The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
bits, but that only affects how far from the dynamic state base
address we can place that. In mesas case, it's always inside the batch
buffer, which is at most 32k, so that's never a problem. If a driver
uses dynamic state in a heapless fashion, then you need to be careful
to place the CC viewport state below 4g.

What I was able  to confirm is that scratch buffer I/O (which we use
for spilling) does break with 48 bit ppgtt. If you run glxgears with
INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
the 4G limit on the general state heap caps attempts to spill and fill
from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.

Kristian

> Thanks,
>
> -Michel
Daniel Vetter Oct. 14, 2015, 7:19 a.m. UTC | #9
On Tue, Oct 13, 2015 at 02:51:36PM -0700, Kristian Høgsberg wrote:
> On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
> <michel.thierry@intel.com> wrote:
> > On 10/13/2015 3:13 PM, Emil Velikov wrote:
> >>
> >> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com>
> >> wrote:
> >>>
> >>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
> >>>>
> >>>>
> >>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
> >>>>>
> >>>>>
> >>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
> >>>>> <michel.thierry@intel.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
> >>>>>>>> always be
> >>>>>>>> allocated inside the 32-bit address range.
> >>>>>>>>
> >>>>>>>> In specific, any resource used with flat/heapless
> >>>>>>>> (0x00000000-0xfffff000)
> >>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in
> >>>>>>>> a
> >>>>>>>> 32-bit range, because the General State Offset and Instruction State
> >>>>>>>> Offset
> >>>>>>>> are limited to 32-bits.
> >>>>>>>>
> >>>>>>>> The i915 driver has been modified to provide a flag to set when the
> >>>>>>>> 4GB
> >>>>>>>> limit is not necessary in a given bo
> >>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
> >>>>>>>> 48-bit range will only be used when explicitly requested.
> >>>>>>>>
> >>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
> >>>>>>>> the
> >>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
> >>>>>>>> range.
> >>>>>>>>
> >>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use
> >>>>>>>> them
> >>>>>>>>        internally in emit_reloc functions (Ben)
> >>>>>>>>        s/48BADDRESS/48B_ADDRESS/ (Dave)
> >>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
> >>>>>>>> directly.
> >>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
> >>>>>>>>        before enabling set/clear function, print full offsets in
> >>>>>>>> debug
> >>>>>>>>        statements, using port of lower_32_bits and upper_32_bits
> >>>>>>>> from linux
> >>>>>>>>        kernel (Micha?)
> >>>>>>>>
> >>>>>>>> References:
> >>>>>>>>
> >>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
> >>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
> >>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> +Kristian
> >>>>>>>
> >>>>>>> LGTM.
> >>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
> >>>>>>>
> >>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> Hi Kristian,
> >>>>>>
> >>>>>> More comments on this?
> >>>>>> I've resent the mesa patch with the last changes
> >>>>>>
> >>>>>>
> >>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
> >>>>>>
> >>>>>>
> >>>>>> Micha? has agreed with the interface and will use it for his work.
> >>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
> >>>>>> have
> >>>>>> been done to prevent any regressions when the support 48-bit flag is
> >>>>>> not
> >>>>>> set.
> >>>>>
> >>>>>
> >>>>>
> >>>>> I didn't get any replies to my last comments on this:
> >>>>>
> >>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
> >>>>>
> >>>>> Basically, I tried explicitly placing buffers above 8G and didn't see
> >>>>> the HW problem described (ie it all worked fine).  I still think
> >>>>> there's some confusion as to what the W/A is about.
> >>>>>
> >>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
> >>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
> >>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
> >>>>> anywhere it the 48 bit address space. If that happens it's consideredd
> >>>>> out-of-bounds for the heap and access fails. To prevent this we need
> >>>>> to make sure the bos we address in a heapless fashion are placed below
> >>>>> 4g.
> >>>>>
> >>>>> For mesa, we only configure general state base as heap-less, which
> >>>>> affects scratch bos. What this boils down to is that we need the 4G
> >>>>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
> >>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
> >>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
> >>>>> and gen8_ps_state.c
> >>>>
> >>>>
> >>>>
> >>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
> >>>> isn't exclusive to the scratch bos (the error state points to that
> >>>> instruction).
> >>>>
> >>>>
> >>>
> >>> Hi,
> >>>
> >>> Anymore inputs about this patch?
> >>> AFAIK, if changes are required based on further comments from Kristian,
> >>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use
> >>> this
> >>> flag in another project.
> >>>
> >> The comment seems quite brief and I'm not sure it fully addresses
> >> Kristian's concern. I'd suspect that providing reference to the HW
> >> documentation (confirming your assumption) might be beneficial.
> >>
> >
> > Sure, attached is the hang I get if I don't set the restriction in
> > gen8_misc_state.c and try to use the full 48-bit range
> > (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
> >
> > I see the same hang signature when it is only applied to the scratch bos (in
> > gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
> > i915_error_state_scratchbo.txt).
> >
> > 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
> > https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
> > (page 256)
> 
> I applied your mesa and libdrm patches and then backed out the 4G
> restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
> reproduce any hang with trex or glxgears. I confirmed (using
> INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
> got this:
> 
>  0: 8 @ 0xfffffff90000 (miptree)
>  1: 9 @ 0xfffffff78000 (hiz)
>  2: 4 @ 0xfffffff77000 (pipe_control workaround)
>  3: 5 @ 0xfffffff76000 (program cache)
>  4: 12 @ 0x000000000000 (miptree)
>  5: 7 @ 0x000000001000 (image)
>  6: 10 @ 0xfffffff5a000 (miptree)
>  7: 13 @ 0xfffffff59000 (bufferobj)
>  8: 11 @ 0xfffffff51000 (bufferobj)
>  9: 14 @ 0xfffffff50000 (bufferobj)
> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
> (miptree)@0x0000ffff fff90000 + 0x00000000
> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
> (hiz)@0x0000ffff fff78000 + 0x00000000
> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
> (pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
> ...
> 
> In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
> all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
> heap bases are set to 0xfffffff40000.
> 
> The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
> bits, but that only affects how far from the dynamic state base
> address we can place that. In mesas case, it's always inside the batch
> buffer, which is at most 32k, so that's never a problem. If a driver
> uses dynamic state in a heapless fashion, then you need to be careful
> to place the CC viewport state below 4g.
> 
> What I was able  to confirm is that scratch buffer I/O (which we use
> for spilling) does break with 48 bit ppgtt. If you run glxgears with
> INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
> the 4G limit on the general state heap caps attempts to spill and fill
> from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
> the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.

Ok, so we do still need the 4G limit bit (i.e. I don't have to revert the
kernel side for lack of open-source userspace), just not at the place we
originally thought. I guess lesson learned is to actually test stuff first
before I pull in the kernel side. Michel, did you not see the spilling
corruptions when testing the mesa patches? Kristian, does this also blow
up with just a piglit run?

Thanks, Daniel
Michel Thierry Oct. 14, 2015, 12:11 p.m. UTC | #10
On 10/14/2015 8:19 AM, Daniel Vetter wrote:
> On Tue, Oct 13, 2015 at 02:51:36PM -0700, Kristian Høgsberg wrote:
>> On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
>> <michel.thierry@intel.com> wrote:
>>> On 10/13/2015 3:13 PM, Emil Velikov wrote:
>>>>
>>>> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com>
>>>> wrote:
>>>>>
>>>>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>>>>
>>>>>>
>>>>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>>>>
>>>>>>>
>>>>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>>>>> <michel.thierry@intel.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>>>>> always be
>>>>>>>>>> allocated inside the 32-bit address range.
>>>>>>>>>>
>>>>>>>>>> In specific, any resource used with flat/heapless
>>>>>>>>>> (0x00000000-0xfffff000)
>>>>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in
>>>>>>>>>> a
>>>>>>>>>> 32-bit range, because the General State Offset and Instruction State
>>>>>>>>>> Offset
>>>>>>>>>> are limited to 32-bits.
>>>>>>>>>>
>>>>>>>>>> The i915 driver has been modified to provide a flag to set when the
>>>>>>>>>> 4GB
>>>>>>>>>> limit is not necessary in a given bo
>>>>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>>>>
>>>>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should set
>>>>>>>>>> the
>>>>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>>>>>> range.
>>>>>>>>>>
>>>>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use
>>>>>>>>>> them
>>>>>>>>>>         internally in emit_reloc functions (Ben)
>>>>>>>>>>         s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>>>>> directly.
>>>>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>>>>>>>>         before enabling set/clear function, print full offsets in
>>>>>>>>>> debug
>>>>>>>>>>         statements, using port of lower_32_bits and upper_32_bits
>>>>>>>>>> from linux
>>>>>>>>>>         kernel (Micha?)
>>>>>>>>>>
>>>>>>>>>> References:
>>>>>>>>>>
>>>>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> +Kristian
>>>>>>>>>
>>>>>>>>> LGTM.
>>>>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>
>>>>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Kristian,
>>>>>>>>
>>>>>>>> More comments on this?
>>>>>>>> I've resent the mesa patch with the last changes
>>>>>>>>
>>>>>>>>
>>>>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>>>>
>>>>>>>>
>>>>>>>> Micha? has agreed with the interface and will use it for his work.
>>>>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel changes
>>>>>>>> have
>>>>>>>> been done to prevent any regressions when the support 48-bit flag is
>>>>>>>> not
>>>>>>>> set.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> I didn't get any replies to my last comments on this:
>>>>>>>
>>>>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>>>>
>>>>>>> Basically, I tried explicitly placing buffers above 8G and didn't see
>>>>>>> the HW problem described (ie it all worked fine).  I still think
>>>>>>> there's some confusion as to what the W/A is about.
>>>>>>>
>>>>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>>>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that the
>>>>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>>>>>> anywhere it the 48 bit address space. If that happens it's consideredd
>>>>>>> out-of-bounds for the heap and access fails. To prevent this we need
>>>>>>> to make sure the bos we address in a heapless fashion are placed below
>>>>>>> 4g.
>>>>>>>
>>>>>>> For mesa, we only configure general state base as heap-less, which
>>>>>>> affects scratch bos. What this boils down to is that we need the 4G
>>>>>>> restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
>>>>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
>>>>>>> and gen8_ps_state.c
>>>>>>
>>>>>>
>>>>>>
>>>>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>>>>>> isn't exclusive to the scratch bos (the error state points to that
>>>>>> instruction).
>>>>>>
>>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>> Anymore inputs about this patch?
>>>>> AFAIK, if changes are required based on further comments from Kristian,
>>>>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use
>>>>> this
>>>>> flag in another project.
>>>>>
>>>> The comment seems quite brief and I'm not sure it fully addresses
>>>> Kristian's concern. I'd suspect that providing reference to the HW
>>>> documentation (confirming your assumption) might be beneficial.
>>>>
>>>
>>> Sure, attached is the hang I get if I don't set the restriction in
>>> gen8_misc_state.c and try to use the full 48-bit range
>>> (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
>>>
>>> I see the same hang signature when it is only applied to the scratch bos (in
>>> gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
>>> i915_error_state_scratchbo.txt).
>>>
>>> 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
>>> https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
>>> (page 256)
>>
>> I applied your mesa and libdrm patches and then backed out the 4G
>> restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
>> reproduce any hang with trex or glxgears. I confirmed (using
>> INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
>> got this:
>>
>>   0: 8 @ 0xfffffff90000 (miptree)
>>   1: 9 @ 0xfffffff78000 (hiz)
>>   2: 4 @ 0xfffffff77000 (pipe_control workaround)
>>   3: 5 @ 0xfffffff76000 (program cache)
>>   4: 12 @ 0x000000000000 (miptree)
>>   5: 7 @ 0x000000001000 (image)
>>   6: 10 @ 0xfffffff5a000 (miptree)
>>   7: 13 @ 0xfffffff59000 (bufferobj)
>>   8: 11 @ 0xfffffff51000 (bufferobj)
>>   9: 14 @ 0xfffffff50000 (bufferobj)
>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
>> (miptree)@0x0000ffff fff90000 + 0x00000000
>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
>> (hiz)@0x0000ffff fff78000 + 0x00000000
>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
>> (pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
>> ...
>>
>> In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
>> all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
>> heap bases are set to 0xfffffff40000.
>>
>> The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
>> bits, but that only affects how far from the dynamic state base
>> address we can place that. In mesas case, it's always inside the batch
>> buffer, which is at most 32k, so that's never a problem. If a driver
>> uses dynamic state in a heapless fashion, then you need to be careful
>> to place the CC viewport state below 4g.
>>
>> What I was able  to confirm is that scratch buffer I/O (which we use
>> for spilling) does break with 48 bit ppgtt. If you run glxgears with
>> INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
>> the 4G limit on the general state heap caps attempts to spill and fill
>> from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
>> the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.
>
> Ok, so we do still need the 4G limit bit (i.e. I don't have to revert the
> kernel side for lack of open-source userspace), just not at the place we
> originally thought. I guess lesson learned is to actually test stuff first
> before I pull in the kernel side. Michel, did you not see the spilling
> corruptions when testing the mesa patches? Kristian, does this also blow
> up with just a piglit run?
>

There's must be something else we have different in our systems.

Kristian doesn't get the hang, and if I back out the 4G restriction in 
the STATE_BASE_ADDRESS, I don't see any corruption while running
INTEL_DEBUG=spill_fs glxgears.
Kristian Hogsberg Nov. 18, 2015, 10:53 p.m. UTC | #11
On Wed, Oct 14, 2015 at 5:11 AM, Michel Thierry
<michel.thierry@intel.com> wrote:
> On 10/14/2015 8:19 AM, Daniel Vetter wrote:
>>
>> On Tue, Oct 13, 2015 at 02:51:36PM -0700, Kristian Høgsberg wrote:
>>>
>>> On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
>>> <michel.thierry@intel.com> wrote:
>>>>
>>>> On 10/13/2015 3:13 PM, Emil Velikov wrote:
>>>>>
>>>>>
>>>>> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com>
>>>>> wrote:
>>>>>>
>>>>>>
>>>>>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>>>>>> <michel.thierry@intel.com> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>>>>>> always be
>>>>>>>>>>> allocated inside the 32-bit address range.
>>>>>>>>>>>
>>>>>>>>>>> In specific, any resource used with flat/heapless
>>>>>>>>>>> (0x00000000-0xfffff000)
>>>>>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be
>>>>>>>>>>> in
>>>>>>>>>>> a
>>>>>>>>>>> 32-bit range, because the General State Offset and Instruction
>>>>>>>>>>> State
>>>>>>>>>>> Offset
>>>>>>>>>>> are limited to 32-bits.
>>>>>>>>>>>
>>>>>>>>>>> The i915 driver has been modified to provide a flag to set when
>>>>>>>>>>> the
>>>>>>>>>>> 4GB
>>>>>>>>>>> limit is not necessary in a given bo
>>>>>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>>>>>
>>>>>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should
>>>>>>>>>>> set
>>>>>>>>>>> the
>>>>>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>>>>>>> range.
>>>>>>>>>>>
>>>>>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use
>>>>>>>>>>> them
>>>>>>>>>>>         internally in emit_reloc functions (Ben)
>>>>>>>>>>>         s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>>>>>> directly.
>>>>>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt
>>>>>>>>>>> type
>>>>>>>>>>>         before enabling set/clear function, print full offsets in
>>>>>>>>>>> debug
>>>>>>>>>>>         statements, using port of lower_32_bits and upper_32_bits
>>>>>>>>>>> from linux
>>>>>>>>>>>         kernel (Micha?)
>>>>>>>>>>>
>>>>>>>>>>> References:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> +Kristian
>>>>>>>>>>
>>>>>>>>>> LGTM.
>>>>>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Kristian,
>>>>>>>>>
>>>>>>>>> More comments on this?
>>>>>>>>> I've resent the mesa patch with the last changes
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Micha? has agreed with the interface and will use it for his work.
>>>>>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel
>>>>>>>>> changes
>>>>>>>>> have
>>>>>>>>> been done to prevent any regressions when the support 48-bit flag
>>>>>>>>> is
>>>>>>>>> not
>>>>>>>>> set.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> I didn't get any replies to my last comments on this:
>>>>>>>>
>>>>>>>>
>>>>>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>>>>>
>>>>>>>> Basically, I tried explicitly placing buffers above 8G and didn't
>>>>>>>> see
>>>>>>>> the HW problem described (ie it all worked fine).  I still think
>>>>>>>> there's some confusion as to what the W/A is about.
>>>>>>>>
>>>>>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>>>>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that
>>>>>>>> the
>>>>>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>>>>>>> anywhere it the 48 bit address space. If that happens it's
>>>>>>>> consideredd
>>>>>>>> out-of-bounds for the heap and access fails. To prevent this we need
>>>>>>>> to make sure the bos we address in a heapless fashion are placed
>>>>>>>> below
>>>>>>>> 4g.
>>>>>>>>
>>>>>>>> For mesa, we only configure general state base as heap-less, which
>>>>>>>> affects scratch bos. What this boils down to is that we need the 4G
>>>>>>>> restriction only for the scratch bos set up on 3DSTATE_VS,
>>>>>>>> 3DSTATE_GS
>>>>>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>>>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c,
>>>>>>>> gen8_gs_state.c
>>>>>>>> and gen8_ps_state.c
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>>>>>>> isn't exclusive to the scratch bos (the error state points to that
>>>>>>> instruction).
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> Anymore inputs about this patch?
>>>>>> AFAIK, if changes are required based on further comments from
>>>>>> Kristian,
>>>>>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use
>>>>>> this
>>>>>> flag in another project.
>>>>>>
>>>>> The comment seems quite brief and I'm not sure it fully addresses
>>>>> Kristian's concern. I'd suspect that providing reference to the HW
>>>>> documentation (confirming your assumption) might be beneficial.
>>>>>
>>>>
>>>> Sure, attached is the hang I get if I don't set the restriction in
>>>> gen8_misc_state.c and try to use the full 48-bit range
>>>> (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
>>>>
>>>> I see the same hang signature when it is only applied to the scratch bos
>>>> (in
>>>> gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
>>>> i915_error_state_scratchbo.txt).
>>>>
>>>> 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
>>>>
>>>> https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
>>>> (page 256)
>>>
>>>
>>> I applied your mesa and libdrm patches and then backed out the 4G
>>> restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
>>> reproduce any hang with trex or glxgears. I confirmed (using
>>> INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
>>> got this:
>>>
>>>   0: 8 @ 0xfffffff90000 (miptree)
>>>   1: 9 @ 0xfffffff78000 (hiz)
>>>   2: 4 @ 0xfffffff77000 (pipe_control workaround)
>>>   3: 5 @ 0xfffffff76000 (program cache)
>>>   4: 12 @ 0x000000000000 (miptree)
>>>   5: 7 @ 0x000000001000 (image)
>>>   6: 10 @ 0xfffffff5a000 (miptree)
>>>   7: 13 @ 0xfffffff59000 (bufferobj)
>>>   8: 11 @ 0xfffffff51000 (bufferobj)
>>>   9: 14 @ 0xfffffff50000 (bufferobj)
>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
>>> (miptree)@0x0000ffff fff90000 + 0x00000000
>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
>>> (hiz)@0x0000ffff fff78000 + 0x00000000
>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
>>> (pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
>>> ...
>>>
>>> In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
>>> all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
>>> heap bases are set to 0xfffffff40000.
>>>
>>> The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
>>> bits, but that only affects how far from the dynamic state base
>>> address we can place that. In mesas case, it's always inside the batch
>>> buffer, which is at most 32k, so that's never a problem. If a driver
>>> uses dynamic state in a heapless fashion, then you need to be careful
>>> to place the CC viewport state below 4g.
>>>
>>> What I was able  to confirm is that scratch buffer I/O (which we use
>>> for spilling) does break with 48 bit ppgtt. If you run glxgears with
>>> INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
>>> the 4G limit on the general state heap caps attempts to spill and fill
>>> from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
>>> the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.
>>
>>
>> Ok, so we do still need the 4G limit bit (i.e. I don't have to revert the
>> kernel side for lack of open-source userspace), just not at the place we
>> originally thought. I guess lesson learned is to actually test stuff first
>> before I pull in the kernel side. Michel, did you not see the spilling
>> corruptions when testing the mesa patches? Kristian, does this also blow
>> up with just a piglit run?
>>
>
> There's must be something else we have different in our systems.
>
> Kristian doesn't get the hang, and if I back out the 4G restriction in the
> STATE_BASE_ADDRESS, I don't see any corruption while running
> INTEL_DEBUG=spill_fs glxgears.

I don't want to block this, in fact, I very much want to see this move
forward. However, I can't ack patches that 1) fix a gpu hang I can't
reproduce and I don't agree should be a problem (the
STATE_BASE_ADDRESS probllem) and 2) don't fix the issue that I see and
reproduced on my BDW (the spilling problem).

I'll try a full piglit run tomorrow with both my mesa patch and
Michels and see what happens. However, this isn't the kind of problem
that occurs once in a blue moon or for some corner case piglit test.
If Michels understanding of the hw and the workaround is correct,
nothing should work without his patch.

To move forward on this, unless something new comes up when running
piglit, I can take ownership of making mesa work one way or the other.
The kernel and libdrm work is fine and most of Michels mesa patch is
good. If mesa breaks down the road because we need to limit the
STATE_BASE_ADDRESS relocs, I'll own the problem then.

Kristian
Michel Thierry Dec. 4, 2015, 2:24 p.m. UTC | #12
On 11/18/2015 10:53 PM, Kristian Høgsberg wrote:
> On Wed, Oct 14, 2015 at 5:11 AM, Michel Thierry
> <michel.thierry@intel.com> wrote:
>> On 10/14/2015 8:19 AM, Daniel Vetter wrote:
>>>
>>> On Tue, Oct 13, 2015 at 02:51:36PM -0700, Kristian Høgsberg wrote:
>>>>
>>>> On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
>>>> <michel.thierry@intel.com> wrote:
>>>>>
>>>>> On 10/13/2015 3:13 PM, Emil Velikov wrote:
>>>>>>
>>>>>>
>>>>>> On 13 October 2015 at 13:16, Michel Thierry <michel.thierry@intel.com>
>>>>>> wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>>>>>>> <michel.thierry@intel.com> wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>>>>>>> always be
>>>>>>>>>>>> allocated inside the 32-bit address range.
>>>>>>>>>>>>
>>>>>>>>>>>> In specific, any resource used with flat/heapless
>>>>>>>>>>>> (0x00000000-0xfffff000)
>>>>>>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must be
>>>>>>>>>>>> in
>>>>>>>>>>>> a
>>>>>>>>>>>> 32-bit range, because the General State Offset and Instruction
>>>>>>>>>>>> State
>>>>>>>>>>>> Offset
>>>>>>>>>>>> are limited to 32-bits.
>>>>>>>>>>>>
>>>>>>>>>>>> The i915 driver has been modified to provide a flag to set when
>>>>>>>>>>>> the
>>>>>>>>>>>> 4GB
>>>>>>>>>>>> limit is not necessary in a given bo
>>>>>>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>>>>>>
>>>>>>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should
>>>>>>>>>>>> set
>>>>>>>>>>>> the
>>>>>>>>>>>> use_48b_address_range flag beforehand, in order to use full ppgtt
>>>>>>>>>>>> range.
>>>>>>>>>>>>
>>>>>>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use
>>>>>>>>>>>> them
>>>>>>>>>>>>          internally in emit_reloc functions (Ben)
>>>>>>>>>>>>          s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>>>>>>> directly.
>>>>>>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt
>>>>>>>>>>>> type
>>>>>>>>>>>>          before enabling set/clear function, print full offsets in
>>>>>>>>>>>> debug
>>>>>>>>>>>>          statements, using port of lower_32_bits and upper_32_bits
>>>>>>>>>>>> from linux
>>>>>>>>>>>>          kernel (Micha?)
>>>>>>>>>>>>
>>>>>>>>>>>> References:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> +Kristian
>>>>>>>>>>>
>>>>>>>>>>> LGTM.
>>>>>>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Hi Kristian,
>>>>>>>>>>
>>>>>>>>>> More comments on this?
>>>>>>>>>> I've resent the mesa patch with the last changes
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Micha? has agreed with the interface and will use it for his work.
>>>>>>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel
>>>>>>>>>> changes
>>>>>>>>>> have
>>>>>>>>>> been done to prevent any regressions when the support 48-bit flag
>>>>>>>>>> is
>>>>>>>>>> not
>>>>>>>>>> set.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I didn't get any replies to my last comments on this:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>>>>>>
>>>>>>>>> Basically, I tried explicitly placing buffers above 8G and didn't
>>>>>>>>> see
>>>>>>>>> the HW problem described (ie it all worked fine).  I still think
>>>>>>>>> there's some confusion as to what the W/A is about.
>>>>>>>>>
>>>>>>>>> Here's my take: the W/A is a SW-only workaround to handle the cases
>>>>>>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that
>>>>>>>>> the
>>>>>>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
>>>>>>>>> anywhere it the 48 bit address space. If that happens it's
>>>>>>>>> consideredd
>>>>>>>>> out-of-bounds for the heap and access fails. To prevent this we need
>>>>>>>>> to make sure the bos we address in a heapless fashion are placed
>>>>>>>>> below
>>>>>>>>> 4g.
>>>>>>>>>
>>>>>>>>> For mesa, we only configure general state base as heap-less, which
>>>>>>>>> affects scratch bos. What this boils down to is that we need the 4G
>>>>>>>>> restriction only for the scratch bos set up on 3DSTATE_VS,
>>>>>>>>> 3DSTATE_GS
>>>>>>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>>>>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c,
>>>>>>>>> gen8_gs_state.c
>>>>>>>>> and gen8_ps_state.c
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe it
>>>>>>>> isn't exclusive to the scratch bos (the error state points to that
>>>>>>>> instruction).
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Hi,
>>>>>>>
>>>>>>> Anymore inputs about this patch?
>>>>>>> AFAIK, if changes are required based on further comments from
>>>>>>> Kristian,
>>>>>>> these will be in the mesa patch[1], not libdrm. Also, Micha? will use
>>>>>>> this
>>>>>>> flag in another project.
>>>>>>>
>>>>>> The comment seems quite brief and I'm not sure it fully addresses
>>>>>> Kristian's concern. I'd suspect that providing reference to the HW
>>>>>> documentation (confirming your assumption) might be beneficial.
>>>>>>
>>>>>
>>>>> Sure, attached is the hang I get if I don't set the restriction in
>>>>> gen8_misc_state.c and try to use the full 48-bit range
>>>>> (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
>>>>>
>>>>> I see the same hang signature when it is only applied to the scratch bos
>>>>> (in
>>>>> gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
>>>>> i915_error_state_scratchbo.txt).
>>>>>
>>>>> 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
>>>>>
>>>>> https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
>>>>> (page 256)
>>>>
>>>>
>>>> I applied your mesa and libdrm patches and then backed out the 4G
>>>> restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
>>>> reproduce any hang with trex or glxgears. I confirmed (using
>>>> INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
>>>> got this:
>>>>
>>>>    0: 8 @ 0xfffffff90000 (miptree)
>>>>    1: 9 @ 0xfffffff78000 (hiz)
>>>>    2: 4 @ 0xfffffff77000 (pipe_control workaround)
>>>>    3: 5 @ 0xfffffff76000 (program cache)
>>>>    4: 12 @ 0x000000000000 (miptree)
>>>>    5: 7 @ 0x000000001000 (image)
>>>>    6: 10 @ 0xfffffff5a000 (miptree)
>>>>    7: 13 @ 0xfffffff59000 (bufferobj)
>>>>    8: 11 @ 0xfffffff51000 (bufferobj)
>>>>    9: 14 @ 0xfffffff50000 (bufferobj)
>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
>>>> (miptree)@0x0000ffff fff90000 + 0x00000000
>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
>>>> (hiz)@0x0000ffff fff78000 + 0x00000000
>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
>>>> (pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
>>>> ...
>>>>
>>>> In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
>>>> all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
>>>> heap bases are set to 0xfffffff40000.
>>>>
>>>> The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
>>>> bits, but that only affects how far from the dynamic state base
>>>> address we can place that. In mesas case, it's always inside the batch
>>>> buffer, which is at most 32k, so that's never a problem. If a driver
>>>> uses dynamic state in a heapless fashion, then you need to be careful
>>>> to place the CC viewport state below 4g.
>>>>
>>>> What I was able  to confirm is that scratch buffer I/O (which we use
>>>> for spilling) does break with 48 bit ppgtt. If you run glxgears with
>>>> INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
>>>> the 4G limit on the general state heap caps attempts to spill and fill
>>>> from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
>>>> the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.
>>>
>>>
>>> Ok, so we do still need the 4G limit bit (i.e. I don't have to revert the
>>> kernel side for lack of open-source userspace), just not at the place we
>>> originally thought. I guess lesson learned is to actually test stuff first
>>> before I pull in the kernel side. Michel, did you not see the spilling
>>> corruptions when testing the mesa patches? Kristian, does this also blow
>>> up with just a piglit run?
>>>
>>
>> There's must be something else we have different in our systems.
>>
>> Kristian doesn't get the hang, and if I back out the 4G restriction in the
>> STATE_BASE_ADDRESS, I don't see any corruption while running
>> INTEL_DEBUG=spill_fs glxgears.
>
> I don't want to block this, in fact, I very much want to see this move
> forward. However, I can't ack patches that 1) fix a gpu hang I can't
> reproduce and I don't agree should be a problem (the
> STATE_BASE_ADDRESS probllem) and 2) don't fix the issue that I see and
> reproduced on my BDW (the spilling problem).
>
> I'll try a full piglit run tomorrow with both my mesa patch and
> Michels and see what happens. However, this isn't the kind of problem
> that occurs once in a blue moon or for some corner case piglit test.
> If Michels understanding of the hw and the workaround is correct,
> nothing should work without his patch.
>
> To move forward on this, unless something new comes up when running
> piglit, I can take ownership of making mesa work one way or the other.
> The kernel and libdrm work is fine and most of Michels mesa patch is
> good. If mesa breaks down the road because we need to limit the
> STATE_BASE_ADDRESS relocs, I'll own the problem then.
>
> Kristian
>

Thanks Kristian,

Let me know if you need something from i915 or libdrm side.
Kristian Hogsberg Dec. 10, 2015, 7:40 p.m. UTC | #13
On Fri, Dec 4, 2015 at 6:24 AM, Michel Thierry <michel.thierry@intel.com> wrote:
> On 11/18/2015 10:53 PM, Kristian Høgsberg wrote:
>>
>> On Wed, Oct 14, 2015 at 5:11 AM, Michel Thierry
>> <michel.thierry@intel.com> wrote:
>>>
>>> On 10/14/2015 8:19 AM, Daniel Vetter wrote:
>>>>
>>>>
>>>> On Tue, Oct 13, 2015 at 02:51:36PM -0700, Kristian Høgsberg wrote:
>>>>>
>>>>>
>>>>> On Tue, Oct 13, 2015 at 7:55 AM, Michel Thierry
>>>>> <michel.thierry@intel.com> wrote:
>>>>>>
>>>>>>
>>>>>> On 10/13/2015 3:13 PM, Emil Velikov wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 13 October 2015 at 13:16, Michel Thierry
>>>>>>> <michel.thierry@intel.com>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 10/6/2015 2:12 PM, Michel Thierry wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 10/5/2015 7:06 PM, Kristian Høgsberg wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry
>>>>>>>>>> <michel.thierry@intel.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 9/14/2015 2:54 PM, Micha? Winiarski wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Gen8+ supports 48-bit virtual addresses, but some objects must
>>>>>>>>>>>>> always be
>>>>>>>>>>>>> allocated inside the 32-bit address range.
>>>>>>>>>>>>>
>>>>>>>>>>>>> In specific, any resource used with flat/heapless
>>>>>>>>>>>>> (0x00000000-0xfffff000)
>>>>>>>>>>>>> General State Heap (GSH) or Instruction State Heap (ISH) must
>>>>>>>>>>>>> be
>>>>>>>>>>>>> in
>>>>>>>>>>>>> a
>>>>>>>>>>>>> 32-bit range, because the General State Offset and Instruction
>>>>>>>>>>>>> State
>>>>>>>>>>>>> Offset
>>>>>>>>>>>>> are limited to 32-bits.
>>>>>>>>>>>>>
>>>>>>>>>>>>> The i915 driver has been modified to provide a flag to set when
>>>>>>>>>>>>> the
>>>>>>>>>>>>> 4GB
>>>>>>>>>>>>> limit is not necessary in a given bo
>>>>>>>>>>>>> (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>>>>>>>>>>>> 48-bit range will only be used when explicitly requested.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Callers to the existing drm_intel_bo_emit_reloc function should
>>>>>>>>>>>>> set
>>>>>>>>>>>>> the
>>>>>>>>>>>>> use_48b_address_range flag beforehand, in order to use full
>>>>>>>>>>>>> ppgtt
>>>>>>>>>>>>> range.
>>>>>>>>>>>>>
>>>>>>>>>>>>> v2: Make set/clear functions nops on pre-gen8 platforms, and
>>>>>>>>>>>>> use
>>>>>>>>>>>>> them
>>>>>>>>>>>>>          internally in emit_reloc functions (Ben)
>>>>>>>>>>>>>          s/48BADDRESS/48B_ADDRESS/ (Dave)
>>>>>>>>>>>>> v3: Keep set/clear functions internal, no-one needs to use them
>>>>>>>>>>>>> directly.
>>>>>>>>>>>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt
>>>>>>>>>>>>> type
>>>>>>>>>>>>>          before enabling set/clear function, print full offsets
>>>>>>>>>>>>> in
>>>>>>>>>>>>> debug
>>>>>>>>>>>>>          statements, using port of lower_32_bits and
>>>>>>>>>>>>> upper_32_bits
>>>>>>>>>>>>> from linux
>>>>>>>>>>>>>          kernel (Micha?)
>>>>>>>>>>>>>
>>>>>>>>>>>>> References:
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>>>>>>>>>>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>>>>>>>>>>>> Cc: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> +Kristian
>>>>>>>>>>>>
>>>>>>>>>>>> LGTM.
>>>>>>>>>>>> Acked-by: Micha? Winiarski <michal.winiarski@intel.com>
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Hi Kristian,
>>>>>>>>>>>
>>>>>>>>>>> More comments on this?
>>>>>>>>>>> I've resent the mesa patch with the last changes
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Micha? has agreed with the interface and will use it for his
>>>>>>>>>>> work.
>>>>>>>>>>> If mesa doesn't plan to use this for now, it's ok. The kernel
>>>>>>>>>>> changes
>>>>>>>>>>> have
>>>>>>>>>>> been done to prevent any regressions when the support 48-bit flag
>>>>>>>>>>> is
>>>>>>>>>>> not
>>>>>>>>>>> set.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I didn't get any replies to my last comments on this:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html
>>>>>>>>>>
>>>>>>>>>> Basically, I tried explicitly placing buffers above 8G and didn't
>>>>>>>>>> see
>>>>>>>>>> the HW problem described (ie it all worked fine).  I still think
>>>>>>>>>> there's some confusion as to what the W/A is about.
>>>>>>>>>>
>>>>>>>>>> Here's my take: the W/A is a SW-only workaround to handle the
>>>>>>>>>> cases
>>>>>>>>>> where a driver uses heapless and 48-bit ppgtt. The problem is that
>>>>>>>>>> the
>>>>>>>>>> heaps are limited to 4G but with 48bit ppgtt a buffer can be
>>>>>>>>>> placed
>>>>>>>>>> anywhere it the 48 bit address space. If that happens it's
>>>>>>>>>> consideredd
>>>>>>>>>> out-of-bounds for the heap and access fails. To prevent this we
>>>>>>>>>> need
>>>>>>>>>> to make sure the bos we address in a heapless fashion are placed
>>>>>>>>>> below
>>>>>>>>>> 4g.
>>>>>>>>>>
>>>>>>>>>> For mesa, we only configure general state base as heap-less, which
>>>>>>>>>> affects scratch bos. What this boils down to is that we need the
>>>>>>>>>> 4G
>>>>>>>>>> restriction only for the scratch bos set up on 3DSTATE_VS,
>>>>>>>>>> 3DSTATE_GS
>>>>>>>>>> and 3DSTATE_PS (and tesselation stage eventually). Look for the
>>>>>>>>>> OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c,
>>>>>>>>>> gen8_gs_state.c
>>>>>>>>>> and gen8_ps_state.c
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I think it also affects _3DSTATE_VIEWPORT_STATE_POINTERS_CC, maybe
>>>>>>>>> it
>>>>>>>>> isn't exclusive to the scratch bos (the error state points to that
>>>>>>>>> instruction).
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> Anymore inputs about this patch?
>>>>>>>> AFAIK, if changes are required based on further comments from
>>>>>>>> Kristian,
>>>>>>>> these will be in the mesa patch[1], not libdrm. Also, Micha? will
>>>>>>>> use
>>>>>>>> this
>>>>>>>> flag in another project.
>>>>>>>>
>>>>>>> The comment seems quite brief and I'm not sure it fully addresses
>>>>>>> Kristian's concern. I'd suspect that providing reference to the HW
>>>>>>> documentation (confirming your assumption) might be beneficial.
>>>>>>>
>>>>>>
>>>>>> Sure, attached is the hang I get if I don't set the restriction in
>>>>>> gen8_misc_state.c and try to use the full 48-bit range
>>>>>> (i915_error_state_nowa.txt). This is just running gfxbench t-rex.
>>>>>>
>>>>>> I see the same hang signature when it is only applied to the scratch
>>>>>> bos
>>>>>> (in
>>>>>> gen8_vs_state.c, gen8_gs_state.c and gen8_ps_state.c -
>>>>>> i915_error_state_scratchbo.txt).
>>>>>>
>>>>>> 3DSTATE_VIEWPORT_STATE_POINTERS_CC (0x78230000) is defined here:
>>>>>>
>>>>>>
>>>>>> https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-bdw-vol02a-commandreference-instructions_0.pdf
>>>>>> (page 256)
>>>>>
>>>>>
>>>>>
>>>>> I applied your mesa and libdrm patches and then backed out the 4G
>>>>> restriction in the STATE_BASE_ADDRESS relocations.  I was not able to
>>>>> reproduce any hang with trex or glxgears. I confirmed (using
>>>>> INTEL_DEBUG=bat) that gem actually placed BOs above the 4G mark and
>>>>> got this:
>>>>>
>>>>>    0: 8 @ 0xfffffff90000 (miptree)
>>>>>    1: 9 @ 0xfffffff78000 (hiz)
>>>>>    2: 4 @ 0xfffffff77000 (pipe_control workaround)
>>>>>    3: 5 @ 0xfffffff76000 (program cache)
>>>>>    4: 12 @ 0x000000000000 (miptree)
>>>>>    5: 7 @ 0x000000001000 (image)
>>>>>    6: 10 @ 0xfffffff5a000 (miptree)
>>>>>    7: 13 @ 0xfffffff59000 (bufferobj)
>>>>>    8: 11 @ 0xfffffff51000 (bufferobj)
>>>>>    9: 14 @ 0xfffffff50000 (bufferobj)
>>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000020 -> 8
>>>>> (miptree)@0x0000ffff fff90000 + 0x00000000
>>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000040 -> 9
>>>>> (hiz)@0x0000ffff fff78000 + 0x00000000
>>>>> 10: 15 @ 0xfffffff40000 (batchbuffer)@0x00000000 00000098 -> 4
>>>>> (pipe_control workaround)@0x0000ffff fff77000 + 0x00000000
>>>>> ...
>>>>>
>>>>> In particular, the mesa batchbuffer is placed at 0xfffffff40000, and
>>>>> all the STATE_BASE_ADDRESS heaps are in that bo. In other words, all
>>>>> heap bases are set to 0xfffffff40000.
>>>>>
>>>>> The offset in 3DSTATE_VIEWPORT_STATE_POINTERS_CC is indeed only 32
>>>>> bits, but that only affects how far from the dynamic state base
>>>>> address we can place that. In mesas case, it's always inside the batch
>>>>> buffer, which is at most 32k, so that's never a problem. If a driver
>>>>> uses dynamic state in a heapless fashion, then you need to be careful
>>>>> to place the CC viewport state below 4g.
>>>>>
>>>>> What I was able  to confirm is that scratch buffer I/O (which we use
>>>>> for spilling) does break with 48 bit ppgtt. If you run glxgears with
>>>>> INTEL_DEBUG=spill_fs set, you can see lots of rendering artifacts as
>>>>> the 4G limit on the general state heap caps attempts to spill and fill
>>>>> from the vertex shader. Using OUT_RELOC64_INSIDE_4G() when setting up
>>>>> the scratch BOs in 3DSTATE_VS and 3DSTATE_PS fixes the problem.
>>>>
>>>>
>>>>
>>>> Ok, so we do still need the 4G limit bit (i.e. I don't have to revert
>>>> the
>>>> kernel side for lack of open-source userspace), just not at the place we
>>>> originally thought. I guess lesson learned is to actually test stuff
>>>> first
>>>> before I pull in the kernel side. Michel, did you not see the spilling
>>>> corruptions when testing the mesa patches? Kristian, does this also blow
>>>> up with just a piglit run?
>>>>
>>>
>>> There's must be something else we have different in our systems.
>>>
>>> Kristian doesn't get the hang, and if I back out the 4G restriction in
>>> the
>>> STATE_BASE_ADDRESS, I don't see any corruption while running
>>> INTEL_DEBUG=spill_fs glxgears.
>>
>>
>> I don't want to block this, in fact, I very much want to see this move
>> forward. However, I can't ack patches that 1) fix a gpu hang I can't
>> reproduce and I don't agree should be a problem (the
>> STATE_BASE_ADDRESS probllem) and 2) don't fix the issue that I see and
>> reproduced on my BDW (the spilling problem).
>>
>> I'll try a full piglit run tomorrow with both my mesa patch and
>> Michels and see what happens. However, this isn't the kind of problem
>> that occurs once in a blue moon or for some corner case piglit test.
>> If Michels understanding of the hw and the workaround is correct,
>> nothing should work without his patch.
>>
>> To move forward on this, unless something new comes up when running
>> piglit, I can take ownership of making mesa work one way or the other.
>> The kernel and libdrm work is fine and most of Michels mesa patch is
>> good. If mesa breaks down the road because we need to limit the
>> STATE_BASE_ADDRESS relocs, I'll own the problem then.
>>
>> Kristian
>>
>
> Thanks Kristian,
>
> Let me know if you need something from i915 or libdrm side.

I've reviewed and pushed these two patches.  I'll send out the mesa
patch for review in a bit and once that lands we can do a libdrm
release.

Kristian
diff mbox

Patch

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index ded43b1..426b25c 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -680,7 +680,8 @@  struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_NEEDS_FENCE (1<<0)
 #define EXEC_OBJECT_NEEDS_GTT	(1<<1)
 #define EXEC_OBJECT_WRITE	(1<<2)
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
+#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1)
 	__u64 flags;
 
 	__u64 rsvd1;
diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c
index 14ea9f9..0856e60 100644
--- a/intel/intel_bufmgr.c
+++ b/intel/intel_bufmgr.c
@@ -293,6 +293,17 @@  drm_intel_bo_madvise(drm_intel_bo *bo, int madv)
 }
 
 int
+drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
+{
+	if (bo->bufmgr->bo_use_48b_address_range) {
+		bo->bufmgr->bo_use_48b_address_range(bo, enable);
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+int
 drm_intel_bo_references(drm_intel_bo *bo, drm_intel_bo *target_bo)
 {
 	return bo->bufmgr->bo_references(bo, target_bo);
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index 95eecb8..a14c78f 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -164,6 +164,7 @@  int drm_intel_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode,
 int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name);
 int drm_intel_bo_busy(drm_intel_bo *bo);
 int drm_intel_bo_madvise(drm_intel_bo *bo, int madv);
+int drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable);
 
 int drm_intel_bo_disable_reuse(drm_intel_bo *bo);
 int drm_intel_bo_is_reusable(drm_intel_bo *bo);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 2723e21..09d82d2 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -83,6 +83,22 @@ 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define MAX2(A, B) ((A) > (B) ? (A) : (B))
 
+/**
+ * upper_32_bits - return bits 32-63 of a number
+ * @n: the number we're accessing
+ *
+ * A basic shift-right of a 64- or 32-bit quantity.  Use this to suppress
+ * the "right shift count >= width of type" warning when that quantity is
+ * 32-bits.
+ */
+#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16))
+
+/**
+ * lower_32_bits - return bits 0-31 of a number
+ * @n: the number we're accessing
+ */
+#define lower_32_bits(n) ((__u32)(n))
+
 typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
 
 struct drm_intel_gem_bo_bucket {
@@ -237,6 +253,15 @@  struct _drm_intel_bo_gem {
 	bool is_userptr;
 
 	/**
+	 * Boolean of whether this buffer can be placed in the full 48-bit
+	 * address range on gen8+.
+	 *
+	 * By default, buffers will be keep in a 32-bit range, unless this
+	 * flag is explicitly set.
+	 */
+	bool use_48b_address_range;
+
+	/**
 	 * Size in bytes of this buffer and its relocation descendents.
 	 *
 	 * Used to avoid costly tree walking in
@@ -395,14 +420,16 @@  drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
 			drm_intel_bo_gem *target_gem =
 			    (drm_intel_bo_gem *) target_bo;
 
-			DBG("%2d: %d (%s)@0x%08llx -> "
-			    "%d (%s)@0x%08lx + 0x%08x\n",
+			DBG("%2d: %d (%s)@0x%08x %08x -> "
+			    "%d (%s)@0x%08x %08x + 0x%08x\n",
 			    i,
 			    bo_gem->gem_handle, bo_gem->name,
-			    (unsigned long long)bo_gem->relocs[j].offset,
+			    upper_32_bits(bo_gem->relocs[j].offset),
+			    lower_32_bits(bo_gem->relocs[j].offset),
 			    target_gem->gem_handle,
 			    target_gem->name,
-			    target_bo->offset64,
+			    upper_32_bits(target_bo->offset64),
+			    lower_32_bits(target_bo->offset64),
 			    bo_gem->relocs[j].delta);
 		}
 	}
@@ -468,11 +495,15 @@  drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr;
 	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo;
 	int index;
+	int flags = 0;
+
+	if (need_fence)
+		flags |= EXEC_OBJECT_NEEDS_FENCE;
+	if (bo_gem->use_48b_address_range)
+		flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
 
 	if (bo_gem->validate_index != -1) {
-		if (need_fence)
-			bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
-				EXEC_OBJECT_NEEDS_FENCE;
+		bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= flags;
 		return;
 	}
 
@@ -501,13 +532,9 @@  drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence)
 	bufmgr_gem->exec2_objects[index].alignment = bo->align;
 	bufmgr_gem->exec2_objects[index].offset = 0;
 	bufmgr_gem->exec_bos[index] = bo;
-	bufmgr_gem->exec2_objects[index].flags = 0;
+	bufmgr_gem->exec2_objects[index].flags = flags;
 	bufmgr_gem->exec2_objects[index].rsvd1 = 0;
 	bufmgr_gem->exec2_objects[index].rsvd2 = 0;
-	if (need_fence) {
-		bufmgr_gem->exec2_objects[index].flags |=
-			EXEC_OBJECT_NEEDS_FENCE;
-	}
 	bufmgr_gem->exec_count++;
 }
 
@@ -780,6 +807,7 @@  retry:
 	bo_gem->used_as_reloc_target = false;
 	bo_gem->has_error = false;
 	bo_gem->reusable = true;
+	bo_gem->use_48b_address_range = false;
 
 	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, alignment);
 
@@ -926,6 +954,7 @@  drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr *bufmgr,
 	bo_gem->used_as_reloc_target = false;
 	bo_gem->has_error = false;
 	bo_gem->reusable = false;
+	bo_gem->use_48b_address_range = false;
 
 	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
 
@@ -1081,6 +1110,7 @@  drm_intel_bo_gem_create_from_name(drm_intel_bufmgr *bufmgr,
 	bo_gem->bo.handle = open_arg.handle;
 	bo_gem->global_name = handle;
 	bo_gem->reusable = false;
+	bo_gem->use_48b_address_range = false;
 
 	memclear(get_tiling);
 	get_tiling.handle = bo_gem->gem_handle;
@@ -1930,6 +1960,13 @@  do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
 	return 0;
 }
 
+static void
+drm_intel_gem_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
+{
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	bo_gem->use_48b_address_range = enable;
+}
+
 static int
 drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
 			    drm_intel_bo *target_bo, uint32_t target_offset,
@@ -2073,10 +2110,12 @@  drm_intel_update_buffer_offsets(drm_intel_bufmgr_gem *bufmgr_gem)
 
 		/* Update the buffer offset */
 		if (bufmgr_gem->exec_objects[i].offset != bo->offset64) {
-			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
-			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
-			    (unsigned long long)bufmgr_gem->exec_objects[i].
-			    offset);
+			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
+			    bo_gem->gem_handle, bo_gem->name,
+			    upper_32_bits(bo->offset64),
+			    lower_32_bits(bo->offset64),
+			    upper_32_bits(bufmgr_gem->exec_objects[i].offset),
+			    lower_32_bits(bufmgr_gem->exec_objects[i].offset));
 			bo->offset64 = bufmgr_gem->exec_objects[i].offset;
 			bo->offset = bufmgr_gem->exec_objects[i].offset;
 		}
@@ -2094,9 +2133,12 @@  drm_intel_update_buffer_offsets2 (drm_intel_bufmgr_gem *bufmgr_gem)
 
 		/* Update the buffer offset */
 		if (bufmgr_gem->exec2_objects[i].offset != bo->offset64) {
-			DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
-			    bo_gem->gem_handle, bo_gem->name, bo->offset64,
-			    (unsigned long long)bufmgr_gem->exec2_objects[i].offset);
+			DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x %08x\n",
+			    bo_gem->gem_handle, bo_gem->name,
+			    upper_32_bits(bo->offset64),
+			    lower_32_bits(bo->offset64),
+			    upper_32_bits(bufmgr_gem->exec2_objects[i].offset),
+			    lower_32_bits(bufmgr_gem->exec2_objects[i].offset));
 			bo->offset64 = bufmgr_gem->exec2_objects[i].offset;
 			bo->offset = bufmgr_gem->exec2_objects[i].offset;
 		}
@@ -2481,6 +2523,7 @@  drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr, int prime_fd, int s
 	bo_gem->used_as_reloc_target = false;
 	bo_gem->has_error = false;
 	bo_gem->reusable = false;
+	bo_gem->use_48b_address_range = false;
 
 	DRMINITLISTHEAD(&bo_gem->vma_list);
 	DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named);
@@ -3278,6 +3321,13 @@  drm_intel_bufmgr_gem_init(int fd, int batch_size)
 		}
 	}
 
+	if (bufmgr_gem->gen >= 8) {
+		gp.param = I915_PARAM_HAS_ALIASING_PPGTT;
+		ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+		if (ret == 0 && *gp.value == 3)
+			bufmgr_gem->bufmgr.bo_use_48b_address_range = drm_intel_gem_bo_use_48b_address_range;
+	}
+
 	/* Let's go with one relocation per every 2 dwords (but round down a bit
 	 * since a power of two will mean an extra page allocation for the reloc
 	 * buffer).
diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h
index 59ebd18..5c17ffb 100644
--- a/intel/intel_bufmgr_priv.h
+++ b/intel/intel_bufmgr_priv.h
@@ -152,6 +152,20 @@  struct _drm_intel_bufmgr {
 	void (*destroy) (drm_intel_bufmgr *bufmgr);
 
 	/**
+	 * Indicate if the buffer can be placed anywhere in the full ppgtt
+	 * address range (2^48).
+	 *
+	 * Any resource used with flat/heapless (0x00000000-0xfffff000)
+	 * General State Heap (GSH) or Intructions State Heap (ISH) must
+	 * be in a 32-bit range. 48-bit range will only be used when explicitly
+	 * requested.
+	 *
+	 * \param bo Buffer to set the use_48b_address_range flag.
+	 * \param enable The flag value.
+	 */
+	void (*bo_use_48b_address_range) (drm_intel_bo *bo, uint32_t enable);
+
+	/**
 	 * Add relocation entry in reloc_buf, which will be updated with the
 	 * target buffer's real offset on on command submission.
 	 *