[mesa,v3] i965/gen8+: bo in state base address must be in 32-bit address range
diff mbox

Message ID 1438940723-23084-4-git-send-email-michel.thierry@intel.com
State New
Headers show

Commit Message

Michel Thierry Aug. 7, 2015, 9:45 a.m. UTC
Gen8+ supports 48-bit virtual addresses, but some objects must always be
allocated inside the 32-bit address range.

In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
General State Heap or Intruction State Heap must be in a 32-bit range
(GSH / ISH), because the General State Offset and Instruction State Offset
are limited to 32-bits.

Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and
the bo can be in the full address space.

This commit introduces a dependency of libdrm 2.4.63, which introduces the
drm_intel_bo_emit_reloc_48bit function.

v2: s/48baddress/48b_address/,
    Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset
    is needed (Ben)
v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation
    needs the 32-bit workaround (Chris)

References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 configure.ac                                  |  2 +-
 src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
 4 files changed, 36 insertions(+), 15 deletions(-)

Comments

Matt Turner Aug. 7, 2015, 6:33 p.m. UTC | #1
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry <michel.thierry@intel.com> wrote:
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 54081a1..ca90784 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -409,11 +409,23 @@ bool
>  intel_batchbuffer_emit_reloc64(struct brw_context *brw,

This patch needs to be rebased on commit 09348c12f (committed more
than 3 weeks ago).
Ilia Mirkin Aug. 7, 2015, 7:57 p.m. UTC | #2
On Fri, Aug 7, 2015 at 5:45 AM, Michel Thierry <michel.thierry@intel.com> wrote:
> Gen8+ supports 48-bit virtual addresses, but some objects must always be
> allocated inside the 32-bit address range.
>
> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
> General State Heap or Intruction State Heap must be in a 32-bit range
> (GSH / ISH), because the General State Offset and Instruction State Offset
> are limited to 32-bits.
>
> Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and
> the bo can be in the full address space.
>
> This commit introduces a dependency of libdrm 2.4.63, which introduces the
> drm_intel_bo_emit_reloc_48bit function.
>
> v2: s/48baddress/48b_address/,
>     Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset
>     is needed (Ben)
> v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation
>     needs the 32-bit workaround (Chris)
>
> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  configure.ac                                  |  2 +-
>  src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
>  src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
>  4 files changed, 36 insertions(+), 15 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index af61aa2..c92ca44 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
>  dnl Versions for external dependencies
>  LIBDRM_REQUIRED=2.4.38
>  LIBDRM_RADEON_REQUIRED=2.4.56
> -LIBDRM_INTEL_REQUIRED=2.4.60
> +LIBDRM_INTEL_REQUIRED=2.4.63

There is no such version. I think you need a release before you can
commit this. Otherwise you'll cause pain for a whole lot of people.

>  LIBDRM_NVVIEUX_REQUIRED=2.4.33
>  LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
>  LIBDRM_FREEDRENO_REQUIRED=2.4.57
> diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> index b20038e..73eba06 100644
> --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> @@ -28,6 +28,10 @@
>
>  /**
>   * Define the base addresses which some state is referenced from.
> + *
> + * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
> + * Offset and Instruction State Offset are limited to 32-bits by hardware,
> + * and must be located in the first 4GBs (32-bit offset).
>   */
>  void gen8_upload_state_base_address(struct brw_context *brw)
>  {
> @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw)
>     OUT_BATCH(0);
>     OUT_BATCH(mocs_wb << 16);
>     /* Surface state base address: */
> -   OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
> -               mocs_wb << 4 | 1);
> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
> +                         mocs_wb << 4 | 1);
>     /* Dynamic state base address: */
> -   OUT_RELOC64(brw->batch.bo,
> -               I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
> -               mocs_wb << 4 | 1);
> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo,
> +                         I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         mocs_wb << 4 | 1);
>     /* Indirect object base address: MEDIA_OBJECT data */
>     OUT_BATCH(mocs_wb << 4 | 1);
>     OUT_BATCH(0);
>     /* Instruction base address: shader kernels (incl. SIP) */
> -   OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
> -               mocs_wb << 4 | 1);
> -
> +   OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         mocs_wb << 4 | 1);
>     /* General state buffer size */
>     OUT_BATCH(0xfffff001);
>     /* Dynamic state buffer size */
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 54081a1..ca90784 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -409,11 +409,23 @@ bool
>  intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>                                 drm_intel_bo *buffer,
>                                 uint32_t read_domains, uint32_t write_domain,
> -                              uint32_t delta)
> +                               uint32_t delta,
> +                               bool support_48bit_offset)
>  {
> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> -                                     buffer, delta,
> -                                     read_domains, write_domain);
> +   int ret;
> +
> +   /* Not all buffers can be allocated outside the first 4GB, and
> +    * offset must be limited to 32-bits.
> +    */
> +   if (support_48bit_offset)
> +      drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
> +                                    buffer, delta,
> +                                    read_domains, write_domain);
> +   else
> +      drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> +                              buffer, delta,
> +                              read_domains, write_domain);
> +
>     assert(ret == 0);
>     (void) ret;
>
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index ef8a6ff..de5023b 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>                                      drm_intel_bo *buffer,
>                                      uint32_t read_domains,
>                                      uint32_t write_domain,
> -                                    uint32_t offset);
> +                                    uint32_t offset,
> +                                    bool support_48bit_offset);
>  static inline uint32_t float_as_int(float f)
>  {
>     union {
> @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
>
>  /* Handle 48-bit address relocations for Gen8+ */
>  #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
> -   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);        \
> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1);     \
> +} while (0)
> +
> +/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */
> +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0);     \
>  } while (0)
>
>  #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
> --
> 2.5.0
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
Emil Velikov Aug. 7, 2015, 9:28 p.m. UTC | #3
On 7 August 2015 at 20:57, Ilia Mirkin <imirkin@alum.mit.edu> wrote:
> On Fri, Aug 7, 2015 at 5:45 AM, Michel Thierry <michel.thierry@intel.com> wrote:
>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>> allocated inside the 32-bit address range.
>>
>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>> General State Heap or Intruction State Heap must be in a 32-bit range
>> (GSH / ISH), because the General State Offset and Instruction State Offset
>> are limited to 32-bits.
>>
>> Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and
>> the bo can be in the full address space.
>>
>> This commit introduces a dependency of libdrm 2.4.63, which introduces the
>> drm_intel_bo_emit_reloc_48bit function.
>>
>> v2: s/48baddress/48b_address/,
>>     Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset
>>     is needed (Ben)
>> v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation
>>     needs the 32-bit workaround (Chris)
>>
>> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>> Cc: Ben Widawsky <ben@bwidawsk.net>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  configure.ac                                  |  2 +-
>>  src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
>>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
>>  src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
>>  4 files changed, 36 insertions(+), 15 deletions(-)
>>
>> diff --git a/configure.ac b/configure.ac
>> index af61aa2..c92ca44 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
>>  dnl Versions for external dependencies
>>  LIBDRM_REQUIRED=2.4.38
>>  LIBDRM_RADEON_REQUIRED=2.4.56
>> -LIBDRM_INTEL_REQUIRED=2.4.60
>> +LIBDRM_INTEL_REQUIRED=2.4.63
>
> There is no such version. I think you need a release before you can
> commit this. Otherwise you'll cause pain for a whole lot of people.
>
Afaics Michel explicitly covered the way things will/should be merged
in the cover letter.

-Emil
Kristian Hogsberg Aug. 7, 2015, 10:46 p.m. UTC | #4
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry <michel.thierry@intel.com> wrote:
> Gen8+ supports 48-bit virtual addresses, but some objects must always be
> allocated inside the 32-bit address range.
>
> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
> General State Heap or Intruction State Heap must be in a 32-bit range
> (GSH / ISH), because the General State Offset and Instruction State Offset
> are limited to 32-bits.

This doesn't look right. From the workaround text, it doesn't sound
like this is a HW problem, it only affects GMM. In mesa, we don't use
"heapless" (which I assume means setting the base to 0 and max range)
for instructions, dynamic state or surface state. Surface state and
dynamic state is always in our batch bo which is limited to 8k.
Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo
can be places anywhere in the aperture. Offsets to dynamic or surface
state is relative to the beginning of the batch bo and will always be
less than 4g. Instructions are in their own bo (brw->cache.bo), but
again, we point instruction state base to it and all our shader stage
setup code references the instructions relative to the beginning of
the instruction bo.

Kristian

> Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and
> the bo can be in the full address space.
>
> This commit introduces a dependency of libdrm 2.4.63, which introduces the
> drm_intel_bo_emit_reloc_48bit function.
>
> v2: s/48baddress/48b_address/,
>     Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset
>     is needed (Ben)
> v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation
>     needs the 32-bit workaround (Chris)
>
> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  configure.ac                                  |  2 +-
>  src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
>  src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
>  4 files changed, 36 insertions(+), 15 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index af61aa2..c92ca44 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
>  dnl Versions for external dependencies
>  LIBDRM_REQUIRED=2.4.38
>  LIBDRM_RADEON_REQUIRED=2.4.56
> -LIBDRM_INTEL_REQUIRED=2.4.60
> +LIBDRM_INTEL_REQUIRED=2.4.63
>  LIBDRM_NVVIEUX_REQUIRED=2.4.33
>  LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
>  LIBDRM_FREEDRENO_REQUIRED=2.4.57
> diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> index b20038e..73eba06 100644
> --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> @@ -28,6 +28,10 @@
>
>  /**
>   * Define the base addresses which some state is referenced from.
> + *
> + * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
> + * Offset and Instruction State Offset are limited to 32-bits by hardware,
> + * and must be located in the first 4GBs (32-bit offset).
>   */
>  void gen8_upload_state_base_address(struct brw_context *brw)
>  {
> @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw)
>     OUT_BATCH(0);
>     OUT_BATCH(mocs_wb << 16);
>     /* Surface state base address: */
> -   OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
> -               mocs_wb << 4 | 1);
> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
> +                         mocs_wb << 4 | 1);
>     /* Dynamic state base address: */
> -   OUT_RELOC64(brw->batch.bo,
> -               I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
> -               mocs_wb << 4 | 1);
> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo,
> +                         I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         mocs_wb << 4 | 1);
>     /* Indirect object base address: MEDIA_OBJECT data */
>     OUT_BATCH(mocs_wb << 4 | 1);
>     OUT_BATCH(0);
>     /* Instruction base address: shader kernels (incl. SIP) */
> -   OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
> -               mocs_wb << 4 | 1);
> -
> +   OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         mocs_wb << 4 | 1);
>     /* General state buffer size */
>     OUT_BATCH(0xfffff001);
>     /* Dynamic state buffer size */
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 54081a1..ca90784 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -409,11 +409,23 @@ bool
>  intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>                                 drm_intel_bo *buffer,
>                                 uint32_t read_domains, uint32_t write_domain,
> -                              uint32_t delta)
> +                               uint32_t delta,
> +                               bool support_48bit_offset)
>  {
> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> -                                     buffer, delta,
> -                                     read_domains, write_domain);
> +   int ret;
> +
> +   /* Not all buffers can be allocated outside the first 4GB, and
> +    * offset must be limited to 32-bits.
> +    */
> +   if (support_48bit_offset)
> +      drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
> +                                    buffer, delta,
> +                                    read_domains, write_domain);
> +   else
> +      drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> +                              buffer, delta,
> +                              read_domains, write_domain);
> +
>     assert(ret == 0);
>     (void) ret;
>
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index ef8a6ff..de5023b 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>                                      drm_intel_bo *buffer,
>                                      uint32_t read_domains,
>                                      uint32_t write_domain,
> -                                    uint32_t offset);
> +                                    uint32_t offset,
> +                                    bool support_48bit_offset);
>  static inline uint32_t float_as_int(float f)
>  {
>     union {
> @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
>
>  /* Handle 48-bit address relocations for Gen8+ */
>  #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
> -   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);        \
> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1);     \
> +} while (0)
> +
> +/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */
> +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0);     \
>  } while (0)
>
>  #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
> --
> 2.5.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michel Thierry Aug. 10, 2015, 9:21 a.m. UTC | #5
Hi,

Thanks for the comments,

On 8/7/2015 11:46 PM, Kristian Høgsberg wrote:
> On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry <michel.thierry@intel.com> wrote:
>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>> allocated inside the 32-bit address range.
>>
>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>> General State Heap or Intruction State Heap must be in a 32-bit range
>> (GSH / ISH), because the General State Offset and Instruction State Offset
>> are limited to 32-bits.
>
> This doesn't look right. From the workaround text, it doesn't sound
> like this is a HW problem, it only affects GMM. In mesa, we don't use
> "heapless" (which I assume means setting the base to 0 and max range)

It is a hw problem, specifically in the state base address command. 
General State Base Address and Instruction Base Address shouldn't be > 4GB.

> for instructions, dynamic state or surface state. Surface state and
> dynamic state is always in our batch bo which is limited to 8k.
> Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo
> can be places anywhere in the aperture. Offsets to dynamic or surface
> state is relative to the beginning of the batch bo and will always be
> less than 4g. Instructions are in their own bo (brw->cache.bo), but
> again, we point instruction state base to it and all our shader stage
> setup code references the instructions relative to the beginning of
> the instruction bo.

I've seen the issue when the driver allocates mapped objects from bottom 
to top and the other bo's from top to bottom (gpu hangs). So I say this 
wa is needed.

-Michel
>
> Kristian
>
>> Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and
>> the bo can be in the full address space.
>>
>> This commit introduces a dependency of libdrm 2.4.63, which introduces the
>> drm_intel_bo_emit_reloc_48bit function.
>>
>> v2: s/48baddress/48b_address/,
>>      Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset
>>      is needed (Ben)
>> v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation
>>      needs the 32-bit workaround (Chris)
>>
>> References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>> Cc: Ben Widawsky <ben@bwidawsk.net>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>   configure.ac                                  |  2 +-
>>   src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
>>   src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
>>   src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
>>   4 files changed, 36 insertions(+), 15 deletions(-)
>>
>> diff --git a/configure.ac b/configure.ac
>> index af61aa2..c92ca44 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
>>   dnl Versions for external dependencies
>>   LIBDRM_REQUIRED=2.4.38
>>   LIBDRM_RADEON_REQUIRED=2.4.56
>> -LIBDRM_INTEL_REQUIRED=2.4.60
>> +LIBDRM_INTEL_REQUIRED=2.4.63
>>   LIBDRM_NVVIEUX_REQUIRED=2.4.33
>>   LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
>>   LIBDRM_FREEDRENO_REQUIRED=2.4.57
>> diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
>> index b20038e..73eba06 100644
>> --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
>> @@ -28,6 +28,10 @@
>>
>>   /**
>>    * Define the base addresses which some state is referenced from.
>> + *
>> + * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
>> + * Offset and Instruction State Offset are limited to 32-bits by hardware,
>> + * and must be located in the first 4GBs (32-bit offset).
>>    */
>>   void gen8_upload_state_base_address(struct brw_context *brw)
>>   {
>> @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw)
>>      OUT_BATCH(0);
>>      OUT_BATCH(mocs_wb << 16);
>>      /* Surface state base address: */
>> -   OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
>> -               mocs_wb << 4 | 1);
>> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
>> +                         mocs_wb << 4 | 1);
>>      /* Dynamic state base address: */
>> -   OUT_RELOC64(brw->batch.bo,
>> -               I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
>> -               mocs_wb << 4 | 1);
>> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo,
>> +                         I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
>> +                         mocs_wb << 4 | 1);
>>      /* Indirect object base address: MEDIA_OBJECT data */
>>      OUT_BATCH(mocs_wb << 4 | 1);
>>      OUT_BATCH(0);
>>      /* Instruction base address: shader kernels (incl. SIP) */
>> -   OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
>> -               mocs_wb << 4 | 1);
>> -
>> +   OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
>> +                         mocs_wb << 4 | 1);
>>      /* General state buffer size */
>>      OUT_BATCH(0xfffff001);
>>      /* Dynamic state buffer size */
>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> index 54081a1..ca90784 100644
>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> @@ -409,11 +409,23 @@ bool
>>   intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>>                                  drm_intel_bo *buffer,
>>                                  uint32_t read_domains, uint32_t write_domain,
>> -                              uint32_t delta)
>> +                               uint32_t delta,
>> +                               bool support_48bit_offset)
>>   {
>> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>> -                                     buffer, delta,
>> -                                     read_domains, write_domain);
>> +   int ret;
>> +
>> +   /* Not all buffers can be allocated outside the first 4GB, and
>> +    * offset must be limited to 32-bits.
>> +    */
>> +   if (support_48bit_offset)
>> +      drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
>> +                                    buffer, delta,
>> +                                    read_domains, write_domain);
>> +   else
>> +      drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>> +                              buffer, delta,
>> +                              read_domains, write_domain);
>> +
>>      assert(ret == 0);
>>      (void) ret;
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> index ef8a6ff..de5023b 100644
>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>>                                       drm_intel_bo *buffer,
>>                                       uint32_t read_domains,
>>                                       uint32_t write_domain,
>> -                                    uint32_t offset);
>> +                                    uint32_t offset,
>> +                                    bool support_48bit_offset);
>>   static inline uint32_t float_as_int(float f)
>>   {
>>      union {
>> @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
>>
>>   /* Handle 48-bit address relocations for Gen8+ */
>>   #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
>> -   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);        \
>> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1);     \
>> +} while (0)
>> +
>> +/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */
>> +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
>> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0);     \
>>   } while (0)
>>
>>   #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
>> --
>> 2.5.0
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Kristian Hogsberg Aug. 10, 2015, 11:17 p.m. UTC | #6
On Mon, Aug 10, 2015 at 2:21 AM, Michel Thierry
<michel.thierry@intel.com> wrote:
> Hi,
>
> Thanks for the comments,
>
> On 8/7/2015 11:46 PM, Kristian Høgsberg wrote:
>>
>> On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry <michel.thierry@intel.com>
>> wrote:
>>>
>>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>>> allocated inside the 32-bit address range.
>>>
>>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>>> General State Heap or Intruction State Heap must be in a 32-bit range
>>> (GSH / ISH), because the General State Offset and Instruction State
>>> Offset
>>> are limited to 32-bits.
>>
>>
>> This doesn't look right. From the workaround text, it doesn't sound
>> like this is a HW problem, it only affects GMM. In mesa, we don't use
>> "heapless" (which I assume means setting the base to 0 and max range)
>
>
> It is a hw problem, specifically in the state base address command. General
> State Base Address and Instruction Base Address shouldn't be > 4GB.
>
>> for instructions, dynamic state or surface state. Surface state and
>> dynamic state is always in our batch bo which is limited to 8k.
>> Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo
>> can be places anywhere in the aperture. Offsets to dynamic or surface
>> state is relative to the beginning of the batch bo and will always be
>> less than 4g. Instructions are in their own bo (brw->cache.bo), but
>> again, we point instruction state base to it and all our shader stage
>> setup code references the instructions relative to the beginning of
>> the instruction bo.
>
>
> I've seen the issue when the driver allocates mapped objects from bottom to
> top and the other bo's from top to bottom (gpu hangs). So I say this wa is
> needed.

mesa does use heapless for the scratch buffers, and on  BDW+ that's
the only address type that's relative to general state base address.
So we need the workaround there, whether or not there's a HW
limitation/bug.

I don't think there is a HW problem though. The driver I'm working on
can use 48 bit ppgtt and Chris Wilson's softpin ioctl for userspace BO
placement. I enabled that and made the userspace allocator place all
BOs above 8g, and it all stilll worked. In particular, the
instructions were above 8g and referenced relative to instruction base
address. If the instruction base was limited to 4g, this would have
hung the GPU.

The fact that the HW limits the size of the heaps to 4g can be
restriction when we move to full 48 ppgtt, but for how mesa uses
STATE_BASE_ADDRESS, it only affects general state base address.

Kristiain

> -Michel
>
>>
>> Kristian
>>
>>> Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary,
>>> and
>>> the bo can be in the full address space.
>>>
>>> This commit introduces a dependency of libdrm 2.4.63, which introduces
>>> the
>>> drm_intel_bo_emit_reloc_48bit function.
>>>
>>> v2: s/48baddress/48b_address/,
>>>      Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address
>>> offset
>>>      is needed (Ben)
>>> v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit
>>> relocation
>>>      needs the 32-bit workaround (Chris)
>>>
>>> References:
>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>> Cc: Ben Widawsky <ben@bwidawsk.net>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>> ---
>>>   configure.ac                                  |  2 +-
>>>   src/mesa/drivers/dri/i965/gen8_misc_state.c   | 19 +++++++++++--------
>>>   src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++----
>>>   src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++--
>>>   4 files changed, 36 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/configure.ac b/configure.ac
>>> index af61aa2..c92ca44 100644
>>> --- a/configure.ac
>>> +++ b/configure.ac
>>> @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION])
>>>   dnl Versions for external dependencies
>>>   LIBDRM_REQUIRED=2.4.38
>>>   LIBDRM_RADEON_REQUIRED=2.4.56
>>> -LIBDRM_INTEL_REQUIRED=2.4.60
>>> +LIBDRM_INTEL_REQUIRED=2.4.63
>>>   LIBDRM_NVVIEUX_REQUIRED=2.4.33
>>>   LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
>>>   LIBDRM_FREEDRENO_REQUIRED=2.4.57
>>> diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c
>>> b/src/mesa/drivers/dri/i965/gen8_misc_state.c
>>> index b20038e..73eba06 100644
>>> --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
>>> +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
>>> @@ -28,6 +28,10 @@
>>>
>>>   /**
>>>    * Define the base addresses which some state is referenced from.
>>> + *
>>> + * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
>>> + * Offset and Instruction State Offset are limited to 32-bits by
>>> hardware,
>>> + * and must be located in the first 4GBs (32-bit offset).
>>>    */
>>>   void gen8_upload_state_base_address(struct brw_context *brw)
>>>   {
>>> @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct
>>> brw_context *brw)
>>>      OUT_BATCH(0);
>>>      OUT_BATCH(mocs_wb << 16);
>>>      /* Surface state base address: */
>>> -   OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
>>> -               mocs_wb << 4 | 1);
>>> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
>>> +                         mocs_wb << 4 | 1);
>>>      /* Dynamic state base address: */
>>> -   OUT_RELOC64(brw->batch.bo,
>>> -               I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
>>> -               mocs_wb << 4 | 1);
>>> +   OUT_RELOC64_INSIDE_4G(brw->batch.bo,
>>> +                         I915_GEM_DOMAIN_RENDER |
>>> I915_GEM_DOMAIN_INSTRUCTION, 0,
>>> +                         mocs_wb << 4 | 1);
>>>      /* Indirect object base address: MEDIA_OBJECT data */
>>>      OUT_BATCH(mocs_wb << 4 | 1);
>>>      OUT_BATCH(0);
>>>      /* Instruction base address: shader kernels (incl. SIP) */
>>> -   OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
>>> -               mocs_wb << 4 | 1);
>>> -
>>> +   OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
>>> +                         mocs_wb << 4 | 1);
>>>      /* General state buffer size */
>>>      OUT_BATCH(0xfffff001);
>>>      /* Dynamic state buffer size */
>>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>>> b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>>> index 54081a1..ca90784 100644
>>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>>> @@ -409,11 +409,23 @@ bool
>>>   intel_batchbuffer_emit_reloc64(struct brw_context *brw,
>>>                                  drm_intel_bo *buffer,
>>>                                  uint32_t read_domains, uint32_t
>>> write_domain,
>>> -                              uint32_t delta)
>>> +                               uint32_t delta,
>>> +                               bool support_48bit_offset)
>>>   {
>>> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>>> -                                     buffer, delta,
>>> -                                     read_domains, write_domain);
>>> +   int ret;
>>> +
>>> +   /* Not all buffers can be allocated outside the first 4GB, and
>>> +    * offset must be limited to 32-bits.
>>> +    */
>>> +   if (support_48bit_offset)
>>> +      drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
>>> +                                    buffer, delta,
>>> +                                    read_domains, write_domain);
>>> +   else
>>> +      drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>>> +                              buffer, delta,
>>> +                              read_domains, write_domain);
>>> +
>>>      assert(ret == 0);
>>>      (void) ret;
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>>> b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>>> index ef8a6ff..de5023b 100644
>>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>>> @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context
>>> *brw,
>>>                                       drm_intel_bo *buffer,
>>>                                       uint32_t read_domains,
>>>                                       uint32_t write_domain,
>>> -                                    uint32_t offset);
>>> +                                    uint32_t offset,
>>> +                                    bool support_48bit_offset);
>>>   static inline uint32_t float_as_int(float f)
>>>   {
>>>      union {
>>> @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
>>>
>>>   /* Handle 48-bit address relocations for Gen8+ */
>>>   #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
>>> -   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
>>> delta);        \
>>> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
>>> delta, 1);     \
>>> +} while (0)
>>> +
>>> +/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset
>>> */
>>> +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do
>>> { \
>>> +   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
>>> delta, 0);     \
>>>   } while (0)
>>>
>>>   #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
>>> --
>>> 2.5.0
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Patch
diff mbox

diff --git a/configure.ac b/configure.ac
index af61aa2..c92ca44 100644
--- a/configure.ac
+++ b/configure.ac
@@ -68,7 +68,7 @@  AC_SUBST([OSMESA_VERSION])
 dnl Versions for external dependencies
 LIBDRM_REQUIRED=2.4.38
 LIBDRM_RADEON_REQUIRED=2.4.56
-LIBDRM_INTEL_REQUIRED=2.4.60
+LIBDRM_INTEL_REQUIRED=2.4.63
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
 LIBDRM_FREEDRENO_REQUIRED=2.4.57
diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
index b20038e..73eba06 100644
--- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
@@ -28,6 +28,10 @@ 
 
 /**
  * Define the base addresses which some state is referenced from.
+ *
+ * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
+ * Offset and Instruction State Offset are limited to 32-bits by hardware,
+ * and must be located in the first 4GBs (32-bit offset).
  */
 void gen8_upload_state_base_address(struct brw_context *brw)
 {
@@ -41,19 +45,18 @@  void gen8_upload_state_base_address(struct brw_context *brw)
    OUT_BATCH(0);
    OUT_BATCH(mocs_wb << 16);
    /* Surface state base address: */
-   OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
-               mocs_wb << 4 | 1);
+   OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
+                         mocs_wb << 4 | 1);
    /* Dynamic state base address: */
-   OUT_RELOC64(brw->batch.bo,
-               I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
-               mocs_wb << 4 | 1);
+   OUT_RELOC64_INSIDE_4G(brw->batch.bo,
+                         I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
+                         mocs_wb << 4 | 1);
    /* Indirect object base address: MEDIA_OBJECT data */
    OUT_BATCH(mocs_wb << 4 | 1);
    OUT_BATCH(0);
    /* Instruction base address: shader kernels (incl. SIP) */
-   OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
-               mocs_wb << 4 | 1);
-
+   OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+                         mocs_wb << 4 | 1);
    /* General state buffer size */
    OUT_BATCH(0xfffff001);
    /* Dynamic state buffer size */
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 54081a1..ca90784 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -409,11 +409,23 @@  bool
 intel_batchbuffer_emit_reloc64(struct brw_context *brw,
                                drm_intel_bo *buffer,
                                uint32_t read_domains, uint32_t write_domain,
-			       uint32_t delta)
+                               uint32_t delta,
+                               bool support_48bit_offset)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
-                                     buffer, delta,
-                                     read_domains, write_domain);
+   int ret;
+
+   /* Not all buffers can be allocated outside the first 4GB, and
+    * offset must be limited to 32-bits.
+    */
+   if (support_48bit_offset)
+      drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
+                                    buffer, delta,
+                                    read_domains, write_domain);
+   else
+      drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+                              buffer, delta,
+                              read_domains, write_domain);
+
    assert(ret == 0);
    (void) ret;
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index ef8a6ff..de5023b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -62,7 +62,8 @@  bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
                                     drm_intel_bo *buffer,
                                     uint32_t read_domains,
                                     uint32_t write_domain,
-                                    uint32_t offset);
+                                    uint32_t offset,
+                                    bool support_48bit_offset);
 static inline uint32_t float_as_int(float f)
 {
    union {
@@ -167,7 +168,12 @@  intel_batchbuffer_advance(struct brw_context *brw)
 
 /* Handle 48-bit address relocations for Gen8+ */
 #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
-   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
+   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1);	\
+} while (0)
+
+/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */
+#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
+   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0);	\
 } while (0)
 
 #define ADVANCE_BATCH() intel_batchbuffer_advance(brw);