diff mbox series

[v2,4/4] drm/i915/: Re-work clflush_write32

Message ID 20220128221020.188253-5-michael.cheng@intel.com (mailing list archive)
State New, archived
Headers show
Series Use drm_clflush* instead of clflush | expand

Commit Message

Michael Cheng Jan. 28, 2022, 10:10 p.m. UTC
Use drm_clflush_virt_range instead of clflushopt and remove the memory
barrier, since drm_clflush_virt_range takes care of that.

Signed-off-by: Michael Cheng <michael.cheng@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

Comments

Bowman, Casey G Jan. 29, 2022, 7:24 a.m. UTC | #1
> -----Original Message-----
> From: Cheng, Michael <michael.cheng@intel.com>
> Sent: Friday, January 28, 2022 2:10 PM
> To: intel-gfx@lists.freedesktop.org
> Cc: Cheng, Michael <michael.cheng@intel.com>; Bowman, Casey G
> <casey.g.bowman@intel.com>; De Marchi, Lucas
> <lucas.demarchi@intel.com>; Boyer, Wayne <wayne.boyer@intel.com>;
> ville.syrjala@linux.intel.com; Kuoppala, Mika <mika.kuoppala@intel.com>;
> Auld, Matthew <matthew.auld@intel.com>
> Subject: [PATCH v2 4/4] drm/i915/: Re-work clflush_write32
> 
> Use drm_clflush_virt_range instead of clflushopt and remove the memory
> barrier, since drm_clflush_virt_range takes care of that.
> 
> Signed-off-by: Michael Cheng <michael.cheng@intel.com>

Reviewed-by: Casey Bowman <casey.g.bowman@intel.com>

> ---
>  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 498b458fd784..0854276ff7ba 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
> static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)  {
>  	if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
> -		if (flushes & CLFLUSH_BEFORE) {
> -			clflushopt(addr);
> -			mb();
> -		}
> +		if (flushes & CLFLUSH_BEFORE)
> +			drm_clflush_virt_range(addr, sizeof(addr));
> 
>  		*addr = value;
> 
> @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 value,
> unsigned int flushes)
>  		 * to ensure ordering of clflush wrt to the system.
>  		 */
>  		if (flushes & CLFLUSH_AFTER)
> -			clflushopt(addr);
> +			drm_clflush_virt_range(addr, sizeof(addr));
>  	} else
>  		*addr = value;
>  }
> --
> 2.25.1
Tvrtko Ursulin Jan. 31, 2022, 2:55 p.m. UTC | #2
On 28/01/2022 22:10, Michael Cheng wrote:
> Use drm_clflush_virt_range instead of clflushopt and remove the memory
> barrier, since drm_clflush_virt_range takes care of that.
> 
> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>   1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 498b458fd784..0854276ff7ba 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
>   static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
>   {
>   	if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
> -		if (flushes & CLFLUSH_BEFORE) {
> -			clflushopt(addr);
> -			mb();
> -		}
> +		if (flushes & CLFLUSH_BEFORE)
> +			drm_clflush_virt_range(addr, sizeof(addr));
>   
>   		*addr = value;
>   
> @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
>   		 * to ensure ordering of clflush wrt to the system.
>   		 */
>   		if (flushes & CLFLUSH_AFTER)
> -			clflushopt(addr);
> +			drm_clflush_virt_range(addr, sizeof(addr));
>   	} else
>   		*addr = value;
>   }

Slightly annoying thing here (maybe in some other patches from the series as well) is that the change adds a function call to x86 only code path, because relocations are not supported on discrete as per:

static in
eb_validate_vma(...)
         /* Relocations are disallowed for all platforms after TGL-LP.  This
          * also covers all platforms with local memory.
          */

         if (entry->relocation_count &&
             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
                 return -EINVAL;

How acceptable would be, for the whole series, to introduce a static inline i915 cluflush wrapper and so be able to avoid functions calls on x86? Is this something that has been discussed and discounted already?

Regards,

Tvrtko

P.S. Hmm I am now reminded of my really old per platform build patches. With them you would be able to compile out large portions of the driver when building for ARM. Probably like a 3rd if my memory serves me right.
Michael Cheng Jan. 31, 2022, 5:02 p.m. UTC | #3
Hey Tvrtko,

Are you saying when adding drm_clflush_virt_range(addr, sizeof(addr), 
this function forces an x86 code path only? If that is the case, 
drm_clflush_virt_range(addr, sizeof(addr) currently has ifdefs that 
seperate out x86 and powerpc, so we can add an ifdef for arm in the near 
future when needed.

On 2022-01-31 6:55 a.m., Tvrtko Ursulin wrote:
> On 28/01/2022 22:10, Michael Cheng wrote:
>> Use drm_clflush_virt_range instead of clflushopt and remove the memory
>> barrier, since drm_clflush_virt_range takes care of that.
>>
>> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> index 498b458fd784..0854276ff7ba 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
>>   static void clflush_write32(u32 *addr, u32 value, unsigned int 
>> flushes)
>>   {
>>       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
>> -        if (flushes & CLFLUSH_BEFORE) {
>> -            clflushopt(addr);
>> -            mb();
>> -        }
>> +        if (flushes & CLFLUSH_BEFORE)
>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>             *addr = value;
>>   @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 
>> value, unsigned int flushes)
>>            * to ensure ordering of clflush wrt to the system.
>>            */
>>           if (flushes & CLFLUSH_AFTER)
>> -            clflushopt(addr);
>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>       } else
>>           *addr = value;
>>   }
>
> Slightly annoying thing here (maybe in some other patches from the 
> series as well) is that the change adds a function call to x86 only 
> code path, because relocations are not supported on discrete as per:
>
> static in
> eb_validate_vma(...)
>         /* Relocations are disallowed for all platforms after TGL-LP.  
> This
>          * also covers all platforms with local memory.
>          */
>
>         if (entry->relocation_count &&
>             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
>                 return -EINVAL;
>
> How acceptable would be, for the whole series, to introduce a static 
> inline i915 cluflush wrapper and so be able to avoid functions calls 
> on x86? Is this something that has been discussed and discounted already?
>
> Regards,
>
> Tvrtko
>
> P.S. Hmm I am now reminded of my really old per platform build 
> patches. With them you would be able to compile out large portions of 
> the driver when building for ARM. Probably like a 3rd if my memory 
> serves me right.
Tvrtko Ursulin Feb. 1, 2022, 9:25 a.m. UTC | #4
On 31/01/2022 17:02, Michael Cheng wrote:
> Hey Tvrtko,
> 
> Are you saying when adding drm_clflush_virt_range(addr, sizeof(addr), 
> this function forces an x86 code path only? If that is the case, 
> drm_clflush_virt_range(addr, sizeof(addr) currently has ifdefs that 
> seperate out x86 and powerpc, so we can add an ifdef for arm in the near 
> future when needed.

No, I was noticing that the change you are making in this patch, while 
it indeed fixes a build failure, it is a code path which does not get 
executed on Arm at all.

So what effectively happens is a single assembly instruction gets 
replaced with a function call on all integrated GPUs up to and including 
Tigerlake.

That was the slightly annoying part I was referring to and asking 
whether it was discussed before.

Sadly I don't think there is a super nice solution apart from 
duplicating drm_clflush_virt_range as for example i915_clflush_range and 
having it static inline. That would allow the integrated GPU code path 
to remain of the same performance profile, while solving the Arm 
problem. However it would be code duplication so might be frowned upon.

I'd be tempted to go that route but it is something which needs a bit of 
discussion if that hasn't happened already.

Regards,

Tvrtko

> On 2022-01-31 6:55 a.m., Tvrtko Ursulin wrote:
>> On 28/01/2022 22:10, Michael Cheng wrote:
>>> Use drm_clflush_virt_range instead of clflushopt and remove the memory
>>> barrier, since drm_clflush_virt_range takes care of that.
>>>
>>> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> index 498b458fd784..0854276ff7ba 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
>>>   static void clflush_write32(u32 *addr, u32 value, unsigned int 
>>> flushes)
>>>   {
>>>       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
>>> -        if (flushes & CLFLUSH_BEFORE) {
>>> -            clflushopt(addr);
>>> -            mb();
>>> -        }
>>> +        if (flushes & CLFLUSH_BEFORE)
>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>             *addr = value;
>>>   @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 
>>> value, unsigned int flushes)
>>>            * to ensure ordering of clflush wrt to the system.
>>>            */
>>>           if (flushes & CLFLUSH_AFTER)
>>> -            clflushopt(addr);
>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>       } else
>>>           *addr = value;
>>>   }
>>
>> Slightly annoying thing here (maybe in some other patches from the 
>> series as well) is that the change adds a function call to x86 only 
>> code path, because relocations are not supported on discrete as per:
>>
>> static in
>> eb_validate_vma(...)
>>         /* Relocations are disallowed for all platforms after TGL-LP. 
>> This
>>          * also covers all platforms with local memory.
>>          */
>>
>>         if (entry->relocation_count &&
>>             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
>>                 return -EINVAL;
>>
>> How acceptable would be, for the whole series, to introduce a static 
>> inline i915 cluflush wrapper and so be able to avoid functions calls 
>> on x86? Is this something that has been discussed and discounted already?
>>
>> Regards,
>>
>> Tvrtko
>>
>> P.S. Hmm I am now reminded of my really old per platform build 
>> patches. With them you would be able to compile out large portions of 
>> the driver when building for ARM. Probably like a 3rd if my memory 
>> serves me right.
Michael Cheng Feb. 1, 2022, 3:41 p.m. UTC | #5
Ah, thanks for the clarification! While discussion goes on about the 
route you suggested, could we land these patches (after addressing the 
reviews) to unblock compiling i915 on arm?

On 2022-02-01 1:25 a.m., Tvrtko Ursulin wrote:
>
> On 31/01/2022 17:02, Michael Cheng wrote:
>> Hey Tvrtko,
>>
>> Are you saying when adding drm_clflush_virt_range(addr, sizeof(addr), 
>> this function forces an x86 code path only? If that is the case, 
>> drm_clflush_virt_range(addr, sizeof(addr) currently has ifdefs that 
>> seperate out x86 and powerpc, so we can add an ifdef for arm in the 
>> near future when needed.
>
> No, I was noticing that the change you are making in this patch, while 
> it indeed fixes a build failure, it is a code path which does not get 
> executed on Arm at all.
>
> So what effectively happens is a single assembly instruction gets 
> replaced with a function call on all integrated GPUs up to and 
> including Tigerlake.
>
> That was the slightly annoying part I was referring to and asking 
> whether it was discussed before.
>
> Sadly I don't think there is a super nice solution apart from 
> duplicating drm_clflush_virt_range as for example i915_clflush_range 
> and having it static inline. That would allow the integrated GPU code 
> path to remain of the same performance profile, while solving the Arm 
> problem. However it would be code duplication so might be frowned upon.
>
> I'd be tempted to go that route but it is something which needs a bit 
> of discussion if that hasn't happened already.
>
> Regards,
>
> Tvrtko
>
>> On 2022-01-31 6:55 a.m., Tvrtko Ursulin wrote:
>>> On 28/01/2022 22:10, Michael Cheng wrote:
>>>> Use drm_clflush_virt_range instead of clflushopt and remove the memory
>>>> barrier, since drm_clflush_virt_range takes care of that.
>>>>
>>>> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>>>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> index 498b458fd784..0854276ff7ba 100644
>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
>>>>   static void clflush_write32(u32 *addr, u32 value, unsigned int 
>>>> flushes)
>>>>   {
>>>>       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
>>>> -        if (flushes & CLFLUSH_BEFORE) {
>>>> -            clflushopt(addr);
>>>> -            mb();
>>>> -        }
>>>> +        if (flushes & CLFLUSH_BEFORE)
>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>             *addr = value;
>>>>   @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 
>>>> value, unsigned int flushes)
>>>>            * to ensure ordering of clflush wrt to the system.
>>>>            */
>>>>           if (flushes & CLFLUSH_AFTER)
>>>> -            clflushopt(addr);
>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>       } else
>>>>           *addr = value;
>>>>   }
>>>
>>> Slightly annoying thing here (maybe in some other patches from the 
>>> series as well) is that the change adds a function call to x86 only 
>>> code path, because relocations are not supported on discrete as per:
>>>
>>> static in
>>> eb_validate_vma(...)
>>>         /* Relocations are disallowed for all platforms after 
>>> TGL-LP. This
>>>          * also covers all platforms with local memory.
>>>          */
>>>
>>>         if (entry->relocation_count &&
>>>             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
>>>                 return -EINVAL;
>>>
>>> How acceptable would be, for the whole series, to introduce a static 
>>> inline i915 cluflush wrapper and so be able to avoid functions calls 
>>> on x86? Is this something that has been discussed and discounted 
>>> already?
>>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>> P.S. Hmm I am now reminded of my really old per platform build 
>>> patches. With them you would be able to compile out large portions 
>>> of the driver when building for ARM. Probably like a 3rd if my 
>>> memory serves me right.
Tvrtko Ursulin Feb. 1, 2022, 4:32 p.m. UTC | #6
On 01/02/2022 15:41, Michael Cheng wrote:
> Ah, thanks for the clarification! While discussion goes on about the 
> route you suggested, could we land these patches (after addressing the 
> reviews) to unblock compiling i915 on arm?

I am 60-40 to no, since follow up can be hard. I'd prefer a little bit 
of discussion before merging.

Also, what will be the Arm implementation of drm_clflush_virt_range? 
Noob question - why is i915 the only driver calling it? Do other GPUs 
never need to flush CPU cache?

Regards,

Tvrtko

> On 2022-02-01 1:25 a.m., Tvrtko Ursulin wrote:
>>
>> On 31/01/2022 17:02, Michael Cheng wrote:
>>> Hey Tvrtko,
>>>
>>> Are you saying when adding drm_clflush_virt_range(addr, sizeof(addr), 
>>> this function forces an x86 code path only? If that is the case, 
>>> drm_clflush_virt_range(addr, sizeof(addr) currently has ifdefs that 
>>> seperate out x86 and powerpc, so we can add an ifdef for arm in the 
>>> near future when needed.
>>
>> No, I was noticing that the change you are making in this patch, while 
>> it indeed fixes a build failure, it is a code path which does not get 
>> executed on Arm at all.
>>
>> So what effectively happens is a single assembly instruction gets 
>> replaced with a function call on all integrated GPUs up to and 
>> including Tigerlake.
>>
>> That was the slightly annoying part I was referring to and asking 
>> whether it was discussed before.
>>
>> Sadly I don't think there is a super nice solution apart from 
>> duplicating drm_clflush_virt_range as for example i915_clflush_range 
>> and having it static inline. That would allow the integrated GPU code 
>> path to remain of the same performance profile, while solving the Arm 
>> problem. However it would be code duplication so might be frowned upon.
>>
>> I'd be tempted to go that route but it is something which needs a bit 
>> of discussion if that hasn't happened already.
>>
>> Regards,
>>
>> Tvrtko
>>
>>> On 2022-01-31 6:55 a.m., Tvrtko Ursulin wrote:
>>>> On 28/01/2022 22:10, Michael Cheng wrote:
>>>>> Use drm_clflush_virt_range instead of clflushopt and remove the memory
>>>>> barrier, since drm_clflush_virt_range takes care of that.
>>>>>
>>>>> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
>>>>> ---
>>>>>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>>>>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> index 498b458fd784..0854276ff7ba 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
>>>>>   static void clflush_write32(u32 *addr, u32 value, unsigned int 
>>>>> flushes)
>>>>>   {
>>>>>       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
>>>>> -        if (flushes & CLFLUSH_BEFORE) {
>>>>> -            clflushopt(addr);
>>>>> -            mb();
>>>>> -        }
>>>>> +        if (flushes & CLFLUSH_BEFORE)
>>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>>             *addr = value;
>>>>>   @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, u32 
>>>>> value, unsigned int flushes)
>>>>>            * to ensure ordering of clflush wrt to the system.
>>>>>            */
>>>>>           if (flushes & CLFLUSH_AFTER)
>>>>> -            clflushopt(addr);
>>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>>       } else
>>>>>           *addr = value;
>>>>>   }
>>>>
>>>> Slightly annoying thing here (maybe in some other patches from the 
>>>> series as well) is that the change adds a function call to x86 only 
>>>> code path, because relocations are not supported on discrete as per:
>>>>
>>>> static in
>>>> eb_validate_vma(...)
>>>>         /* Relocations are disallowed for all platforms after 
>>>> TGL-LP. This
>>>>          * also covers all platforms with local memory.
>>>>          */
>>>>
>>>>         if (entry->relocation_count &&
>>>>             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
>>>>                 return -EINVAL;
>>>>
>>>> How acceptable would be, for the whole series, to introduce a static 
>>>> inline i915 cluflush wrapper and so be able to avoid functions calls 
>>>> on x86? Is this something that has been discussed and discounted 
>>>> already?
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>> P.S. Hmm I am now reminded of my really old per platform build 
>>>> patches. With them you would be able to compile out large portions 
>>>> of the driver when building for ARM. Probably like a 3rd if my 
>>>> memory serves me right.
Michael Cheng Feb. 2, 2022, 4:35 p.m. UTC | #7
As far as I know, we haven't gotten to the arm implementation yet, since 
we are trying to get i915 compile for arm without using random ifdefs 
and dummy functions.

"Noob question - why is i915 the only driver calling it? Do other GPUs 
never need to flush CPU cache?"

Unfortunately I don't have enough expertise to comfortable answer this 
question. Maybe someone else can chime in here? Lucas? Matt?

On 2022-02-01 8:32 a.m., Tvrtko Ursulin wrote:
>
> On 01/02/2022 15:41, Michael Cheng wrote:
>> Ah, thanks for the clarification! While discussion goes on about the 
>> route you suggested, could we land these patches (after addressing 
>> the reviews) to unblock compiling i915 on arm?
>
> I am 60-40 to no, since follow up can be hard. I'd prefer a little bit 
> of discussion before merging.
>
> Also, what will be the Arm implementation of drm_clflush_virt_range? 
> Noob question - why is i915 the only driver calling it? Do other GPUs 
> never need to flush CPU cache?
>
> Regards,
>
> Tvrtko
>
>> On 2022-02-01 1:25 a.m., Tvrtko Ursulin wrote:
>>>
>>> On 31/01/2022 17:02, Michael Cheng wrote:
>>>> Hey Tvrtko,
>>>>
>>>> Are you saying when adding drm_clflush_virt_range(addr, 
>>>> sizeof(addr), this function forces an x86 code path only? If that 
>>>> is the case, drm_clflush_virt_range(addr, sizeof(addr) currently 
>>>> has ifdefs that seperate out x86 and powerpc, so we can add an 
>>>> ifdef for arm in the near future when needed.
>>>
>>> No, I was noticing that the change you are making in this patch, 
>>> while it indeed fixes a build failure, it is a code path which does 
>>> not get executed on Arm at all.
>>>
>>> So what effectively happens is a single assembly instruction gets 
>>> replaced with a function call on all integrated GPUs up to and 
>>> including Tigerlake.
>>>
>>> That was the slightly annoying part I was referring to and asking 
>>> whether it was discussed before.
>>>
>>> Sadly I don't think there is a super nice solution apart from 
>>> duplicating drm_clflush_virt_range as for example i915_clflush_range 
>>> and having it static inline. That would allow the integrated GPU 
>>> code path to remain of the same performance profile, while solving 
>>> the Arm problem. However it would be code duplication so might be 
>>> frowned upon.
>>>
>>> I'd be tempted to go that route but it is something which needs a 
>>> bit of discussion if that hasn't happened already.
>>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>>> On 2022-01-31 6:55 a.m., Tvrtko Ursulin wrote:
>>>>> On 28/01/2022 22:10, Michael Cheng wrote:
>>>>>> Use drm_clflush_virt_range instead of clflushopt and remove the 
>>>>>> memory
>>>>>> barrier, since drm_clflush_virt_range takes care of that.
>>>>>>
>>>>>> Signed-off-by: Michael Cheng <michael.cheng@intel.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-----
>>>>>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
>>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>> index 498b458fd784..0854276ff7ba 100644
>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>>>>> @@ -1332,10 +1332,8 @@ static void *reloc_vaddr(struct i915_vma 
>>>>>> *vma,
>>>>>>   static void clflush_write32(u32 *addr, u32 value, unsigned int 
>>>>>> flushes)
>>>>>>   {
>>>>>>       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
>>>>>> -        if (flushes & CLFLUSH_BEFORE) {
>>>>>> -            clflushopt(addr);
>>>>>> -            mb();
>>>>>> -        }
>>>>>> +        if (flushes & CLFLUSH_BEFORE)
>>>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>>>             *addr = value;
>>>>>>   @@ -1347,7 +1345,7 @@ static void clflush_write32(u32 *addr, 
>>>>>> u32 value, unsigned int flushes)
>>>>>>            * to ensure ordering of clflush wrt to the system.
>>>>>>            */
>>>>>>           if (flushes & CLFLUSH_AFTER)
>>>>>> -            clflushopt(addr);
>>>>>> +            drm_clflush_virt_range(addr, sizeof(addr));
>>>>>>       } else
>>>>>>           *addr = value;
>>>>>>   }
>>>>>
>>>>> Slightly annoying thing here (maybe in some other patches from the 
>>>>> series as well) is that the change adds a function call to x86 
>>>>> only code path, because relocations are not supported on discrete 
>>>>> as per:
>>>>>
>>>>> static in
>>>>> eb_validate_vma(...)
>>>>>         /* Relocations are disallowed for all platforms after 
>>>>> TGL-LP. This
>>>>>          * also covers all platforms with local memory.
>>>>>          */
>>>>>
>>>>>         if (entry->relocation_count &&
>>>>>             GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915))
>>>>>                 return -EINVAL;
>>>>>
>>>>> How acceptable would be, for the whole series, to introduce a 
>>>>> static inline i915 cluflush wrapper and so be able to avoid 
>>>>> functions calls on x86? Is this something that has been discussed 
>>>>> and discounted already?
>>>>>
>>>>> Regards,
>>>>>
>>>>> Tvrtko
>>>>>
>>>>> P.S. Hmm I am now reminded of my really old per platform build 
>>>>> patches. With them you would be able to compile out large portions 
>>>>> of the driver when building for ARM. Probably like a 3rd if my 
>>>>> memory serves me right.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 498b458fd784..0854276ff7ba 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1332,10 +1332,8 @@  static void *reloc_vaddr(struct i915_vma *vma,
 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 {
 	if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
-		if (flushes & CLFLUSH_BEFORE) {
-			clflushopt(addr);
-			mb();
-		}
+		if (flushes & CLFLUSH_BEFORE)
+			drm_clflush_virt_range(addr, sizeof(addr));
 
 		*addr = value;
 
@@ -1347,7 +1345,7 @@  static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		 * to ensure ordering of clflush wrt to the system.
 		 */
 		if (flushes & CLFLUSH_AFTER)
-			clflushopt(addr);
+			drm_clflush_virt_range(addr, sizeof(addr));
 	} else
 		*addr = value;
 }