diff mbox series

[v5,3/4] drm/xe/migrate: Clear CCS when clearing bo on xe2

Message ID 20240704081841.30212-3-nirmoy.das@intel.com (mailing list archive)
State New, archived
Headers show
Series [v5,1/4] drm/ttm: Add a flag to allow drivers to skip clear-on-free | expand

Commit Message

Nirmoy Das July 4, 2024, 8:18 a.m. UTC
Clearing bo with uncompress PTE will trigger a CCS clearing as well
for XE2, so skip emit_copy_ccs() when on xe2 when clearing bo.

v2: When clearing BO, CCS clear happens with all command as long
    as PTEs are uncompress.

Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/xe/xe_migrate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

Comments

Nirmoy Das July 10, 2024, 9:14 a.m. UTC | #1
On 7/4/2024 10:18 AM, Nirmoy Das wrote:
> Clearing bo with uncompress PTE will trigger a CCS clearing as well
> for XE2, so skip emit_copy_ccs() when on xe2 when clearing bo.
>
> v2: When clearing BO, CCS clear happens with all command as long
>      as PTEs are uncompress.
>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_migrate.c | 7 +++++--
>   1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index e0a3f6921572..cc8beed2bf8e 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -1061,7 +1061,8 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>   		if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
>   			xe_res_next(&src_it, clear_L0);
>   		else
> -			emit_pte(m, bb, clear_L0_pt, clear_vram, clear_ccs,
> +			/* Use uncompressed pte so clear happens in the real memory. */
> +			emit_pte(m, bb, clear_L0_pt, clear_vram, false,
>   				 &src_it, clear_L0, dst);
>   
>   		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> @@ -1070,7 +1071,9 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>   		if (clear_bo_data)
>   			emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
>   
> -		if (xe_device_has_flat_ccs(xe)) {
> +		/* Clearing BO with uncompress PTE will clear CCS metadata as well on XE2 */
> +		if (xe_device_has_flat_ccs(xe) && clear_ccs &&
> +		    !(clear_bo_data && GRAPHICS_VERx100(gt_to_xe(gt)) >= 2000)) {

Looking into Akshata's recent patch made me realized that I missed out 
on reducing batch_size when this condition is not met.


>   			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>   				      m->cleared_mem_ofs, false, clear_L0);
>   			flush_flags = MI_FLUSH_DW_CCS;
Matthew Auld July 18, 2024, 4:40 p.m. UTC | #2
On 04/07/2024 09:18, Nirmoy Das wrote:
> Clearing bo with uncompress PTE will trigger a CCS clearing as well
> for XE2, so skip emit_copy_ccs() when on xe2 when clearing bo.
> 
> v2: When clearing BO, CCS clear happens with all command as long
>      as PTEs are uncompress.
> 
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_migrate.c | 7 +++++--
>   1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index e0a3f6921572..cc8beed2bf8e 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -1061,7 +1061,8 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>   		if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
>   			xe_res_next(&src_it, clear_L0);
>   		else
> -			emit_pte(m, bb, clear_L0_pt, clear_vram, clear_ccs,
> +			/* Use uncompressed pte so clear happens in the real memory. */
> +			emit_pte(m, bb, clear_L0_pt, clear_vram, false,
>   				 &src_it, clear_L0, dst);

I assume this uses coherency 1way+ mode for that pat index? We could 
potentially use coh_none instead, for the case where bo.cpu_caching != 
wb. In theory that should be faster, but could be ignored for now.

>   
>   		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> @@ -1070,7 +1071,9 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>   		if (clear_bo_data)
>   			emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
>   
> -		if (xe_device_has_flat_ccs(xe)) {
> +		/* Clearing BO with uncompress PTE will clear CCS metadata as well on XE2 */
> +		if (xe_device_has_flat_ccs(xe) && clear_ccs &&
> +		    !(clear_bo_data && GRAPHICS_VERx100(gt_to_xe(gt)) >= 2000)) {
>   			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>   				      m->cleared_mem_ofs, false, clear_L0);
>   			flush_flags = MI_FLUSH_DW_CCS;
Nirmoy Das July 18, 2024, 6:33 p.m. UTC | #3
On 7/18/2024 6:40 PM, Matthew Auld wrote:
> On 04/07/2024 09:18, Nirmoy Das wrote:
>> Clearing bo with uncompress PTE will trigger a CCS clearing as well
>> for XE2, so skip emit_copy_ccs() when on xe2 when clearing bo.
>>
>> v2: When clearing BO, CCS clear happens with all command as long
>>      as PTEs are uncompress.
>>
>> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>> Cc: Matthew Auld <matthew.auld@intel.com>
>> Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_migrate.c | 7 +++++--
>>   1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c 
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index e0a3f6921572..cc8beed2bf8e 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -1061,7 +1061,8 @@ struct dma_fence *xe_migrate_clear(struct 
>> xe_migrate *m,
>>           if (clear_vram && xe_migrate_allow_identity(clear_L0, 
>> &src_it))
>>               xe_res_next(&src_it, clear_L0);
>>           else
>> -            emit_pte(m, bb, clear_L0_pt, clear_vram, clear_ccs,
>> +            /* Use uncompressed pte so clear happens in the real 
>> memory. */
>> +            emit_pte(m, bb, clear_L0_pt, clear_vram, false,
>>                    &src_it, clear_L0, dst);
>
> I assume this uses coherency 1way+ mode for that pat index? 


When compression is false then we use XE_CACHE_WB which is 2-way coherency.

> We could potentially use coh_none instead, for the case where 
> bo.cpu_caching != wb. In theory that should be faster, but could be 
> ignored for now.

Yes, coh_none should be faster. I can try that out later on.


Thanks,

Nirmoy


>
>>             bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>> @@ -1070,7 +1071,9 @@ struct dma_fence *xe_migrate_clear(struct 
>> xe_migrate *m,
>>           if (clear_bo_data)
>>               emit_clear(gt, bb, clear_L0_ofs, clear_L0, 
>> XE_PAGE_SIZE, clear_vram);
>>   -        if (xe_device_has_flat_ccs(xe)) {
>> +        /* Clearing BO with uncompress PTE will clear CCS metadata 
>> as well on XE2 */
>> +        if (xe_device_has_flat_ccs(xe) && clear_ccs &&
>> +            !(clear_bo_data && GRAPHICS_VERx100(gt_to_xe(gt)) >= 
>> 2000)) {
>>               emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>>                         m->cleared_mem_ofs, false, clear_L0);
>>               flush_flags = MI_FLUSH_DW_CCS;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index e0a3f6921572..cc8beed2bf8e 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -1061,7 +1061,8 @@  struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
 			xe_res_next(&src_it, clear_L0);
 		else
-			emit_pte(m, bb, clear_L0_pt, clear_vram, clear_ccs,
+			/* Use uncompressed pte so clear happens in the real memory. */
+			emit_pte(m, bb, clear_L0_pt, clear_vram, false,
 				 &src_it, clear_L0, dst);
 
 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
@@ -1070,7 +1071,9 @@  struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		if (clear_bo_data)
 			emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
 
-		if (xe_device_has_flat_ccs(xe)) {
+		/* Clearing BO with uncompress PTE will clear CCS metadata as well on XE2 */
+		if (xe_device_has_flat_ccs(xe) && clear_ccs &&
+		    !(clear_bo_data && GRAPHICS_VERx100(gt_to_xe(gt)) >= 2000)) {
 			emit_copy_ccs(gt, bb, clear_L0_ofs, true,
 				      m->cleared_mem_ofs, false, clear_L0);
 			flush_flags = MI_FLUSH_DW_CCS;