diff mbox series

[1/2] drm/xe/lnl: Only do gpu sys page clear for non-pooled BOs

Message ID 20240821095035.29083-1-nirmoy.das@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/xe/lnl: Only do gpu sys page clear for non-pooled BOs | expand

Commit Message

Nirmoy Das Aug. 21, 2024, 9:50 a.m. UTC
Currently XE lacks clean-on-free implementation so using
TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
only for WB cached BOs which are not pooled so there is no need to
return a zeroed pages to a pool.

Without the patch:
api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
Perf tool top 5 entries:
11.16%  api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page
7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
5.53%  api_overhead_be  [kernel.kallsyms] [k] lookup_address_in_pgd_attr

With the patch:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
Perf tool top 5 entries:
25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find

This is still better than the base case where there was no
page clearing offloading.

Cc: Christian König <christian.koenig@amd.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

Comments

Thomas Hellström Aug. 28, 2024, 8:09 a.m. UTC | #1
On Wed, 2024-08-21 at 11:50 +0200, Nirmoy Das wrote:
> Currently XE lacks clean-on-free implementation so using
> TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
> TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
> only for WB cached BOs which are not pooled so there is no need to
> return a zeroed pages to a pool.
> 
> Without the patch:
> api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
> UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
> Perf tool top 5 entries:
> 11.16%  api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page
> 7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
> 7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
> 7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
> 5.53%  api_overhead_be  [kernel.kallsyms] [k]
> lookup_address_in_pgd_attr
> 
> With the patch:
> UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
> Perf tool top 5 entries:
> 25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
> 9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
> 4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
> 4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
> 3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find
> 
> This is still better than the base case where there was no
> page clearing offloading.
> 
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>

Nirmoy, is it possible to split this up into a revert and then added
functionality so that we could quickly revert the security issue end
then add a separate patch that is an optimization?

/Thomas

> ---
>  drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
>  1 file changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 6ed0e1955215..a18408d5d185 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -283,6 +283,7 @@ struct xe_ttm_tt {
>  	struct device *dev;
>  	struct sg_table sgt;
>  	struct sg_table *sg;
> +	bool clear_system_pages;
>  };
>  
>  static int xe_tt_map_sg(struct ttm_tt *tt)
> @@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct
> ttm_buffer_object *ttm_bo,
>  	}
>  
>  	/*
> -	 * If the device can support gpu clear system pages then set
> proper ttm
> +	 * If the device can support gpu clear system pages then set
> proper
>  	 * flag. Zeroed pages are only required for
> ttm_bo_type_device so
>  	 * unwanted data is not leaked to userspace.
> +	 *
> +	 * XE currently does clear-on-alloc so gpu clear will only
> work on
> +	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global
> pool will
> +	 * get poisoned ono-zeroed pages.
>  	 */
> -	if (ttm_bo->type == ttm_bo_type_device && xe-
> >mem.gpu_page_clear_sys)
> -		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
> +	if (ttm_bo->type == ttm_bo_type_device && xe-
> >mem.gpu_page_clear_sys &&
> +	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
> +		tt->clear_system_pages = true;
>  
>  	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
> extra_pages);
>  	if (err) {
> @@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct
> ttm_buffer_object *ttm_bo,
>  static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct
> ttm_tt *tt,
>  			      struct ttm_operation_ctx *ctx)
>  {
> +	struct xe_ttm_tt *xe_tt;
>  	int err;
>  
> +	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
> +
>  	/*
>  	 * dma-bufs are not populated with pages, and the dma-
>  	 * addresses are set up when moved to XE_PL_TT.
> @@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device
> *ttm_dev, struct ttm_tt *tt,
>  		return 0;
>  
>  	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear
> system pages */
> -	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
> +	if (xe_tt->clear_system_pages)
>  		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
>  
>  	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
> @@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>  	struct ttm_resource *old_mem = ttm_bo->resource;
>  	u32 old_mem_type = old_mem ? old_mem->mem_type :
> XE_PL_SYSTEM;
>  	struct ttm_tt *ttm = ttm_bo->ttm;
> +	struct xe_ttm_tt *xe_tt = container_of(ttm, struct
> xe_ttm_tt, ttm);
>  	struct xe_migrate *migrate = NULL;
>  	struct dma_fence *fence;
>  	bool move_lacks_source;
> @@ -671,15 +681,16 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>  	bool needs_clear;
>  	bool handle_system_ccs = (!IS_DGFX(xe) &&
> xe_bo_needs_ccs_pages(bo) &&
>  				  ttm && ttm_tt_is_populated(ttm)) ?
> true : false;
> -	bool clear_system_pages;
> +	bool clear_system_pages = false;
>  	int ret = 0;
>  
>  	/*
>  	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path
> when
>  	 * moving to system as the bo doesn't have dma_mapping.
>  	 */
> -	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
> -		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
> +	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
> +	    xe_tt->clear_system_pages)
> +		xe_tt->clear_system_pages = false;
>  
>  	/* Bo creation path, moving to system or TT. */
>  	if ((!old_mem && ttm) && !handle_system_ccs) {
> @@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object
> *ttm_bo, bool evict,
>  	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared) 
> :
>  						(!mem_type_is_vram(o
> ld_mem_type) && !tt_has_data);
>  
> -	clear_system_pages = ttm && (ttm->page_flags &
> TTM_TT_FLAG_CLEARED_ON_FREE);
> +	clear_system_pages = ttm && xe_tt->clear_system_pages;
>  	needs_clear = (ttm && ttm->page_flags &
> TTM_TT_FLAG_ZERO_ALLOC) ||
>  		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
>  		clear_system_pages;
Thomas Hellström Aug. 28, 2024, 8:12 a.m. UTC | #2
On Wed, 2024-08-28 at 10:09 +0200, Thomas Hellström wrote:
> On Wed, 2024-08-21 at 11:50 +0200, Nirmoy Das wrote:
> > Currently XE lacks clean-on-free implementation so using
> > TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
> > TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
> > only for WB cached BOs which are not pooled so there is no need to
> > return a zeroed pages to a pool.
> > 
> > Without the patch:
> > api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
> > UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
> > UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
> > Perf tool top 5 entries:
> > 11.16%  api_overhead_be [kernel.kallsyms] [k]
> > __pageblock_pfn_to_page
> > 7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
> > 7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
> > 7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
> > 5.53%  api_overhead_be  [kernel.kallsyms] [k]
> > lookup_address_in_pgd_attr
> > 
> > With the patch:
> > UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
> > UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
> > Perf tool top 5 entries:
> > 25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
> > 9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
> > 4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
> > 4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
> > 3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find
> > 
> > This is still better than the base case where there was no
> > page clearing offloading.
> > 
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> > Cc: Matthew Auld <matthew.auld@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> 
> Nirmoy, is it possible to split this up into a revert and then added
> functionality so that we could quickly revert the security issue end
> then add a separate patch that is an optimization?

Wait, I see now on the list there's already a revert patch. Let me
check what's actually pending review.

/Thomas


> 
> /Thomas
> 
> > ---
> >  drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
> >  1 file changed, 19 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c
> > b/drivers/gpu/drm/xe/xe_bo.c
> > index 6ed0e1955215..a18408d5d185 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -283,6 +283,7 @@ struct xe_ttm_tt {
> >  	struct device *dev;
> >  	struct sg_table sgt;
> >  	struct sg_table *sg;
> > +	bool clear_system_pages;
> >  };
> >  
> >  static int xe_tt_map_sg(struct ttm_tt *tt)
> > @@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct
> > ttm_buffer_object *ttm_bo,
> >  	}
> >  
> >  	/*
> > -	 * If the device can support gpu clear system pages then
> > set
> > proper ttm
> > +	 * If the device can support gpu clear system pages then
> > set
> > proper
> >  	 * flag. Zeroed pages are only required for
> > ttm_bo_type_device so
> >  	 * unwanted data is not leaked to userspace.
> > +	 *
> > +	 * XE currently does clear-on-alloc so gpu clear will only
> > work on
> > +	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise
> > global
> > pool will
> > +	 * get poisoned ono-zeroed pages.
> >  	 */
> > -	if (ttm_bo->type == ttm_bo_type_device && xe-
> > > mem.gpu_page_clear_sys)
> > -		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
> > +	if (ttm_bo->type == ttm_bo_type_device && xe-
> > > mem.gpu_page_clear_sys &&
> > +	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
> > +		tt->clear_system_pages = true;
> >  
> >  	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
> > extra_pages);
> >  	if (err) {
> > @@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct
> > ttm_buffer_object *ttm_bo,
> >  static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct
> > ttm_tt *tt,
> >  			      struct ttm_operation_ctx *ctx)
> >  {
> > +	struct xe_ttm_tt *xe_tt;
> >  	int err;
> >  
> > +	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
> > +
> >  	/*
> >  	 * dma-bufs are not populated with pages, and the dma-
> >  	 * addresses are set up when moved to XE_PL_TT.
> > @@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device
> > *ttm_dev, struct ttm_tt *tt,
> >  		return 0;
> >  
> >  	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear
> > system pages */
> > -	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
> > +	if (xe_tt->clear_system_pages)
> >  		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
> >  
> >  	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
> > @@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object
> > *ttm_bo, bool evict,
> >  	struct ttm_resource *old_mem = ttm_bo->resource;
> >  	u32 old_mem_type = old_mem ? old_mem->mem_type :
> > XE_PL_SYSTEM;
> >  	struct ttm_tt *ttm = ttm_bo->ttm;
> > +	struct xe_ttm_tt *xe_tt = container_of(ttm, struct
> > xe_ttm_tt, ttm);
> >  	struct xe_migrate *migrate = NULL;
> >  	struct dma_fence *fence;
> >  	bool move_lacks_source;
> > @@ -671,15 +681,16 @@ static int xe_bo_move(struct
> > ttm_buffer_object
> > *ttm_bo, bool evict,
> >  	bool needs_clear;
> >  	bool handle_system_ccs = (!IS_DGFX(xe) &&
> > xe_bo_needs_ccs_pages(bo) &&
> >  				  ttm && ttm_tt_is_populated(ttm))
> > ?
> > true : false;
> > -	bool clear_system_pages;
> > +	bool clear_system_pages = false;
> >  	int ret = 0;
> >  
> >  	/*
> >  	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path
> > when
> >  	 * moving to system as the bo doesn't have dma_mapping.
> >  	 */
> > -	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
> > -		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
> > +	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
> > +	    xe_tt->clear_system_pages)
> > +		xe_tt->clear_system_pages = false;
> >  
> >  	/* Bo creation path, moving to system or TT. */
> >  	if ((!old_mem && ttm) && !handle_system_ccs) {
> > @@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object
> > *ttm_bo, bool evict,
> >  	move_lacks_source = handle_system_ccs ? (!bo-
> > >ccs_cleared) 
> > :
> >  						(!mem_type_is_vram
> > (o
> > ld_mem_type) && !tt_has_data);
> >  
> > -	clear_system_pages = ttm && (ttm->page_flags &
> > TTM_TT_FLAG_CLEARED_ON_FREE);
> > +	clear_system_pages = ttm && xe_tt->clear_system_pages;
> >  	needs_clear = (ttm && ttm->page_flags &
> > TTM_TT_FLAG_ZERO_ALLOC) ||
> >  		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
> >  		clear_system_pages;
>
Nirmoy Das Aug. 28, 2024, 8:19 a.m. UTC | #3
On 8/28/2024 10:12 AM, Thomas Hellström wrote:
> On Wed, 2024-08-28 at 10:09 +0200, Thomas Hellström wrote:
>> On Wed, 2024-08-21 at 11:50 +0200, Nirmoy Das wrote:
>>> Currently XE lacks clean-on-free implementation so using
>>> TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
>>> TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
>>> only for WB cached BOs which are not pooled so there is no need to
>>> return a zeroed pages to a pool.
>>>
>>> Without the patch:
>>> api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
>>> UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
>>> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
>>> Perf tool top 5 entries:
>>> 11.16%  api_overhead_be [kernel.kallsyms] [k]
>>> __pageblock_pfn_to_page
>>> 7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
>>> 7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
>>> 7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
>>> 5.53%  api_overhead_be  [kernel.kallsyms] [k]
>>> lookup_address_in_pgd_attr
>>>
>>> With the patch:
>>> UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
>>> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
>>> Perf tool top 5 entries:
>>> 25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
>>> 9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
>>> 4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
>>> 4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
>>> 3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find
>>>
>>> This is still better than the base case where there was no
>>> page clearing offloading.
>>>
>>> Cc: Christian König <christian.koenig@amd.com>
>>> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>>> Cc: Matthew Auld <matthew.auld@intel.com>
>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
>> Nirmoy, is it possible to split this up into a revert and then added
>> functionality so that we could quickly revert the security issue end
>> then add a separate patch that is an optimization?
> Wait, I see now on the list there's already a revert patch. Let me
> check what's actually pending review.


Yes, sent out https://patchwork.freedesktop.org/series/137855/ yesterday 
which should be easier

to review and merge. I will look into the page zero offload later on 
because for LNL we don't need to

clear CCS for WB and pool backed BOs doesn't have much alloc latency.


Regards,

Nirmoy

>
> /Thomas
>
>
>> /Thomas
>>
>>> ---
>>>   drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
>>>   1 file changed, 19 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_bo.c
>>> b/drivers/gpu/drm/xe/xe_bo.c
>>> index 6ed0e1955215..a18408d5d185 100644
>>> --- a/drivers/gpu/drm/xe/xe_bo.c
>>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>>> @@ -283,6 +283,7 @@ struct xe_ttm_tt {
>>>   	struct device *dev;
>>>   	struct sg_table sgt;
>>>   	struct sg_table *sg;
>>> +	bool clear_system_pages;
>>>   };
>>>   
>>>   static int xe_tt_map_sg(struct ttm_tt *tt)
>>> @@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct
>>> ttm_buffer_object *ttm_bo,
>>>   	}
>>>   
>>>   	/*
>>> -	 * If the device can support gpu clear system pages then
>>> set
>>> proper ttm
>>> +	 * If the device can support gpu clear system pages then
>>> set
>>> proper
>>>   	 * flag. Zeroed pages are only required for
>>> ttm_bo_type_device so
>>>   	 * unwanted data is not leaked to userspace.
>>> +	 *
>>> +	 * XE currently does clear-on-alloc so gpu clear will only
>>> work on
>>> +	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise
>>> global
>>> pool will
>>> +	 * get poisoned ono-zeroed pages.
>>>   	 */
>>> -	if (ttm_bo->type == ttm_bo_type_device && xe-
>>>> mem.gpu_page_clear_sys)
>>> -		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
>>> +	if (ttm_bo->type == ttm_bo_type_device && xe-
>>>> mem.gpu_page_clear_sys &&
>>> +	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
>>> +		tt->clear_system_pages = true;
>>>   
>>>   	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
>>> extra_pages);
>>>   	if (err) {
>>> @@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct
>>> ttm_buffer_object *ttm_bo,
>>>   static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct
>>> ttm_tt *tt,
>>>   			      struct ttm_operation_ctx *ctx)
>>>   {
>>> +	struct xe_ttm_tt *xe_tt;
>>>   	int err;
>>>   
>>> +	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
>>> +
>>>   	/*
>>>   	 * dma-bufs are not populated with pages, and the dma-
>>>   	 * addresses are set up when moved to XE_PL_TT.
>>> @@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device
>>> *ttm_dev, struct ttm_tt *tt,
>>>   		return 0;
>>>   
>>>   	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear
>>> system pages */
>>> -	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
>>> +	if (xe_tt->clear_system_pages)
>>>   		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
>>>   
>>>   	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
>>> @@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object
>>> *ttm_bo, bool evict,
>>>   	struct ttm_resource *old_mem = ttm_bo->resource;
>>>   	u32 old_mem_type = old_mem ? old_mem->mem_type :
>>> XE_PL_SYSTEM;
>>>   	struct ttm_tt *ttm = ttm_bo->ttm;
>>> +	struct xe_ttm_tt *xe_tt = container_of(ttm, struct
>>> xe_ttm_tt, ttm);
>>>   	struct xe_migrate *migrate = NULL;
>>>   	struct dma_fence *fence;
>>>   	bool move_lacks_source;
>>> @@ -671,15 +681,16 @@ static int xe_bo_move(struct
>>> ttm_buffer_object
>>> *ttm_bo, bool evict,
>>>   	bool needs_clear;
>>>   	bool handle_system_ccs = (!IS_DGFX(xe) &&
>>> xe_bo_needs_ccs_pages(bo) &&
>>>   				  ttm && ttm_tt_is_populated(ttm))
>>> ?
>>> true : false;
>>> -	bool clear_system_pages;
>>> +	bool clear_system_pages = false;
>>>   	int ret = 0;
>>>   
>>>   	/*
>>>   	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path
>>> when
>>>   	 * moving to system as the bo doesn't have dma_mapping.
>>>   	 */
>>> -	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
>>> -		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
>>> +	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
>>> +	    xe_tt->clear_system_pages)
>>> +		xe_tt->clear_system_pages = false;
>>>   
>>>   	/* Bo creation path, moving to system or TT. */
>>>   	if ((!old_mem && ttm) && !handle_system_ccs) {
>>> @@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object
>>> *ttm_bo, bool evict,
>>>   	move_lacks_source = handle_system_ccs ? (!bo-
>>>> ccs_cleared)
>>> :
>>>   						(!mem_type_is_vram
>>> (o
>>> ld_mem_type) && !tt_has_data);
>>>   
>>> -	clear_system_pages = ttm && (ttm->page_flags &
>>> TTM_TT_FLAG_CLEARED_ON_FREE);
>>> +	clear_system_pages = ttm && xe_tt->clear_system_pages;
>>>   	needs_clear = (ttm && ttm->page_flags &
>>> TTM_TT_FLAG_ZERO_ALLOC) ||
>>>   		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
>>>   		clear_system_pages;
Nirmoy Das Aug. 28, 2024, 9:08 a.m. UTC | #4
Hi Thomas,

On 8/28/2024 10:09 AM, Thomas Hellström wrote:
> On Wed, 2024-08-21 at 11:50 +0200, Nirmoy Das wrote:
>> Currently XE lacks clean-on-free implementation so using
>> TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
>> TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
>> only for WB cached BOs which are not pooled so there is no need to
>> return a zeroed pages to a pool.
>>
>> Without the patch:
>> api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
>> UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
>> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
>> Perf tool top 5 entries:
>> 11.16%  api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page
>> 7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
>> 7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
>> 7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
>> 5.53%  api_overhead_be  [kernel.kallsyms] [k]
>> lookup_address_in_pgd_attr
>>
>> With the patch:
>> UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
>> UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
>> Perf tool top 5 entries:
>> 25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
>> 9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
>> 4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
>> 4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
>> 3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find
>>
>> This is still better than the base case where there was no
>> page clearing offloading.
>>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>> Cc: Matthew Auld <matthew.auld@intel.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> Nirmoy, is it possible to split this up into a revert and then added
> functionality so that we could quickly revert the security issue end
> then add a separate patch that is an optimization?


Sent out a new series to clean revert both patches so it could be merge 
soon.

Regards,

Nirmoy

>
> /Thomas
>
>> ---
>>   drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
>>   1 file changed, 19 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>> index 6ed0e1955215..a18408d5d185 100644
>> --- a/drivers/gpu/drm/xe/xe_bo.c
>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>> @@ -283,6 +283,7 @@ struct xe_ttm_tt {
>>   	struct device *dev;
>>   	struct sg_table sgt;
>>   	struct sg_table *sg;
>> +	bool clear_system_pages;
>>   };
>>   
>>   static int xe_tt_map_sg(struct ttm_tt *tt)
>> @@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct
>> ttm_buffer_object *ttm_bo,
>>   	}
>>   
>>   	/*
>> -	 * If the device can support gpu clear system pages then set
>> proper ttm
>> +	 * If the device can support gpu clear system pages then set
>> proper
>>   	 * flag. Zeroed pages are only required for
>> ttm_bo_type_device so
>>   	 * unwanted data is not leaked to userspace.
>> +	 *
>> +	 * XE currently does clear-on-alloc so gpu clear will only
>> work on
>> +	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global
>> pool will
>> +	 * get poisoned ono-zeroed pages.
>>   	 */
>> -	if (ttm_bo->type == ttm_bo_type_device && xe-
>>> mem.gpu_page_clear_sys)
>> -		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
>> +	if (ttm_bo->type == ttm_bo_type_device && xe-
>>> mem.gpu_page_clear_sys &&
>> +	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
>> +		tt->clear_system_pages = true;
>>   
>>   	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
>> extra_pages);
>>   	if (err) {
>> @@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct
>> ttm_buffer_object *ttm_bo,
>>   static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct
>> ttm_tt *tt,
>>   			      struct ttm_operation_ctx *ctx)
>>   {
>> +	struct xe_ttm_tt *xe_tt;
>>   	int err;
>>   
>> +	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
>> +
>>   	/*
>>   	 * dma-bufs are not populated with pages, and the dma-
>>   	 * addresses are set up when moved to XE_PL_TT.
>> @@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device
>> *ttm_dev, struct ttm_tt *tt,
>>   		return 0;
>>   
>>   	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear
>> system pages */
>> -	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
>> +	if (xe_tt->clear_system_pages)
>>   		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
>>   
>>   	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
>> @@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>   	struct ttm_resource *old_mem = ttm_bo->resource;
>>   	u32 old_mem_type = old_mem ? old_mem->mem_type :
>> XE_PL_SYSTEM;
>>   	struct ttm_tt *ttm = ttm_bo->ttm;
>> +	struct xe_ttm_tt *xe_tt = container_of(ttm, struct
>> xe_ttm_tt, ttm);
>>   	struct xe_migrate *migrate = NULL;
>>   	struct dma_fence *fence;
>>   	bool move_lacks_source;
>> @@ -671,15 +681,16 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>   	bool needs_clear;
>>   	bool handle_system_ccs = (!IS_DGFX(xe) &&
>> xe_bo_needs_ccs_pages(bo) &&
>>   				  ttm && ttm_tt_is_populated(ttm)) ?
>> true : false;
>> -	bool clear_system_pages;
>> +	bool clear_system_pages = false;
>>   	int ret = 0;
>>   
>>   	/*
>>   	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path
>> when
>>   	 * moving to system as the bo doesn't have dma_mapping.
>>   	 */
>> -	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
>> -		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
>> +	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
>> +	    xe_tt->clear_system_pages)
>> +		xe_tt->clear_system_pages = false;
>>   
>>   	/* Bo creation path, moving to system or TT. */
>>   	if ((!old_mem && ttm) && !handle_system_ccs) {
>> @@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object
>> *ttm_bo, bool evict,
>>   	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)
>> :
>>   						(!mem_type_is_vram(o
>> ld_mem_type) && !tt_has_data);
>>   
>> -	clear_system_pages = ttm && (ttm->page_flags &
>> TTM_TT_FLAG_CLEARED_ON_FREE);
>> +	clear_system_pages = ttm && xe_tt->clear_system_pages;
>>   	needs_clear = (ttm && ttm->page_flags &
>> TTM_TT_FLAG_ZERO_ALLOC) ||
>>   		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
>>   		clear_system_pages;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 6ed0e1955215..a18408d5d185 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -283,6 +283,7 @@  struct xe_ttm_tt {
 	struct device *dev;
 	struct sg_table sgt;
 	struct sg_table *sg;
+	bool clear_system_pages;
 };
 
 static int xe_tt_map_sg(struct ttm_tt *tt)
@@ -397,12 +398,17 @@  static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
 	}
 
 	/*
-	 * If the device can support gpu clear system pages then set proper ttm
+	 * If the device can support gpu clear system pages then set proper
 	 * flag. Zeroed pages are only required for ttm_bo_type_device so
 	 * unwanted data is not leaked to userspace.
+	 *
+	 * XE currently does clear-on-alloc so gpu clear will only work on
+	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global pool will
+	 * get poisoned ono-zeroed pages.
 	 */
-	if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys)
-		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
+	if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys &&
+	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
+		tt->clear_system_pages = true;
 
 	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages);
 	if (err) {
@@ -416,8 +422,11 @@  static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
 static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
 			      struct ttm_operation_ctx *ctx)
 {
+	struct xe_ttm_tt *xe_tt;
 	int err;
 
+	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
+
 	/*
 	 * dma-bufs are not populated with pages, and the dma-
 	 * addresses are set up when moved to XE_PL_TT.
@@ -426,7 +435,7 @@  static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
 		return 0;
 
 	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear system pages */
-	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
+	if (xe_tt->clear_system_pages)
 		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
 
 	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
@@ -664,6 +673,7 @@  static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	struct ttm_resource *old_mem = ttm_bo->resource;
 	u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM;
 	struct ttm_tt *ttm = ttm_bo->ttm;
+	struct xe_ttm_tt *xe_tt = container_of(ttm, struct xe_ttm_tt, ttm);
 	struct xe_migrate *migrate = NULL;
 	struct dma_fence *fence;
 	bool move_lacks_source;
@@ -671,15 +681,16 @@  static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	bool needs_clear;
 	bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
 				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
-	bool clear_system_pages;
+	bool clear_system_pages = false;
 	int ret = 0;
 
 	/*
 	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path when
 	 * moving to system as the bo doesn't have dma_mapping.
 	 */
-	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
-		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
+	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
+	    xe_tt->clear_system_pages)
+		xe_tt->clear_system_pages = false;
 
 	/* Bo creation path, moving to system or TT. */
 	if ((!old_mem && ttm) && !handle_system_ccs) {
@@ -703,7 +714,7 @@  static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
 						(!mem_type_is_vram(old_mem_type) && !tt_has_data);
 
-	clear_system_pages = ttm && (ttm->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE);
+	clear_system_pages = ttm && xe_tt->clear_system_pages;
 	needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
 		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
 		clear_system_pages;