diff mbox series

drm/ttm: Merge hugepage attr changes in ttm_dma_page_put.

Message ID 20180725202950.51216-1-basni@chromium.org (mailing list archive)
State New, archived
Headers show
Series drm/ttm: Merge hugepage attr changes in ttm_dma_page_put. | expand

Commit Message

Bas Nieuwenhuizen July 25, 2018, 8:29 p.m. UTC
Every set_pages_array_wb call resulted in cross-core
interrupts and TLB flushes. Merge more of them for
less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC
buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
(Allocation still takes more than 2 sec though)

Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>
---
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
 1 file changed, 24 insertions(+), 7 deletions(-)

Comments

Huang Rui July 26, 2018, 4:02 a.m. UTC | #1
On Wed, Jul 25, 2018 at 10:29:50PM +0200, Bas Nieuwenhuizen wrote:
> Every set_pages_array_wb call resulted in cross-core
> interrupts and TLB flushes. Merge more of them for
> less overhead.
> 
> This reduces the time needed to free a 1.6 GiB GTT WC
> buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
> (Allocation still takes more than 2 sec though)
> 
> Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>

Reviewed-by: Huang Rui <ray.huang@amd.com>

> ---
>  drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
>  1 file changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index 4c659405a008a..9440ba0a55116 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray)
>  #endif
>  	return 0;
>  }
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +#if IS_ENABLED(CONFIG_AGP)
> +        unsigned long i;
> +
> +        for (i = 0; i < numpages; i++)
> +                unmap_page_from_agp(p + i);
> +#endif
> +	return 0;
> +}
> +
> +#else /* for !CONFIG_X86 */
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +	return set_memory_wb((unsigned long)page_address(p), numpages);
> +}
> +
>  #endif /* for !CONFIG_X86 */
>  
>  static int ttm_set_pages_caching(struct dma_pool *pool,
> @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool,
>  static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page)
>  {
>  	struct page *page = d_page->p;
> -	unsigned i, num_pages;
> +	unsigned num_pages;
>  	int ret;
>  
>  	/* Don't set WB on WB page pool. */
>  	if (!(pool->type & IS_CACHED)) {
>  		num_pages = pool->size / PAGE_SIZE;
> -		for (i = 0; i < num_pages; ++i, ++page) {
> -			ret = set_pages_array_wb(&page, 1);
> -			if (ret) {
> -				pr_err("%s: Failed to set %d pages to wb!\n",
> -				       pool->dev_name, 1);
> -			}
> +		ret = ttm_set_page_range_wb(page, num_pages);
> +		if (ret) {
> +			pr_err("%s: Failed to set %d pages to wb!\n",
> +			       pool->dev_name, num_pages);
>  		}
>  	}
>  
> -- 
> 2.18.0.233.g985f88cf7e-goog
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Zhang, Jerry(Junwei) July 26, 2018, 5:52 a.m. UTC | #2
On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:
> Every set_pages_array_wb call resulted in cross-core
> interrupts and TLB flushes. Merge more of them for
> less overhead.
>
> This reduces the time needed to free a 1.6 GiB GTT WC
> buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
> (Allocation still takes more than 2 sec though)
>
> Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>
> ---
>   drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
>   1 file changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index 4c659405a008a..9440ba0a55116 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray)
>   #endif
>   	return 0;
>   }
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +#if IS_ENABLED(CONFIG_AGP)
> +        unsigned long i;
> +
> +        for (i = 0; i < numpages; i++)
> +                unmap_page_from_agp(p + i);
> +#endif
> +	return 0;
> +}
> +
> +#else /* for !CONFIG_X86 */
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +	return set_memory_wb((unsigned long)page_address(p), numpages);
> +}
> +
>   #endif /* for !CONFIG_X86 */
>
>   static int ttm_set_pages_caching(struct dma_pool *pool,
> @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool,
>   static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page)
>   {
>   	struct page *page = d_page->p;
> -	unsigned i, num_pages;
> +	unsigned num_pages;
>   	int ret;
>
>   	/* Don't set WB on WB page pool. */
>   	if (!(pool->type & IS_CACHED)) {
>   		num_pages = pool->size / PAGE_SIZE;
> -		for (i = 0; i < num_pages; ++i, ++page) {
> -			ret = set_pages_array_wb(&page, 1);
> -			if (ret) {
> -				pr_err("%s: Failed to set %d pages to wb!\n",
> -				       pool->dev_name, 1);
> -			}
> +		ret = ttm_set_page_range_wb(page, num_pages);

For AGP enabled, set_pages_array_wc() could works like that by passing "num_pages" instead of "1"
In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.

so, does it work as below?

ret = set_pages_array_wb(page, num_pages);

Jerry

> +		if (ret) {
> +			pr_err("%s: Failed to set %d pages to wb!\n",
> +			       pool->dev_name, num_pages);
>   		}
>   	}
>
>
Christian König July 26, 2018, 6:37 a.m. UTC | #3
Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:
> Every set_pages_array_wb call resulted in cross-core
> interrupts and TLB flushes. Merge more of them for
> less overhead.
>
> This reduces the time needed to free a 1.6 GiB GTT WC
> buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
> (Allocation still takes more than 2 sec though)

Yeah, I was already wondering when I originally implemented this if 
there isn't a better approach.

This needs a bit of cleanup I think, e.g. use set_pages_wb() instead of 
set_memory_wb() and we should move the non-x86 abstraction into a common 
header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.

Bas, do you want to tackle this or should just I take a look?

Christian.

>
> Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>
> ---
>   drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
>   1 file changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index 4c659405a008a..9440ba0a55116 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray)
>   #endif
>   	return 0;
>   }
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +#if IS_ENABLED(CONFIG_AGP)
> +        unsigned long i;
> +
> +        for (i = 0; i < numpages; i++)
> +                unmap_page_from_agp(p + i);
> +#endif
> +	return 0;
> +}
> +
> +#else /* for !CONFIG_X86 */
> +
> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> +{
> +	return set_memory_wb((unsigned long)page_address(p), numpages);
> +}
> +
>   #endif /* for !CONFIG_X86 */
>   
>   static int ttm_set_pages_caching(struct dma_pool *pool,
> @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool,
>   static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page)
>   {
>   	struct page *page = d_page->p;
> -	unsigned i, num_pages;
> +	unsigned num_pages;
>   	int ret;
>   
>   	/* Don't set WB on WB page pool. */
>   	if (!(pool->type & IS_CACHED)) {
>   		num_pages = pool->size / PAGE_SIZE;
> -		for (i = 0; i < num_pages; ++i, ++page) {
> -			ret = set_pages_array_wb(&page, 1);
> -			if (ret) {
> -				pr_err("%s: Failed to set %d pages to wb!\n",
> -				       pool->dev_name, 1);
> -			}
> +		ret = ttm_set_page_range_wb(page, num_pages);
> +		if (ret) {
> +			pr_err("%s: Failed to set %d pages to wb!\n",
> +			       pool->dev_name, num_pages);
>   		}
>   	}
>
Huang Rui July 26, 2018, 7:02 a.m. UTC | #4
On Thu, Jul 26, 2018 at 08:37:45AM +0200, Christian König wrote:
> Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:
> >Every set_pages_array_wb call resulted in cross-core
> >interrupts and TLB flushes. Merge more of them for
> >less overhead.
> >
> >This reduces the time needed to free a 1.6 GiB GTT WC
> >buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
> >(Allocation still takes more than 2 sec though)
> 
> Yeah, I was already wondering when I originally implemented this if
> there isn't a better approach.
> 
> This needs a bit of cleanup I think, e.g. use set_pages_wb() instead
> of set_memory_wb() and we should move the non-x86 abstraction into a
> common header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.
> 

Agree, at the first glance, I almost got it wrong to miss-read "#ifndef
CONFIG_X86". So it make sense to move non-x86 definition into another
header.

Thanks,
Ray

> Bas, do you want to tackle this or should just I take a look?
> 
> Christian.
> 
> >
> >Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>
> >---
> >  drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
> >  1 file changed, 24 insertions(+), 7 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> >index 4c659405a008a..9440ba0a55116 100644
> >--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> >+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> >@@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray)
> >  #endif
> >  	return 0;
> >  }
> >+
> >+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> >+{
> >+#if IS_ENABLED(CONFIG_AGP)
> >+        unsigned long i;
> >+
> >+        for (i = 0; i < numpages; i++)
> >+                unmap_page_from_agp(p + i);
> >+#endif
> >+	return 0;
> >+}
> >+
> >+#else /* for !CONFIG_X86 */
> >+
> >+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
> >+{
> >+	return set_memory_wb((unsigned long)page_address(p), numpages);
> >+}
> >+
> >  #endif /* for !CONFIG_X86 */
> >  static int ttm_set_pages_caching(struct dma_pool *pool,
> >@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool,
> >  static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page)
> >  {
> >  	struct page *page = d_page->p;
> >-	unsigned i, num_pages;
> >+	unsigned num_pages;
> >  	int ret;
> >  	/* Don't set WB on WB page pool. */
> >  	if (!(pool->type & IS_CACHED)) {
> >  		num_pages = pool->size / PAGE_SIZE;
> >-		for (i = 0; i < num_pages; ++i, ++page) {
> >-			ret = set_pages_array_wb(&page, 1);
> >-			if (ret) {
> >-				pr_err("%s: Failed to set %d pages to wb!\n",
> >-				       pool->dev_name, 1);
> >-			}
> >+		ret = ttm_set_page_range_wb(page, num_pages);
> >+		if (ret) {
> >+			pr_err("%s: Failed to set %d pages to wb!\n",
> >+			       pool->dev_name, num_pages);
> >  		}
> >  	}
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Bas Nieuwenhuizen July 26, 2018, 9:06 a.m. UTC | #5
On Thu, Jul 26, 2018 at 7:52 AM, Zhang, Jerry (Junwei)
<Jerry.Zhang@amd.com> wrote:
> On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:
>>
>> Every set_pages_array_wb call resulted in cross-core
>> interrupts and TLB flushes. Merge more of them for
>> less overhead.
>>
>> This reduces the time needed to free a 1.6 GiB GTT WC
>> buffer as part of Vulkan CTS from  ~2 sec to < 0.25 sec.
>> (Allocation still takes more than 2 sec though)
>>
>> Signed-off-by: Bas Nieuwenhuizen <basni@chromium.org>
>> ---
>>   drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------
>>   1 file changed, 24 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>> b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>> index 4c659405a008a..9440ba0a55116 100644
>> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>> @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages,
>> int addrinarray)
>>   #endif
>>         return 0;
>>   }
>> +
>> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
>> +{
>> +#if IS_ENABLED(CONFIG_AGP)
>> +        unsigned long i;
>> +
>> +        for (i = 0; i < numpages; i++)
>> +                unmap_page_from_agp(p + i);
>> +#endif
>> +       return 0;
>> +}
>> +
>> +#else /* for !CONFIG_X86 */
>> +
>> +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
>> +{
>> +       return set_memory_wb((unsigned long)page_address(p), numpages);
>> +}
>> +
>>   #endif /* for !CONFIG_X86 */
>>
>>   static int ttm_set_pages_caching(struct dma_pool *pool,
>> @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct
>> dma_pool *pool,
>>   static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page
>> *d_page)
>>   {
>>         struct page *page = d_page->p;
>> -       unsigned i, num_pages;
>> +       unsigned num_pages;
>>         int ret;
>>
>>         /* Don't set WB on WB page pool. */
>>         if (!(pool->type & IS_CACHED)) {
>>                 num_pages = pool->size / PAGE_SIZE;
>> -               for (i = 0; i < num_pages; ++i, ++page) {
>> -                       ret = set_pages_array_wb(&page, 1);
>> -                       if (ret) {
>> -                               pr_err("%s: Failed to set %d pages to
>> wb!\n",
>> -                                      pool->dev_name, 1);
>> -                       }
>> +               ret = ttm_set_page_range_wb(page, num_pages);
>
>
> For AGP enabled, set_pages_array_wc() could works like that by passing
> "num_pages" instead of "1"
> In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.
>
> so, does it work as below?
>
> ret = set_pages_array_wb(page, num_pages);

No that would not work. Note that we have an array of page structs,
while set_pages_array_wb() wants an array of pointers to page structs.
We could allocate a temporary array and write the pointers but that
seems unnecessarily inefficient to me, and probably also does not
achieve a reduction in code.

>
> Jerry
>
>
>> +               if (ret) {
>> +                       pr_err("%s: Failed to set %d pages to wb!\n",
>> +                              pool->dev_name, num_pages);
>>                 }
>>         }
>>
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index 4c659405a008a..9440ba0a55116 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -299,6 +299,25 @@  static int set_pages_array_uc(struct page **pages, int addrinarray)
 #endif
 	return 0;
 }
+
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
+{
+#if IS_ENABLED(CONFIG_AGP)
+        unsigned long i;
+
+        for (i = 0; i < numpages; i++)
+                unmap_page_from_agp(p + i);
+#endif
+	return 0;
+}
+
+#else /* for !CONFIG_X86 */
+
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages)
+{
+	return set_memory_wb((unsigned long)page_address(p), numpages);
+}
+
 #endif /* for !CONFIG_X86 */
 
 static int ttm_set_pages_caching(struct dma_pool *pool,
@@ -387,18 +406,16 @@  static void ttm_pool_update_free_locked(struct dma_pool *pool,
 static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page)
 {
 	struct page *page = d_page->p;
-	unsigned i, num_pages;
+	unsigned num_pages;
 	int ret;
 
 	/* Don't set WB on WB page pool. */
 	if (!(pool->type & IS_CACHED)) {
 		num_pages = pool->size / PAGE_SIZE;
-		for (i = 0; i < num_pages; ++i, ++page) {
-			ret = set_pages_array_wb(&page, 1);
-			if (ret) {
-				pr_err("%s: Failed to set %d pages to wb!\n",
-				       pool->dev_name, 1);
-			}
+		ret = ttm_set_page_range_wb(page, num_pages);
+		if (ret) {
+			pr_err("%s: Failed to set %d pages to wb!\n",
+			       pool->dev_name, num_pages);
 		}
 	}