diff mbox

[5/6] drm/radeon: validate relocations in the order determined by userspace

Message ID 1393255246-8296-6-git-send-email-maraeo@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marek Olšák Feb. 24, 2014, 3:20 p.m. UTC
From: Marek Olšák <marek.olsak@amd.com>

Userspace should set the first 4 bits of drm_radeon_cs_reloc::flags to
a number from 0 to 15. The higher the number, the higher the priority,
which means a buffer with a higher number will be validated sooner.

The old behavior is preserved: Buffers used for write are prioritized over
read-only buffers if the userspace doesn't set the number.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/radeon/radeon.h        |  2 +-
 drivers/gpu/drm/radeon/radeon_cs.c     | 53 ++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/radeon/radeon_object.c | 10 -------
 drivers/gpu/drm/radeon/radeon_object.h |  2 --
 4 files changed, 51 insertions(+), 16 deletions(-)

Comments

Christian König Feb. 24, 2014, 4:27 p.m. UTC | #1
Am 24.02.2014 16:20, schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> Userspace should set the first 4 bits of drm_radeon_cs_reloc::flags to
> a number from 0 to 15. The higher the number, the higher the priority,
> which means a buffer with a higher number will be validated sooner.

Assuming that we only have 32 different priorities it would probably be 
better to add the buffers to 32 different lists while evaluating the 
priorities in radeon_cs_parser_relocs and then concatenate all 32 lists 
at the end instead of sorting them.

>
> The old behavior is preserved: Buffers used for write are prioritized over
> read-only buffers if the userspace doesn't set the number.
>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
> ---
>   drivers/gpu/drm/radeon/radeon.h        |  2 +-
>   drivers/gpu/drm/radeon/radeon_cs.c     | 53 ++++++++++++++++++++++++++++++++--
>   drivers/gpu/drm/radeon/radeon_object.c | 10 -------
>   drivers/gpu/drm/radeon/radeon_object.h |  2 --
>   4 files changed, 51 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index d37a57a..f7a3174 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -481,7 +481,7 @@ struct radeon_bo_list {
>   	struct ttm_validate_buffer tv;
>   	struct radeon_bo	*bo;
>   	uint64_t		gpu_offset;
> -	bool			written;
> +	unsigned		priority;
>   	unsigned		domain;
>   	unsigned		alt_domain;
>   	u32			tiling_flags;
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index d49a3f7..1ba1a48 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -31,6 +31,41 @@
>   #include "radeon.h"
>   #include "radeon_trace.h"
>   
> +/* This is a variation of the bucket sort with O(n) time complexity.
> + * The relocations are sorted from the highest to the lowest priority. */
> +static void sort_relocs_for_validation(struct list_head *list)
> +{
> +	struct list_head bucket[17], *it, *tmp;
> +	unsigned i, priority;
> +
> +	for (i = 0; i < 17; i++)
> +		INIT_LIST_HEAD(&bucket[i]);
> +
> +	/* Move the elements into buckets. An i-th bucket only contains
> +	 * elements with priorities i*2 and i*2+1. Odd numbers are added
> +	 * at the head of a bucket and even numbers are added at the tail,
> +	 * therefore all buckets are always sorted. */
> +	list_for_each_safe(it, tmp, list) {
> +		priority = list_entry(it, struct radeon_bo_list,
> +				      tv.head)->priority;
> +		i = priority / 2;
> +		i = min(i, 16u) ;
> +
> +		if (priority % 2 == 1) {
> +			list_move(it, &bucket[i]);
> +		} else {
> +			list_move_tail(it, &bucket[i]);
> +		}
> +	}
> +
> +	INIT_LIST_HEAD(list);
> +
> +	/* connect the sorted buckets */
> +	for (i = 0; i < 17; i++) {
> +		list_splice(&bucket[i], list);
> +	}
> +}
> +
>   static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>   {
>   	struct drm_device *ddev = p->rdev->ddev;
> @@ -80,7 +115,15 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>   		p->relocs_ptr[i] = &p->relocs[i];
>   		p->relocs[i].robj = gem_to_radeon_bo(p->relocs[i].gobj);
>   		p->relocs[i].lobj.bo = p->relocs[i].robj;
> -		p->relocs[i].lobj.written = !!r->write_domain;
> +
> +		/* The userspace buffer priorities are from 0 to 15. A higher
> +		 * number means the buffer is more important.
> +		 * Also, the buffers used for write have a higher priority than
> +		 * the buffers used for read only, which doubles the range
> +		 * to 0 to 31. Numbers 32 and 33 are reserved for the kernel
> +		 * driver.
> +		 */
> +		p->relocs[i].lobj.priority = (r->flags & 0xf) * 2 + !!r->write_domain;
>   
>   		/* the first reloc of an UVD job is the msg and that must be in
>   		   VRAM, also but everything into VRAM on AGP cards to avoid
> @@ -94,6 +137,8 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>   			p->relocs[i].lobj.alt_domain =
>   				RADEON_GEM_DOMAIN_VRAM;
>   
> +			/* prioritize this over any other relocation */
> +			p->relocs[i].lobj.priority = 32;
>   		} else {
>   			uint32_t domain = r->write_domain ?
>   				r->write_domain : r->read_domains;
> @@ -107,9 +152,11 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>   		p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
>   		p->relocs[i].handle = r->handle;
>   
> -		radeon_bo_list_add_object(&p->relocs[i].lobj,
> -					  &p->validated);
> +		list_add(&p->relocs[i].lobj.tv.head, &p->validated);
>   	}
> +
> +	sort_relocs_for_validation(&p->validated);
> +
>   	return radeon_bo_list_validate(&p->ticket, &p->validated, p->ring);
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
> index d676ee2..19042ae 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.c
> +++ b/drivers/gpu/drm/radeon/radeon_object.c
> @@ -368,16 +368,6 @@ void radeon_bo_fini(struct radeon_device *rdev)
>   	arch_phys_wc_del(rdev->mc.vram_mtrr);
>   }
>   
> -void radeon_bo_list_add_object(struct radeon_bo_list *lobj,
> -				struct list_head *head)
> -{
> -	if (lobj->written) {
> -		list_add(&lobj->tv.head, head);
> -	} else {
> -		list_add_tail(&lobj->tv.head, head);
> -	}
> -}
> -
>   int radeon_bo_list_validate(struct ww_acquire_ctx *ticket,
>   			    struct list_head *head, int ring)
>   {
> diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
> index a9a8c11..6c3ca9e 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.h
> +++ b/drivers/gpu/drm/radeon/radeon_object.h
> @@ -138,8 +138,6 @@ extern int radeon_bo_evict_vram(struct radeon_device *rdev);
>   extern void radeon_bo_force_delete(struct radeon_device *rdev);
>   extern int radeon_bo_init(struct radeon_device *rdev);
>   extern void radeon_bo_fini(struct radeon_device *rdev);
> -extern void radeon_bo_list_add_object(struct radeon_bo_list *lobj,
> -				struct list_head *head);
>   extern int radeon_bo_list_validate(struct ww_acquire_ctx *ticket,
>   				   struct list_head *head, int ring);
>   extern int radeon_bo_fbdev_mmap(struct radeon_bo *bo,
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index d37a57a..f7a3174 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -481,7 +481,7 @@  struct radeon_bo_list {
 	struct ttm_validate_buffer tv;
 	struct radeon_bo	*bo;
 	uint64_t		gpu_offset;
-	bool			written;
+	unsigned		priority;
 	unsigned		domain;
 	unsigned		alt_domain;
 	u32			tiling_flags;
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index d49a3f7..1ba1a48 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -31,6 +31,41 @@ 
 #include "radeon.h"
 #include "radeon_trace.h"
 
+/* This is a variation of the bucket sort with O(n) time complexity.
+ * The relocations are sorted from the highest to the lowest priority. */
+static void sort_relocs_for_validation(struct list_head *list)
+{
+	struct list_head bucket[17], *it, *tmp;
+	unsigned i, priority;
+
+	for (i = 0; i < 17; i++)
+		INIT_LIST_HEAD(&bucket[i]);
+
+	/* Move the elements into buckets. An i-th bucket only contains
+	 * elements with priorities i*2 and i*2+1. Odd numbers are added
+	 * at the head of a bucket and even numbers are added at the tail,
+	 * therefore all buckets are always sorted. */
+	list_for_each_safe(it, tmp, list) {
+		priority = list_entry(it, struct radeon_bo_list,
+				      tv.head)->priority;
+		i = priority / 2;
+		i = min(i, 16u) ;
+
+		if (priority % 2 == 1) {
+			list_move(it, &bucket[i]);
+		} else {
+			list_move_tail(it, &bucket[i]);
+		}
+	}
+
+	INIT_LIST_HEAD(list);
+
+	/* connect the sorted buckets */
+	for (i = 0; i < 17; i++) {
+		list_splice(&bucket[i], list);
+	}
+}
+
 static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 {
 	struct drm_device *ddev = p->rdev->ddev;
@@ -80,7 +115,15 @@  static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 		p->relocs_ptr[i] = &p->relocs[i];
 		p->relocs[i].robj = gem_to_radeon_bo(p->relocs[i].gobj);
 		p->relocs[i].lobj.bo = p->relocs[i].robj;
-		p->relocs[i].lobj.written = !!r->write_domain;
+
+		/* The userspace buffer priorities are from 0 to 15. A higher
+		 * number means the buffer is more important.
+		 * Also, the buffers used for write have a higher priority than
+		 * the buffers used for read only, which doubles the range
+		 * to 0 to 31. Numbers 32 and 33 are reserved for the kernel
+		 * driver.
+		 */
+		p->relocs[i].lobj.priority = (r->flags & 0xf) * 2 + !!r->write_domain;
 
 		/* the first reloc of an UVD job is the msg and that must be in
 		   VRAM, also but everything into VRAM on AGP cards to avoid
@@ -94,6 +137,8 @@  static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 			p->relocs[i].lobj.alt_domain =
 				RADEON_GEM_DOMAIN_VRAM;
 
+			/* prioritize this over any other relocation */
+			p->relocs[i].lobj.priority = 32;
 		} else {
 			uint32_t domain = r->write_domain ?
 				r->write_domain : r->read_domains;
@@ -107,9 +152,11 @@  static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 		p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
 		p->relocs[i].handle = r->handle;
 
-		radeon_bo_list_add_object(&p->relocs[i].lobj,
-					  &p->validated);
+		list_add(&p->relocs[i].lobj.tv.head, &p->validated);
 	}
+
+	sort_relocs_for_validation(&p->validated);
+
 	return radeon_bo_list_validate(&p->ticket, &p->validated, p->ring);
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index d676ee2..19042ae 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -368,16 +368,6 @@  void radeon_bo_fini(struct radeon_device *rdev)
 	arch_phys_wc_del(rdev->mc.vram_mtrr);
 }
 
-void radeon_bo_list_add_object(struct radeon_bo_list *lobj,
-				struct list_head *head)
-{
-	if (lobj->written) {
-		list_add(&lobj->tv.head, head);
-	} else {
-		list_add_tail(&lobj->tv.head, head);
-	}
-}
-
 int radeon_bo_list_validate(struct ww_acquire_ctx *ticket,
 			    struct list_head *head, int ring)
 {
diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
index a9a8c11..6c3ca9e 100644
--- a/drivers/gpu/drm/radeon/radeon_object.h
+++ b/drivers/gpu/drm/radeon/radeon_object.h
@@ -138,8 +138,6 @@  extern int radeon_bo_evict_vram(struct radeon_device *rdev);
 extern void radeon_bo_force_delete(struct radeon_device *rdev);
 extern int radeon_bo_init(struct radeon_device *rdev);
 extern void radeon_bo_fini(struct radeon_device *rdev);
-extern void radeon_bo_list_add_object(struct radeon_bo_list *lobj,
-				struct list_head *head);
 extern int radeon_bo_list_validate(struct ww_acquire_ctx *ticket,
 				   struct list_head *head, int ring);
 extern int radeon_bo_fbdev_mmap(struct radeon_bo *bo,