vc4: Add an ioctl for labeling GEM BOs for summary stats
diff mbox

Message ID 20170622205054.31472-1-eric@anholt.net
State New
Headers show

Commit Message

Eric Anholt June 22, 2017, 8:50 p.m. UTC
This has proven immensely useful for debugging memory leaks and
overallocation (which is a rather serious concern on the platform,
given that we typically run at about 256MB of CMA out of up to 1GB
total memory, with framebuffers that are about 8MB ecah).

The state of the art without this is to dump debug logs from every GL
application, guess as to kernel allocations based on bo_stats, and try
to merge that all together into a global picture of memory allocation
state.  With this, you can add a couple of calls to the debug build of
the 3D driver and get a pretty detailed view of GPU memory usage from
/debug/dri/0/bo_stats (or when we debug print to dmesg on allocation
failure).

The Mesa side currently labels at the gallium resource level (so you
see that a 1920x20 pixmap has been created, presumably for the window
system panel), but we could extend that to be even more useful with
glObjectLabel() names being sent all the way down to the kernel.

(partial) example of sorted debugfs output with Mesa labeling all
resources:

               kernel BO cache:  16392kb BOs (3)
       tiling shadow 1920x1080:   8160kb BOs (1)
       resource 1920x1080@32/0:   8160kb BOs (1)
scanout resource 1920x1080@32/0:   8100kb BOs (1)
                        kernel:   8100kb BOs (1)

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_bo.c        | 255 ++++++++++++++++++++++++++++--------
 drivers/gpu/drm/vc4/vc4_drv.c       |   8 +-
 drivers/gpu/drm/vc4/vc4_drv.h       |  39 +++++-
 drivers/gpu/drm/vc4/vc4_gem.c       |   2 +-
 drivers/gpu/drm/vc4/vc4_render_cl.c |   2 +-
 drivers/gpu/drm/vc4/vc4_v3d.c       |   3 +-
 include/uapi/drm/vc4_drm.h          |  11 ++
 7 files changed, 254 insertions(+), 66 deletions(-)

Comments

Eric Anholt July 25, 2017, 4:28 p.m. UTC | #1
Eric Anholt <eric@anholt.net> writes:

> This has proven immensely useful for debugging memory leaks and
> overallocation (which is a rather serious concern on the platform,
> given that we typically run at about 256MB of CMA out of up to 1GB
> total memory, with framebuffers that are about 8MB ecah).

I'm still interested in merging this patch if someone can ack.
Chris Wilson July 25, 2017, 5:09 p.m. UTC | #2
Quoting Eric Anholt (2017-06-22 21:50:54)
> This has proven immensely useful for debugging memory leaks and
> overallocation (which is a rather serious concern on the platform,
> given that we typically run at about 256MB of CMA out of up to 1GB
> total memory, with framebuffers that are about 8MB ecah).
> 
> The state of the art without this is to dump debug logs from every GL
> application, guess as to kernel allocations based on bo_stats, and try
> to merge that all together into a global picture of memory allocation
> state.  With this, you can add a couple of calls to the debug build of
> the 3D driver and get a pretty detailed view of GPU memory usage from
> /debug/dri/0/bo_stats (or when we debug print to dmesg on allocation
> failure).
> 
> The Mesa side currently labels at the gallium resource level (so you
> see that a 1920x20 pixmap has been created, presumably for the window
> system panel), but we could extend that to be even more useful with
> glObjectLabel() names being sent all the way down to the kernel.
> 
> (partial) example of sorted debugfs output with Mesa labeling all
> resources:
> 
>                kernel BO cache:  16392kb BOs (3)
>        tiling shadow 1920x1080:   8160kb BOs (1)
>        resource 1920x1080@32/0:   8160kb BOs (1)
> scanout resource 1920x1080@32/0:   8100kb BOs (1)
>                         kernel:   8100kb BOs (1)
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---

>  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>  {

Now unused?

> -       DRM_INFO("num bos allocated: %d\n",
> -                vc4->bo_stats.num_allocated);
> -       DRM_INFO("size bos allocated: %dkb\n",
> -                vc4->bo_stats.size_allocated / 1024);
> -       DRM_INFO("num bos used: %d\n",
> -                vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
> -       DRM_INFO("size bos used: %dkb\n",
> -                (vc4->bo_stats.size_allocated -
> -                 vc4->bo_stats.size_cached) / 1024);
> -       DRM_INFO("num bos cached: %d\n",
> -                vc4->bo_stats.num_cached);
> -       DRM_INFO("size bos cached: %dkb\n",
> -                vc4->bo_stats.size_cached / 1024);
> +       int i;
> +
> +       for (i = 0; i < vc4->num_labels; i++) {
> +               if (!vc4->bo_labels[i].num_allocated)
> +                       continue;
> +
> +               DRM_INFO("%30s: %6dkb BOs (%d)\n",
> +                        vc4->bo_labels[i].name,
> +                        vc4->bo_labels[i].size_allocated / 1024,
> +                        vc4->bo_labels[i].num_allocated);
> +       }
>  }
>  
>  #ifdef CONFIG_DEBUG_FS

> +/* Must be called with bo_lock held. */
> +static void vc4_bo_set_label(struct drm_gem_object *gem_obj, int label)
> +{
> +       struct vc4_bo *bo = to_vc4_bo(gem_obj);
> +       struct vc4_dev *vc4 = to_vc4_dev(gem_obj->dev);

lockdep_assert_held(&vc4->bo_lock);

> +
> +       vc4->bo_labels[label].num_allocated++;
> +       vc4->bo_labels[label].size_allocated += gem_obj->size;

This gets called with label=-1 on destroy.

> +       vc4->bo_labels[bo->label].num_allocated--;
> +       vc4->bo_labels[bo->label].size_allocated -= gem_obj->size;

Ok, preassigned to TYPE_KERNEL on creation.

> +       if (vc4->bo_labels[bo->label].num_allocated == 0 &&
> +           is_user_label(bo->label)) {
> +               /* Free user BO label slots on last unreference. */
> +               kfree(vc4->bo_labels[bo->label].name);
> +               vc4->bo_labels[bo->label].name = NULL;
> +       }

This seems dubious. As a user I would expect the label I created to last
until I closed the fd. Otherwise if I ever close all bo of one type,
wait a few seconds for the cache to be reaped, then reallocate a new bo,
someone else may have assigned my label a new name.

If that's guarded against, a comment would help.

> +
> +       bo->label = label;
> +}

> +int vc4_label_bo_ioctl(struct drm_device *dev, void *data,
> +                      struct drm_file *file_priv)
> +{
> +       struct vc4_dev *vc4 = to_vc4_dev(dev);
> +       struct drm_vc4_label_bo *args = data;
> +       char *name;
> +       struct drm_gem_object *gem_obj;
> +       int ret = 0, label;
> +
> +       name = kmalloc(args->len + 1, GFP_KERNEL);
> +       if (!name)
> +               return -ENOMEM;
> +
> +       if (copy_from_user(name, (void __user *)(uintptr_t)args->name,
> +                          args->len)) {
> +               kfree(name);
> +               return -EFAULT;
> +       }
> +       name[args->len] = 0;

if (!args->len)
	return -EINVAL;

name = strndup_user(u64_to_user_ptr(args->name), args->len + 1);
if (IS_ERR(name))
	return PTR_ERR(name);
-Chris
Eric Anholt July 25, 2017, 6:27 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Eric Anholt (2017-06-22 21:50:54)
>> This has proven immensely useful for debugging memory leaks and
>> overallocation (which is a rather serious concern on the platform,
>> given that we typically run at about 256MB of CMA out of up to 1GB
>> total memory, with framebuffers that are about 8MB ecah).
>> 
>> The state of the art without this is to dump debug logs from every GL
>> application, guess as to kernel allocations based on bo_stats, and try
>> to merge that all together into a global picture of memory allocation
>> state.  With this, you can add a couple of calls to the debug build of
>> the 3D driver and get a pretty detailed view of GPU memory usage from
>> /debug/dri/0/bo_stats (or when we debug print to dmesg on allocation
>> failure).
>> 
>> The Mesa side currently labels at the gallium resource level (so you
>> see that a 1920x20 pixmap has been created, presumably for the window
>> system panel), but we could extend that to be even more useful with
>> glObjectLabel() names being sent all the way down to the kernel.
>> 
>> (partial) example of sorted debugfs output with Mesa labeling all
>> resources:
>> 
>>                kernel BO cache:  16392kb BOs (3)
>>        tiling shadow 1920x1080:   8160kb BOs (1)
>>        resource 1920x1080@32/0:   8160kb BOs (1)
>> scanout resource 1920x1080@32/0:   8100kb BOs (1)
>>                         kernel:   8100kb BOs (1)
>> 
>> Signed-off-by: Eric Anholt <eric@anholt.net>
>> ---
>
>>  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>>  {
>
> Now unused?

Still used from the splat when CMA allocation fails.  It's less
catastrophic than it used to be, but still bad, so we're dumping to
dmesg for now.

>> -       DRM_INFO("num bos allocated: %d\n",
>> -                vc4->bo_stats.num_allocated);
>> -       DRM_INFO("size bos allocated: %dkb\n",
>> -                vc4->bo_stats.size_allocated / 1024);
>> -       DRM_INFO("num bos used: %d\n",
>> -                vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
>> -       DRM_INFO("size bos used: %dkb\n",
>> -                (vc4->bo_stats.size_allocated -
>> -                 vc4->bo_stats.size_cached) / 1024);
>> -       DRM_INFO("num bos cached: %d\n",
>> -                vc4->bo_stats.num_cached);
>> -       DRM_INFO("size bos cached: %dkb\n",
>> -                vc4->bo_stats.size_cached / 1024);
>> +       int i;
>> +
>> +       for (i = 0; i < vc4->num_labels; i++) {
>> +               if (!vc4->bo_labels[i].num_allocated)
>> +                       continue;
>> +
>> +               DRM_INFO("%30s: %6dkb BOs (%d)\n",
>> +                        vc4->bo_labels[i].name,
>> +                        vc4->bo_labels[i].size_allocated / 1024,
>> +                        vc4->bo_labels[i].num_allocated);
>> +       }
>>  }
>>  
>>  #ifdef CONFIG_DEBUG_FS
>
>> +/* Must be called with bo_lock held. */
>> +static void vc4_bo_set_label(struct drm_gem_object *gem_obj, int label)
>> +{
>> +       struct vc4_bo *bo = to_vc4_bo(gem_obj);
>> +       struct vc4_dev *vc4 = to_vc4_dev(gem_obj->dev);
>
> lockdep_assert_held(&vc4->bo_lock);

Nice.  I've converted the other instances of this comment, too.

>> +
>> +       vc4->bo_labels[label].num_allocated++;
>> +       vc4->bo_labels[label].size_allocated += gem_obj->size;
>
> This gets called with label=-1 on destroy.

Oh, good catch, thanks.  This code got reworked a couple of times and I
lost that check.

>> +       vc4->bo_labels[bo->label].num_allocated--;
>> +       vc4->bo_labels[bo->label].size_allocated -= gem_obj->size;
>
> Ok, preassigned to TYPE_KERNEL on creation.
>
>> +       if (vc4->bo_labels[bo->label].num_allocated == 0 &&
>> +           is_user_label(bo->label)) {
>> +               /* Free user BO label slots on last unreference. */
>> +               kfree(vc4->bo_labels[bo->label].name);
>> +               vc4->bo_labels[bo->label].name = NULL;
>> +       }
>
> This seems dubious. As a user I would expect the label I created to last
> until I closed the fd. Otherwise if I ever close all bo of one type,
> wait a few seconds for the cache to be reaped, then reallocate a new bo,
> someone else may have assigned my label a new name.
>
> If that's guarded against, a comment would help.

Userspace can't see the label index, though, and can only set string
names on BOs.  New text:

		/* Free user BO label slots on last unreference.
		 * Slots are just where we track the stats for a given
		 * name, and once a name is unused we can reuse that
		 * slot.
		 */

>> +
>> +       bo->label = label;
>> +}
>
>> +int vc4_label_bo_ioctl(struct drm_device *dev, void *data,
>> +                      struct drm_file *file_priv)
>> +{
>> +       struct vc4_dev *vc4 = to_vc4_dev(dev);
>> +       struct drm_vc4_label_bo *args = data;
>> +       char *name;
>> +       struct drm_gem_object *gem_obj;
>> +       int ret = 0, label;
>> +
>> +       name = kmalloc(args->len + 1, GFP_KERNEL);
>> +       if (!name)
>> +               return -ENOMEM;
>> +
>> +       if (copy_from_user(name, (void __user *)(uintptr_t)args->name,
>> +                          args->len)) {
>> +               kfree(name);
>> +               return -EFAULT;
>> +       }
>> +       name[args->len] = 0;
>
> if (!args->len)
> 	return -EINVAL;
>
> name = strndup_user(u64_to_user_ptr(args->name), args->len + 1);
> if (IS_ERR(name))
> 	return PTR_ERR(name);
> -Chris

Oh, nice.  I've converted the code I was copying from to use
u64_to_user_ptr(), too.
Chris Wilson July 28, 2017, 6:53 p.m. UTC | #4
Quoting Eric Anholt (2017-07-25 19:27:25)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Eric Anholt (2017-06-22 21:50:54)
> >> This has proven immensely useful for debugging memory leaks and
> >> overallocation (which is a rather serious concern on the platform,
> >> given that we typically run at about 256MB of CMA out of up to 1GB
> >> total memory, with framebuffers that are about 8MB ecah).
> >> 
> >> The state of the art without this is to dump debug logs from every GL
> >> application, guess as to kernel allocations based on bo_stats, and try
> >> to merge that all together into a global picture of memory allocation
> >> state.  With this, you can add a couple of calls to the debug build of
> >> the 3D driver and get a pretty detailed view of GPU memory usage from
> >> /debug/dri/0/bo_stats (or when we debug print to dmesg on allocation
> >> failure).
> >> 
> >> The Mesa side currently labels at the gallium resource level (so you
> >> see that a 1920x20 pixmap has been created, presumably for the window
> >> system panel), but we could extend that to be even more useful with
> >> glObjectLabel() names being sent all the way down to the kernel.
> >> 
> >> (partial) example of sorted debugfs output with Mesa labeling all
> >> resources:
> >> 
> >>                kernel BO cache:  16392kb BOs (3)
> >>        tiling shadow 1920x1080:   8160kb BOs (1)
> >>        resource 1920x1080@32/0:   8160kb BOs (1)
> >> scanout resource 1920x1080@32/0:   8100kb BOs (1)
> >>                         kernel:   8100kb BOs (1)
> >> 
> >> Signed-off-by: Eric Anholt <eric@anholt.net>
> >> ---
> >
> >>  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
> >>  {
> >
> > Now unused?
> 
> Still used from the splat when CMA allocation fails.  It's less
> catastrophic than it used to be, but still bad, so we're dumping to
> dmesg for now.
> 
> >> -       DRM_INFO("num bos allocated: %d\n",
> >> -                vc4->bo_stats.num_allocated);
> >> -       DRM_INFO("size bos allocated: %dkb\n",
> >> -                vc4->bo_stats.size_allocated / 1024);
> >> -       DRM_INFO("num bos used: %d\n",
> >> -                vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
> >> -       DRM_INFO("size bos used: %dkb\n",
> >> -                (vc4->bo_stats.size_allocated -
> >> -                 vc4->bo_stats.size_cached) / 1024);
> >> -       DRM_INFO("num bos cached: %d\n",
> >> -                vc4->bo_stats.num_cached);
> >> -       DRM_INFO("size bos cached: %dkb\n",
> >> -                vc4->bo_stats.size_cached / 1024);
> >> +       int i;
> >> +
> >> +       for (i = 0; i < vc4->num_labels; i++) {
> >> +               if (!vc4->bo_labels[i].num_allocated)
> >> +                       continue;
> >> +
> >> +               DRM_INFO("%30s: %6dkb BOs (%d)\n",
> >> +                        vc4->bo_labels[i].name,
> >> +                        vc4->bo_labels[i].size_allocated / 1024,
> >> +                        vc4->bo_labels[i].num_allocated);
> >> +       }
> >>  }
> >>  
> >>  #ifdef CONFIG_DEBUG_FS
> >
> >> +/* Must be called with bo_lock held. */
> >> +static void vc4_bo_set_label(struct drm_gem_object *gem_obj, int label)
> >> +{
> >> +       struct vc4_bo *bo = to_vc4_bo(gem_obj);
> >> +       struct vc4_dev *vc4 = to_vc4_dev(gem_obj->dev);
> >
> > lockdep_assert_held(&vc4->bo_lock);
> 
> Nice.  I've converted the other instances of this comment, too.
> 
> >> +
> >> +       vc4->bo_labels[label].num_allocated++;
> >> +       vc4->bo_labels[label].size_allocated += gem_obj->size;
> >
> > This gets called with label=-1 on destroy.
> 
> Oh, good catch, thanks.  This code got reworked a couple of times and I
> lost that check.
> 
> >> +       vc4->bo_labels[bo->label].num_allocated--;
> >> +       vc4->bo_labels[bo->label].size_allocated -= gem_obj->size;
> >
> > Ok, preassigned to TYPE_KERNEL on creation.
> >
> >> +       if (vc4->bo_labels[bo->label].num_allocated == 0 &&
> >> +           is_user_label(bo->label)) {
> >> +               /* Free user BO label slots on last unreference. */
> >> +               kfree(vc4->bo_labels[bo->label].name);
> >> +               vc4->bo_labels[bo->label].name = NULL;
> >> +       }
> >
> > This seems dubious. As a user I would expect the label I created to last
> > until I closed the fd. Otherwise if I ever close all bo of one type,
> > wait a few seconds for the cache to be reaped, then reallocate a new bo,
> > someone else may have assigned my label a new name.
> >
> > If that's guarded against, a comment would help.
> 
> Userspace can't see the label index, though, and can only set string
> names on BOs.  New text:
> 
>                 /* Free user BO label slots on last unreference.
>                  * Slots are just where we track the stats for a given
>                  * name, and once a name is unused we can reuse that
>                  * slot.
>                  */

Ah. My mistake was in thinking that you passed the name_id to the create
ioctl, but names are assigned to the bo themselves, and so the slot does
only exist whilst the bo exist.

BOs are assigned to slots, and not slots to BOs.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 487f96412d35..64147f1fcca6 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -24,21 +24,35 @@ 
 #include "vc4_drv.h"
 #include "uapi/drm/vc4_drm.h"
 
+static const char * const bo_type_names[] = {
+	"kernel",
+	"V3D",
+	"V3D shader",
+	"dumb",
+	"binner",
+	"RCL",
+	"BCL",
+	"kernel BO cache",
+};
+
+static bool is_user_label(int label)
+{
+	return label >= VC4_BO_TYPE_COUNT;
+}
+
 static void vc4_bo_stats_dump(struct vc4_dev *vc4)
 {
-	DRM_INFO("num bos allocated: %d\n",
-		 vc4->bo_stats.num_allocated);
-	DRM_INFO("size bos allocated: %dkb\n",
-		 vc4->bo_stats.size_allocated / 1024);
-	DRM_INFO("num bos used: %d\n",
-		 vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
-	DRM_INFO("size bos used: %dkb\n",
-		 (vc4->bo_stats.size_allocated -
-		  vc4->bo_stats.size_cached) / 1024);
-	DRM_INFO("num bos cached: %d\n",
-		 vc4->bo_stats.num_cached);
-	DRM_INFO("size bos cached: %dkb\n",
-		 vc4->bo_stats.size_cached / 1024);
+	int i;
+
+	for (i = 0; i < vc4->num_labels; i++) {
+		if (!vc4->bo_labels[i].num_allocated)
+			continue;
+
+		DRM_INFO("%30s: %6dkb BOs (%d)\n",
+			 vc4->bo_labels[i].name,
+			 vc4->bo_labels[i].size_allocated / 1024,
+			 vc4->bo_labels[i].num_allocated);
+	}
 }
 
 #ifdef CONFIG_DEBUG_FS
@@ -47,30 +61,96 @@  int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
 	struct drm_info_node *node = (struct drm_info_node *)m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
-	struct vc4_bo_stats stats;
+	int i;
 
-	/* Take a snapshot of the current stats with the lock held. */
 	mutex_lock(&vc4->bo_lock);
-	stats = vc4->bo_stats;
+	for (i = 0; i < vc4->num_labels; i++) {
+		if (!vc4->bo_labels[i].num_allocated)
+			continue;
+
+		seq_printf(m, "%30s: %6dkb BOs (%d)\n",
+			   vc4->bo_labels[i].name,
+			   vc4->bo_labels[i].size_allocated / 1024,
+			   vc4->bo_labels[i].num_allocated);
+	}
 	mutex_unlock(&vc4->bo_lock);
 
-	seq_printf(m, "num bos allocated: %d\n",
-		   stats.num_allocated);
-	seq_printf(m, "size bos allocated: %dkb\n",
-		   stats.size_allocated / 1024);
-	seq_printf(m, "num bos used: %d\n",
-		   stats.num_allocated - stats.num_cached);
-	seq_printf(m, "size bos used: %dkb\n",
-		   (stats.size_allocated - stats.size_cached) / 1024);
-	seq_printf(m, "num bos cached: %d\n",
-		   stats.num_cached);
-	seq_printf(m, "size bos cached: %dkb\n",
-		   stats.size_cached / 1024);
-
 	return 0;
 }
 #endif
 
+/* Takes ownership of *name and returns the appropriate slot for it in
+ * the bo_labels[] array, extending it as necessary.
+ *
+ * This is inefficient and could use a hash table instead of walking
+ * an array and strcmp()ing.  However, the assumption is that user
+ * labeling will be infrequent (scanout buffers and other long-lived
+ * objects, or debug driver builds), so we can live with it for now.
+ */
+static int vc4_get_user_label(struct vc4_dev *vc4, const char *name)
+{
+	int i;
+	int free_slot = -1;
+
+	for (i = 0; i < vc4->num_labels; i++) {
+		if (!vc4->bo_labels[i].name) {
+			free_slot = i;
+		} else if (strcmp(vc4->bo_labels[i].name, name) == 0) {
+			kfree(name);
+			return i;
+		}
+	}
+
+	if (free_slot != -1) {
+		WARN_ON(vc4->bo_labels[free_slot].num_allocated != 0);
+		vc4->bo_labels[free_slot].name = name;
+		return free_slot;
+	} else {
+		u32 new_label_count = vc4->num_labels + 1;
+		struct vc4_label *new_labels =
+			krealloc(vc4->bo_labels,
+				 new_label_count * sizeof(*new_labels),
+				 GFP_KERNEL);
+
+		if (!new_labels) {
+			kfree(name);
+			return -1;
+		}
+
+		free_slot = vc4->num_labels;
+		vc4->bo_labels = new_labels;
+		vc4->num_labels = new_label_count;
+
+		vc4->bo_labels[free_slot].name = name;
+		vc4->bo_labels[free_slot].num_allocated = 0;
+		vc4->bo_labels[free_slot].size_allocated = 0;
+
+		return free_slot;
+	}
+}
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_set_label(struct drm_gem_object *gem_obj, int label)
+{
+	struct vc4_bo *bo = to_vc4_bo(gem_obj);
+	struct vc4_dev *vc4 = to_vc4_dev(gem_obj->dev);
+
+	vc4->bo_labels[label].num_allocated++;
+	vc4->bo_labels[label].size_allocated += gem_obj->size;
+
+	vc4->bo_labels[bo->label].num_allocated--;
+	vc4->bo_labels[bo->label].size_allocated -= gem_obj->size;
+
+	if (vc4->bo_labels[bo->label].num_allocated == 0 &&
+	    is_user_label(bo->label)) {
+		/* Free user BO label slots on last unreference. */
+		kfree(vc4->bo_labels[bo->label].name);
+		vc4->bo_labels[bo->label].name = NULL;
+	}
+
+	bo->label = label;
+}
+
 static uint32_t bo_page_index(size_t size)
 {
 	return (size / PAGE_SIZE) - 1;
@@ -80,7 +160,8 @@  static uint32_t bo_page_index(size_t size)
 static void vc4_bo_destroy(struct vc4_bo *bo)
 {
 	struct drm_gem_object *obj = &bo->base.base;
-	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
+
+	vc4_bo_set_label(obj, -1);
 
 	if (bo->validated_shader) {
 		kfree(bo->validated_shader->texture_samples);
@@ -88,9 +169,6 @@  static void vc4_bo_destroy(struct vc4_bo *bo)
 		bo->validated_shader = NULL;
 	}
 
-	vc4->bo_stats.num_allocated--;
-	vc4->bo_stats.size_allocated -= obj->size;
-
 	reservation_object_fini(&bo->_resv);
 
 	drm_gem_cma_free_object(obj);
@@ -99,12 +177,6 @@  static void vc4_bo_destroy(struct vc4_bo *bo)
 /* Must be called with bo_lock held. */
 static void vc4_bo_remove_from_cache(struct vc4_bo *bo)
 {
-	struct drm_gem_object *obj = &bo->base.base;
-	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
-
-	vc4->bo_stats.num_cached--;
-	vc4->bo_stats.size_cached -= obj->size;
-
 	list_del(&bo->unref_head);
 	list_del(&bo->size_head);
 }
@@ -165,7 +237,8 @@  static void vc4_bo_cache_purge(struct drm_device *dev)
 }
 
 static struct vc4_bo *vc4_bo_get_from_cache(struct drm_device *dev,
-					    uint32_t size)
+					    uint32_t size,
+					    enum vc4_kernel_bo_type type)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	uint32_t page_index = bo_page_index(size);
@@ -186,6 +259,8 @@  static struct vc4_bo *vc4_bo_get_from_cache(struct drm_device *dev,
 	kref_init(&bo->base.base.refcount);
 
 out:
+	if (bo)
+		vc4_bo_set_label(&bo->base.base, type);
 	mutex_unlock(&vc4->bo_lock);
 	return bo;
 }
@@ -208,8 +283,9 @@  struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size)
 		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&vc4->bo_lock);
-	vc4->bo_stats.num_allocated++;
-	vc4->bo_stats.size_allocated += size;
+	bo->label = VC4_BO_TYPE_KERNEL;
+	vc4->bo_labels[VC4_BO_TYPE_KERNEL].num_allocated++;
+	vc4->bo_labels[VC4_BO_TYPE_KERNEL].size_allocated += size;
 	mutex_unlock(&vc4->bo_lock);
 	bo->resv = &bo->_resv;
 	reservation_object_init(bo->resv);
@@ -218,7 +294,7 @@  struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size)
 }
 
 struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
-			     bool allow_unzeroed)
+			     bool allow_unzeroed, enum vc4_kernel_bo_type type)
 {
 	size_t size = roundup(unaligned_size, PAGE_SIZE);
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -229,7 +305,7 @@  struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
 		return ERR_PTR(-EINVAL);
 
 	/* First, try to get a vc4_bo from the kernel BO cache. */
-	bo = vc4_bo_get_from_cache(dev, size);
+	bo = vc4_bo_get_from_cache(dev, size, type);
 	if (bo) {
 		if (!allow_unzeroed)
 			memset(bo->base.vaddr, 0, bo->base.base.size);
@@ -251,7 +327,13 @@  struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
 			return ERR_PTR(-ENOMEM);
 		}
 	}
-	return to_vc4_bo(&cma_obj->base);
+	bo = to_vc4_bo(&cma_obj->base);
+
+	mutex_lock(&vc4->bo_lock);
+	vc4_bo_set_label(&cma_obj->base, type);
+	mutex_unlock(&vc4->bo_lock);
+
+	return bo;
 }
 
 int vc4_dumb_create(struct drm_file *file_priv,
@@ -268,7 +350,7 @@  int vc4_dumb_create(struct drm_file *file_priv,
 	if (args->size < args->pitch * args->height)
 		args->size = args->pitch * args->height;
 
-	bo = vc4_bo_create(dev, args->size, false);
+	bo = vc4_bo_create(dev, args->size, false, VC4_BO_TYPE_DUMB);
 	if (IS_ERR(bo))
 		return PTR_ERR(bo);
 
@@ -348,8 +430,7 @@  void vc4_free_object(struct drm_gem_object *gem_bo)
 	list_add(&bo->size_head, cache_list);
 	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
 
-	vc4->bo_stats.num_cached++;
-	vc4->bo_stats.size_cached += gem_bo->size;
+	vc4_bo_set_label(&bo->base.base, VC4_BO_TYPE_KERNEL_CACHE);
 
 	vc4_bo_cache_free_old(dev);
 
@@ -483,7 +564,7 @@  int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
 	 * We can't allocate from the BO cache, because the BOs don't
 	 * get zeroed, and that might leak data between users.
 	 */
-	bo = vc4_bo_create(dev, args->size, false);
+	bo = vc4_bo_create(dev, args->size, false, VC4_BO_TYPE_V3D);
 	if (IS_ERR(bo))
 		return PTR_ERR(bo);
 
@@ -536,7 +617,7 @@  vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	bo = vc4_bo_create(dev, args->size, true);
+	bo = vc4_bo_create(dev, args->size, true, VC4_BO_TYPE_V3D_SHADER);
 	if (IS_ERR(bo))
 		return PTR_ERR(bo);
 
@@ -651,9 +732,24 @@  int vc4_get_tiling_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
-void vc4_bo_cache_init(struct drm_device *dev)
+int vc4_bo_cache_init(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int i;
+
+	/* Create the initial set of BO labels that the kernel will
+	 * use.  This lets us avoid a bunch of string reallocation in
+	 * the kernel's draw and BO allocation paths.
+	 */
+	vc4->bo_labels = kcalloc(VC4_BO_TYPE_COUNT, sizeof(*vc4->bo_labels),
+				 GFP_KERNEL);
+	if (!vc4->bo_labels)
+		return -ENOMEM;
+	vc4->num_labels = VC4_BO_TYPE_COUNT;
+
+	BUILD_BUG_ON(ARRAY_SIZE(bo_type_names) != VC4_BO_TYPE_COUNT);
+	for (i = 0; i < VC4_BO_TYPE_COUNT; i++)
+		vc4->bo_labels[i].name = bo_type_names[i];
 
 	mutex_init(&vc4->bo_lock);
 
@@ -663,19 +759,70 @@  void vc4_bo_cache_init(struct drm_device *dev)
 	setup_timer(&vc4->bo_cache.time_timer,
 		    vc4_bo_cache_time_timer,
 		    (unsigned long)dev);
+
+	return 0;
 }
 
 void vc4_bo_cache_destroy(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int i;
 
 	del_timer(&vc4->bo_cache.time_timer);
 	cancel_work_sync(&vc4->bo_cache.time_work);
 
 	vc4_bo_cache_purge(dev);
 
-	if (vc4->bo_stats.num_allocated) {
-		DRM_ERROR("Destroying BO cache while BOs still allocated:\n");
-		vc4_bo_stats_dump(vc4);
+	for (i = 0; i < vc4->num_labels; i++) {
+		if (vc4->bo_labels[i].num_allocated) {
+			DRM_ERROR("Destroying BO cache with %d %s "
+				  "BOs still allocated\n",
+				  vc4->bo_labels[i].num_allocated,
+				  vc4->bo_labels[i].name);
+		}
+
+		if (is_user_label(i))
+			kfree(vc4->bo_labels[i].name);
+	}
+	kfree(vc4->bo_labels);
+}
+
+int vc4_label_bo_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_label_bo *args = data;
+	char *name;
+	struct drm_gem_object *gem_obj;
+	int ret = 0, label;
+
+	name = kmalloc(args->len + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	if (copy_from_user(name, (void __user *)(uintptr_t)args->name,
+			   args->len)) {
+		kfree(name);
+		return -EFAULT;
+	}
+	name[args->len] = 0;
+
+	gem_obj = drm_gem_object_lookup(file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		kfree(name);
+		return -ENOENT;
 	}
+
+	mutex_lock(&vc4->bo_lock);
+	label = vc4_get_user_label(vc4, name);
+	if (label != -1)
+		vc4_bo_set_label(gem_obj, label);
+	else
+		ret = -ENOMEM;
+	mutex_unlock(&vc4->bo_lock);
+
+	drm_gem_object_unreference_unlocked(gem_obj);
+
+	return ret;
 }
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index c6b487c3d2b7..75c1f50a7b5d 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -140,6 +140,7 @@  static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_GET_PARAM, vc4_get_param_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(VC4_SET_TILING, vc4_set_tiling_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver vc4_drm_driver = {
@@ -257,7 +258,9 @@  static int vc4_drm_bind(struct device *dev)
 	vc4->dev = drm;
 	drm->dev_private = vc4;
 
-	vc4_bo_cache_init(drm);
+	ret = vc4_bo_cache_init(drm);
+	if (ret)
+		goto dev_unref;
 
 	drm_mode_config_init(drm);
 
@@ -281,8 +284,9 @@  static int vc4_drm_bind(struct device *dev)
 	component_unbind_all(dev, drm);
 gem_destroy:
 	vc4_gem_destroy(drm);
-	drm_dev_unref(drm);
 	vc4_bo_cache_destroy(drm);
+dev_unref:
+	drm_dev_unref(drm);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 1047953216a8..87f2d8e5c134 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -11,6 +11,24 @@ 
 #include <drm/drm_encoder.h>
 #include <drm/drm_gem_cma_helper.h>
 
+/* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
+ * this.
+ */
+enum vc4_kernel_bo_type {
+	/* Any kernel allocation (gem_create_object hook) before it
+	 * gets another type set.
+	 */
+	VC4_BO_TYPE_KERNEL,
+	VC4_BO_TYPE_V3D,
+	VC4_BO_TYPE_V3D_SHADER,
+	VC4_BO_TYPE_DUMB,
+	VC4_BO_TYPE_BIN,
+	VC4_BO_TYPE_RCL,
+	VC4_BO_TYPE_BCL,
+	VC4_BO_TYPE_KERNEL_CACHE,
+	VC4_BO_TYPE_COUNT
+};
+
 struct vc4_dev {
 	struct drm_device *dev;
 
@@ -46,14 +64,14 @@  struct vc4_dev {
 		struct timer_list time_timer;
 	} bo_cache;
 
-	struct vc4_bo_stats {
+	u32 num_labels;
+	struct vc4_label {
+		const char *name;
 		u32 num_allocated;
 		u32 size_allocated;
-		u32 num_cached;
-		u32 size_cached;
-	} bo_stats;
+	} *bo_labels;
 
-	/* Protects bo_cache and the BO stats. */
+	/* Protects bo_cache and bo_labels. */
 	struct mutex bo_lock;
 
 	uint64_t dma_fence_context;
@@ -169,6 +187,11 @@  struct vc4_bo {
 	/* normally (resv == &_resv) except for imported bo's */
 	struct reservation_object *resv;
 	struct reservation_object _resv;
+
+	/* One of enum vc4_kernel_bo_type, or VC4_BO_TYPE_COUNT + i
+	 * for user-allocated labels.
+	 */
+	int label;
 };
 
 static inline struct vc4_bo *
@@ -460,7 +483,7 @@  struct vc4_validated_shader_info {
 struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size);
 void vc4_free_object(struct drm_gem_object *gem_obj);
 struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size,
-			     bool from_cache);
+			     bool from_cache, enum vc4_kernel_bo_type type);
 int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_device *dev,
 		    struct drm_mode_create_dumb *args);
@@ -478,6 +501,8 @@  int vc4_get_tiling_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 int vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
+int vc4_label_bo_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv);
 int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
 struct reservation_object *vc4_prime_res_obj(struct drm_gem_object *obj);
 int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
@@ -485,7 +510,7 @@  struct drm_gem_object *vc4_prime_import_sg_table(struct drm_device *dev,
 						 struct dma_buf_attachment *attach,
 						 struct sg_table *sgt);
 void *vc4_prime_vmap(struct drm_gem_object *obj);
-void vc4_bo_cache_init(struct drm_device *dev);
+int vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
 
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index d5b821ad06af..e75787d60e57 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -775,7 +775,7 @@  vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 
-	bo = vc4_bo_create(dev, exec_size, true);
+	bo = vc4_bo_create(dev, exec_size, true, VC4_BO_TYPE_BCL);
 	if (IS_ERR(bo)) {
 		DRM_ERROR("Couldn't allocate BO for binning\n");
 		ret = PTR_ERR(bo);
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 5dc19429d4ae..4a8051532f00 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -320,7 +320,7 @@  static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 
 	size += xtiles * ytiles * loop_body_size;
 
-	setup->rcl = &vc4_bo_create(dev, size, true)->base;
+	setup->rcl = &vc4_bo_create(dev, size, true, VC4_BO_TYPE_RCL)->base;
 	if (IS_ERR(setup->rcl))
 		return PTR_ERR(setup->rcl);
 	list_add_tail(&to_vc4_bo(&setup->rcl->base)->unref_head,
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 8c723da71f66..622cd43840b8 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -236,7 +236,8 @@  vc4_allocate_bin_bo(struct drm_device *drm)
 	INIT_LIST_HEAD(&list);
 
 	while (true) {
-		struct vc4_bo *bo = vc4_bo_create(drm, size, true);
+		struct vc4_bo *bo = vc4_bo_create(drm, size, true,
+						  VC4_BO_TYPE_BIN);
 
 		if (IS_ERR(bo)) {
 			ret = PTR_ERR(bo);
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 6ac4c5c014cb..551628e571f9 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -40,6 +40,7 @@  extern "C" {
 #define DRM_VC4_GET_PARAM                         0x07
 #define DRM_VC4_SET_TILING                        0x08
 #define DRM_VC4_GET_TILING                        0x09
+#define DRM_VC4_LABEL_BO                          0x0a
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -51,6 +52,7 @@  extern "C" {
 #define DRM_IOCTL_VC4_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_PARAM, struct drm_vc4_get_param)
 #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
 #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
+#define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
 
 struct drm_vc4_submit_rcl_surface {
 	__u32 hindex; /* Handle index, or ~0 if not present. */
@@ -311,6 +313,15 @@  struct drm_vc4_set_tiling {
 	__u64 modifier;
 };
 
+/**
+ * struct drm_vc4_label_bo - Attach a name to a BO for debug purposes.
+ */
+struct drm_vc4_label_bo {
+	__u32 handle;
+	__u32 len;
+	__u64 name;
+};
+
 #if defined(__cplusplus)
 }
 #endif