diff mbox series

[2/3] drm/panfrost: Expose HW counters to userspace

Message ID 20190404152051.17996-3-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panfrost: Expose HW counters to userspace | expand

Commit Message

Boris Brezillon April 4, 2019, 3:20 p.m. UTC
Add the necessary infrastructure to expose GPU counters to userspace.
This takes the form of 4 new ioctls to:

- query the available counters
- create/destroy a performance monitor
- retrieve its values

The drm_panfrost_submit struct is extended to pass a list of perfmons
to attach to a job, which means perfmons will only track changes caused
by the jobs they are attached too.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   8 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |  11 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |  22 +-
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |  43 +-
 drivers/gpu/drm/panfrost/panfrost_job.c     |  24 +
 drivers/gpu/drm/panfrost/panfrost_job.h     |   4 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 954 ++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  54 ++
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 +
 include/uapi/drm/panfrost_drm.h             | 122 +++
 11 files changed, 1260 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h

Comments

Alyssa Rosenzweig April 4, 2019, 3:41 p.m. UTC | #1
> +	/*
> +	 * In v4 HW we have one tiler per core group, with the number
> +	 * of core groups being equal to the number of L2 caches. Other
> +	 * HW versions just have one tiler and the number of L2 caches
> +	 * can be extracted from the mem_features field.
> +	 */

Normalizing layout in kernel sounds like a good idea to me, nice one :)
(kbase does not and userspace is messy as a result)

> +/*
> + * Returns true if the 2 jobs have exactly the same perfcnt context, false
> + * otherwise.
> + */
> +static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
> +					 struct panfrost_perfcnt_job_ctx *b)
> +{
> +	unsigned int i, j;
> +
> +	if (a->perfmon_count != b->perfmon_count)
> +		return false;
> +
> +	for (i = 0; i < a->perfmon_count; i++) {
> +		for (j = 0; j < b->perfmon_count; j++) {
> +			if (a->perfmons[i] == b->perfmons[j])
> +				break;
> +		}
> +
> +		if (j == b->perfmon_count)
> +			return false;
> +	}
> +

Would using memcmp() be cleaner here?

> +	if (panfrost_model_cmp(pfdev, 0x1000) >= 0)

What does 0x1000 refer to here? I'm assuming maybe Bifrost, but it's not
obvious... probably better to have a #define somewhere and use that (or
an enum equivalently).

> +	/*
> +	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
> +	 * counters.
> +	 */

What on earth is PRLAM-8186? :)

Actually, wait, I can answer that -- old kbase versions had an errata
list:

        /* Write of PRFCNT_CONFIG_MODE_MANUAL to PRFCNT_CONFIG causes a instrumentation dump if
           PRFCNT_TILER_EN is enabled */
        BASE_HW_ISSUE_8186,

So that's why. If people want, I'm considering moving these errata
descriptions back into the kernel where possible, since otherwise code
like this is opaque.

> +		unsigned int nl2c, ncores;
> +
> +		/*
> +		 * TODO: define a macro to extract the number of l2 caches from
> +		 * mem_features.
> +		 */
> +		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
> +
> +		/*
> +		 * The ARM driver is grouping cores per core group and then
> +		 * only using the number of cores in group 0 to calculate the
> +		 * size. Not sure why this is done like that, but I guess
> +		 * shader_present will only show cores in the first group
> +		 * anyway.
> +		 */
> +		ncores = hweight64(pfdev->features.shader_present);
> +

Deja vu. Was this copypaste dmaybe?

> +		  (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?

THere's that pesky 0x1000 again.

> @@ -55,6 +63,15 @@ struct drm_panfrost_submit {
>  
>  	/** A combination of PANFROST_JD_REQ_* */
>  	__u32 requirements;
> +
> +	/** Pointer to a u32 array of perfmons that should be attached to the job. */
> +	__u64 perfmon_handles;
> +
> +	/** Number of perfmon handles passed in (size is that times 4). */
> +	__u32 perfmon_handle_count;
> +
> +	/** Unused field, should be set to 0. */
> +	__u32 padding;

Bleep blorp. If we're modifying _submit, we'll need to be swift about
merging this ahead of the main code to make sure we don't break the
UABI. Although I guess if we're just adding fields at the end, that's a
nonissue.

> +struct drm_panfrost_block_perfcounters {
> +	/*
> +	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
> +	 * instances for a specific given block type.
> +	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
> +	 * user wants to monitor.
> +	 * Note: the bitmap might be sparse.
> +	 */
> +	__u64 instances;
> +
> +	/*
> +	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
> +	 * counters attached to a specific block type.
> +	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
> +	 * wants to monitor.
> +	 * Note: the bitmap might be sparse.
> +	 */
> +	__u64 counters;
> +};

I don't understand this. Aren't there more than 64 counters?

> +struct drm_panfrost_get_perfcnt_layout {
> +	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
> +};

--Oh. It's per-block. Got it.

> + * Used to create a performance monitor. Each perfmonance monitor is assigned an

Typo.

---

Overall, this looks really great! Thank you! :)
Boris Brezillon April 4, 2019, 6:17 p.m. UTC | #2
On Thu, 4 Apr 2019 08:41:29 -0700
Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:


> > +/*
> > + * Returns true if the 2 jobs have exactly the same perfcnt context, false
> > + * otherwise.
> > + */
> > +static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
> > +					 struct panfrost_perfcnt_job_ctx *b)
> > +{
> > +	unsigned int i, j;
> > +
> > +	if (a->perfmon_count != b->perfmon_count)
> > +		return false;
> > +
> > +	for (i = 0; i < a->perfmon_count; i++) {
> > +		for (j = 0; j < b->perfmon_count; j++) {
> > +			if (a->perfmons[i] == b->perfmons[j])
> > +				break;
> > +		}
> > +
> > +		if (j == b->perfmon_count)
> > +			return false;
> > +	}
> > +  
> 
> Would using memcmp() be cleaner here?

memcmp() does not account for the case where 2 jobs contain exactly the
same perfmons but in a different order. This being said, it's rather
unlikely to happen, so maybe we can accept the perf penalty for that
case.

> 
> > +	if (panfrost_model_cmp(pfdev, 0x1000) >= 0)  
> 
> What does 0x1000 refer to here? I'm assuming maybe Bifrost, but it's not
> obvious... probably better to have a #define somewhere and use that (or
> an enum equivalently).

Yes, all numbers above 0xfff are bifrost GPUs. I'll add a macro.

> 
> > +	/*
> > +	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
> > +	 * counters.
> > +	 */  
> 
> What on earth is PRLAM-8186? :)
> 
> Actually, wait, I can answer that -- old kbase versions had an errata
> list:
> 
>         /* Write of PRFCNT_CONFIG_MODE_MANUAL to PRFCNT_CONFIG causes a instrumentation dump if
>            PRFCNT_TILER_EN is enabled */
>         BASE_HW_ISSUE_8186,
> 
> So that's why. If people want, I'm considering moving these errata
> descriptions back into the kernel where possible, since otherwise code
> like this is opaque.

Will copy the errata.

> 
> > +		unsigned int nl2c, ncores;
> > +
> > +		/*
> > +		 * TODO: define a macro to extract the number of l2 caches from
> > +		 * mem_features.
> > +		 */
> > +		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
> > +
> > +		/*
> > +		 * The ARM driver is grouping cores per core group and then
> > +		 * only using the number of cores in group 0 to calculate the
> > +		 * size. Not sure why this is done like that, but I guess
> > +		 * shader_present will only show cores in the first group
> > +		 * anyway.
> > +		 */
> > +		ncores = hweight64(pfdev->features.shader_present);
> > +  
> 
> Deja vu. Was this copypaste dmaybe?

Actually, that one is from me, hence the 'not sure why' part :).

> 
> > +		  (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?  
> 
> THere's that pesky 0x1000 again.
> 
> > @@ -55,6 +63,15 @@ struct drm_panfrost_submit {
> >  
> >  	/** A combination of PANFROST_JD_REQ_* */
> >  	__u32 requirements;
> > +
> > +	/** Pointer to a u32 array of perfmons that should be attached to the job. */
> > +	__u64 perfmon_handles;
> > +
> > +	/** Number of perfmon handles passed in (size is that times 4). */
> > +	__u32 perfmon_handle_count;
> > +
> > +	/** Unused field, should be set to 0. */
> > +	__u32 padding;  
> 
> Bleep blorp. If we're modifying _submit, we'll need to be swift about
> merging this ahead of the main code to make sure we don't break the
> UABI. Although I guess if we're just adding fields at the end, that's a
> nonissue.

Others are using the same "if data passed is smaller than expected
size, unassigned fields are zeroed". That allows us to extend a struct
without breaking the ABI as long as zero is a valid value and does not
change the behavior compared to when the field was not present.

This is the case here: perfmon_handle_count = 0 means no perfmon
attached to the job, so the driver is acting like it previously was.

No need to get that part merged in the initial patch series IMO.

> 
> > +struct drm_panfrost_block_perfcounters {
> > +	/*
> > +	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
> > +	 * instances for a specific given block type.
> > +	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
> > +	 * user wants to monitor.
> > +	 * Note: the bitmap might be sparse.
> > +	 */
> > +	__u64 instances;
> > +
> > +	/*
> > +	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
> > +	 * counters attached to a specific block type.
> > +	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
> > +	 * wants to monitor.
> > +	 * Note: the bitmap might be sparse.
> > +	 */
> > +	__u64 counters;
> > +};  
> 
> I don't understand this. Aren't there more than 64 counters?
> 
> > +struct drm_panfrost_get_perfcnt_layout {
> > +	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
> > +};  
> 
> --Oh. It's per-block. Got it.
> 
> > + * Used to create a performance monitor. Each perfmonance monitor is assigned an  
> 
> Typo.

Will fix.

> 
> ---
> 
> Overall, this looks really great! Thank you! :)

Thanks a lot for your reviews. That was pretty damn fast!
Alyssa Rosenzweig April 4, 2019, 10:40 p.m. UTC | #3
> memcmp() does not account for the case where 2 jobs contain exactly the
> same perfmons but in a different order. This being said, it's rather
> unlikely to happen, so maybe we can accept the perf penalty for that
> case.

If you say so!

> Yes, all numbers above 0xfff are bifrost GPUs. I'll add a macro.

Thank you.

> Will copy the errata.

I was planning to include annotations for the whole file, but that's low
prio.

> Actually, that one is from me, hence the 'not sure why' part :).

Alright...

> Others are using the same "if data passed is smaller than expected
> size, unassigned fields are zeroed". That allows us to extend a struct
> without breaking the ABI as long as zero is a valid value and does not
> change the behavior compared to when the field was not present.
> 
> This is the case here: perfmon_handle_count = 0 means no perfmon
> attached to the job, so the driver is acting like it previously was.
> 
> No need to get that part merged in the initial patch series IMO.

+1

> Thanks a lot for your reviews. That was pretty damn fast!

THanks a lot for your code! \ o /
Eric Anholt April 5, 2019, 3:36 p.m. UTC | #4
Alyssa Rosenzweig <alyssa@rosenzweig.io> writes:
>> @@ -55,6 +63,15 @@ struct drm_panfrost_submit {
>>  
>>  	/** A combination of PANFROST_JD_REQ_* */
>>  	__u32 requirements;
>> +
>> +	/** Pointer to a u32 array of perfmons that should be attached to the job. */
>> +	__u64 perfmon_handles;
>> +
>> +	/** Number of perfmon handles passed in (size is that times 4). */
>> +	__u32 perfmon_handle_count;
>> +
>> +	/** Unused field, should be set to 0. */
>> +	__u32 padding;
>
> Bleep blorp. If we're modifying _submit, we'll need to be swift about
> merging this ahead of the main code to make sure we don't break the
> UABI. Although I guess if we're just adding fields at the end, that's a
> nonissue.

You can extend ioctl structs safely.  When older userspace passes theirs
in, it has the shorter length encoded in the cmd. The kernel allocates
the newest version's space, copies in the shorter struct, and
zero-extends the rest.
Alyssa Rosenzweig April 5, 2019, 4:17 p.m. UTC | #5
> You can extend ioctl structs safely.  When older userspace passes theirs
> in, it has the shorter length encoded in the cmd. The kernel allocates
> the newest version's space, copies in the shorter struct, and
> zero-extends the rest.

Understood, thank you!
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile
index d07e0971b687..31cfb9d25682 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -6,6 +6,7 @@  panfrost-y := \
 	panfrost_gem.o \
 	panfrost_gpu.o \
 	panfrost_job.o \
-	panfrost_mmu.o
+	panfrost_mmu.o \
+	panfrost_perfcnt.o
 
 obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 148b5caa2322..f6a87bfa486b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -13,6 +13,7 @@ 
 #include "panfrost_gpu.h"
 #include "panfrost_job.h"
 #include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_reset_init(struct panfrost_device *pfdev)
 {
@@ -147,7 +148,13 @@  int panfrost_device_init(struct panfrost_device *pfdev)
 	pm_runtime_mark_last_busy(pfdev->dev);
 	pm_runtime_put_autosuspend(pfdev->dev);
 
+	err = panfrost_perfcnt_init(pfdev);
+	if (err)
+		goto err_out5;
+
 	return 0;
+err_out5:
+	panfrost_job_fini(pfdev);
 err_out4:
 	panfrost_mmu_fini(pfdev);
 err_out3:
@@ -163,6 +170,7 @@  int panfrost_device_init(struct panfrost_device *pfdev)
 
 void panfrost_device_fini(struct panfrost_device *pfdev)
 {
+	panfrost_perfcnt_fini(pfdev);
 	panfrost_job_fini(pfdev);
 	panfrost_mmu_fini(pfdev);
 	panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index a821b50a14c3..f7c4e9e55f1b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -9,11 +9,13 @@ 
 #include <drm/drm_device.h>
 #include <drm/drm_mm.h>
 #include <drm/gpu_scheduler.h>
+#include <drm/panfrost_drm.h>
 
 struct panfrost_device;
 struct panfrost_mmu;
 struct panfrost_job_slot;
 struct panfrost_job;
+struct panfrost_perfcnt;
 
 #define NUM_JOB_SLOTS 3
 
@@ -45,6 +47,8 @@  struct panfrost_features {
 
 	unsigned long hw_features[64 / BITS_PER_LONG];
 	unsigned long hw_issues[64 / BITS_PER_LONG];
+
+	struct drm_panfrost_block_perfcounters perfcnt_layout[PANFROST_NUM_BLOCKS];
 };
 
 struct panfrost_device {
@@ -70,6 +74,8 @@  struct panfrost_device {
 	struct panfrost_job *jobs[NUM_JOB_SLOTS];
 	struct list_head scheduled_jobs;
 
+	struct panfrost_perfcnt *perfcnt;
+
 	struct mutex sched_lock;
 };
 
@@ -77,6 +83,11 @@  struct panfrost_file_priv {
 	struct panfrost_device *pfdev;
 
 	struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
+
+	struct {
+		struct idr idr;
+		struct mutex lock;
+	} perfmon;
 };
 
 static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 8cffb70a3548..e5375b31627f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@ 
 #include "panfrost_mmu.h"
 #include "panfrost_job.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct drm_file *file)
 {
@@ -219,6 +220,10 @@  static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	ret = panfrost_perfcnt_create_job_ctx(job, file, args);
+	if (ret)
+		goto fail;
+
 	ret = panfrost_job_push(job);
 	if (ret)
 		goto fail;
@@ -313,6 +318,7 @@  panfrost_open(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_device *pfdev = dev->dev_private;
 	struct panfrost_file_priv *panfrost_priv;
+	int ret;
 
 	panfrost_priv = kzalloc(sizeof(*panfrost_priv), GFP_KERNEL);
 	if (!panfrost_priv)
@@ -321,7 +327,16 @@  panfrost_open(struct drm_device *dev, struct drm_file *file)
 	panfrost_priv->pfdev = pfdev;
 	file->driver_priv = panfrost_priv;
 
-	return panfrost_job_open(panfrost_priv);
+	ret = panfrost_job_open(panfrost_priv);
+	if (ret)
+		goto err_free_priv;
+
+	panfrost_perfcnt_open(panfrost_priv);
+	return 0;
+
+err_free_priv:
+	kfree(panfrost_priv);
+	return ret;
 }
 
 static void
@@ -329,6 +344,7 @@  panfrost_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_file_priv *panfrost_priv = file->driver_priv;
 
+	panfrost_perfcnt_close(panfrost_priv);
 	panfrost_job_close(panfrost_priv);
 
 	kfree(panfrost_priv);
@@ -348,6 +364,10 @@  static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = {
 	PANFROST_IOCTL(MMAP_BO,		mmap_bo,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_PARAM,	get_param,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_BO_OFFSET,	get_bo_offset,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFCNT_LAYOUT, get_perfcnt_layout, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(CREATE_PERFMON,	create_perfmon,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(DESTROY_PERFMON,	destroy_perfmon, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFMON_VALUES, get_perfmon_values, DRM_RENDER_ALLOW),
 };
 
 DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index d46d36170e18..c28a31c547cc 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -13,6 +13,7 @@ 
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 
 static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -42,6 +43,12 @@  static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
 		done = true;
 	}
 
+	if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+		panfrost_perfcnt_sample_done(pfdev);
+
+	if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+		panfrost_perfcnt_clean_cache_done(pfdev);
+
 	gpu_write(pfdev, GPU_INT_CLEAR, state);
 
 	return IRQ_HANDLED;
@@ -152,14 +159,16 @@  struct panfrost_model {
 		u32 revision;
 		u64 issues;
 	} revs[MAX_HW_REVS];
+	u64 perfcnt[PANFROST_NUM_BLOCKS];
 };
 
 #define GPU_MODEL(_name, _id, ...) \
-{\
+{								\
 	.name = __stringify(_name),				\
 	.id = _id,						\
 	.features = hw_features_##_name,			\
 	.issues = hw_issues_##_name,				\
+	.perfcnt = hw_perfcnt_##_name,				\
 	.revs = { __VA_ARGS__ },				\
 }
 
@@ -198,13 +207,17 @@  static const struct panfrost_model gpu_models[] = {
 
 static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 {
+	struct drm_panfrost_block_perfcounters *perfcnt_layout;
 	u32 gpu_id, num_js, major, minor, status, rev;
 	const char *name = "unknown";
 	u64 hw_feat = 0;
-	u64 hw_issues = hw_issues_all;
+	u64 hw_issues = hw_issues_all, mask;
 	const struct panfrost_model *model;
+	unsigned int num;
 	int i;
 
+	perfcnt_layout = pfdev->features.perfcnt_layout;
+
 	pfdev->features.l2_features = gpu_read(pfdev, GPU_L2_FEATURES);
 	pfdev->features.core_features = gpu_read(pfdev, GPU_CORE_FEATURES);
 	pfdev->features.tiler_features = gpu_read(pfdev, GPU_TILER_FEATURES);
@@ -272,9 +285,35 @@  static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 		if (best >= 0)
 			hw_issues |= model->revs[best].issues;
 
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			perfcnt_layout[i].counters = model->perfcnt[i];
+
 		break;
 	}
 
+	/* Only one Job Manager. */
+	perfcnt_layout[PANFROST_JM_BLOCK].instances = BIT(0);
+	perfcnt_layout[PANFROST_SHADER_BLOCK].instances =
+						pfdev->features.shader_present;
+
+	/*
+	 * In v4 HW we have one tiler per core group, with the number
+	 * of core groups being equal to the number of L2 caches. Other
+	 * HW versions just have one tiler and the number of L2 caches
+	 * can be extracted from the mem_features field.
+	 */
+	if (hw_feat & HW_FEATURE_V4) {
+		num = hweight64(pfdev->features.l2_present);
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = mask;
+	} else {
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = BIT(0);
+		num = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+	}
+
 	bitmap_from_u64(pfdev->features.hw_features, hw_feat);
 	bitmap_from_u64(pfdev->features.hw_issues, hw_issues);
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 8d570c3f15d0..c2be61a9ebff 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -15,6 +15,7 @@ 
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gem.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 #include "panfrost_gpu.h"
 #include "panfrost_mmu.h"
@@ -153,6 +154,7 @@  static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 		goto end;
 
 	spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
+	panfrost_perfcnt_run_job(job);
 
 	job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
 	job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
@@ -233,6 +235,12 @@  int panfrost_job_push(struct panfrost_job *job)
 		goto unlock;
 	}
 
+	ret = panfrost_perfcnt_push_job(job);
+	if (ret) {
+		mutex_unlock(&pfdev->sched_lock);
+		goto unlock;
+	}
+
 	job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
 
 	kref_get(&job->refcount); /* put by scheduler job completion */
@@ -272,6 +280,9 @@  static void panfrost_job_cleanup(struct kref *ref)
 
 	for (i = 0; i < job->bo_count; i++)
 		drm_gem_object_put_unlocked(job->bos[i]);
+
+	panfrost_perfcnt_clean_job_ctx(job);
+
 	kvfree(job->bos);
 
 	kfree(job);
@@ -316,6 +327,13 @@  static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job
 		}
 	}
 
+	/* Return the perfmon wait fence if any. */
+	if (job->perfcnt_fence) {
+		fence = job->perfcnt_fence;
+		job->perfcnt_fence = NULL;
+		return fence;
+	}
+
 	return NULL;
 }
 
@@ -399,6 +417,11 @@  static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_start(&pfdev->js->queue[i].sched, true);
+
+	/* For now, just say we're done. No reset and retry. */
+//	job_write(pfdev, JS_COMMAND(js), JS_COMMAND_HARD_STOP);
+	dma_fence_signal(job->done_fence);
+	panfrost_perfcnt_finish_job(job, true);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -442,6 +465,7 @@  static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
 
 		if (status & JOB_INT_MASK_DONE(j)) {
 			dma_fence_signal(pfdev->jobs[j]->done_fence);
+			panfrost_perfcnt_finish_job(pfdev->jobs[j], false);
 		}
 
 		status &= ~mask;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h
index 62454128a792..18646cc5eebb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.h
+++ b/drivers/gpu/drm/panfrost/panfrost_job.h
@@ -37,6 +37,10 @@  struct panfrost_job {
 
 	/* Fence to be signaled by drm-sched once its done with the job */
 	struct dma_fence *render_done_fence;
+
+	/* Perfcnt context */
+	struct panfrost_perfcnt_job_ctx *perfcnt_ctx;
+	struct dma_fence *perfcnt_fence;
 };
 
 int panfrost_job_init(struct panfrost_device *pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..4491f153ad48
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,954 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK		64
+#define BYTES_PER_COUNTER		4
+#define BLOCKS_PER_COREGROUP		8
+#define V4_SHADERS_PER_COREGROUP	4
+
+struct panfrost_perfcnt_job_ctx {
+	refcount_t refcount;
+	struct panfrost_device *pfdev;
+	struct dma_fence *wait_fence;
+	struct dma_fence *done_fence;
+	struct panfrost_perfmon **perfmons;
+	u32 perfmon_count;
+};
+
+struct panfrost_perfcnt {
+	struct work_struct dumpwork;
+	u64 fence_context;
+	u64 emit_seqno;
+	spinlock_t fence_lock;
+	struct mutex cfg_lock;
+	u32 cur_cfg[PANFROST_NUM_BLOCKS];
+	struct panfrost_gem_object *bo;
+	void *buf;
+	spinlock_t ctx_lock;
+	struct panfrost_perfcnt_job_ctx *last_ctx;
+	struct panfrost_perfcnt_job_ctx *dump_ctx;
+};
+
+struct panfrost_perfcnt_fence {
+	struct dma_fence base;
+	struct drm_device *dev;
+	u64 seqno;
+};
+
+struct panfrost_perfmon {
+	refcount_t refcnt;
+	atomic_t busycnt;
+	struct wait_queue_head wq;
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+	u32 *values[PANFROST_NUM_BLOCKS];
+};
+
+static inline struct panfrost_perfcnt_fence *
+to_panfrost_perfcnt_fence(struct dma_fence *fence)
+{
+	return container_of(fence, struct panfrost_perfcnt_fence, base);
+}
+
+static const char *
+panfrost_perfcnt_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "panfrost";
+}
+
+static const char *
+panfrost_perfcnt_fence_get_timeline_name(struct dma_fence *fence)
+{
+	return "panfrost-perfcnt";
+}
+
+static const struct dma_fence_ops panfrost_perfcnt_fence_ops = {
+	.get_driver_name = panfrost_perfcnt_fence_get_driver_name,
+	.get_timeline_name = panfrost_perfcnt_fence_get_timeline_name,
+};
+
+static struct dma_fence *
+panfrost_perfcnt_fence_create(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return ERR_PTR(-ENOMEM);
+
+	fence->dev = pfdev->ddev;
+	fence->seqno = ++pfdev->perfcnt->emit_seqno;
+	dma_fence_init(&fence->base, &panfrost_perfcnt_fence_ops,
+		       &pfdev->perfcnt->fence_lock,
+		       pfdev->perfcnt->fence_context, fence->seqno);
+
+	return &fence->base;
+}
+
+static void panfrost_perfmon_get(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon)
+		refcount_inc(&perfmon->refcnt);
+}
+
+static void panfrost_perfmon_put(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+		unsigned int i;
+
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			kfree(perfmon->values[i]);
+
+		kfree(perfmon);
+	}
+}
+
+static struct panfrost_perfmon *
+panfrost_perfcnt_find_perfmon(struct panfrost_file_priv *pfile, int id)
+{
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	return perfmon;
+}
+
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile)
+{
+	mutex_init(&pfile->perfmon.lock);
+	idr_init(&pfile->perfmon.idr);
+}
+
+static int panfrost_perfcnt_idr_del(int id, void *elem, void *data)
+{
+	struct panfrost_perfmon *perfmon = elem;
+
+	panfrost_perfmon_put(perfmon);
+
+	return 0;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+	mutex_lock(&pfile->perfmon.lock);
+	idr_for_each(&pfile->perfmon.idr, panfrost_perfcnt_idr_del, NULL);
+	idr_destroy(&pfile->perfmon.idr);
+	mutex_unlock(&pfile->perfmon.lock);
+}
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_get_perfcnt_layout *layout = data;
+
+	memcpy(layout->counters, pfdev->features.perfcnt_layout,
+	       sizeof(layout->counters));
+
+	return 0;
+}
+
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_create_perfmon *req = data;
+	struct drm_panfrost_block_perfcounters *layout;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret;
+
+	if (req->padding)
+		return -EINVAL;
+
+	perfmon = kzalloc(sizeof(*perfmon), GFP_KERNEL);
+	if (!perfmon)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	layout = pfdev->features.perfcnt_layout;
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		/* Make sure the request matches the available counters. */
+		if (~layout[i].instances & req->counters[i].instances ||
+		    ~layout[i].counters & req->counters[i].counters)
+			goto err_free_perfmon;
+
+		ncounters = hweight64(req->counters[i].instances) *
+			    hweight64(req->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		perfmon->counters[i] = req->counters[i];
+		perfmon->values[i] = kcalloc(ncounters, sizeof(u32), GFP_KERNEL);
+		if (!perfmon->values)
+			goto err_free_perfmon;
+	}
+
+	refcount_set(&perfmon->refcnt, 1);
+	init_waitqueue_head(&perfmon->wq);
+
+	mutex_lock(&pfile->perfmon.lock);
+	ret = idr_alloc(&pfile->perfmon.idr, perfmon, 1, U32_MAX, GFP_KERNEL);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (ret < 0)
+		goto err_free_perfmon;
+
+	req->id = ret;
+	return 0;
+
+err_free_perfmon:
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+		kfree(perfmon->values[i]);
+
+	kfree(perfmon);
+	return ret;
+}
+
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_destroy_perfmon *req = data;
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_remove(&pfile->perfmon.idr, req->id);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	panfrost_perfmon_put(perfmon);
+	return 0;
+}
+
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_get_perfmon_values *req = data;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret = 0;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, req->id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	if (!(req->flags & DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT))
+		ret = wait_event_interruptible(perfmon->wq,
+					       !atomic_read(&perfmon->busycnt));
+	else if (atomic_read(&perfmon->busycnt))
+		ret = -EBUSY;
+
+	if (ret)
+		goto out;
+
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		ncounters = hweight64(perfmon->counters[i].instances) *
+			    hweight64(perfmon->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		if (copy_to_user(u64_to_user_ptr(req->values_ptrs[i]),
+				 perfmon->values[i],
+				 ncounters * sizeof(u32))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (req->flags & DRM_PANFROST_GET_PERFMON_VALS_RESET)
+			memset(perfmon->values[i], 0, ncounters * sizeof(u32));
+	}
+
+out:
+	panfrost_perfmon_put(perfmon);
+	return ret;
+}
+
+/*
+ * Returns true if the 2 jobs have exactly the same perfcnt context, false
+ * otherwise.
+ */
+static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
+					 struct panfrost_perfcnt_job_ctx *b)
+{
+	unsigned int i, j;
+
+	if (a->perfmon_count != b->perfmon_count)
+		return false;
+
+	for (i = 0; i < a->perfmon_count; i++) {
+		for (j = 0; j < b->perfmon_count; j++) {
+			if (a->perfmons[i] == b->perfmons[j])
+				break;
+		}
+
+		if (j == b->perfmon_count)
+			return false;
+	}
+
+	return true;
+}
+
+static u32 counters_u64_to_u32(u64 in)
+{
+	unsigned int i;
+	u32 out = 0;
+
+	for (i = 0; i < 64; i += 4) {
+		if (GENMASK(i + 3, i) & in)
+			out |= BIT(i / 4);
+	}
+
+	return out;
+}
+
+void panfrost_perfcnt_run_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	u32 perfcnt_en[PANFROST_NUM_BLOCKS] = { };
+	bool disable_perfcnt = true, config_changed = false;
+	unsigned int i, j;
+	u64 gpuva;
+	u32 cfg;
+
+	mutex_lock(&pfdev->perfcnt->cfg_lock);
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		for (j = 0; j < ctx->perfmon_count; j++) {
+			u64 counters = ctx->perfmons[j]->counters[i].counters;
+
+			perfcnt_en[i] |= counters_u64_to_u32(counters);
+		}
+
+		if (perfcnt_en[i])
+			disable_perfcnt = false;
+
+		if (perfcnt_en[i] != pfdev->perfcnt->cur_cfg[i]) {
+			pfdev->perfcnt->cur_cfg[i] = perfcnt_en[i];
+			config_changed = true;
+		}
+	}
+	mutex_unlock(&pfdev->perfcnt->cfg_lock);
+
+	if (!config_changed)
+		return;
+
+	/*
+	 * Always use address space 0 for now.
+	 * FIXME: this needs to be updated when we start using different
+	 * address space.
+	 */
+	cfg = GPU_PERFCNT_CFG_AS(0);
+	if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
+		cfg |= GPU_PERFCNT_CFG_SETSEL(1);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+	if (disable_perfcnt)
+		return;
+
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, perfcnt_en[PANFROST_JM_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN,
+		  perfcnt_en[PANFROST_SHADER_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN,
+		  perfcnt_en[PANFROST_MMU_L2_BLOCK]);
+	gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+	gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+	gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+
+	/*
+	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+	 * counters.
+	 */
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+	else
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
+
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+}
+
+static void
+panfrost_perfcnt_release_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	unsigned int i;
+
+	WARN_ON(refcount_read(&ctx->refcount));
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		if (atomic_dec_and_test(&ctx->perfmons[i]->busycnt))
+			wake_up(&ctx->perfmons[i]->wq);
+		panfrost_perfmon_put(ctx->perfmons[i]);
+	}
+
+	dma_fence_put(ctx->wait_fence);
+	dma_fence_put(ctx->done_fence);
+	kfree(ctx->perfmons);
+	kfree(ctx);
+}
+
+static void panfrost_perfcnt_put_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (!IS_ERR_OR_NULL(ctx) && refcount_dec_and_test(&ctx->refcount))
+		panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+struct panfrost_perfcnt_job_ctx *
+panfrost_perfcnt_get_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (ctx)
+		refcount_inc(&ctx->refcount);
+
+	return ctx;
+}
+
+static void panfrost_perfcnt_dump_done(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	struct panfrost_device *pfdev;
+	unsigned long flags;
+
+	pfdev = ctx->pfdev;
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	pfdev->perfcnt->dump_ctx = NULL;
+	if (pfdev->perfcnt->last_ctx == ctx)
+		pfdev->perfcnt->last_ctx = NULL;
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+
+	dma_fence_signal(ctx->done_fence);
+	panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+static void
+panfrost_perfcnt_get_counter_vals(struct panfrost_device *pfdev,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *vals)
+{
+	u64 shader_present = pfdev->features.shader_present;
+	unsigned int bufoffs, shaderid, shadernum;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = ((shaderid / V4_SHADERS_PER_COREGROUP) *
+				   2048) +
+				  ((shaderid % V4_SHADERS_PER_COREGROUP) *
+				   256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1024;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1280;
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 1792;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = 512 + ((nl2c + shaderid) * 256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance))
+				return;
+
+			bufoffs = 256;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= nl2c))
+				return;
+
+			bufoffs = 512 + (instance * 256);
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 0;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	}
+
+	memcpy(vals, pfdev->perfcnt->buf + bufoffs, 256);
+}
+
+static void
+panfrost_perfmon_upd_counter_vals(struct panfrost_perfmon *perfmon,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *invals)
+{
+	u32 *outvals = perfmon->values[block];
+	unsigned int inidx, outidx;
+
+	if (WARN_ON(instance >= hweight64(perfmon->counters[block].instances)))
+		return;
+
+	if (!(perfmon->counters[block].instances & BIT_ULL(instance)))
+		return;
+
+	outvals += instance * hweight64(perfmon->counters[block].counters);
+	for (inidx = 0, outidx = 0; inidx < 64; inidx++) {
+		if (!(perfmon->counters[block].counters & BIT_ULL(inidx)))
+			continue;
+
+		if (U32_MAX - outvals[outidx] < invals[inidx])
+			outvals[outidx] = U32_MAX;
+		else
+			outvals[outidx] += invals[inidx];
+		outidx++;
+	}
+}
+
+static void panfrost_perfcnt_dump_work(struct work_struct *w)
+{
+	struct panfrost_perfcnt *perfcnt = container_of(w,
+						struct panfrost_perfcnt,
+						dumpwork);
+	struct panfrost_perfcnt_job_ctx *ctx = perfcnt->dump_ctx;
+	unsigned int block, instance, pmonidx, num;
+
+	if (!ctx)
+		return;
+
+	for (block = 0; block < PANFROST_NUM_BLOCKS; block++) {
+		struct panfrost_perfmon *perfmon;
+		u32 vals[COUNTERS_PER_BLOCK];
+		u64 instances = 0;
+
+		for (pmonidx = 0; pmonidx < ctx->perfmon_count; pmonidx++) {
+			perfmon = ctx->perfmons[pmonidx];
+			instances |= perfmon->counters[block].instances;
+		}
+
+		for (instance = 0, num = 0; instance < 64; instance++) {
+			if (!(instances & BIT_ULL(instance)))
+				continue;
+
+			panfrost_perfcnt_get_counter_vals(ctx->pfdev, block,
+							  instance, vals);
+
+			for (pmonidx = 0; pmonidx < ctx->perfmon_count;
+			     pmonidx++) {
+				perfmon = ctx->perfmons[pmonidx];
+				panfrost_perfmon_upd_counter_vals(perfmon,
+								  block,
+								  num,
+								  vals);
+			}
+			num++;
+		}
+	}
+
+	panfrost_perfcnt_dump_done(ctx);
+}
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+	schedule_work(&pfdev->perfcnt->dumpwork);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job)
+{
+	return panfrost_perfcnt_put_job_ctx(job->perfcnt_ctx);
+}
+
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args)
+{
+	struct panfrost_device *pfdev = job->pfdev;
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_perfcnt_job_ctx *ctx;
+	unsigned int i, j;
+	u32 *handles;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->pfdev = pfdev;
+	refcount_set(&ctx->refcount, 1);
+
+	ctx->perfmon_count = args->perfmon_handle_count;
+	if (!ctx->perfmon_count) {
+		job->perfcnt_ctx = ctx;
+		return 0;
+	}
+
+	handles = kcalloc(ctx->perfmon_count, sizeof(u32), GFP_KERNEL);
+	if (!handles) {
+		ret = -ENOMEM;
+		goto err_put_ctx;
+	}
+
+	if (copy_from_user(handles,
+			   u64_to_user_ptr(args->perfmon_handles),
+			   ctx->perfmon_count * sizeof(u32))) {
+		ret = -EFAULT;
+		DRM_DEBUG("Failed to copy in perfmon handles\n");
+		goto err_free_handles;
+	}
+
+	/* Make sure each perfmon only appears once. */
+	for (i = 0; i < ctx->perfmon_count - 1; i++) {
+		for (j = i + 1; j < ctx->perfmon_count; j++) {
+			if (handles[i] == handles[j]) {
+				ret = -EINVAL;
+				goto err_free_handles;
+			}
+		}
+	}
+
+	ctx->perfmons = kcalloc(ctx->perfmon_count, sizeof(*ctx->perfmons),
+				GFP_KERNEL | __GFP_ZERO);
+	if (!ctx->perfmons) {
+		ret = -ENOMEM;
+		goto err_free_handles;
+	}
+
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		ctx->perfmons[i] = panfrost_perfcnt_find_perfmon(pfile,
+								 handles[i]);
+		if (!ctx->perfmons[i]) {
+			ret = -EINVAL;
+			goto err_free_handles;
+		}
+		atomic_inc(&ctx->perfmons[i]->busycnt);
+	}
+
+	job->perfcnt_ctx = ctx;
+	kfree(handles);
+	return 0;
+
+err_free_handles:
+	kfree(handles);
+
+err_put_ctx:
+	panfrost_perfcnt_put_job_ctx(ctx);
+	return ret;
+}
+
+void panfrost_perfcnt_finish_job(struct panfrost_job *job, bool skip_dump)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+
+	if (WARN_ON(!ctx))
+		return;
+
+	job->perfcnt_ctx = NULL;
+	if (!refcount_dec_and_test(&ctx->refcount))
+		return;
+
+	if (!ctx->perfmon_count || skip_dump) {
+		panfrost_perfcnt_dump_done(ctx);
+		return;
+	}
+
+	ctx->pfdev->perfcnt->dump_ctx = ctx;
+	gpu_write(ctx->pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+}
+
+static bool panfrost_perfcnt_try_reuse_last_job_ctx(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned int i;
+
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	if (!prev_ctx)
+		return false;
+
+	if (!refcount_inc_not_zero(&prev_ctx->refcount))
+		return false;
+
+	if (!panfrost_perfcnt_job_ctx_cmp(prev_ctx, new_ctx)) {
+		refcount_dec(&prev_ctx->refcount);
+		return false;
+	}
+
+	/*
+	 * Make sure we increment busycnt, as panfrost_perfcnt_put_job_ctx()
+	 * will decrement it.
+	 */
+	for (i = 0; i < prev_ctx->perfmon_count; i++)
+		atomic_inc(&prev_ctx->perfmons[i]->busycnt);
+
+	panfrost_perfcnt_put_job_ctx(new_ctx);
+	job->perfcnt_ctx = prev_ctx;
+	job->perfcnt_fence = dma_fence_get(prev_ctx->wait_fence);
+	return true;
+}
+
+int panfrost_perfcnt_push_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	/*
+	 * In order to keep things relatively fast even when HW counters are
+	 * enabled we try to avoid having to dump perfcounters at the end of
+	 * each job (which implies making other jobs wait for this dump to
+	 * finish) when that's possible.
+	 * This is only acceptable if all queued jobs share the same perfctx,
+	 * that is, they have the same list of jobs attached to them. In this
+	 * condition we are guaranteed that nothing will increment the counters
+	 * behind our back.
+	 */
+	if (panfrost_perfcnt_try_reuse_last_job_ctx(job))
+		goto out;
+
+	new_ctx->done_fence = panfrost_perfcnt_fence_create(pfdev);
+	if (IS_ERR(new_ctx->done_fence)) {
+		ret = PTR_ERR(new_ctx->done_fence);
+		goto out;
+	}
+
+	/*
+	 * The previous job has a different perfmon ctx, so we must wait for it
+	 * to be done dumping the counters before we can schedule this new job,
+	 * otherwise we might corrupt the counter values.
+	 */
+	if (prev_ctx)
+		new_ctx->wait_fence = dma_fence_get(prev_ctx->done_fence);
+
+	job->perfcnt_fence = dma_fence_get(new_ctx->wait_fence);
+	pfdev->perfcnt->last_ctx = new_ctx;
+
+out:
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+	return ret;
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt *perfcnt;
+	struct drm_gem_shmem_object *bo;
+	size_t size;
+	u32 status;
+	int ret;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+		size = ncoregroups * BLOCKS_PER_COREGROUP *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		/*
+		 * There's always one JM and one Tiler block, hence the '+ 2'
+		 * here.
+		 */
+		size = (nl2c + ncores + 2) *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	}
+
+	perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+	if (!perfcnt)
+		return -ENOMEM;
+
+	bo = drm_gem_shmem_create(pfdev->ddev, size);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	perfcnt->bo = to_panfrost_bo(&bo->base);
+
+	/*
+	 * We always use the same buffer, so let's map it once and keep it
+	 * mapped until the driver is unloaded. This might be a problem if
+	 * we start using different AS and the perfcnt BO is not mapped at
+	 * the same GPU virtual address.
+	 */
+	ret = panfrost_mmu_map(perfcnt->bo);
+	if (ret)
+		goto err_put_bo;
+
+	/* Disable everything. */
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_AS(0) |
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
+		  (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
+		   GPU_PERFCNT_CFG_SETSEL(1) : 0));
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+	perfcnt->buf = drm_gem_vmap(&bo->base);
+	if (IS_ERR(perfcnt->buf)) {
+		ret = PTR_ERR(perfcnt->buf);
+		goto err_put_bo;
+	}
+
+	INIT_WORK(&perfcnt->dumpwork, panfrost_perfcnt_dump_work);
+	mutex_init(&perfcnt->cfg_lock);
+	spin_lock_init(&perfcnt->fence_lock);
+	spin_lock_init(&perfcnt->ctx_lock);
+	perfcnt->fence_context = dma_fence_context_alloc(1);
+	pfdev->perfcnt = perfcnt;
+
+	/*
+	 * Invalidate the cache and clear the counters to start from a fresh
+	 * state.
+	 */
+	gpu_write(pfdev, GPU_INT_MASK, 0);
+	gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
+
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+	ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
+					 status,
+					 status &
+					 GPU_IRQ_CLEAN_CACHES_COMPLETED,
+					 100, 10000);
+	if (ret)
+		goto err_gem_vunmap;
+
+	gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);
+
+	return 0;
+
+err_gem_vunmap:
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+
+err_put_bo:
+	drm_gem_object_put_unlocked(&bo->base);
+	return ret;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+	drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..7cbfeb072aa1
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,54 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include <linux/bitops.h>
+
+struct panfrost_perfcnt_job_ctx;
+
+#define PERFCNT(_shader, _tiler, _mmu_l2, _jm)		\
+	{ _shader, _tiler, _mmu_l2, _jm }
+#define NO_PERFCNT      PERFCNT(0, 0, 0, 0)
+
+/* FIXME: Declare counters for all models */
+#define hw_perfcnt_t600	NO_PERFCNT
+#define hw_perfcnt_t620	NO_PERFCNT
+#define hw_perfcnt_t720	NO_PERFCNT
+#define hw_perfcnt_t760	NO_PERFCNT
+#define hw_perfcnt_t820	NO_PERFCNT
+#define hw_perfcnt_t830	NO_PERFCNT
+#define hw_perfcnt_t860	NO_PERFCNT
+#define hw_perfcnt_t880	NO_PERFCNT
+#define hw_perfcnt_g76	NO_PERFCNT
+#define hw_perfcnt_g71	NO_PERFCNT
+#define hw_perfcnt_g72	NO_PERFCNT
+#define hw_perfcnt_g51	NO_PERFCNT
+#define hw_perfcnt_g52	NO_PERFCNT
+#define hw_perfcnt_g31	NO_PERFCNT
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_push_job(struct panfrost_job *job);
+void panfrost_perfcnt_run_job(struct panfrost_job *job);
+void panfrost_perfcnt_finish_job(struct panfrost_job *job,
+				 bool skip_dump);
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job);
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args);
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv);
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@ 
 	 GPU_IRQ_MULTIPLE_FAULT)
 #define GPU_CMD				0x30
 #define   GPU_CMD_SOFT_RESET		0x01
+#define   GPU_CMD_PERFCNT_CLEAR		0x03
+#define   GPU_CMD_PERFCNT_SAMPLE	0x04
+#define   GPU_CMD_CLEAN_CACHES		0x07
+#define   GPU_CMD_CLEAN_INV_CACHES	0x08
 #define GPU_STATUS			0x34
+#define   GPU_STATUS_PRFCNT_ACTIVE	BIT(2)
 #define GPU_LATEST_FLUSH_ID		0x38
 #define GPU_FAULT_STATUS		0x3C
 #define GPU_FAULT_ADDRESS_LO		0x40
 #define GPU_FAULT_ADDRESS_HI		0x44
 
+#define GPU_PERFCNT_BASE_LO		0x60
+#define GPU_PERFCNT_BASE_HI		0x64
+#define GPU_PERFCNT_CFG			0x68
+#define   GPU_PERFCNT_CFG_MODE(x)	(x)
+#define   GPU_PERFCNT_CFG_MODE_OFF	0
+#define   GPU_PERFCNT_CFG_MODE_MANUAL	1
+#define   GPU_PERFCNT_CFG_MODE_TILE	2
+#define   GPU_PERFCNT_CFG_AS(x)		((x) << 4)
+#define   GPU_PERFCNT_CFG_SETSEL(x)	((x) << 8)
+#define GPU_PRFCNT_JM_EN		0x6c
+#define GPU_PRFCNT_SHADER_EN		0x70
+#define GPU_PRFCNT_TILER_EN		0x74
+#define GPU_PRFCNT_MMU_L2_EN		0x7c
+
 #define GPU_THREAD_MAX_THREADS		0x0A0	/* (RO) Maximum number of threads per core */
 #define GPU_THREAD_MAX_WORKGROUP_SIZE	0x0A4	/* (RO) Maximum workgroup size */
 #define GPU_THREAD_MAX_BARRIER_SIZE	0x0A8	/* (RO) Maximum threads waiting at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 508b9621d9db..e09b35bf6035 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,10 @@  extern "C" {
 #define DRM_PANFROST_MMAP_BO			0x03
 #define DRM_PANFROST_GET_PARAM			0x04
 #define DRM_PANFROST_GET_BO_OFFSET		0x05
+#define DRM_PANFROST_GET_PERFCNT_LAYOUT		0x06
+#define DRM_PANFROST_CREATE_PERFMON		0x07
+#define DRM_PANFROST_DESTROY_PERFMON		0x08
+#define DRM_PANFROST_GET_PERFMON_VALUES		0x09
 
 #define DRM_IOCTL_PANFROST_SUBMIT		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
 #define DRM_IOCTL_PANFROST_WAIT_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -25,6 +29,10 @@  extern "C" {
 #define DRM_IOCTL_PANFROST_MMAP_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
 #define DRM_IOCTL_PANFROST_GET_PARAM		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
 #define DRM_IOCTL_PANFROST_GET_BO_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+#define DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFCNT_LAYOUT, struct drm_panfrost_get_perfcnt_layout)
+#define DRM_IOCTL_PANFROST_CREATE_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_CREATE_PERFMON, struct drm_panfrost_create_perfmon)
+#define DRM_IOCTL_PANFROST_DESTROY_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_DESTROY_PERFMON, struct drm_panfrost_destroy_perfmon)
+#define DRM_IOCTL_PANFROST_GET_PERFMON_VALUES	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFMON_VALUES, struct drm_panfrost_get_perfmon_values)
 
 #define PANFROST_JD_REQ_FS (1 << 0)
 /**
@@ -55,6 +63,15 @@  struct drm_panfrost_submit {
 
 	/** A combination of PANFROST_JD_REQ_* */
 	__u32 requirements;
+
+	/** Pointer to a u32 array of perfmons that should be attached to the job. */
+	__u64 perfmon_handles;
+
+	/** Number of perfmon handles passed in (size is that times 4). */
+	__u32 perfmon_handle_count;
+
+	/** Unused field, should be set to 0. */
+	__u32 padding;
 };
 
 /**
@@ -133,6 +150,111 @@  struct drm_panfrost_get_bo_offset {
 	__u64 offset;
 };
 
+/**
+ * Panfrost HW block ids used to group HW counters. There might be several
+ * shader, tiler and MMU/L2 blocks in a given GPU. How many of them are
+ * available is exposed through the instances field of
+ * drm_panfrost_block_perfcounters.
+ */
+enum drm_panfrost_block_id {
+	PANFROST_SHADER_BLOCK,
+	PANFROST_TILER_BLOCK,
+	PANFROST_MMU_L2_BLOCK,
+	PANFROST_JM_BLOCK,
+	PANFROST_NUM_BLOCKS,
+};
+
+struct drm_panfrost_block_perfcounters {
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * instances for a specific given block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
+	 * user wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 instances;
+
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * counters attached to a specific block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
+	 * wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 counters;
+};
+
+/**
+ * Used to retrieve available HW counters.
+ */
+struct drm_panfrost_get_perfcnt_layout {
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+};
+
+/**
+ * Used to create a performance monitor. Each perfmonance monitor is assigned an
+ * ID that can later be passed when submitting a job to capture hardware counter
+ * values (and thus count things related to this specific job).
+ * Performance monitors are attached to the GPU file descriptor and IDs are
+ * unique within this context, not across all GPU users.
+ * This implies that
+ * - perfmons are automatically released when the FD is closed
+ * - perfmons can't be shared across GPU context
+ */
+struct drm_panfrost_create_perfmon {
+	/* Input Fields. */
+	/* List all HW counters this performance monitor should track. */
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+
+	/* Output fields. */
+	/* ID of the newly created perfmon. */
+	__u32 id;
+
+	/* Padding: must be set to 0. */
+	__u32 padding;
+};
+
+/**
+ * Destroy an existing performance monitor.
+ */
+struct drm_panfrost_destroy_perfmon {
+	/*
+	 * ID of the perfmon to destroy (the one returned by
+	 * DRM_IOCTL_PANFROST_CREATE_PERFMON)
+	 */
+	__u32 id;
+};
+
+/*
+ * Don't wait when trying to get perfmon values. If the perfmon is still active
+ * (still attached to a queued or running job), EBUSY is returned.
+ */
+#define DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT		0x1
+
+/* Reset all perfmon values to zero after reading them. */
+#define DRM_PANFROST_GET_PERFMON_VALS_RESET		0x2
+
+/**
+ * Used to query values collected by a performance monitor.
+ */
+struct drm_panfrost_get_perfmon_values {
+	/* ID of the perfmon to query value on. */
+	__u32 id;
+
+	/* See DRM_PANFROST_GET_PERFMON_VALS_XXX flags */
+	__u32 flags;
+
+	/*
+	 * An array of u32 userspace pointers where counters values will be
+	 * copied too.
+	 * The array sizes depend on the counters/instances activated at
+	 * perfmon creation time: hweight64(instances) * hweight64(counters).
+	 * Note that some entries in values_ptrs[] might be NULL if no counters
+	 * on a specific block were activated.
+	 */
+	__u64 values_ptrs[PANFROST_NUM_BLOCKS];
+};
+
 #if defined(__cplusplus)
 }
 #endif