diff mbox series

[3/5] perf: Add pmu get/put

Message ID 20241008183501.1354695-4-lucas.demarchi@intel.com (mailing list archive)
State New, archived
Headers show
Series perf: Fix pmu for drivers with bind/unbind | expand

Commit Message

Lucas De Marchi Oct. 8, 2024, 6:34 p.m. UTC
If a pmu is unregistered while there's an active event, perf will still
access the pmu via event->pmu, even after the event is destroyed. This
makes it difficult for drivers like i915 that can be unbound from the
HW.

	BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0
	Read of size 4 at addr ffff88816e2bb63c by task perf/7748

i915 tries to cope with it by installing a event->destroy, but that is
not sufficient: if pmu is released by the driver, it will still crash
since event->pmu is still used.

Moreover, even with that use-after-free fixed by adjusting the order in
_free_event() or delaying the free by the driver, kernel still oops when
closing the event fd related to a unregistered pmu: the percpu variables
allocated on perf_pmu_register() would already be released. One such
crash is:

	BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100
	Write of size 4 at addr 00000000ffffffff by task perf/727

	CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
	Call Trace:
	 <TASK>
	 dump_stack_lvl+0x5f/0x90
	 print_report+0x4d3/0x50a
	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
	 ? kasan_addr_to_slab+0xd/0xb0
	 kasan_report+0xe2/0x170
	 ? _raw_spin_lock_irqsave+0x88/0x100
	 ? _raw_spin_lock_irqsave+0x88/0x100
	 kasan_check_range+0x125/0x230
	 __kasan_check_write+0x14/0x30
	 _raw_spin_lock_irqsave+0x88/0x100
	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
	 _atomic_dec_and_raw_lock_irqsave+0x89/0x110
	 ? __kasan_check_write+0x14/0x30
	 put_pmu_ctx+0x98/0x330

The fix here is to provide a set of get/put hooks that drivers can
implement to piggy back the perf's pmu lifecycle to the driver's
instance lifecycle.  With this, perf_pmu_unregister() can be called by
the driver, which is then responsible for freeing the resources.

Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 include/linux/perf_event.h | 12 ++++++++++++
 kernel/events/core.c       | 37 ++++++++++++++++++++++++++++++++-----
 2 files changed, 44 insertions(+), 5 deletions(-)

Comments

kernel test robot Oct. 9, 2024, 10:44 a.m. UTC | #1
Hi Lucas,

kernel test robot noticed the following build warnings:

[auto build test WARNING on perf-tools-next/perf-tools-next]
[cannot apply to tip/perf/core perf-tools/perf-tools acme/perf/core linus/master v6.12-rc2 next-20241008]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lucas-De-Marchi/perf-Add-dummy-pmu-module/20241009-023728
base:   https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf-tools-next
patch link:    https://lore.kernel.org/r/20241008183501.1354695-4-lucas.demarchi%40intel.com
patch subject: [PATCH 3/5] perf: Add pmu get/put
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241009/202410091848.aRUoRWWD-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241009/202410091848.aRUoRWWD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410091848.aRUoRWWD-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/events/core.c:5235:17: warning: unused variable 'module' [-Wunused-variable]
    5235 |         struct module *module;
         |                        ^~~~~~
   1 warning generated.


vim +/module +5235 kernel/events/core.c

  5232	
  5233	static void _free_event(struct perf_event *event)
  5234	{
> 5235		struct module *module;
  5236	
  5237		irq_work_sync(&event->pending_irq);
  5238		irq_work_sync(&event->pending_disable_irq);
  5239		perf_pending_task_sync(event);
  5240	
  5241		unaccount_event(event);
  5242	
  5243		security_perf_event_free(event);
  5244	
  5245		if (event->rb) {
  5246			/*
  5247			 * Can happen when we close an event with re-directed output.
  5248			 *
  5249			 * Since we have a 0 refcount, perf_mmap_close() will skip
  5250			 * over us; possibly making our ring_buffer_put() the last.
  5251			 */
  5252			mutex_lock(&event->mmap_mutex);
  5253			ring_buffer_attach(event, NULL);
  5254			mutex_unlock(&event->mmap_mutex);
  5255		}
  5256	
  5257		if (is_cgroup_event(event))
  5258			perf_detach_cgroup(event);
  5259	
  5260		if (!event->parent) {
  5261			if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
  5262				put_callchain_buffers();
  5263		}
  5264	
  5265		perf_event_free_bpf_prog(event);
  5266		perf_addr_filters_splice(event, NULL);
  5267		kfree(event->addr_filter_ranges);
  5268	
  5269		if (event->destroy)
  5270			event->destroy(event);
  5271	
  5272		/*
  5273		 * Must be after ->destroy(), due to uprobe_perf_close() using
  5274		 * hw.target.
  5275		 */
  5276		if (event->hw.target)
  5277			put_task_struct(event->hw.target);
  5278	
  5279		if (event->pmu_ctx)
  5280			put_pmu_ctx(event->pmu_ctx);
  5281	
  5282		/*
  5283		 * perf_event_free_task() relies on put_ctx() being 'last', in particular
  5284		 * all task references must be cleaned up.
  5285		 */
  5286		if (event->ctx)
  5287			put_ctx(event->ctx);
  5288	
  5289		exclusive_event_destroy(event);
  5290	
  5291		pmu_module_put(&event->pmu);
  5292	
  5293		call_rcu(&event->rcu_head, free_event_rcu);
  5294	}
  5295
Peter Zijlstra Oct. 14, 2024, 5:32 p.m. UTC | #2
On Tue, Oct 08, 2024 at 01:34:59PM -0500, Lucas De Marchi wrote:
> If a pmu is unregistered while there's an active event, perf will still
> access the pmu via event->pmu, even after the event is destroyed. This
> makes it difficult for drivers like i915 that can be unbound from the
> HW.
> 
> 	BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0
> 	Read of size 4 at addr ffff88816e2bb63c by task perf/7748
> 
> i915 tries to cope with it by installing a event->destroy, but that is
> not sufficient: if pmu is released by the driver, it will still crash
> since event->pmu is still used.
> 
> Moreover, even with that use-after-free fixed by adjusting the order in
> _free_event() or delaying the free by the driver, kernel still oops when
> closing the event fd related to a unregistered pmu: the percpu variables
> allocated on perf_pmu_register() would already be released. One such
> crash is:
> 
> 	BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100
> 	Write of size 4 at addr 00000000ffffffff by task perf/727
> 
> 	CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9
> 	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
> 	Call Trace:
> 	 <TASK>
> 	 dump_stack_lvl+0x5f/0x90
> 	 print_report+0x4d3/0x50a
> 	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
> 	 ? kasan_addr_to_slab+0xd/0xb0
> 	 kasan_report+0xe2/0x170
> 	 ? _raw_spin_lock_irqsave+0x88/0x100
> 	 ? _raw_spin_lock_irqsave+0x88/0x100
> 	 kasan_check_range+0x125/0x230
> 	 __kasan_check_write+0x14/0x30
> 	 _raw_spin_lock_irqsave+0x88/0x100
> 	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
> 	 _atomic_dec_and_raw_lock_irqsave+0x89/0x110
> 	 ? __kasan_check_write+0x14/0x30
> 	 put_pmu_ctx+0x98/0x330
> 
> The fix here is to provide a set of get/put hooks that drivers can
> implement to piggy back the perf's pmu lifecycle to the driver's
> instance lifecycle.  With this, perf_pmu_unregister() can be called by
> the driver, which is then responsible for freeing the resources.

I'm confused.. probably because I still don't have any clue about
drivers and the above isn't really telling me much either.

I don't see how you get rid of the try_module_get() we do per event;
without that you can't unload the module.

And I don't see how you think it is safe to free a pmu while there are
still events around.

Nor do I really see what these new get/put methods do. I see you call
->put() where we do module_put(), and ->get() near try_module_get(), but
how is that helping?
Lucas De Marchi Oct. 14, 2024, 6:20 p.m. UTC | #3
On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote:
>On Tue, Oct 08, 2024 at 01:34:59PM -0500, Lucas De Marchi wrote:
>> If a pmu is unregistered while there's an active event, perf will still
>> access the pmu via event->pmu, even after the event is destroyed. This
>> makes it difficult for drivers like i915 that can be unbound from the
>> HW.
>>
>> 	BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0
>> 	Read of size 4 at addr ffff88816e2bb63c by task perf/7748
>>
>> i915 tries to cope with it by installing a event->destroy, but that is
>> not sufficient: if pmu is released by the driver, it will still crash
>> since event->pmu is still used.
>>
>> Moreover, even with that use-after-free fixed by adjusting the order in
>> _free_event() or delaying the free by the driver, kernel still oops when
>> closing the event fd related to a unregistered pmu: the percpu variables
>> allocated on perf_pmu_register() would already be released. One such
>> crash is:
>>
>> 	BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100
>> 	Write of size 4 at addr 00000000ffffffff by task perf/727
>>
>> 	CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9
>> 	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
>> 	Call Trace:
>> 	 <TASK>
>> 	 dump_stack_lvl+0x5f/0x90
>> 	 print_report+0x4d3/0x50a
>> 	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
>> 	 ? kasan_addr_to_slab+0xd/0xb0
>> 	 kasan_report+0xe2/0x170
>> 	 ? _raw_spin_lock_irqsave+0x88/0x100
>> 	 ? _raw_spin_lock_irqsave+0x88/0x100
>> 	 kasan_check_range+0x125/0x230
>> 	 __kasan_check_write+0x14/0x30
>> 	 _raw_spin_lock_irqsave+0x88/0x100
>> 	 ? __pfx__raw_spin_lock_irqsave+0x10/0x10
>> 	 _atomic_dec_and_raw_lock_irqsave+0x89/0x110
>> 	 ? __kasan_check_write+0x14/0x30
>> 	 put_pmu_ctx+0x98/0x330
>>
>> The fix here is to provide a set of get/put hooks that drivers can
>> implement to piggy back the perf's pmu lifecycle to the driver's
>> instance lifecycle.  With this, perf_pmu_unregister() can be called by
>> the driver, which is then responsible for freeing the resources.
>
>I'm confused.. probably because I still don't have any clue about
>drivers and the above isn't really telling me much either.
>
>I don't see how you get rid of the try_module_get() we do per event;
>without that you can't unload the module.

I don't get rid of the try_module_get(). They serve diffeerent purposes.
Having a reference to the module prevents the _module_ going away (and
hence the function pointers we call into from perf). It doesn't prevent
the module unbinding from the HW.  A module may have N instances if it's
bound to N devices.

This can be done today to unbind the HW (integrated graphics) from the
i915 module:

	# echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind

The ref taken by these new get()/put() are related to preventing the
data going away - the driver can use that to take a ref on something
that will survive the unbind.

>
>And I don't see how you think it is safe to free a pmu while there are
>still events around.

so, we don't actually free it - the pmu is unregistered but the
`struct pmu` and (possibly) its container are still around after unregister.
When the get/put are used, the driver can keep the data around, which is
then free'd when the last reference is put.

>
>Nor do I really see what these new get/put methods do. I see you call
>->put() where we do module_put(), and ->get() near try_module_get(), but
>how is that helping?

Maybe the specific patches for i915 can help? Patch series:
https://lore.kernel.org/intel-gfx/20241011225430.1219345-1-lucas.demarchi@intel.com/

Important patches here are patches 2 and 3:

- Subject: [PATCH 2/8] drm/i915/pmu: Let resource survive unbind

   Allow the final kfree() to happen at a different time, not
   necessarily together with the call to perf_pmu_unregister().
   Here it uses drmm_add_action() to easily tie on the last drm ref going
   away.

- Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free

   This implements the get()/put() so we get/put a reference to the drm
   dev.

These 2 patches for i915 are the equivalent of patch 4 in this series
for the dummy_pmu.

Lucas De Marchi
Peter Zijlstra Oct. 14, 2024, 7:25 p.m. UTC | #4
On Mon, Oct 14, 2024 at 01:20:34PM -0500, Lucas De Marchi wrote:
> On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote:

> > I'm confused.. probably because I still don't have any clue about
> > drivers and the above isn't really telling me much either.
> > 
> > I don't see how you get rid of the try_module_get() we do per event;
> > without that you can't unload the module.
> 
> I don't get rid of the try_module_get(). They serve diffeerent purposes.
> Having a reference to the module prevents the _module_ going away (and
> hence the function pointers we call into from perf). It doesn't prevent
> the module unbinding from the HW.  A module may have N instances if it's
> bound to N devices.
> 
> This can be done today to unbind the HW (integrated graphics) from the
> i915 module:
> 
> 	# echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind
> 
> The ref taken by these new get()/put() are related to preventing the
> data going away - the driver can use that to take a ref on something
> that will survive the unbind.

OK, for some reason I thought to remember that you wanted to be able to
unload the module too.

> > And I don't see how you think it is safe to free a pmu while there are
> > still events around.
> 
> so, we don't actually free it - the pmu is unregistered but the
> `struct pmu` and (possibly) its container are still around after unregister.
> When the get/put are used, the driver can keep the data around, which is
> then free'd when the last reference is put.

Aaaaah, okay. So the implementation knows to nop out all device
interaction when it gets unbound, but the events and pmu data stick
around until they're naturally killed off?

Ah, reading the below patches that is indeed what i915 does. pmu->closed
makes this so.

The dummy thing you posted in this thread, does perf_event_disable() on
all previously created events, and this is not sound. Userspace can do
PERF_EVENT_IOC_ENABLE on them and then things will go side-ways fast.
And I was afraid i915 was doing this same.

> - Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free

So reading that Changelog, you would like a replacement for pmu->closed
as well.

I suppose, one way to go about doing this is to have
perf_pmu_unregister() replace a bunch of methods. Notably you have
pmu->closed in:

  - event_init()
  - read()
  - start()
  - stop()
  - add()

Having perf_pmu_unregister() overwrite those function pointers with
something akin to your pmu->closed would go a long way, right? It would
require using READ_ONCE() for calling the methods, which would make
things a little ugly :/

But I also think we want to force all the events into STATE_ERROR, and
I'm not immediately sure how best to go about doing that. Adding better
return value handling to ->add() is trivial enough, and perhaps calling
perf_pmu_resched() is sufficient to cycle everything.

Let me ponder that a little bit.
Lucas De Marchi Oct. 14, 2024, 8:12 p.m. UTC | #5
On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:
>On Mon, Oct 14, 2024 at 01:20:34PM -0500, Lucas De Marchi wrote:
>> On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote:
>
>> > I'm confused.. probably because I still don't have any clue about
>> > drivers and the above isn't really telling me much either.
>> >
>> > I don't see how you get rid of the try_module_get() we do per event;
>> > without that you can't unload the module.
>>
>> I don't get rid of the try_module_get(). They serve diffeerent purposes.
>> Having a reference to the module prevents the _module_ going away (and
>> hence the function pointers we call into from perf). It doesn't prevent
>> the module unbinding from the HW.  A module may have N instances if it's
>> bound to N devices.
>>
>> This can be done today to unbind the HW (integrated graphics) from the
>> i915 module:
>>
>> 	# echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind
>>
>> The ref taken by these new get()/put() are related to preventing the
>> data going away - the driver can use that to take a ref on something
>> that will survive the unbind.
>
>OK, for some reason I thought to remember that you wanted to be able to
>unload the module too.

humn... maybe because in the first version we were talking about
shortcutting all the function calls. If made by the driver, it'd allow
to remove the ugly `if (pmu->closed)`, and if made by perf itself, it
would  allow to drop the module ref since we would guarantee we are not
calling into the module anymore.

But I think that's orthogonal to what we really care about: once the HW
vanishes underneath us, stop accessing it and unregister the PMU in a
way that if the HW shows up again we can still register it and nothing
explodes.

>
>> > And I don't see how you think it is safe to free a pmu while there are
>> > still events around.
>>
>> so, we don't actually free it - the pmu is unregistered but the
>> `struct pmu` and (possibly) its container are still around after unregister.
>> When the get/put are used, the driver can keep the data around, which is
>> then free'd when the last reference is put.
>
>Aaaaah, okay. So the implementation knows to nop out all device
>interaction when it gets unbound, but the events and pmu data stick
>around until they're naturally killed off?
>
>Ah, reading the below patches that is indeed what i915 does. pmu->closed
>makes this so.

yes. Without these patches it doesn't work well though, particuarly
because

a) even if we protected the methods with pmu->closed(), the data is
freed when we call perf_pmu_unregister(), making the whole pmu pointer
invalid
b) kernel/core/events.c still accesss the pmu after calling
event->destroy() - we can't really hook on that to destroy the pmu like
is done today.

>
>The dummy thing you posted in this thread, does perf_event_disable() on

that's optional and I think we could live without. The main issue is
completely crashing the kernel if we do:

	# perf stat -e i915/rc6-residency/ -I1000 &
	# echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind

... which these patches are fixing regardless of calling
perf_event_disable().

>all previously created events, and this is not sound. Userspace can do
>PERF_EVENT_IOC_ENABLE on them and then things will go side-ways fast.
>And I was afraid i915 was doing this same.

For the i915 series, that would be "[PATCH 8/8] drm/i915/pmu: Release
open events when unregistering". See release_active_events()

I was just wanting a smoke signal/hint for userspace that "something
went wrong" with this counter.

>
>> - Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free
>
>So reading that Changelog, you would like a replacement for pmu->closed
>as well.
>
>I suppose, one way to go about doing this is to have
>perf_pmu_unregister() replace a bunch of methods. Notably you have
>pmu->closed in:
>
>  - event_init()
>  - read()
>  - start()
>  - stop()
>  - add()
>
>Having perf_pmu_unregister() overwrite those function pointers with
>something akin to your pmu->closed would go a long way, right? It would

it would help if we want to unload the module, to make it work for
other drivers without having to add similar flag and to make those
drivers less ugly with those checks. However grepping the
kernel for calls to perf_pmu_register(), it seems there are only 3
candidates, all in drm: i915, amdgpu and xe (the xe is a pending patch
series waiting on the resolution of this issue).  There is
drivers/powercap/intel_rapl_common.c with its `bool registered` flag,
but that is basically doing multiple register/unregister to replace the pmu
rather than working with HW that can be removed.

>require using READ_ONCE() for calling the methods, which would make
>things a little ugly :/
>
>But I also think we want to force all the events into STATE_ERROR, and

yeah...  When I was looking for the smoke signal I mentioned, I wanted
something to move the event into that state, but couldn't find one. The
best I found was to disable it.

>I'm not immediately sure how best to go about doing that. Adding better
>return value handling to ->add() is trivial enough, and perhaps calling
>perf_pmu_resched() is sufficient to cycle everything.
>
>Let me ponder that a little bit.

thanks for the help

Lucas De Marchi

>
Peter Zijlstra Oct. 16, 2024, 12:03 p.m. UTC | #6
On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:

> Let me ponder that a little bit.

So I did the thing on top of the get/put thing that would allow you to
get rid of the ->closed thing, and before I was finished I already hated
all of it :-(

The thing is, if you're going to the effort of adding get/put and
putting the responsibility on the implementation, then the ->closed
thing is only a little extra ask.

So... I wondered, how hard it would be for perf_pmu_unregister() to do
what you want.

Time passed, hacks were done...

and while what I have is still riddled with holes (more work is
definitely required), it does pass your dummy_pmu test scipt.

I'll poke at this a little longer. Afaict it should be possible to make
this good enough for what you want. Just needs more holes filled.

---
 include/linux/perf_event.h |  13 ++-
 kernel/events/core.c       | 260 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 228 insertions(+), 45 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..20995d33d27e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
 struct pmu {
 	struct list_head		entry;
 
+	spinlock_t			events_lock;
+	struct list_head		events;
+
 	struct module			*module;
 	struct device			*dev;
 	struct device			*parent;
@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
  * enum perf_event_state - the states of an event:
  */
 enum perf_event_state {
-	PERF_EVENT_STATE_DEAD		= -4,
-	PERF_EVENT_STATE_EXIT		= -3,
-	PERF_EVENT_STATE_ERROR		= -2,
+	PERF_EVENT_STATE_DEAD		= -5,
+	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
+	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
+	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
 	PERF_EVENT_STATE_ACTIVE		=  1,
@@ -853,6 +857,7 @@ struct perf_event {
 	void *security;
 #endif
 	struct list_head		sb_list;
+	struct list_head		pmu_list;
 
 	/*
 	 * Certain events gets forwarded to another pmu internally by over-
@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
 extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
 
 extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cdd09769e6c5..e66827367a97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
 
 #define DETACH_GROUP	0x01UL
 #define DETACH_CHILD	0x02UL
-#define DETACH_DEAD	0x04UL
+#define DETACH_EXIT	0x04UL
+#define DETACH_REVOKE	0x08UL
+#define DETACH_DEAD	0x10UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
 			   void *info)
 {
 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+	enum perf_event_state state = PERF_EVENT_STATE_OFF;
 	unsigned long flags = (unsigned long)info;
 
 	ctx_time_update(cpuctx, ctx);
@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
 	 * Ensure event_sched_out() switches to OFF, at the very least
 	 * this avoids raising perf_pending_task() at this time.
 	 */
-	if (flags & DETACH_DEAD)
+	if (flags & DETACH_EXIT)
+		state = PERF_EVENT_STATE_EXIT;
+	if (flags & DETACH_REVOKE)
+		state = PERF_EVENT_STATE_REVOKED;
+	if (flags & DETACH_DEAD) {
 		event->pending_disable = 1;
+		state = PERF_EVENT_STATE_DEAD;
+	}
 	event_sched_out(event, ctx);
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	if (flags & DETACH_CHILD)
 		perf_child_detach(event);
 	list_del_event(event, ctx);
-	if (flags & DETACH_DEAD)
-		event->state = PERF_EVENT_STATE_DEAD;
+
+	event->state = state;
 
 	if (!pmu_ctx->nr_events) {
 		pmu_ctx->rotate_necessary = 0;
@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *event,
-				  struct perf_event_context *ctx);
+				  struct perf_event_context *ctx,
+				  bool revoke);
 
 /*
  * Removes all events from the current task that have been marked
@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
 
 		modified = true;
 
-		perf_event_exit_event(event, ctx);
+		perf_event_exit_event(event, ctx, false);
 	}
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
 	    attr->context_switch || attr->text_poke ||
 	    attr->bpf_event)
 		return true;
+
 	return false;
 }
 
@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event)
 
 static void _free_event(struct perf_event *event)
 {
+	struct pmu *pmu = event->pmu;
+
 	irq_work_sync(&event->pending_irq);
 	irq_work_sync(&event->pending_disable_irq);
 	perf_pending_task_sync(event);
@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
 	security_perf_event_free(event);
 
 	if (event->rb) {
+		WARN_ON_ONCE(!pmu);
 		/*
 		 * Can happen when we close an event with re-directed output.
 		 *
@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
 			put_callchain_buffers();
 	}
 
-	perf_event_free_bpf_prog(event);
-	perf_addr_filters_splice(event, NULL);
-	kfree(event->addr_filter_ranges);
+	if (pmu) {
+		perf_event_free_bpf_prog(event);
+		perf_addr_filters_splice(event, NULL);
+		kfree(event->addr_filter_ranges);
+	}
 
-	if (event->destroy)
+	if (event->destroy) {
+		WARN_ON_ONCE(!pmu);
 		event->destroy(event);
+	}
 
 	/*
 	 * Must be after ->destroy(), due to uprobe_perf_close() using
@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
 	if (event->hw.target)
 		put_task_struct(event->hw.target);
 
-	if (event->pmu_ctx)
+	if (event->pmu_ctx) {
+		WARN_ON_ONCE(!pmu);
 		put_pmu_ctx(event->pmu_ctx);
+	}
 
 	/*
 	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	exclusive_event_destroy(event);
-	module_put(event->pmu->module);
+	if (pmu) {
+		exclusive_event_destroy(event);
+		module_put(pmu->module);
+		scoped_guard(spinlock, &pmu->events_lock) {
+			list_del(&event->pmu_list);
+			wake_up_var(pmu);
+		}
+	}
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
 	 * Thus this guarantees that we will in fact observe and kill _ALL_
 	 * child events.
 	 */
-	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+	if (event->state > PERF_EVENT_STATE_REVOKED) {
+		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+	} else {
+		event->state = PERF_EVENT_STATE_DEAD;
+	}
 
 	perf_event_ctx_unlock(event, ctx);
 
@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
 	 * error state (i.e. because it was pinned but it couldn't be
 	 * scheduled on to the CPU at some point).
 	 */
-	if (event->state == PERF_EVENT_STATE_ERROR)
+	if (event->state <= PERF_EVENT_STATE_ERROR)
 		return 0;
 
 	if (count < event->read_size)
@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
 	struct perf_buffer *rb;
 	__poll_t events = EPOLLHUP;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return EPOLLERR;
+
 	poll_wait(file, &event->waitq, wait);
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return EPOLLERR;
+
 	if (is_event_hup(event))
 		return events;
 
@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 }
 
 static int perf_event_set_output(struct perf_event *event,
-				 struct perf_event *output_event);
+				 struct perf_event *output_event, bool force);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			  struct perf_event_attr *attr);
@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	void (*func)(struct perf_event *);
 	u32 flags = arg;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return -ENODEV;
+
 	switch (cmd) {
 	case PERF_EVENT_IOC_ENABLE:
 		func = _perf_event_enable;
@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 			if (ret)
 				return ret;
 			output_event = fd_file(output)->private_data;
-			ret = perf_event_set_output(event, output_event);
+			ret = perf_event_set_output(event, output_event, false);
 			fdput(output);
 		} else {
-			ret = perf_event_set_output(event, NULL);
+			ret = perf_event_set_output(event, NULL, false);
 		}
 		return ret;
 	}
@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	unsigned long size = perf_data_size(rb);
 	bool detach_rest = false;
 
+	/* FIXIES vs perf_pmu_unregister() */
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event, vma->vm_mm);
 
@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra = 0, extra = 0;
-	int ret = 0, flags = 0;
+	int ret, flags = 0;
+
+	ret = security_perf_event_read(event);
+	if (ret)
+		return ret;
+
+	/* XXX needs event->mmap_mutex? */
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return -ENODEV;
 
 	/*
 	 * Don't allow mmap() of inherited per-task counters. This would
@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	ret = security_perf_event_read(event);
-	if (ret)
-		return ret;
-
 	vma_size = vma->vm_end - vma->vm_start;
 
 	if (vma->vm_pgoff == 0) {
@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
 	struct perf_event *event = filp->private_data;
 	int retval;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return -ENODEV;
+
 	inode_lock(inode);
 	retval = fasync_helper(fd, filp, on, &event->fasync);
 	inode_unlock(inode);
@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event)
 
 static void free_pmu_context(struct pmu *pmu)
 {
+	free_percpu(pmu->pmu_disable_count);
 	free_percpu(pmu->cpu_pmu_context);
 }
 
@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	if (type >= 0)
 		max = type;
 
+	// XXX broken vs perf_init_event(), this publishes before @pmu is finalized
 	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto free_pdc;
@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	if (!pmu->event_idx)
 		pmu->event_idx = perf_event_idx_default;
 
-	list_add_rcu(&pmu->entry, &pmus);
 	atomic_set(&pmu->exclusive_cnt, 0);
+	INIT_LIST_HEAD(&pmu->events);
+	spin_lock_init(&pmu->events_lock);
+
+	/*
+	 * Publish the pmu after it is fully initialized.
+	 */
+	list_add_rcu(&pmu->entry, &pmus);
 	ret = 0;
 unlock:
 	mutex_unlock(&pmus_lock);
@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_register);
 
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+			       struct perf_event_context *ctx)
 {
+	/*
+	 * De-schedule the event and mark it EXIT_PMU.
+	 */
+	perf_event_exit_event(event, ctx, true);
+
+	/*
+	 * All _free_event() bits that rely on event->pmu:
+	 */
+	perf_event_set_output(event, NULL, true);
+
+	perf_event_free_bpf_prog(event);
+	perf_addr_filters_splice(event, NULL);
+	kfree(event->addr_filter_ranges);
+
+	if (event->destroy) {
+		event->destroy(event);
+		event->destroy = NULL;
+	}
+
+	if (event->pmu_ctx) {
+		/*
+		 * Make sure pmu->cpu_pmu_context is unused. An alternative is
+		 * to have it be pointers and make put_pmu_ctx()'s
+		 * epc->embedded case be another RCU free case.
+		 */
+		put_pmu_ctx(event->pmu_ctx);
+		event->pmu_ctx = NULL;
+	}
+
+	exclusive_event_destroy(event);
+	module_put(pmu->module);
+
+	event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+	struct perf_event_context *ctx;
+
+	ctx = perf_event_ctx_lock(event);
+	__pmu_detach_event(pmu, event, ctx);
+	perf_event_ctx_unlock(event, ctx);
+
+	scoped_guard(spinlock, &pmu->events_lock)
+		list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+	struct perf_event *event;
+
+	guard(spinlock)(&pmu->events_lock);
+	list_for_each_entry(event, &pmu->events, pmu_list) {
+		if (atomic_long_inc_not_zero(&event->refcount))
+			return event;
+	}
+
+	return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+	guard(spinlock)(&pmu->events_lock);
+	return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+	struct perf_event *event;
+
+	for (;;) {
+		event = pmu_get_event(pmu);
+		if (!event)
+			break;
+
+		pmu_detach_event(pmu, event);
+		put_event(event);
+	}
+
+	/*
+	 * wait for pending _free_event()s
+	 */
+	wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
+{
+	/*
+	 * FIXME!
+	 *
+	 *   perf_mmap_close(); event->pmu->event_unmapped()
+	 *
+	 * XXX this check is racy vs perf_event_alloc()
+	 */
+	if (pmu->event_unmapped && !pmu_empty(pmu))
+		return -EBUSY;
+
 	mutex_lock(&pmus_lock);
 	list_del_rcu(&pmu->entry);
+	idr_remove(&pmu_idr, pmu->type);
+	mutex_unlock(&pmus_lock);
 
 	/*
 	 * We dereference the pmu list under both SRCU and regular RCU, so
@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
 	synchronize_srcu(&pmus_srcu);
 	synchronize_rcu();
 
-	free_percpu(pmu->pmu_disable_count);
-	idr_remove(&pmu_idr, pmu->type);
+	/*
+	 * PMU is removed from the pmus list, so no new events will
+	 * be created, now take care of the existing ones.
+	 */
+	pmu_detach_events(pmu);
+
+	/*
+	 * PMU is unused, make it go away.
+	 */
 	if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
 		if (pmu->nr_addr_filters)
 			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
 		device_del(pmu->dev);
 		put_device(pmu->dev);
 	}
+
+	/* 
+	 * XXX -- broken? can still contain SW events at this point?
+	 * audit more, make sure DETACH_GROUP DTRT
+	 */
 	free_pmu_context(pmu);
-	mutex_unlock(&pmus_lock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
+	scoped_guard(spinlock, &pmu->events_lock)
+		list_add(&event->pmu_list, &pmu->events);
+
 	return event;
 
 err_callchain_buffer:
@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 }
 
 static int
-perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force)
 {
 	struct perf_buffer *rb = NULL;
 	int ret = -EINVAL;
@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
 	/* Can't redirect output if we've got an active mmap() */
-	if (atomic_read(&event->mmap_count))
-		goto unlock;
+	if (atomic_read(&event->mmap_count)) {
+		if (!force)
+			goto unlock;
+
+		WARN_ON_ONCE(event->pmu->event_unmapped);
+	}
 
 	if (output_event) {
 		/* get the rb we want to redirect to */
@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		if (err)
 			goto err_fd;
 		group_leader = fd_file(group)->private_data;
+		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+			err = -ENODEV;
+			goto err_group_fd;
+		}
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
 		if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	event->pmu_ctx = pmu_ctx;
 
 	if (output_event) {
-		err = perf_event_set_output(event, output_event);
+		err = perf_event_set_output(event, output_event, false);
 		if (err)
 			goto err_context;
 	}
@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event)
 }
 
 static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+		      struct perf_event_context *ctx, bool revoke)
 {
 	struct perf_event *parent_event = event->parent;
-	unsigned long detach_flags = 0;
+	unsigned long detach_flags = DETACH_EXIT;
 
 	if (parent_event) {
 		/*
@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
 		 * Do destroy all inherited groups, we don't care about those
 		 * and being thorough is better.
 		 */
-		detach_flags = DETACH_GROUP | DETACH_CHILD;
+		detach_flags |= DETACH_GROUP | DETACH_CHILD;
 		mutex_lock(&parent_event->child_mutex);
 	}
 
-	perf_remove_from_context(event, detach_flags);
+	if (revoke)
+		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
 
-	raw_spin_lock_irq(&ctx->lock);
-	if (event->state > PERF_EVENT_STATE_EXIT)
-		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
-	raw_spin_unlock_irq(&ctx->lock);
+	perf_remove_from_context(event, detach_flags);
 
 	/*
 	 * Child events can be freed.
@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
 	perf_event_task(child, child_ctx, 0);
 
 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-		perf_event_exit_event(child_event, child_ctx);
+		perf_event_exit_event(child_event, child_ctx, false);
 
 	mutex_unlock(&child_ctx->mutex);
Lucas De Marchi Oct. 18, 2024, 7:46 p.m. UTC | #7
On Wed, Oct 16, 2024 at 02:03:02PM +0200, Peter Zijlstra wrote:
>On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:
>
>> Let me ponder that a little bit.
>
>So I did the thing on top of the get/put thing that would allow you to
>get rid of the ->closed thing, and before I was finished I already hated
>all of it :-(
>
>The thing is, if you're going to the effort of adding get/put and
>putting the responsibility on the implementation, then the ->closed
>thing is only a little extra ask.
>
>So... I wondered, how hard it would be for perf_pmu_unregister() to do
>what you want.
>
>Time passed, hacks were done...
>
>and while what I have is still riddled with holes (more work is
>definitely required), it does pass your dummy_pmu test scipt.
>
>I'll poke at this a little longer. Afaict it should be possible to make
>this good enough for what you want. Just needs more holes filled.
>
>---
> include/linux/perf_event.h |  13 ++-
> kernel/events/core.c       | 260 ++++++++++++++++++++++++++++++++++++++-------
> 2 files changed, 228 insertions(+), 45 deletions(-)
>
>diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>index fb908843f209..20995d33d27e 100644
>--- a/include/linux/perf_event.h
>+++ b/include/linux/perf_event.h
>@@ -318,6 +318,9 @@ struct perf_output_handle;
> struct pmu {
> 	struct list_head		entry;
>
>+	spinlock_t			events_lock;
>+	struct list_head		events;

oh... I looked at several lists and was wondering which one would
contain the events of our pmu. Now I see we didn't have that :)

>+
> 	struct module			*module;
> 	struct device			*dev;
> 	struct device			*parent;
>@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
>  * enum perf_event_state - the states of an event:
>  */
> enum perf_event_state {
>-	PERF_EVENT_STATE_DEAD		= -4,
>-	PERF_EVENT_STATE_EXIT		= -3,
>-	PERF_EVENT_STATE_ERROR		= -2,
>+	PERF_EVENT_STATE_DEAD		= -5,
>+	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
>+	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
>+	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
> 	PERF_EVENT_STATE_OFF		= -1,
> 	PERF_EVENT_STATE_INACTIVE	=  0,
> 	PERF_EVENT_STATE_ACTIVE		=  1,
>@@ -853,6 +857,7 @@ struct perf_event {
> 	void *security;
> #endif
> 	struct list_head		sb_list;
>+	struct list_head		pmu_list;
>
> 	/*
> 	 * Certain events gets forwarded to another pmu internally by over-
>@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
> extern void perf_event_itrace_started(struct perf_event *event);
>
> extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
>-extern void perf_pmu_unregister(struct pmu *pmu);
>+extern int perf_pmu_unregister(struct pmu *pmu);
>
> extern void __perf_event_task_sched_in(struct task_struct *prev,
> 				       struct task_struct *task);
>diff --git a/kernel/events/core.c b/kernel/events/core.c
>index cdd09769e6c5..e66827367a97 100644
>--- a/kernel/events/core.c
>+++ b/kernel/events/core.c
>@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>
> #define DETACH_GROUP	0x01UL
> #define DETACH_CHILD	0x02UL
>-#define DETACH_DEAD	0x04UL
>+#define DETACH_EXIT	0x04UL
>+#define DETACH_REVOKE	0x08UL
>+#define DETACH_DEAD	0x10UL
>
> /*
>  * Cross CPU call to remove a performance event
>@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
> 			   void *info)
> {
> 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
>+	enum perf_event_state state = PERF_EVENT_STATE_OFF;
> 	unsigned long flags = (unsigned long)info;
>
> 	ctx_time_update(cpuctx, ctx);
>@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
> 	 * Ensure event_sched_out() switches to OFF, at the very least
> 	 * this avoids raising perf_pending_task() at this time.
> 	 */
>-	if (flags & DETACH_DEAD)
>+	if (flags & DETACH_EXIT)
>+		state = PERF_EVENT_STATE_EXIT;
>+	if (flags & DETACH_REVOKE)
>+		state = PERF_EVENT_STATE_REVOKED;
>+	if (flags & DETACH_DEAD) {
> 		event->pending_disable = 1;
>+		state = PERF_EVENT_STATE_DEAD;
>+	}
> 	event_sched_out(event, ctx);
> 	if (flags & DETACH_GROUP)
> 		perf_group_detach(event);
> 	if (flags & DETACH_CHILD)
> 		perf_child_detach(event);
> 	list_del_event(event, ctx);
>-	if (flags & DETACH_DEAD)
>-		event->state = PERF_EVENT_STATE_DEAD;
>+
>+	event->state = state;
>
> 	if (!pmu_ctx->nr_events) {
> 		pmu_ctx->rotate_necessary = 0;
>@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>
> static void perf_remove_from_owner(struct perf_event *event);
> static void perf_event_exit_event(struct perf_event *event,
>-				  struct perf_event_context *ctx);
>+				  struct perf_event_context *ctx,
>+				  bool revoke);
>
> /*
>  * Removes all events from the current task that have been marked
>@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
>
> 		modified = true;
>
>-		perf_event_exit_event(event, ctx);
>+		perf_event_exit_event(event, ctx, false);
> 	}
>
> 	raw_spin_lock_irqsave(&ctx->lock, flags);
>@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
> 	    attr->context_switch || attr->text_poke ||
> 	    attr->bpf_event)
> 		return true;
>+
> 	return false;
> }
>
>@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event)
>
> static void _free_event(struct perf_event *event)
> {
>+	struct pmu *pmu = event->pmu;
>+
> 	irq_work_sync(&event->pending_irq);
> 	irq_work_sync(&event->pending_disable_irq);
> 	perf_pending_task_sync(event);
>@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
> 	security_perf_event_free(event);
>
> 	if (event->rb) {
>+		WARN_ON_ONCE(!pmu);
> 		/*
> 		 * Can happen when we close an event with re-directed output.
> 		 *
>@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
> 			put_callchain_buffers();
> 	}
>
>-	perf_event_free_bpf_prog(event);
>-	perf_addr_filters_splice(event, NULL);
>-	kfree(event->addr_filter_ranges);
>+	if (pmu) {
>+		perf_event_free_bpf_prog(event);
>+		perf_addr_filters_splice(event, NULL);
>+		kfree(event->addr_filter_ranges);
>+	}
>
>-	if (event->destroy)
>+	if (event->destroy) {
>+		WARN_ON_ONCE(!pmu);
> 		event->destroy(event);
>+	}
>
> 	/*
> 	 * Must be after ->destroy(), due to uprobe_perf_close() using
>@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
> 	if (event->hw.target)
> 		put_task_struct(event->hw.target);
>
>-	if (event->pmu_ctx)
>+	if (event->pmu_ctx) {
>+		WARN_ON_ONCE(!pmu);
> 		put_pmu_ctx(event->pmu_ctx);
>+	}
>
> 	/*
> 	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
>@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
> 	if (event->ctx)
> 		put_ctx(event->ctx);
>
>-	exclusive_event_destroy(event);
>-	module_put(event->pmu->module);
>+	if (pmu) {
>+		exclusive_event_destroy(event);
>+		module_put(pmu->module);
>+		scoped_guard(spinlock, &pmu->events_lock) {
>+			list_del(&event->pmu_list);
>+			wake_up_var(pmu);
>+		}
>+	}
>
> 	call_rcu(&event->rcu_head, free_event_rcu);
> }
>@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
> 	 * Thus this guarantees that we will in fact observe and kill _ALL_
> 	 * child events.
> 	 */
>-	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+	if (event->state > PERF_EVENT_STATE_REVOKED) {
>+		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+	} else {
>+		event->state = PERF_EVENT_STATE_DEAD;
>+	}
>
> 	perf_event_ctx_unlock(event, ctx);
>
>@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
> 	 * error state (i.e. because it was pinned but it couldn't be
> 	 * scheduled on to the CPU at some point).
> 	 */
>-	if (event->state == PERF_EVENT_STATE_ERROR)
>+	if (event->state <= PERF_EVENT_STATE_ERROR)
> 		return 0;
>
> 	if (count < event->read_size)
>@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
> 	struct perf_buffer *rb;
> 	__poll_t events = EPOLLHUP;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return EPOLLERR;
>+
> 	poll_wait(file, &event->waitq, wait);
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return EPOLLERR;
>+
> 	if (is_event_hup(event))
> 		return events;
>
>@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
> }
>
> static int perf_event_set_output(struct perf_event *event,
>-				 struct perf_event *output_event);
>+				 struct perf_event *output_event, bool force);
> static int perf_event_set_filter(struct perf_event *event, void __user *arg);
> static int perf_copy_attr(struct perf_event_attr __user *uattr,
> 			  struct perf_event_attr *attr);
>@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> 	void (*func)(struct perf_event *);
> 	u32 flags = arg;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>+
> 	switch (cmd) {
> 	case PERF_EVENT_IOC_ENABLE:
> 		func = _perf_event_enable;
>@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> 			if (ret)
> 				return ret;
> 			output_event = fd_file(output)->private_data;
>-			ret = perf_event_set_output(event, output_event);
>+			ret = perf_event_set_output(event, output_event, false);
> 			fdput(output);
> 		} else {
>-			ret = perf_event_set_output(event, NULL);
>+			ret = perf_event_set_output(event, NULL, false);
> 		}
> 		return ret;
> 	}
>@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> 	unsigned long size = perf_data_size(rb);
> 	bool detach_rest = false;
>
>+	/* FIXIES vs perf_pmu_unregister() */
> 	if (event->pmu->event_unmapped)
> 		event->pmu->event_unmapped(event, vma->vm_mm);
>
>@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> 	unsigned long vma_size;
> 	unsigned long nr_pages;
> 	long user_extra = 0, extra = 0;
>-	int ret = 0, flags = 0;
>+	int ret, flags = 0;
>+
>+	ret = security_perf_event_read(event);
>+	if (ret)
>+		return ret;
>+
>+	/* XXX needs event->mmap_mutex? */
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>
> 	/*
> 	 * Don't allow mmap() of inherited per-task counters. This would
>@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> 	if (!(vma->vm_flags & VM_SHARED))
> 		return -EINVAL;
>
>-	ret = security_perf_event_read(event);
>-	if (ret)
>-		return ret;
>-
> 	vma_size = vma->vm_end - vma->vm_start;
>
> 	if (vma->vm_pgoff == 0) {
>@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
> 	struct perf_event *event = filp->private_data;
> 	int retval;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>+
> 	inode_lock(inode);
> 	retval = fasync_helper(fd, filp, on, &event->fasync);
> 	inode_unlock(inode);
>@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event)
>
> static void free_pmu_context(struct pmu *pmu)
> {
>+	free_percpu(pmu->pmu_disable_count);
> 	free_percpu(pmu->cpu_pmu_context);
> }
>
>@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> 	if (type >= 0)
> 		max = type;
>
>+	// XXX broken vs perf_init_event(), this publishes before @pmu is finalized
> 	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
> 	if (ret < 0)
> 		goto free_pdc;
>@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> 	if (!pmu->event_idx)
> 		pmu->event_idx = perf_event_idx_default;
>
>-	list_add_rcu(&pmu->entry, &pmus);
> 	atomic_set(&pmu->exclusive_cnt, 0);
>+	INIT_LIST_HEAD(&pmu->events);
>+	spin_lock_init(&pmu->events_lock);
>+
>+	/*
>+	 * Publish the pmu after it is fully initialized.
>+	 */
>+	list_add_rcu(&pmu->entry, &pmus);
> 	ret = 0;
> unlock:
> 	mutex_unlock(&pmus_lock);
>@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> }
> EXPORT_SYMBOL_GPL(perf_pmu_register);
>
>-void perf_pmu_unregister(struct pmu *pmu)
>+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>+			       struct perf_event_context *ctx)
> {
>+	/*
>+	 * De-schedule the event and mark it EXIT_PMU.
>+	 */
>+	perf_event_exit_event(event, ctx, true);
>+
>+	/*
>+	 * All _free_event() bits that rely on event->pmu:
>+	 */
>+	perf_event_set_output(event, NULL, true);
>+
>+	perf_event_free_bpf_prog(event);
>+	perf_addr_filters_splice(event, NULL);
>+	kfree(event->addr_filter_ranges);
>+
>+	if (event->destroy) {
>+		event->destroy(event);
>+		event->destroy = NULL;
>+	}
>+
>+	if (event->pmu_ctx) {
>+		/*
>+		 * Make sure pmu->cpu_pmu_context is unused. An alternative is
>+		 * to have it be pointers and make put_pmu_ctx()'s
>+		 * epc->embedded case be another RCU free case.
>+		 */
>+		put_pmu_ctx(event->pmu_ctx);
>+		event->pmu_ctx = NULL;
>+	}
>+
>+	exclusive_event_destroy(event);
>+	module_put(pmu->module);
>+
>+	event->pmu = NULL; /* force fault instead of UAF */
>+}
>+
>+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
>+{
>+	struct perf_event_context *ctx;
>+
>+	ctx = perf_event_ctx_lock(event);
>+	__pmu_detach_event(pmu, event, ctx);
>+	perf_event_ctx_unlock(event, ctx);
>+
>+	scoped_guard(spinlock, &pmu->events_lock)
>+		list_del(&event->pmu_list);
>+}
>+
>+static struct perf_event *pmu_get_event(struct pmu *pmu)
>+{
>+	struct perf_event *event;
>+
>+	guard(spinlock)(&pmu->events_lock);
>+	list_for_each_entry(event, &pmu->events, pmu_list) {
>+		if (atomic_long_inc_not_zero(&event->refcount))
>+			return event;
>+	}
>+
>+	return NULL;
>+}
>+
>+static bool pmu_empty(struct pmu *pmu)
>+{
>+	guard(spinlock)(&pmu->events_lock);
>+	return list_empty(&pmu->events);
>+}
>+
>+static void pmu_detach_events(struct pmu *pmu)
>+{
>+	struct perf_event *event;
>+
>+	for (;;) {
>+		event = pmu_get_event(pmu);
>+		if (!event)
>+			break;
>+
>+		pmu_detach_event(pmu, event);
>+		put_event(event);
>+	}
>+
>+	/*
>+	 * wait for pending _free_event()s
>+	 */
>+	wait_var_event(pmu, pmu_empty(pmu));


ok... so now we are synchronosly moving the events to a revoked state
during unregister, so we wouldn't need the refcount on the driver side
anymore and can just free the objects right after return.

I will give this a try with i915 and/or xe.

thanks
Lucas De Marchi


>+}
>+
>+int perf_pmu_unregister(struct pmu *pmu)
>+{
>+	/*
>+	 * FIXME!
>+	 *
>+	 *   perf_mmap_close(); event->pmu->event_unmapped()
>+	 *
>+	 * XXX this check is racy vs perf_event_alloc()
>+	 */
>+	if (pmu->event_unmapped && !pmu_empty(pmu))
>+		return -EBUSY;
>+
> 	mutex_lock(&pmus_lock);
> 	list_del_rcu(&pmu->entry);
>+	idr_remove(&pmu_idr, pmu->type);
>+	mutex_unlock(&pmus_lock);
>
> 	/*
> 	 * We dereference the pmu list under both SRCU and regular RCU, so
>@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
> 	synchronize_srcu(&pmus_srcu);
> 	synchronize_rcu();
>
>-	free_percpu(pmu->pmu_disable_count);
>-	idr_remove(&pmu_idr, pmu->type);
>+	/*
>+	 * PMU is removed from the pmus list, so no new events will
>+	 * be created, now take care of the existing ones.
>+	 */
>+	pmu_detach_events(pmu);
>+
>+	/*
>+	 * PMU is unused, make it go away.
>+	 */
> 	if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
> 		if (pmu->nr_addr_filters)
> 			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
> 		device_del(pmu->dev);
> 		put_device(pmu->dev);
> 	}
>+
>+	/*
>+	 * XXX -- broken? can still contain SW events at this point?
>+	 * audit more, make sure DETACH_GROUP DTRT
>+	 */
> 	free_pmu_context(pmu);
>-	mutex_unlock(&pmus_lock);
>+
>+	return 0;
> }
> EXPORT_SYMBOL_GPL(perf_pmu_unregister);
>
>@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> 	/* symmetric to unaccount_event() in _free_event() */
> 	account_event(event);
>
>+	scoped_guard(spinlock, &pmu->events_lock)
>+		list_add(&event->pmu_list, &pmu->events);
>+
> 	return event;
>
> err_callchain_buffer:
>@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
> }
>
> static int
>-perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force)
> {
> 	struct perf_buffer *rb = NULL;
> 	int ret = -EINVAL;
>@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
> 	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
> set:
> 	/* Can't redirect output if we've got an active mmap() */
>-	if (atomic_read(&event->mmap_count))
>-		goto unlock;
>+	if (atomic_read(&event->mmap_count)) {
>+		if (!force)
>+			goto unlock;
>+
>+		WARN_ON_ONCE(event->pmu->event_unmapped);
>+	}
>
> 	if (output_event) {
> 		/* get the rb we want to redirect to */
>@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
> 		if (err)
> 			goto err_fd;
> 		group_leader = fd_file(group)->private_data;
>+		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
>+			err = -ENODEV;
>+			goto err_group_fd;
>+		}
> 		if (flags & PERF_FLAG_FD_OUTPUT)
> 			output_event = group_leader;
> 		if (flags & PERF_FLAG_FD_NO_GROUP)
>@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
> 	event->pmu_ctx = pmu_ctx;
>
> 	if (output_event) {
>-		err = perf_event_set_output(event, output_event);
>+		err = perf_event_set_output(event, output_event, false);
> 		if (err)
> 			goto err_context;
> 	}
>@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event)
> }
>
> static void
>-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>+perf_event_exit_event(struct perf_event *event,
>+		      struct perf_event_context *ctx, bool revoke)
> {
> 	struct perf_event *parent_event = event->parent;
>-	unsigned long detach_flags = 0;
>+	unsigned long detach_flags = DETACH_EXIT;
>
> 	if (parent_event) {
> 		/*
>@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> 		 * Do destroy all inherited groups, we don't care about those
> 		 * and being thorough is better.
> 		 */
>-		detach_flags = DETACH_GROUP | DETACH_CHILD;
>+		detach_flags |= DETACH_GROUP | DETACH_CHILD;
> 		mutex_lock(&parent_event->child_mutex);
> 	}
>
>-	perf_remove_from_context(event, detach_flags);
>+	if (revoke)
>+		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
>
>-	raw_spin_lock_irq(&ctx->lock);
>-	if (event->state > PERF_EVENT_STATE_EXIT)
>-		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
>-	raw_spin_unlock_irq(&ctx->lock);
>+	perf_remove_from_context(event, detach_flags);
>
> 	/*
> 	 * Child events can be freed.
>@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
> 	perf_event_task(child, child_ctx, 0);
>
> 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
>-		perf_event_exit_event(child_event, child_ctx);
>+		perf_event_exit_event(child_event, child_ctx, false);
>
> 	mutex_unlock(&child_ctx->mutex);
>
Peter Zijlstra Oct. 22, 2024, 9:52 p.m. UTC | #8
On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote:

> I will give this a try with i915 and/or xe.

Less horrible version here:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister

I've just pushed it out to the robots, but it builds, passes perf test
and your dummy_pmu testcase (for me, on my one testbox and .config
etc..)

I'll let it sit with the robots for a few days and if they don't
complain I'll post it.

Thanks!
Lucas De Marchi Oct. 23, 2024, 5:07 a.m. UTC | #9
On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote:
>On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote:
>
>> I will give this a try with i915 and/or xe.
>
>Less horrible version here:
>
>  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister
>
>I've just pushed it out to the robots, but it builds, passes perf test
>and your dummy_pmu testcase (for me, on my one testbox and .config
>etc..)

It passed for me as well with both dummy_pmu and with i915. I have some
changes to igt (i915/xe testsuite) that should bring some more coverage.
I minimized the pending test changes I had and posted to trigger CI:

https://patchwork.freedesktop.org/series/140355/

thanks
Lucas De Marchi


>
>I'll let it sit with the robots for a few days and if they don't
>complain I'll post it.
>
>Thanks!
Lucas De Marchi Oct. 31, 2024, 5:07 a.m. UTC | #10
On Wed, Oct 23, 2024 at 12:07:58AM -0500, Lucas De Marchi wrote:
>On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote:
>>On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote:
>>
>>>I will give this a try with i915 and/or xe.
>>
>>Less horrible version here:
>>
>> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister
>>
>>I've just pushed it out to the robots, but it builds, passes perf test
>>and your dummy_pmu testcase (for me, on my one testbox and .config
>>etc..)
>
>It passed for me as well with both dummy_pmu and with i915. I have some
>changes to igt (i915/xe testsuite) that should bring some more coverage.
>I minimized the pending test changes I had and posted to trigger CI:
>
>https://patchwork.freedesktop.org/series/140355/

Our CI also didn't trigger that pmu issue and the test could run
succesfully.

I did get a report from kernel test robot though:

https://lore.kernel.org/all/202410281436.8246527d-lkp@intel.com/

thanks
Lucas De Marchi
Peter Zijlstra Oct. 31, 2024, 9:45 a.m. UTC | #11
On Thu, Oct 31, 2024 at 12:07:42AM -0500, Lucas De Marchi wrote:
> On Wed, Oct 23, 2024 at 12:07:58AM -0500, Lucas De Marchi wrote:
> > On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote:
> > > On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote:
> > > 
> > > > I will give this a try with i915 and/or xe.
> > > 
> > > Less horrible version here:
> > > 
> > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister
> > > 
> > > I've just pushed it out to the robots, but it builds, passes perf test
> > > and your dummy_pmu testcase (for me, on my one testbox and .config
> > > etc..)
> > 
> > It passed for me as well with both dummy_pmu and with i915. I have some
> > changes to igt (i915/xe testsuite) that should bring some more coverage.
> > I minimized the pending test changes I had and posted to trigger CI:
> > 
> > https://patchwork.freedesktop.org/series/140355/
> 
> Our CI also didn't trigger that pmu issue and the test could run
> succesfully.

Excellent.

> I did get a report from kernel test robot though:
> 
> https://lore.kernel.org/all/202410281436.8246527d-lkp@intel.com/

Yeah, I think I fixed that one, but the robot gifted me another one that
I still need to find time to address. I'm hoping this weel.

I'll repost properly once I fix it.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..d6983dbf5a45 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -561,6 +561,17 @@  struct pmu {
 	 * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
 	 */
 	int (*check_period)		(struct perf_event *event, u64 value); /* optional */
+
+	/*
+	 * Optional: get a reference. Typically needed by PMUs that are bound to a device
+	 * that can be hotplugged, either physically or through sysfs' bind/unbind. When provided,
+	 * pmu::put() is mandatory and it's driver responsibility to call perf_pmu_free() when
+	 * resources can be released.
+	 */
+	struct pmu *(*get)		(struct pmu *pmu);
+
+	/* Optional: put a reference. See pmu::get() */
+	 void (*put)			(struct pmu *pmu);
 };
 
 enum perf_addr_filter_action_t {
@@ -1104,6 +1115,7 @@  extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
+extern void perf_pmu_free(struct pmu *pmu);
 
 extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6395dbf67671..bf5b8fc8979e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5319,8 +5319,24 @@  static void perf_pending_task_sync(struct perf_event *event)
 	rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
 }
 
+static void pmu_module_put(struct pmu **ppmu)
+{
+	struct pmu *pmu = *ppmu;
+	struct module *module = pmu->module;
+
+	if (pmu->put)
+		pmu->put(pmu);
+
+	module_put(module);
+
+	/* Can't touch pmu anymore/ */
+	*ppmu = NULL;
+}
+
 static void _free_event(struct perf_event *event)
 {
+	struct module *module;
+
 	irq_work_sync(&event->pending_irq);
 	irq_work_sync(&event->pending_disable_irq);
 	perf_pending_task_sync(event);
@@ -5374,7 +5390,8 @@  static void _free_event(struct perf_event *event)
 		put_ctx(event->ctx);
 
 	exclusive_event_destroy(event);
-	module_put(event->pmu->module);
+
+	pmu_module_put(&event->pmu);
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -11512,10 +11529,12 @@  static int perf_event_idx_default(struct perf_event *event)
 	return 0;
 }
 
-static void free_pmu_context(struct pmu *pmu)
+void perf_pmu_free(struct pmu *pmu)
 {
 	free_percpu(pmu->cpu_pmu_context);
+	free_percpu(pmu->pmu_disable_count);
 }
+EXPORT_SYMBOL_GPL(perf_pmu_free);
 
 /*
  * Let userspace know that this PMU supports address range filtering:
@@ -11749,6 +11768,11 @@  int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		goto free_pdc;
 	}
 
+	if (WARN_ONCE((!!pmu->get) ^ (!!pmu->put), "Can not register a pmu with only get or put defined.\n")) {
+		ret = -EINVAL;
+		goto free_pdc;
+	}
+
 	pmu->name = name;
 
 	if (type >= 0)
@@ -11855,8 +11879,8 @@  void perf_pmu_unregister(struct pmu *pmu)
 
 	mutex_unlock(&pmus_lock);
 
-	free_percpu(pmu->pmu_disable_count);
-	free_pmu_context(pmu);
+	if (!pmu->put)
+		perf_pmu_free(pmu);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
@@ -11890,6 +11914,9 @@  static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		BUG_ON(!ctx);
 	}
 
+	if (pmu->get)
+		pmu->get(pmu);
+
 	event->pmu = pmu;
 	ret = pmu->event_init(event);
 
@@ -11926,7 +11953,7 @@  static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 	}
 
 	if (ret)
-		module_put(pmu->module);
+		pmu_module_put(&pmu);
 
 	return ret;
 }