diff mbox series

[6/7] drm/i915/pmu: Lazy unregister

Message ID 20240722210648.80892-7-lucas.demarchi@intel.com (mailing list archive)
State New, archived
Headers show
Series Fix i915 pmu on bind/unbind | expand

Commit Message

Lucas De Marchi July 22, 2024, 9:06 p.m. UTC
Instead of calling perf_pmu_unregister() when unbinding, defer that to
the destruction of i915 object. Since perf itself holds a reference in
the event, this only happens when all events are gone, which guarantees
i915 is not unregistering the pmu with live events.

Previously, running the following sequence would crash the system after
~2 tries:

	1) bind device to i915
	2) wait events to show up on sysfs
	3) start perf  stat -I 1000 -e i915/rcs0-busy/
	4) unbind driver
	5) kill perf

Most of the time this crashes in perf_pmu_disable() while accessing the
percpu pmu_disable_count. This happens because perf_pmu_unregister()
destroys it with free_percpu(pmu->pmu_disable_count).

With a lazy unbind, the pmu is only unregistered after (5) as opposed to
after (4). The downside is that if a new bind operation is attempted for
the same device/driver without killing the perf process, i915 will fail
to register the pmu (but still load successfully). This seems better
than completely crashing the system.

Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

Comments

Tvrtko Ursulin July 23, 2024, 8:03 a.m. UTC | #1
On 22/07/2024 22:06, Lucas De Marchi wrote:
> Instead of calling perf_pmu_unregister() when unbinding, defer that to
> the destruction of i915 object. Since perf itself holds a reference in
> the event, this only happens when all events are gone, which guarantees
> i915 is not unregistering the pmu with live events.
> 
> Previously, running the following sequence would crash the system after
> ~2 tries:
> 
> 	1) bind device to i915
> 	2) wait events to show up on sysfs
> 	3) start perf  stat -I 1000 -e i915/rcs0-busy/
> 	4) unbind driver
> 	5) kill perf
> 
> Most of the time this crashes in perf_pmu_disable() while accessing the
> percpu pmu_disable_count. This happens because perf_pmu_unregister()
> destroys it with free_percpu(pmu->pmu_disable_count).
> 
> With a lazy unbind, the pmu is only unregistered after (5) as opposed to
> after (4). The downside is that if a new bind operation is attempted for
> the same device/driver without killing the perf process, i915 will fail
> to register the pmu (but still load successfully). This seems better
> than completely crashing the system.

So effectively allows unbind to succeed without fully unbinding the 
driver from the device? That sounds like a significant drawback and if 
so, I wonder if a more complicated solution wouldn't be better after 
all. Or is there precedence for allowing userspace keeping their paws on 
unbound devices in this way?

Regards,

Tvrtko

> 
> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 24 +++++++++---------------
>   1 file changed, 9 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 8708f905f4f4..df53a8fe53ec 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -1158,18 +1158,21 @@ static void free_pmu(struct drm_device *dev, void *res)
>   	struct i915_pmu *pmu = res;
>   	struct drm_i915_private *i915 = pmu_to_i915(pmu);
>   
> +	perf_pmu_unregister(&pmu->base);
>   	free_event_attributes(pmu);
>   	kfree(pmu->base.attr_groups);
>   	if (IS_DGFX(i915))
>   		kfree(pmu->name);
> +
> +	/*
> +	 * Make sure all currently running (but shortcut on pmu->closed) are
> +	 * gone before proceeding with free'ing the pmu object embedded in i915.
> +	 */
> +	synchronize_rcu();
>   }
>   
>   static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
>   {
> -	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
> -
> -	GEM_BUG_ON(!pmu->base.event_init);
> -
>   	/* Select the first online CPU as a designated reader. */
>   	if (cpumask_empty(&i915_pmu_cpumask))
>   		cpumask_set_cpu(cpu, &i915_pmu_cpumask);
> @@ -1182,8 +1185,6 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
>   	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
>   	unsigned int target = i915_pmu_target_cpu;
>   
> -	GEM_BUG_ON(!pmu->base.event_init);
> -
>   	/*
>   	 * Unregistering an instance generates a CPU offline event which we must
>   	 * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask.
> @@ -1337,21 +1338,14 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
>   {
>   	struct i915_pmu *pmu = &i915->pmu;
>   
> -	if (!pmu->base.event_init)
> -		return;
> -
>   	/*
> -	 * "Disconnect" the PMU callbacks - since all are atomic synchronize_rcu
> -	 * ensures all currently executing ones will have exited before we
> -	 * proceed with unregistration.
> +	 * "Disconnect" the PMU callbacks - unregistering the pmu will be done
> +	 * later when all currently open events are gone
>   	 */
>   	pmu->closed = true;
> -	synchronize_rcu();
>   
>   	hrtimer_cancel(&pmu->timer);
> -
>   	i915_pmu_unregister_cpuhp_state(pmu);
> -	perf_pmu_unregister(&pmu->base);
>   
>   	pmu->base.event_init = NULL;
>   }
Lucas De Marchi July 23, 2024, 3:30 p.m. UTC | #2
On Tue, Jul 23, 2024 at 09:03:25AM GMT, Tvrtko Ursulin wrote:
>
>On 22/07/2024 22:06, Lucas De Marchi wrote:
>>Instead of calling perf_pmu_unregister() when unbinding, defer that to
>>the destruction of i915 object. Since perf itself holds a reference in
>>the event, this only happens when all events are gone, which guarantees
>>i915 is not unregistering the pmu with live events.
>>
>>Previously, running the following sequence would crash the system after
>>~2 tries:
>>
>>	1) bind device to i915
>>	2) wait events to show up on sysfs
>>	3) start perf  stat -I 1000 -e i915/rcs0-busy/
>>	4) unbind driver
>>	5) kill perf
>>
>>Most of the time this crashes in perf_pmu_disable() while accessing the
>>percpu pmu_disable_count. This happens because perf_pmu_unregister()
>>destroys it with free_percpu(pmu->pmu_disable_count).
>>
>>With a lazy unbind, the pmu is only unregistered after (5) as opposed to
>>after (4). The downside is that if a new bind operation is attempted for
>>the same device/driver without killing the perf process, i915 will fail
>>to register the pmu (but still load successfully). This seems better
>>than completely crashing the system.
>
>So effectively allows unbind to succeed without fully unbinding the 
>driver from the device? That sounds like a significant drawback and if 
>so, I wonder if a more complicated solution wouldn't be better after 
>all. Or is there precedence for allowing userspace keeping their paws 
>on unbound devices in this way?

keeping the resources alive but "unplunged" while the hardware
disappeared is a common thing to do... it's the whole point of the
drmm-managed resource for example. If you bind the driver and then
unbind it while userspace is holding a ref, next time you try to bind it
will come up with a different card number. A similar thing that could be
done is to adjust the name of the event - currently we add the mangled
pci slot.

That said, I agree a better approach would be to allow
perf_pmu_unregister() to do its job even when there are open events. On
top of that (or as a way to help achieve that), make perf core replace
the callbacks with stubs when pmu is unregistered - that would even kill
the need for i915's checks on pmu->closed (and fix the lack thereof in
other drivers).

It can be a can of worms though and may be pushed back by perf core
maintainers, so it'd be good have their feedback.

thanks
Lucas De Marchi

>
>Regards,
>
>Tvrtko
>
>>
>>Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
>>---
>>  drivers/gpu/drm/i915/i915_pmu.c | 24 +++++++++---------------
>>  1 file changed, 9 insertions(+), 15 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>index 8708f905f4f4..df53a8fe53ec 100644
>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>@@ -1158,18 +1158,21 @@ static void free_pmu(struct drm_device *dev, void *res)
>>  	struct i915_pmu *pmu = res;
>>  	struct drm_i915_private *i915 = pmu_to_i915(pmu);
>>+	perf_pmu_unregister(&pmu->base);
>>  	free_event_attributes(pmu);
>>  	kfree(pmu->base.attr_groups);
>>  	if (IS_DGFX(i915))
>>  		kfree(pmu->name);
>>+
>>+	/*
>>+	 * Make sure all currently running (but shortcut on pmu->closed) are
>>+	 * gone before proceeding with free'ing the pmu object embedded in i915.
>>+	 */
>>+	synchronize_rcu();
>>  }
>>  static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
>>  {
>>-	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
>>-
>>-	GEM_BUG_ON(!pmu->base.event_init);
>>-
>>  	/* Select the first online CPU as a designated reader. */
>>  	if (cpumask_empty(&i915_pmu_cpumask))
>>  		cpumask_set_cpu(cpu, &i915_pmu_cpumask);
>>@@ -1182,8 +1185,6 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
>>  	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
>>  	unsigned int target = i915_pmu_target_cpu;
>>-	GEM_BUG_ON(!pmu->base.event_init);
>>-
>>  	/*
>>  	 * Unregistering an instance generates a CPU offline event which we must
>>  	 * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask.
>>@@ -1337,21 +1338,14 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
>>  {
>>  	struct i915_pmu *pmu = &i915->pmu;
>>-	if (!pmu->base.event_init)
>>-		return;
>>-
>>  	/*
>>-	 * "Disconnect" the PMU callbacks - since all are atomic synchronize_rcu
>>-	 * ensures all currently executing ones will have exited before we
>>-	 * proceed with unregistration.
>>+	 * "Disconnect" the PMU callbacks - unregistering the pmu will be done
>>+	 * later when all currently open events are gone
>>  	 */
>>  	pmu->closed = true;
>>-	synchronize_rcu();
>>  	hrtimer_cancel(&pmu->timer);
>>-
>>  	i915_pmu_unregister_cpuhp_state(pmu);
>>-	perf_pmu_unregister(&pmu->base);
>>  	pmu->base.event_init = NULL;
>>  }
Tvrtko Ursulin July 24, 2024, 7:48 a.m. UTC | #3
On 23/07/2024 16:30, Lucas De Marchi wrote:
> On Tue, Jul 23, 2024 at 09:03:25AM GMT, Tvrtko Ursulin wrote:
>>
>> On 22/07/2024 22:06, Lucas De Marchi wrote:
>>> Instead of calling perf_pmu_unregister() when unbinding, defer that to
>>> the destruction of i915 object. Since perf itself holds a reference in
>>> the event, this only happens when all events are gone, which guarantees
>>> i915 is not unregistering the pmu with live events.
>>>
>>> Previously, running the following sequence would crash the system after
>>> ~2 tries:
>>>
>>>     1) bind device to i915
>>>     2) wait events to show up on sysfs
>>>     3) start perf  stat -I 1000 -e i915/rcs0-busy/
>>>     4) unbind driver
>>>     5) kill perf
>>>
>>> Most of the time this crashes in perf_pmu_disable() while accessing the
>>> percpu pmu_disable_count. This happens because perf_pmu_unregister()
>>> destroys it with free_percpu(pmu->pmu_disable_count).
>>>
>>> With a lazy unbind, the pmu is only unregistered after (5) as opposed to
>>> after (4). The downside is that if a new bind operation is attempted for
>>> the same device/driver without killing the perf process, i915 will fail
>>> to register the pmu (but still load successfully). This seems better
>>> than completely crashing the system.
>>
>> So effectively allows unbind to succeed without fully unbinding the 
>> driver from the device? That sounds like a significant drawback and if 
>> so, I wonder if a more complicated solution wouldn't be better after 
>> all. Or is there precedence for allowing userspace keeping their paws 
>> on unbound devices in this way?
> 
> keeping the resources alive but "unplunged" while the hardware
> disappeared is a common thing to do... it's the whole point of the
> drmm-managed resource for example. If you bind the driver and then
> unbind it while userspace is holding a ref, next time you try to bind it
> will come up with a different card number. A similar thing that could be
> done is to adjust the name of the event - currently we add the mangled
> pci slot.

Yes.. but what my point was this from your commit message:

"""
The downside is that if a new bind operation is attempted for
the same device/driver without killing the perf process, i915 will fail
to register the pmu (but still load successfully).
"""

So the subsequent bind does not "come up with a different card number". 
Statement is it will come up with an error if we look at the PMU subset 
of functionality. I was wondering if there was precedent for that kind 
of situation.

Mangling the PMU driver name probably also wouldn't be great.

> That said, I agree a better approach would be to allow
> perf_pmu_unregister() to do its job even when there are open events. On
> top of that (or as a way to help achieve that), make perf core replace
> the callbacks with stubs when pmu is unregistered - that would even kill
> the need for i915's checks on pmu->closed (and fix the lack thereof in
> other drivers).
> 
> It can be a can of worms though and may be pushed back by perf core
> maintainers, so it'd be good have their feedback.

Yeah definitely would be essential.

Regards,

Tvrtko

>>> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_pmu.c | 24 +++++++++---------------
>>>  1 file changed, 9 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c 
>>> b/drivers/gpu/drm/i915/i915_pmu.c
>>> index 8708f905f4f4..df53a8fe53ec 100644
>>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>>> @@ -1158,18 +1158,21 @@ static void free_pmu(struct drm_device *dev, 
>>> void *res)
>>>      struct i915_pmu *pmu = res;
>>>      struct drm_i915_private *i915 = pmu_to_i915(pmu);
>>> +    perf_pmu_unregister(&pmu->base);
>>>      free_event_attributes(pmu);
>>>      kfree(pmu->base.attr_groups);
>>>      if (IS_DGFX(i915))
>>>          kfree(pmu->name);
>>> +
>>> +    /*
>>> +     * Make sure all currently running (but shortcut on pmu->closed) 
>>> are
>>> +     * gone before proceeding with free'ing the pmu object embedded 
>>> in i915.
>>> +     */
>>> +    synchronize_rcu();
>>>  }
>>>  static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node 
>>> *node)
>>>  {
>>> -    struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), 
>>> cpuhp.node);
>>> -
>>> -    GEM_BUG_ON(!pmu->base.event_init);
>>> -
>>>      /* Select the first online CPU as a designated reader. */
>>>      if (cpumask_empty(&i915_pmu_cpumask))
>>>          cpumask_set_cpu(cpu, &i915_pmu_cpumask);
>>> @@ -1182,8 +1185,6 @@ static int i915_pmu_cpu_offline(unsigned int 
>>> cpu, struct hlist_node *node)
>>>      struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), 
>>> cpuhp.node);
>>>      unsigned int target = i915_pmu_target_cpu;
>>> -    GEM_BUG_ON(!pmu->base.event_init);
>>> -
>>>      /*
>>>       * Unregistering an instance generates a CPU offline event which 
>>> we must
>>>       * ignore to avoid incorrectly modifying the shared 
>>> i915_pmu_cpumask.
>>> @@ -1337,21 +1338,14 @@ void i915_pmu_unregister(struct 
>>> drm_i915_private *i915)
>>>  {
>>>      struct i915_pmu *pmu = &i915->pmu;
>>> -    if (!pmu->base.event_init)
>>> -        return;
>>> -
>>>      /*
>>> -     * "Disconnect" the PMU callbacks - since all are atomic 
>>> synchronize_rcu
>>> -     * ensures all currently executing ones will have exited before we
>>> -     * proceed with unregistration.
>>> +     * "Disconnect" the PMU callbacks - unregistering the pmu will 
>>> be done
>>> +     * later when all currently open events are gone
>>>       */
>>>      pmu->closed = true;
>>> -    synchronize_rcu();
>>>      hrtimer_cancel(&pmu->timer);
>>> -
>>>      i915_pmu_unregister_cpuhp_state(pmu);
>>> -    perf_pmu_unregister(&pmu->base);
>>>      pmu->base.event_init = NULL;
>>>  }
Peter Zijlstra July 24, 2024, 12:41 p.m. UTC | #4
On Tue, Jul 23, 2024 at 10:30:08AM -0500, Lucas De Marchi wrote:
> On Tue, Jul 23, 2024 at 09:03:25AM GMT, Tvrtko Ursulin wrote:
> > 
> > On 22/07/2024 22:06, Lucas De Marchi wrote:
> > > Instead of calling perf_pmu_unregister() when unbinding, defer that to
> > > the destruction of i915 object. Since perf itself holds a reference in
> > > the event, this only happens when all events are gone, which guarantees
> > > i915 is not unregistering the pmu with live events.
> > > 
> > > Previously, running the following sequence would crash the system after
> > > ~2 tries:
> > > 
> > > 	1) bind device to i915
> > > 	2) wait events to show up on sysfs
> > > 	3) start perf  stat -I 1000 -e i915/rcs0-busy/
> > > 	4) unbind driver
> > > 	5) kill perf
> > > 
> > > Most of the time this crashes in perf_pmu_disable() while accessing the
> > > percpu pmu_disable_count. This happens because perf_pmu_unregister()
> > > destroys it with free_percpu(pmu->pmu_disable_count).
> > > 
> > > With a lazy unbind, the pmu is only unregistered after (5) as opposed to
> > > after (4). The downside is that if a new bind operation is attempted for
> > > the same device/driver without killing the perf process, i915 will fail
> > > to register the pmu (but still load successfully). This seems better
> > > than completely crashing the system.
> > 
> > So effectively allows unbind to succeed without fully unbinding the
> > driver from the device? That sounds like a significant drawback and if
> > so, I wonder if a more complicated solution wouldn't be better after
> > all. Or is there precedence for allowing userspace keeping their paws on
> > unbound devices in this way?
> 
> keeping the resources alive but "unplunged" while the hardware
> disappeared is a common thing to do... it's the whole point of the
> drmm-managed resource for example. If you bind the driver and then
> unbind it while userspace is holding a ref, next time you try to bind it
> will come up with a different card number. A similar thing that could be
> done is to adjust the name of the event - currently we add the mangled
> pci slot.
> 
> That said, I agree a better approach would be to allow
> perf_pmu_unregister() to do its job even when there are open events. On
> top of that (or as a way to help achieve that), make perf core replace
> the callbacks with stubs when pmu is unregistered - that would even kill
> the need for i915's checks on pmu->closed (and fix the lack thereof in
> other drivers).
> 
> It can be a can of worms though and may be pushed back by perf core
> maintainers, so it'd be good have their feedback.

I don't think I understand the problem. I also don't understand drivers
much -- so that might be the problem.

So the problem appears to be that the device just disappears without
warning? How can a GPU go away like that?

Since you have a notion of this device, can't you do this stubbing you
talk about? That is, if your internal device reference becomes NULL, let
the PMU methods preserve the state like no-ops.

And then when the last event goes away, tear down the whole thing.

Again, I'm not sure I'm following.
Lucas De Marchi July 24, 2024, 3:39 p.m. UTC | #5
On Wed, Jul 24, 2024 at 02:41:05PM GMT, Peter Zijlstra wrote:
>On Tue, Jul 23, 2024 at 10:30:08AM -0500, Lucas De Marchi wrote:
>> On Tue, Jul 23, 2024 at 09:03:25AM GMT, Tvrtko Ursulin wrote:
>> >
>> > On 22/07/2024 22:06, Lucas De Marchi wrote:
>> > > Instead of calling perf_pmu_unregister() when unbinding, defer that to
>> > > the destruction of i915 object. Since perf itself holds a reference in
>> > > the event, this only happens when all events are gone, which guarantees
>> > > i915 is not unregistering the pmu with live events.
>> > >
>> > > Previously, running the following sequence would crash the system after
>> > > ~2 tries:
>> > >
>> > > 	1) bind device to i915
>> > > 	2) wait events to show up on sysfs
>> > > 	3) start perf  stat -I 1000 -e i915/rcs0-busy/
>> > > 	4) unbind driver
>> > > 	5) kill perf
>> > >
>> > > Most of the time this crashes in perf_pmu_disable() while accessing the
>> > > percpu pmu_disable_count. This happens because perf_pmu_unregister()
>> > > destroys it with free_percpu(pmu->pmu_disable_count).
>> > >
>> > > With a lazy unbind, the pmu is only unregistered after (5) as opposed to
>> > > after (4). The downside is that if a new bind operation is attempted for
>> > > the same device/driver without killing the perf process, i915 will fail
>> > > to register the pmu (but still load successfully). This seems better
>> > > than completely crashing the system.
>> >
>> > So effectively allows unbind to succeed without fully unbinding the
>> > driver from the device? That sounds like a significant drawback and if
>> > so, I wonder if a more complicated solution wouldn't be better after
>> > all. Or is there precedence for allowing userspace keeping their paws on
>> > unbound devices in this way?
>>
>> keeping the resources alive but "unplunged" while the hardware
>> disappeared is a common thing to do... it's the whole point of the
>> drmm-managed resource for example. If you bind the driver and then
>> unbind it while userspace is holding a ref, next time you try to bind it
>> will come up with a different card number. A similar thing that could be
>> done is to adjust the name of the event - currently we add the mangled
>> pci slot.
>>
>> That said, I agree a better approach would be to allow
>> perf_pmu_unregister() to do its job even when there are open events. On
>> top of that (or as a way to help achieve that), make perf core replace
>> the callbacks with stubs when pmu is unregistered - that would even kill
>> the need for i915's checks on pmu->closed (and fix the lack thereof in
>> other drivers).
>>
>> It can be a can of worms though and may be pushed back by perf core
>> maintainers, so it'd be good have their feedback.
>
>I don't think I understand the problem. I also don't understand drivers
>much -- so that might be the problem.

We can bind/unbind the driver to/from the pci device. Example:

	echo -n "0000:00:02.0" > /sys/bus/pci/drivers/i915/unbind

This is essentially unplugging the HW from the kernel.  This will
trigger the driver to deinitialize and free up all resources use by that
device.

So when the driver is binding it does:

	perf_pmu_register();

When it's unbinding:

	perf_pmu_unregister();
	
Reasons to unbind include:

	- driver testing so then we can unload the module and load it
	  again
	- device is toast - doing an FLR and rebinding may
	  fix/workaround it
	- For SR-IOV, in which we are exposing multiple instances of the
	  same underlying PCI device, we may need to bind/unbind
	  on-demand  (it's not yet clear if perf_pmu_register() would be
	  called on the VF instances, but listed here just to explain
	  the bind/unbind)
	- Hotplug

>
>So the problem appears to be that the device just disappears without
>warning? How can a GPU go away like that?
>
>Since you have a notion of this device, can't you do this stubbing you
>talk about? That is, if your internal device reference becomes NULL, let
>the PMU methods preserve the state like no-ops.

It's not clear if you are suggesting these stubs to be in each driver or
to be assigned by perf in perf_pmu_unregister(). Some downsides
of doing it in the driver:

	- you can't remove the module as perf will still call module
	  code
	- need to replicate the stubs in every driver (or the
	  equivalent of pmu->closed checks in i915_pmu.c)

I *think* the stubs would be quiet the same for every device, so could
be feasible to share them inside perf. Alternatively it could simply
shortcut the call, without stubs, by looking at event->hw.state and
a new pmu->state. I can give this a try.

thanks
Lucas De Marchi

>
>And then when the last event goes away, tear down the whole thing.
>
>Again, I'm not sure I'm following.
Lucas De Marchi Sept. 9, 2024, 9:03 p.m. UTC | #6
Hi Peter,

On Wed, Jul 24, 2024 at 10:39:57AM GMT, Lucas De Marchi wrote:
>On Wed, Jul 24, 2024 at 02:41:05PM GMT, Peter Zijlstra wrote:
>>On Tue, Jul 23, 2024 at 10:30:08AM -0500, Lucas De Marchi wrote:
>>>On Tue, Jul 23, 2024 at 09:03:25AM GMT, Tvrtko Ursulin wrote:
>>>>
>>>> On 22/07/2024 22:06, Lucas De Marchi wrote:
>>>> > Instead of calling perf_pmu_unregister() when unbinding, defer that to
>>>> > the destruction of i915 object. Since perf itself holds a reference in
>>>> > the event, this only happens when all events are gone, which guarantees
>>>> > i915 is not unregistering the pmu with live events.
>>>> >
>>>> > Previously, running the following sequence would crash the system after
>>>> > ~2 tries:
>>>> >
>>>> > 	1) bind device to i915
>>>> > 	2) wait events to show up on sysfs
>>>> > 	3) start perf  stat -I 1000 -e i915/rcs0-busy/
>>>> > 	4) unbind driver
>>>> > 	5) kill perf
>>>> >
>>>> > Most of the time this crashes in perf_pmu_disable() while accessing the
>>>> > percpu pmu_disable_count. This happens because perf_pmu_unregister()
>>>> > destroys it with free_percpu(pmu->pmu_disable_count).
>>>> >
>>>> > With a lazy unbind, the pmu is only unregistered after (5) as opposed to
>>>> > after (4). The downside is that if a new bind operation is attempted for
>>>> > the same device/driver without killing the perf process, i915 will fail
>>>> > to register the pmu (but still load successfully). This seems better
>>>> > than completely crashing the system.
>>>>
>>>> So effectively allows unbind to succeed without fully unbinding the
>>>> driver from the device? That sounds like a significant drawback and if
>>>> so, I wonder if a more complicated solution wouldn't be better after
>>>> all. Or is there precedence for allowing userspace keeping their paws on
>>>> unbound devices in this way?
>>>
>>>keeping the resources alive but "unplunged" while the hardware
>>>disappeared is a common thing to do... it's the whole point of the
>>>drmm-managed resource for example. If you bind the driver and then
>>>unbind it while userspace is holding a ref, next time you try to bind it
>>>will come up with a different card number. A similar thing that could be
>>>done is to adjust the name of the event - currently we add the mangled
>>>pci slot.
>>>
>>>That said, I agree a better approach would be to allow
>>>perf_pmu_unregister() to do its job even when there are open events. On
>>>top of that (or as a way to help achieve that), make perf core replace
>>>the callbacks with stubs when pmu is unregistered - that would even kill
>>>the need for i915's checks on pmu->closed (and fix the lack thereof in
>>>other drivers).
>>>
>>>It can be a can of worms though and may be pushed back by perf core
>>>maintainers, so it'd be good have their feedback.
>>
>>I don't think I understand the problem. I also don't understand drivers
>>much -- so that might be the problem.
>
>We can bind/unbind the driver to/from the pci device. Example:
>
>	echo -n "0000:00:02.0" > /sys/bus/pci/drivers/i915/unbind
>
>This is essentially unplugging the HW from the kernel.  This will
>trigger the driver to deinitialize and free up all resources use by that
>device.
>
>So when the driver is binding it does:
>
>	perf_pmu_register();
>
>When it's unbinding:
>
>	perf_pmu_unregister();
>	
>Reasons to unbind include:
>
>	- driver testing so then we can unload the module and load it
>	  again
>	- device is toast - doing an FLR and rebinding may
>	  fix/workaround it
>	- For SR-IOV, in which we are exposing multiple instances of the
>	  same underlying PCI device, we may need to bind/unbind
>	  on-demand  (it's not yet clear if perf_pmu_register() would be
>	  called on the VF instances, but listed here just to explain
>	  the bind/unbind)
>	- Hotplug
>
>>
>>So the problem appears to be that the device just disappears without
>>warning? How can a GPU go away like that?
>>
>>Since you have a notion of this device, can't you do this stubbing you
>>talk about? That is, if your internal device reference becomes NULL, let
>>the PMU methods preserve the state like no-ops.
>
>It's not clear if you are suggesting these stubs to be in each driver or
>to be assigned by perf in perf_pmu_unregister(). Some downsides
>of doing it in the driver:
>
>	- you can't remove the module as perf will still call module
>	  code
>	- need to replicate the stubs in every driver (or the
>	  equivalent of pmu->closed checks in i915_pmu.c)
>
>I *think* the stubs would be quiet the same for every device, so could
>be feasible to share them inside perf. Alternatively it could simply
>shortcut the call, without stubs, by looking at event->hw.state and
>a new pmu->state. I can give this a try.

trying to revive these patches to get this fixed. Are you ok with one of
those approaches, i.e. a) either add the stub in the perf side or b)
shortcut the calls after perf_pmu_unregister() is called ?


Lucas De Marchi

>
>thanks
>Lucas De Marchi
>
>>
>>And then when the last event goes away, tear down the whole thing.
>>
>>Again, I'm not sure I'm following.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 8708f905f4f4..df53a8fe53ec 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -1158,18 +1158,21 @@  static void free_pmu(struct drm_device *dev, void *res)
 	struct i915_pmu *pmu = res;
 	struct drm_i915_private *i915 = pmu_to_i915(pmu);
 
+	perf_pmu_unregister(&pmu->base);
 	free_event_attributes(pmu);
 	kfree(pmu->base.attr_groups);
 	if (IS_DGFX(i915))
 		kfree(pmu->name);
+
+	/*
+	 * Make sure all currently running (but shortcut on pmu->closed) are
+	 * gone before proceeding with free'ing the pmu object embedded in i915.
+	 */
+	synchronize_rcu();
 }
 
 static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
-	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
-
-	GEM_BUG_ON(!pmu->base.event_init);
-
 	/* Select the first online CPU as a designated reader. */
 	if (cpumask_empty(&i915_pmu_cpumask))
 		cpumask_set_cpu(cpu, &i915_pmu_cpumask);
@@ -1182,8 +1185,6 @@  static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
 	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
 	unsigned int target = i915_pmu_target_cpu;
 
-	GEM_BUG_ON(!pmu->base.event_init);
-
 	/*
 	 * Unregistering an instance generates a CPU offline event which we must
 	 * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask.
@@ -1337,21 +1338,14 @@  void i915_pmu_unregister(struct drm_i915_private *i915)
 {
 	struct i915_pmu *pmu = &i915->pmu;
 
-	if (!pmu->base.event_init)
-		return;
-
 	/*
-	 * "Disconnect" the PMU callbacks - since all are atomic synchronize_rcu
-	 * ensures all currently executing ones will have exited before we
-	 * proceed with unregistration.
+	 * "Disconnect" the PMU callbacks - unregistering the pmu will be done
+	 * later when all currently open events are gone
 	 */
 	pmu->closed = true;
-	synchronize_rcu();
 
 	hrtimer_cancel(&pmu->timer);
-
 	i915_pmu_unregister_cpuhp_state(pmu);
-	perf_pmu_unregister(&pmu->base);
 
 	pmu->base.event_init = NULL;
 }