mmc: core: Don't allocate IDA for OF aliases

Message ID	20210413003621.1403300-1-swboyd@chromium.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-mmc-owner@kernel.org> From: Stephen Boyd <swboyd@chromium.org> To: Ulf Hansson <ulf.hansson@linaro.org> Cc: linux-kernel@vger.kernel.org, linux-mmc@vger.kernel.org, Matthias Schiffer <matthias.schiffer@ew.tq-group.com>, Sujit Kautkar <sujitka@chromium.org>, Zubin Mithra <zsm@chromium.org> Subject: [PATCH] mmc: core: Don't allocate IDA for OF aliases Date: Mon, 12 Apr 2021 17:36:21 -0700 Message-Id: <20210413003621.1403300-1-swboyd@chromium.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	mmc: core: Don't allocate IDA for OF aliases \| expand mmc: core: Don't allocate IDA for OF aliases

Stephen Boyd April 13, 2021, 12:36 a.m. UTC

There's a chance that the IDA allocated in mmc_alloc_host() is not freed
for some time because it's freed as part of a class' release function
(see mmc_host_classdev_release() where the IDA is freed). If another
thread is holding a reference to the class, then only once all balancing
device_put() calls (in turn calling kobject_put()) have been made will
the IDA be released and usable again.

Normally this isn't a problem because the kobject is released before
anything else that may want to use the same number tries to again, but
with CONFIG_DEBUG_KOBJECT_RELEASE=y and OF aliases it becomes pretty
easy to try to allocate an alias from the IDA twice while the first time
it was allocated is still pending a call to ida_simple_remove(). It's
also possible to trigger it by using CONFIG_DEBUG_KOBJECT_RELEASE and
probe defering a driver at boot that calls mmc_alloc_host() before
trying to get resources that may defer likes clks or regulators.

Instead of allocating from the IDA in this scenario, let's just skip it
if we know this is an OF alias. The number is already "claimed" and
devices that aren't using OF aliases won't try to use the claimed
numbers anyway (see mmc_first_nonreserved_index()). This should avoid
any issues with mmc_alloc_host() returning failures from the
ida_simple_get() in the case that we're using an OF alias.

Cc: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Cc: Sujit Kautkar <sujitka@chromium.org>
Reported-by: Zubin Mithra <zsm@chromium.org>
Fixes: fa2d0aa96941 ("mmc: core: Allow setting slot index via device tree alias")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
---
 drivers/mmc/core/host.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

Ulf Hansson April 15, 2021, 8:56 a.m. UTC | #1

On Tue, 13 Apr 2021 at 02:36, Stephen Boyd <swboyd@chromium.org> wrote:
>
> There's a chance that the IDA allocated in mmc_alloc_host() is not freed
> for some time because it's freed as part of a class' release function
> (see mmc_host_classdev_release() where the IDA is freed). If another
> thread is holding a reference to the class, then only once all balancing
> device_put() calls (in turn calling kobject_put()) have been made will
> the IDA be released and usable again.
>
> Normally this isn't a problem because the kobject is released before
> anything else that may want to use the same number tries to again, but
> with CONFIG_DEBUG_KOBJECT_RELEASE=y and OF aliases it becomes pretty
> easy to try to allocate an alias from the IDA twice while the first time
> it was allocated is still pending a call to ida_simple_remove(). It's
> also possible to trigger it by using CONFIG_DEBUG_KOBJECT_RELEASE and
> probe defering a driver at boot that calls mmc_alloc_host() before
> trying to get resources that may defer likes clks or regulators.

Thanks for a very nice description of the problem.

>
> Instead of allocating from the IDA in this scenario, let's just skip it
> if we know this is an OF alias. The number is already "claimed" and
> devices that aren't using OF aliases won't try to use the claimed
> numbers anyway (see mmc_first_nonreserved_index()). This should avoid
> any issues with mmc_alloc_host() returning failures from the
> ida_simple_get() in the case that we're using an OF alias.

At first glance, this seems like a good idea, but I am not completely
sure, yet. See more below.

>
> Cc: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
> Cc: Sujit Kautkar <sujitka@chromium.org>
> Reported-by: Zubin Mithra <zsm@chromium.org>
> Fixes: fa2d0aa96941 ("mmc: core: Allow setting slot index via device tree alias")
> Signed-off-by: Stephen Boyd <swboyd@chromium.org>
> ---
>  drivers/mmc/core/host.c | 20 ++++++++++----------
>  1 file changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
> index 9b89a91b6b47..137b4a769f62 100644
> --- a/drivers/mmc/core/host.c
> +++ b/drivers/mmc/core/host.c
> @@ -39,7 +39,8 @@ static void mmc_host_classdev_release(struct device *dev)
>  {
>         struct mmc_host *host = cls_dev_to_mmc_host(dev);
>         wakeup_source_unregister(host->ws);
> -       ida_simple_remove(&mmc_host_ida, host->index);
> +       if (of_alias_get_id(host->parent->of_node, "mmc") < 0)
> +               ida_simple_remove(&mmc_host_ida, host->index);
>         kfree(host);
>  }
>
> @@ -444,7 +445,7 @@ static int mmc_first_nonreserved_index(void)
>   */
>  struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
>  {
> -       int err;
> +       int index;
>         struct mmc_host *host;
>         int alias_id, min_idx, max_idx;
>
> @@ -457,20 +458,19 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
>
>         alias_id = of_alias_get_id(dev->of_node, "mmc");
>         if (alias_id >= 0) {
> -               min_idx = alias_id;
> -               max_idx = alias_id + 1;
> +               index = alias_id;
>         } else {
>                 min_idx = mmc_first_nonreserved_index();
>                 max_idx = 0;
> -       }
>
> -       err = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> -       if (err < 0) {
> -               kfree(host);
> -               return NULL;
> +               index = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> +               if (index < 0) {
> +                       kfree(host);
> +                       return NULL;
> +               }

This means that a DTB that is screwed up in a way that it has two mmc
aliases with the same index, would be allowed to use the same index.

What will happen when we continue the probe sequence in such a case?

>         }
>
> -       host->index = err;
> +       host->index = index;
>
>         dev_set_name(&host->class_dev, "mmc%d", host->index);
>         host->ws = wakeup_source_register(NULL, dev_name(&host->class_dev));

Another concern that could potentially be a problem, is that the
"thread" that holds the reference that prevents ida from being
removed, how would that react to a new mmc device to become
re-registered with the same index?

I wonder if we perhaps should return -EPROBE_DEFER instead, when
ida_simple_get() fails?

Kind regards
Uffe

Stephen Boyd April 15, 2021, 7:29 p.m. UTC | #2

Quoting Ulf Hansson (2021-04-15 01:56:12)
> On Tue, 13 Apr 2021 at 02:36, Stephen Boyd <swboyd@chromium.org> wrote:
> >
> > -       err = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> > -       if (err < 0) {
> > -               kfree(host);
> > -               return NULL;
> > +               index = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> > +               if (index < 0) {
> > +                       kfree(host);
> > +                       return NULL;
> > +               }
> 
> This means that a DTB that is screwed up in a way that it has two mmc
> aliases with the same index, would be allowed to use the same index.
> 
> What will happen when we continue the probe sequence in such a case?

Yeah I thought about this after sending the patch. So the problem would
be like this right?

	aliases {
		mmc1 = &sdhci0;
		mmc1 = &sdhci1;
	};

I have good news! DT won't compile it because it saw the same alias
assigned to twice. I tried it on my sc7180 board. 

arch/arm64/boot/dts/qcom/sc7180.dtsi:35.3-18:
ERROR (duplicate_property_names): /aliases:mmc1: Duplicate property name
ERROR: Input tree has errors, aborting (use -f to force output)
arch/arm64/boot/dts/qcom/sc7180-idp.dtb] Error 2

I suppose if someone forced the compilation it may be bad, but do we
really care?

TL;DR: this seems like it isn't a problem.

> 
> >         }
> >
> > -       host->index = err;
> > +       host->index = index;
> >
> >         dev_set_name(&host->class_dev, "mmc%d", host->index);
> >         host->ws = wakeup_source_register(NULL, dev_name(&host->class_dev));
> 
> Another concern that could potentially be a problem, is that the
> "thread" that holds the reference that prevents ida from being
> removed, how would that react to a new mmc device to become
> re-registered with the same index?
> 
> I wonder if we perhaps should return -EPROBE_DEFER instead, when
> ida_simple_get() fails?
> 

Don't think so. The device (with the kobject inside) is removed, and
thus the mmc1 device will be removed, but the kobject's release function
is delayed due to the config. This means that
mmc_host_classdev_release() is called at a later time. The only thing
inside that function is the IDA removal and the kfree of the container
object. Given that nothing else is in that release function I believe it
is safe to skip IDA allocation as it won't be blocking anything in the
reserved alias case. 

Furthermore, when the device is deleted in mmc_remove_host() there could
be other users of the device that haven't called put_device() yet.
Either way, those other users are keeping the device memory alive, but
otherwise device_del() has unlinked it from the various driver core
lists and sysfs has removed it too so it's in a state where code may be
referencing it but it's on the way out so users of the device will not
be able to do much with it during this time.

This sort of problem (if it exists which I don't think it does) would
have been there all along and can't be fixed at this level. When a
device that has an alias calls the mmc_alloc_host() function twice it
gets two different device structures created so there are two distinct
kobjects that will need to be released at some point. The index is
usually different for those two kobjects, but with aliases it turns out
it is the same. When it comes to registering that device with the same
name the second one will fail because a device with that name already
exists on the bus. This would be really hard to do given that it would
need to be the same aliased device in DT calling the mmc_add_host()
function without calling mmc_remove_host() for the first one it added in
between.

(Sorry if that is long. I'm sort of stream of conciousness writing it to
you here and not rewriting it to be more concise)

Ulf Hansson April 16, 2021, 7:17 a.m. UTC | #3

On Thu, 15 Apr 2021 at 21:29, Stephen Boyd <swboyd@chromium.org> wrote:
>
> Quoting Ulf Hansson (2021-04-15 01:56:12)
> > On Tue, 13 Apr 2021 at 02:36, Stephen Boyd <swboyd@chromium.org> wrote:
> > >
> > > -       err = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> > > -       if (err < 0) {
> > > -               kfree(host);
> > > -               return NULL;
> > > +               index = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
> > > +               if (index < 0) {
> > > +                       kfree(host);
> > > +                       return NULL;
> > > +               }
> >
> > This means that a DTB that is screwed up in a way that it has two mmc
> > aliases with the same index, would be allowed to use the same index.
> >
> > What will happen when we continue the probe sequence in such a case?
>
> Yeah I thought about this after sending the patch. So the problem would
> be like this right?
>
>         aliases {
>                 mmc1 = &sdhci0;
>                 mmc1 = &sdhci1;
>         };

Correct.

>
> I have good news! DT won't compile it because it saw the same alias
> assigned to twice. I tried it on my sc7180 board.
>
> arch/arm64/boot/dts/qcom/sc7180.dtsi:35.3-18:
> ERROR (duplicate_property_names): /aliases:mmc1: Duplicate property name
> ERROR: Input tree has errors, aborting (use -f to force output)
> arch/arm64/boot/dts/qcom/sc7180-idp.dtb] Error 2
>
> I suppose if someone forced the compilation it may be bad, but do we
> really care?
>
> TL;DR: this seems like it isn't a problem.

Yep, I definitely tend to agree with you here. Thanks for doing the
test and sharing the result.

>
> >
> > >         }
> > >
> > > -       host->index = err;
> > > +       host->index = index;
> > >
> > >         dev_set_name(&host->class_dev, "mmc%d", host->index);
> > >         host->ws = wakeup_source_register(NULL, dev_name(&host->class_dev));
> >
> > Another concern that could potentially be a problem, is that the
> > "thread" that holds the reference that prevents ida from being
> > removed, how would that react to a new mmc device to become
> > re-registered with the same index?
> >
> > I wonder if we perhaps should return -EPROBE_DEFER instead, when
> > ida_simple_get() fails?
> >
>
> Don't think so. The device (with the kobject inside) is removed, and
> thus the mmc1 device will be removed, but the kobject's release function
> is delayed due to the config. This means that
> mmc_host_classdev_release() is called at a later time. The only thing
> inside that function is the IDA removal and the kfree of the container
> object. Given that nothing else is in that release function I believe it
> is safe to skip IDA allocation as it won't be blocking anything in the
> reserved alias case.
>
> Furthermore, when the device is deleted in mmc_remove_host() there could
> be other users of the device that haven't called put_device() yet.
> Either way, those other users are keeping the device memory alive, but
> otherwise device_del() has unlinked it from the various driver core
> lists and sysfs has removed it too so it's in a state where code may be
> referencing it but it's on the way out so users of the device will not
> be able to do much with it during this time.

Right, but see more below.

>
> This sort of problem (if it exists which I don't think it does) would
> have been there all along and can't be fixed at this level. When a
> device that has an alias calls the mmc_alloc_host() function twice it
> gets two different device structures created so there are two distinct
> kobjects that will need to be released at some point. The index is
> usually different for those two kobjects, but with aliases it turns out
> it is the same. When it comes to registering that device with the same
> name the second one will fail because a device with that name already
> exists on the bus. This would be really hard to do given that it would
> need to be the same aliased device in DT calling the mmc_add_host()
> function without calling mmc_remove_host() for the first one it added in
> between.

In fact, we have a few rare corner cases that can trigger KASAN splats
when mmc_remove_host() gets executed. Similar splats can be triggered
by just doing a sudden card removal. It's especially related to the
cases, where a thread holds a reference to the card/host object when
it's being removed. I am working on patches to fix these cases, but
haven't yet decided on the best solution.

That's the reason why I was thinking that maybe returning
-EPROBE_DEFER could be an option, but certainly I need to think more
about this.

>
> (Sorry if that is long. I'm sort of stream of conciousness writing it to
> you here and not rewriting it to be more concise)

No worries, thanks a lot for sharing your thoughts!

Kind regards
Uffe

Stephen Boyd April 16, 2021, 5:50 p.m. UTC | #4

Quoting Ulf Hansson (2021-04-16 00:17:10)
> On Thu, 15 Apr 2021 at 21:29, Stephen Boyd <swboyd@chromium.org> wrote:
> >
> >
> > Don't think so. The device (with the kobject inside) is removed, and
> > thus the mmc1 device will be removed, but the kobject's release function
> > is delayed due to the config. This means that
> > mmc_host_classdev_release() is called at a later time. The only thing
> > inside that function is the IDA removal and the kfree of the container
> > object. Given that nothing else is in that release function I believe it
> > is safe to skip IDA allocation as it won't be blocking anything in the
> > reserved alias case.
> >
> > Furthermore, when the device is deleted in mmc_remove_host() there could
> > be other users of the device that haven't called put_device() yet.
> > Either way, those other users are keeping the device memory alive, but
> > otherwise device_del() has unlinked it from the various driver core
> > lists and sysfs has removed it too so it's in a state where code may be
> > referencing it but it's on the way out so users of the device will not
> > be able to do much with it during this time.
> 
> Right, but see more below.
> 
> >
> > This sort of problem (if it exists which I don't think it does) would
> > have been there all along and can't be fixed at this level. When a
> > device that has an alias calls the mmc_alloc_host() function twice it
> > gets two different device structures created so there are two distinct
> > kobjects that will need to be released at some point. The index is
> > usually different for those two kobjects, but with aliases it turns out
> > it is the same. When it comes to registering that device with the same
> > name the second one will fail because a device with that name already
> > exists on the bus. This would be really hard to do given that it would
> > need to be the same aliased device in DT calling the mmc_add_host()
> > function without calling mmc_remove_host() for the first one it added in
> > between.
> 
> In fact, we have a few rare corner cases that can trigger KASAN splats
> when mmc_remove_host() gets executed. Similar splats can be triggered
> by just doing a sudden card removal. It's especially related to the
> cases, where a thread holds a reference to the card/host object when
> it's being removed. I am working on patches to fix these cases, but
> haven't yet decided on the best solution.

Ok. Can you share the KASAN reports? The memory allocated for this class
object and associated index from the IDA will be unique for each call
the mmc_alloc_host() so I don't see how KASAN could be noticing
something unless a reference to the host is leaking out without the
proper get_device() call being made to keep the underlying memory from
being freed.

> 
> That's the reason why I was thinking that maybe returning
> -EPROBE_DEFER could be an option, but certainly I need to think more
> about this.

I was hoping that you wouldn't need to think more about returning
-EPROBE_DEFER after my email. :( Returning EPROBE_DEFER looks like it's
a bandaid for bigger problems with reference counting the pointer
allocated in mmc_alloc_host(). I hope I convinced you that there isn't
any danger in reusing the IDA in the case of an alias because the only
way that is a problem is if the same device calls mmc_alloc_host() twice
without calling mmc_remove_host() in between. That should be a pretty
obvious problem in driver code? The check to see if that same device has
tried to alloc a host twice would still effectively happen after this
patch because when mmc_add_host() tries to add that second device to the
driver core it will complain about duplicate device names and fail.

Will you apply this patch?

Ulf Hansson April 16, 2021, 11:07 p.m. UTC | #5

On Fri, 16 Apr 2021 at 19:50, Stephen Boyd <swboyd@chromium.org> wrote:
>
> Quoting Ulf Hansson (2021-04-16 00:17:10)
> > On Thu, 15 Apr 2021 at 21:29, Stephen Boyd <swboyd@chromium.org> wrote:
> > >
> > >
> > > Don't think so. The device (with the kobject inside) is removed, and
> > > thus the mmc1 device will be removed, but the kobject's release function
> > > is delayed due to the config. This means that
> > > mmc_host_classdev_release() is called at a later time. The only thing
> > > inside that function is the IDA removal and the kfree of the container
> > > object. Given that nothing else is in that release function I believe it
> > > is safe to skip IDA allocation as it won't be blocking anything in the
> > > reserved alias case.
> > >
> > > Furthermore, when the device is deleted in mmc_remove_host() there could
> > > be other users of the device that haven't called put_device() yet.
> > > Either way, those other users are keeping the device memory alive, but
> > > otherwise device_del() has unlinked it from the various driver core
> > > lists and sysfs has removed it too so it's in a state where code may be
> > > referencing it but it's on the way out so users of the device will not
> > > be able to do much with it during this time.
> >
> > Right, but see more below.
> >
> > >
> > > This sort of problem (if it exists which I don't think it does) would
> > > have been there all along and can't be fixed at this level. When a
> > > device that has an alias calls the mmc_alloc_host() function twice it
> > > gets two different device structures created so there are two distinct
> > > kobjects that will need to be released at some point. The index is
> > > usually different for those two kobjects, but with aliases it turns out
> > > it is the same. When it comes to registering that device with the same
> > > name the second one will fail because a device with that name already
> > > exists on the bus. This would be really hard to do given that it would
> > > need to be the same aliased device in DT calling the mmc_add_host()
> > > function without calling mmc_remove_host() for the first one it added in
> > > between.
> >
> > In fact, we have a few rare corner cases that can trigger KASAN splats
> > when mmc_remove_host() gets executed. Similar splats can be triggered
> > by just doing a sudden card removal. It's especially related to the
> > cases, where a thread holds a reference to the card/host object when
> > it's being removed. I am working on patches to fix these cases, but
> > haven't yet decided on the best solution.
>
> Ok. Can you share the KASAN reports? The memory allocated for this class
> object and associated index from the IDA will be unique for each call
> the mmc_alloc_host() so I don't see how KASAN could be noticing
> something unless a reference to the host is leaking out without the
> proper get_device() call being made to keep the underlying memory from
> being freed.

Removing the host, also means removing the card. Although, as I said,
I need some more time to think of the best solution.

Here's a report, triggered with some manual hacks and by using the
mmc-utils usesland tool.

/mmc status get /dev/mmcblk1
[   95.905913] DEBUG: mmc_blk_open: Let's sleep for 10s..
[   98.806639] mmc1: card 0007 removed
[  105.972139] BUG: KASAN: use-after-free in mmc_blk_get+0x58/0xb8
[  105.979144] Read of size 4 at addr ffff00000a394a28 by task mmc/180
[  105.984945]
[  105.991209] CPU: 2 PID: 180 Comm: mmc Not tainted
5.10.0-rc4-00069-gcc758c8c7127-dirty #5
[  105.992943] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
[  106.001010] Call trace:
[  106.007799]  dump_backtrace+0x0/0x2b4
[  106.009965]  show_stack+0x18/0x6c
[  106.013779]  dump_stack+0xfc/0x168
[  106.017083]  print_address_description.constprop.0+0x6c/0x488
[  106.020384]  kasan_report+0x118/0x210
[  106.026193]  __asan_load4+0x94/0xd0
[  106.029844]  mmc_blk_get+0x58/0xb8
[  106.033141]  mmc_blk_open+0x7c/0xdc
[  106.036614]  __blkdev_get+0x3b4/0x964
[  106.039996]  blkdev_get+0x64/0x100
[  106.043815]  blkdev_open+0xe8/0x104
[  106.047114]  do_dentry_open+0x234/0x61c
[  106.050498]  vfs_open+0x54/0x64
[  106.054324]  path_openat+0xe04/0x1584
[  106.057450]  do_filp_open+0xe8/0x1e4
[  106.061263]  do_sys_openat2+0x120/0x230
[  106.064911]  __arm64_sys_openat+0xf0/0x15c
[  106.068477]  el0_svc_common.constprop.0+0xac/0x234
[  106.072639]  do_el0_svc+0x84/0xa0
[  106.077410]  el0_sync_handler+0x264/0x270
[  106.080790]  el0_sync+0x174/0x180
[  106.084768]
[  106.088070] Allocated by task 33:
[  106.089647]  stack_trace_save+0x9c/0xdc
[  106.092858]  kasan_save_stack+0x28/0x60
[  106.096506]  __kasan_kmalloc.constprop.0+0xc8/0xf0
[  106.100325]  kasan_kmalloc+0x10/0x20
[  106.105183]  mmc_blk_alloc_req+0x94/0x4b0
[  106.108913]  mmc_blk_probe+0x2d4/0xaa4
[  106.112829]  mmc_bus_probe+0x34/0x4c
[  106.116471]  really_probe+0x148/0x6e0
[  106.120202]  driver_probe_device+0x78/0xec
[  106.123764]  __device_attach_driver+0x108/0x16c
[  106.127754]  bus_for_each_drv+0xf4/0x15c
[  106.132180]  __device_attach+0x168/0x240
[  106.136349]  device_initial_probe+0x14/0x20
[  106.140253]  bus_probe_device+0xec/0x100
[  106.144167]  device_add+0x55c/0xaf0
[  106.148332]  mmc_add_card+0x288/0x380
[  106.151540]  mmc_attach_sd+0x18c/0x22c
[  106.155363]  mmc_rescan+0x444/0x4f0
[  106.159014]  process_one_work+0x3b8/0x650
[  106.162396]  worker_thread+0xa0/0x724
[  106.166556]  kthread+0x218/0x220
[  106.170201]  ret_from_fork+0x10/0x38
[  106.173482]
[  106.177045] Freed by task 33:
[  106.178533]  stack_trace_save+0x9c/0xdc
[  106.181399]  kasan_save_stack+0x28/0x60
[  106.185045]  kasan_set_track+0x28/0x40
[  106.188868]  kasan_set_free_info+0x24/0x4c
[  106.192684]  __kasan_slab_free+0x100/0x180
[  106.196764]  kasan_slab_free+0x14/0x20
[  106.200838]  kfree+0xb8/0x46c
[  106.204583]  mmc_blk_put+0xe4/0x11c
[  106.207624]  mmc_blk_remove_req.part.0+0x6c/0xe4
[  106.210914]  mmc_blk_remove+0x368/0x370
[  106.215778]  mmc_bus_remove+0x34/0x50
[  106.219336]  __device_release_driver+0x228/0x31c
[  106.223155]  device_release_driver+0x2c/0x44
[  106.227840]  bus_remove_device+0x1e4/0x200
[  106.232100]  device_del+0x2b0/0x770
[  106.236005]  mmc_remove_card+0xf0/0x150
[  106.239383]  mmc_sd_detect+0x9c/0x150
[  106.243207]  mmc_rescan+0x110/0x4f0
[  106.247032]  process_one_work+0x3b8/0x650
[  106.250329]  worker_thread+0xa0/0x724
[  106.254488]  kthread+0x218/0x220
[  106.258134]  ret_from_fork+0x10/0x38
[  106.261416]
[  106.264986] The buggy address belongs to the object at ffff00000a394800
[  106.264986]  which belongs to the cache kmalloc-1k of size 1024
[  106.266485] The buggy address is located 552 bytes inside of
[  106.266485]  1024-byte region [ffff00000a394800, ffff00000a394c00)
[  106.278710] The buggy address belongs to the page:
[  106.290520] page:00000000ff84ed53 refcount:1 mapcount:0
mapping:0000000000000000 index:0x0 pfn:0x8a390
[  106.295381] head:00000000ff84ed53 order:3 compound_mapcount:0
compound_pincount:0
[  106.304669] flags: 0x3fffc0000010200(slab|head)
[  106.312234] raw: 03fffc0000010200 dead000000000100 dead000000000122
ffff000009f03800
[  106.316571] raw: 0000000000000000 0000000000100010 00000001ffffffff
0000000000000000
[  106.324537] page dumped because: kasan: bad access detected
[  106.332254]
[  106.337543] Memory state around the buggy address:
[  106.339302]  ffff00000a394900: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[  106.343907]  ffff00000a394980: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[  106.351111] >ffff00000a394a00: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[  106.358303]                                   ^
[  106.365515]  ffff00000a394a80: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[  106.369948]  ffff00000a394b00: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb
[  106.377225] ==================================================================
[  106.384429] Disabling lock debugging due to kernel taint
open: No such device or address

>
> >
> > That's the reason why I was thinking that maybe returning
> > -EPROBE_DEFER could be an option, but certainly I need to think more
> > about this.
>
> I was hoping that you wouldn't need to think more about returning
> -EPROBE_DEFER after my email. :( Returning EPROBE_DEFER looks like it's
> a bandaid for bigger problems with reference counting the pointer
> allocated in mmc_alloc_host(). I hope I convinced you that there isn't
> any danger in reusing the IDA in the case of an alias because the only
> way that is a problem is if the same device calls mmc_alloc_host() twice
> without calling mmc_remove_host() in between. That should be a pretty
> obvious problem in driver code? The check to see if that same device has
> tried to alloc a host twice would still effectively happen after this
> patch because when mmc_add_host() tries to add that second device to the
> driver core it will complain about duplicate device names and fail.

You may very well be correct in you reasoning above, but I just need
to convince myself about it.

>
> Will you apply this patch?

It's likely, but allow me some more time to think about it. To make
sure we do the right thing.

Kind regards
Uffe

Stephen Boyd April 30, 2021, 10:19 p.m. UTC | #6

Quoting Ulf Hansson (2021-04-16 16:07:11)
> On Fri, 16 Apr 2021 at 19:50, Stephen Boyd <swboyd@chromium.org> wrote:
> >
> > Ok. Can you share the KASAN reports? The memory allocated for this class
> > object and associated index from the IDA will be unique for each call
> > the mmc_alloc_host() so I don't see how KASAN could be noticing
> > something unless a reference to the host is leaking out without the
> > proper get_device() call being made to keep the underlying memory from
> > being freed.
>
> Removing the host, also means removing the card. Although, as I said,
> I need some more time to think of the best solution.
>
> Here's a report, triggered with some manual hacks and by using the
> mmc-utils usesland tool.

Thanks! I'm just getting back to this mail on a Friday afternoon!

>
> /mmc status get /dev/mmcblk1
> [   95.905913] DEBUG: mmc_blk_open: Let's sleep for 10s..
> [   98.806639] mmc1: card 0007 removed
> [  105.972139] BUG: KASAN: use-after-free in mmc_blk_get+0x58/0xb8
> [  105.979144] Read of size 4 at addr ffff00000a394a28 by task mmc/180
> [  105.984945]
> [  105.991209] CPU: 2 PID: 180 Comm: mmc Not tainted
> 5.10.0-rc4-00069-gcc758c8c7127-dirty #5
> [  105.992943] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> [  106.001010] Call trace:
> [  106.007799]  dump_backtrace+0x0/0x2b4
> [  106.009965]  show_stack+0x18/0x6c
> [  106.013779]  dump_stack+0xfc/0x168
> [  106.017083]  print_address_description.constprop.0+0x6c/0x488
> [  106.020384]  kasan_report+0x118/0x210
> [  106.026193]  __asan_load4+0x94/0xd0
> [  106.029844]  mmc_blk_get+0x58/0xb8
> [  106.033141]  mmc_blk_open+0x7c/0xdc
> [  106.036614]  __blkdev_get+0x3b4/0x964
> [  106.039996]  blkdev_get+0x64/0x100
> [  106.043815]  blkdev_open+0xe8/0x104
> [  106.047114]  do_dentry_open+0x234/0x61c
> [  106.050498]  vfs_open+0x54/0x64
> [  106.054324]  path_openat+0xe04/0x1584
> [  106.057450]  do_filp_open+0xe8/0x1e4
> [  106.061263]  do_sys_openat2+0x120/0x230
> [  106.064911]  __arm64_sys_openat+0xf0/0x15c
> [  106.068477]  el0_svc_common.constprop.0+0xac/0x234
> [  106.072639]  do_el0_svc+0x84/0xa0
> [  106.077410]  el0_sync_handler+0x264/0x270
> [  106.080790]  el0_sync+0x174/0x180
> [  106.084768]
> [  106.088070] Allocated by task 33:
> [  106.089647]  stack_trace_save+0x9c/0xdc
> [  106.092858]  kasan_save_stack+0x28/0x60
> [  106.096506]  __kasan_kmalloc.constprop.0+0xc8/0xf0
> [  106.100325]  kasan_kmalloc+0x10/0x20
> [  106.105183]  mmc_blk_alloc_req+0x94/0x4b0

I see that this is a different IDA managed object, for mmc_blk_ida.
Presumably the object allocated is md?

	md = kzalloc(sizeof(struct mmc_blk_data), GFP_KERNEL);

Is that the line?

> [  106.108913]  mmc_blk_probe+0x2d4/0xaa4
> [  106.112829]  mmc_bus_probe+0x34/0x4c
> [  106.116471]  really_probe+0x148/0x6e0
> [  106.120202]  driver_probe_device+0x78/0xec
> [  106.123764]  __device_attach_driver+0x108/0x16c
> [  106.127754]  bus_for_each_drv+0xf4/0x15c
> [  106.132180]  __device_attach+0x168/0x240
> [  106.136349]  device_initial_probe+0x14/0x20
> [  106.140253]  bus_probe_device+0xec/0x100
> [  106.144167]  device_add+0x55c/0xaf0
> [  106.148332]  mmc_add_card+0x288/0x380
> [  106.151540]  mmc_attach_sd+0x18c/0x22c
> [  106.155363]  mmc_rescan+0x444/0x4f0
> [  106.159014]  process_one_work+0x3b8/0x650
> [  106.162396]  worker_thread+0xa0/0x724
> [  106.166556]  kthread+0x218/0x220
> [  106.170201]  ret_from_fork+0x10/0x38
> [  106.173482]
> [  106.177045] Freed by task 33:
> [  106.178533]  stack_trace_save+0x9c/0xdc
> [  106.181399]  kasan_save_stack+0x28/0x60
> [  106.185045]  kasan_set_track+0x28/0x40
> [  106.188868]  kasan_set_free_info+0x24/0x4c
> [  106.192684]  __kasan_slab_free+0x100/0x180
> [  106.196764]  kasan_slab_free+0x14/0x20
> [  106.200838]  kfree+0xb8/0x46c
> [  106.204583]  mmc_blk_put+0xe4/0x11c

Looking at mmc_blk_put() and mmc_blk_get() I see


	static struct mmc_blk_data *mmc_blk_get(struct gendisk *disk)
	{
		...
		md = disk->private_data;
		if (md && md->usage == 0)
			md = NULL;
		...
	}

	static void mmc_blk_put(struct mmc_blk_data *md)
	{
		...
		if (md->usage == 0) {
			int devidx = mmc_get_devidx(md->disk);
			blk_put_queue(md->queue.queue);
			ida_simple_remove(&mmc_blk_ida, devidx);
			put_disk(md->disk);
			kfree(md);
		}
		...
	}

and notice that mmc_blk_get() takes a gendisk but mmc_blk_put() takes a
mmc_blk_data. That's already weird, but then I notice that 'md' comes
from disk->private_data in mmc_blk_get() and in mmc_blk_put() we kfree
'md' if usage drops to 0. The storage for md is inside
disk->private_data, according to mmc_blk_alloc_req()

	md->disk->private_data = md

so 'md' points to itself through gendisk private_data. Alright.

Either way, KASAN is telling us that 'md' got freed in mmc_blk_put() but
the gendisk is still around and valid, because it's held alive via some
kobject_get(). When we go to blkdev_open() we have that gendisk that has
private_data pointing to the freed 'md'.

This is a classic dangling pointer bug.

Given that mmc_blk_get() is checking for disk->private_data being
non-NULL, I looked to see where that is assigned to NULL, but I don't
see it. Is it ever set to NULL? We could set private_data to NULL after
freeing it, but that feels hacky. The md->usage code looks like a kref
design open-coded; probably replace that with a kref that does the code
in the if (md->usage == 0) path on the kref release function and then we
wouldn't need a mutex around these get/put APIs.

Does this patch fix it?

---8<----
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index d666e24fbe0e..c7939e8fe76f 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -203,6 +203,7 @@ static void mmc_blk_put(struct mmc_blk_data *md)
 		int devidx = mmc_get_devidx(md->disk);
 		blk_put_queue(md->queue.queue);
 		ida_simple_remove(&mmc_blk_ida, devidx);
+		md->disk->private_data = NULL;
 		put_disk(md->disk);
 		kfree(md);
 	}

> [  106.207624]  mmc_blk_remove_req.part.0+0x6c/0xe4
> [  106.210914]  mmc_blk_remove+0x368/0x370
> [  106.215778]  mmc_bus_remove+0x34/0x50
> [  106.219336]  __device_release_driver+0x228/0x31c
> [  106.223155]  device_release_driver+0x2c/0x44
> [  106.227840]  bus_remove_device+0x1e4/0x200
> [  106.232100]  device_del+0x2b0/0x770
> [  106.236005]  mmc_remove_card+0xf0/0x150
> [  106.239383]  mmc_sd_detect+0x9c/0x150
> [  106.243207]  mmc_rescan+0x110/0x4f0
> [  106.247032]  process_one_work+0x3b8/0x650
> [  106.250329]  worker_thread+0xa0/0x724
> [  106.254488]  kthread+0x218/0x220
> [  106.258134]  ret_from_fork+0x10/0x38
> [  106.261416]
> [  106.264986] The buggy address belongs to the object at ffff00000a394800
> [  106.264986]  which belongs to the cache kmalloc-1k of size 1024
> [  106.266485] The buggy address is located 552 bytes inside of
> [  106.266485]  1024-byte region [ffff00000a394800, ffff00000a394c00)
> [  106.278710] The buggy address belongs to the page:
> [  106.290520] page:00000000ff84ed53 refcount:1 mapcount:0
> mapping:0000000000000000 index:0x0 pfn:0x8a390
> [  106.295381] head:00000000ff84ed53 order:3 compound_mapcount:0
> compound_pincount:0
> [  106.304669] flags: 0x3fffc0000010200(slab|head)
> [  106.312234] raw: 03fffc0000010200 dead000000000100 dead000000000122
> ffff000009f03800
> [  106.316571] raw: 0000000000000000 0000000000100010 00000001ffffffff
> 0000000000000000
> [  106.324537] page dumped because: kasan: bad access detected
> [  106.332254]
> [  106.337543] Memory state around the buggy address:
> [  106.339302]  ffff00000a394900: fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb fb
> [  106.343907]  ffff00000a394980: fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb fb
> [  106.351111] >ffff00000a394a00: fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb fb
> [  106.358303]                                   ^
> [  106.365515]  ffff00000a394a80: fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb fb
> [  106.369948]  ffff00000a394b00: fb fb fb fb fb fb fb fb fb fb fb fb
> fb fb fb fb

If you're interested in the kref patch it may also clean it up a bit. I
left the mutex in place, but otherwise now the refcount is an atomic
variable, so mmc_blk_put() can happen in parallel to mmc_blk_get() and
the release part can run later. This lets mmc_blk_get() start failing
earlier and returning NULL when the last user has called mmc_blk_put()
but the disk is still hanging around. No idea if this works though.

----8<----
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index d666e24fbe0e..8a300cc2c8be 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/hdreg.h>
 #include <linux/kdev_t.h>
+#include <linux/kref.h>
 #include <linux/blkdev.h>
 #include <linux/cdev.h>
 #include <linux/mutex.h>
@@ -110,7 +111,7 @@ struct mmc_blk_data {
 #define MMC_BLK_CMD23	(1 << 0)	/* Can do SET_BLOCK_COUNT for multiblock */
 #define MMC_BLK_REL_WR	(1 << 1)	/* MMC Reliable write support */

-	unsigned int	usage;
+	struct kref	kref;
 	unsigned int	read_only;
 	unsigned int	part_type;
 	unsigned int	reset_done;
@@ -180,10 +181,8 @@ static struct mmc_blk_data *mmc_blk_get(struct
gendisk *disk)

 	mutex_lock(&open_lock);
 	md = disk->private_data;
-	if (md && md->usage == 0)
+	if (md && !kref_get_unless_zero(&md->kref))
 		md = NULL;
-	if (md)
-		md->usage++;
 	mutex_unlock(&open_lock);

 	return md;
@@ -195,20 +194,26 @@ static inline int mmc_get_devidx(struct gendisk *disk)
 	return devidx;
 }

-static void mmc_blk_put(struct mmc_blk_data *md)
+static void mmc_blk_kref_release(struct kref *ref)
 {
+	struct mmc_blk_data *md = container_of(ref, struct mmc_blk_data, kref);
+	int devidx;
+
 	mutex_lock(&open_lock);
-	md->usage--;
-	if (md->usage == 0) {
-		int devidx = mmc_get_devidx(md->disk);
-		blk_put_queue(md->queue.queue);
-		ida_simple_remove(&mmc_blk_ida, devidx);
-		put_disk(md->disk);
-		kfree(md);
-	}
+	devidx = mmc_get_devidx(md->disk);
+	blk_put_queue(md->queue.queue);
+	ida_simple_remove(&mmc_blk_ida, devidx);
+	md->disk->private_data = NULL;
+	put_disk(md->disk);
+	kfree(md);
 	mutex_unlock(&open_lock);
 }

+static void mmc_blk_put(struct mmc_blk_data *md)
+{
+	kref_put(&md->kref, mmc_blk_kref_release);
+}
+
 static ssize_t power_ro_lock_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -2300,7 +2305,7 @@ static struct mmc_blk_data
*mmc_blk_alloc_req(struct mmc_card *card,

 	INIT_LIST_HEAD(&md->part);
 	INIT_LIST_HEAD(&md->rpmbs);
-	md->usage = 1;
+	kref_init(&md->kref);

 	ret = mmc_init_queue(&md->queue, card);
 	if (ret)

mmc: core: Don't allocate IDA for OF aliases

Commit Message

Comments

Patch