diff mbox series

[2/4] cxl/mem: Fix cdev_device_add() error handling

Message ID 161661971651.1721612.7457823773061754064.stgit@dwillia2-desk3.amr.corp.intel.com
State Superseded
Headers show
Series cxl/mem: Fix memdev device setup | expand

Commit Message

Dan Williams March 24, 2021, 9:01 p.m. UTC
If cdev_device_add() fails then the allocation performed by
dev_set_name() is leaked. Use put_device(), not open coded release, for
device_add() failures.

The comment is obsolete because direct err_id failures need not worry
about the device being live.

The release method expects the percpu_ref is already dead, so
percpu_ref_kill() is needed before put_device(). However, given that the
cdev was partially live wait_for_completion() also belongs in the
release method.

Fixes: b39cb1052a5c ("cxl/mem: Register CXL memX devices")
Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/mem.c |   16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

Comments

Jason Gunthorpe March 25, 2021, 5:11 p.m. UTC | #1
On Wed, Mar 24, 2021 at 02:01:56PM -0700, Dan Williams wrote:
> If cdev_device_add() fails then the allocation performed by
> dev_set_name() is leaked. Use put_device(), not open coded release, for
> device_add() failures.
> 
> The comment is obsolete because direct err_id failures need not worry
> about the device being live.
> 
> The release method expects the percpu_ref is already dead, so
> percpu_ref_kill() is needed before put_device(). However, given that the
> cdev was partially live wait_for_completion() also belongs in the
> release method.
> 
> Fixes: b39cb1052a5c ("cxl/mem: Register CXL memX devices")
> Reported-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>  drivers/cxl/mem.c |   16 ++++++----------
>  1 file changed, 6 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> index 30bf4f0f3c17..e53d573ae4ab 100644
> +++ b/drivers/cxl/mem.c
> @@ -1049,6 +1049,7 @@ static void cxl_memdev_release(struct device *dev)
>  {
>  	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>  
> +	wait_for_completion(&cxlmd->ops_dead);

This only works because the fops stuff is not right, a kref shouldn't
have a completion like this.

Also, don't use devm for unregister. That just makes it extra-hard to
write the driver remove function correctly.

> @@ -1157,7 +1158,6 @@ static void cxlmdev_unregister(void *_cxlmd)
>  
>  	percpu_ref_kill(&cxlmd->ops_active);
>  	cdev_device_del(&cxlmd->cdev, dev);
> -	wait_for_completion(&cxlmd->ops_dead);
>  	cxlmd->cxlm = NULL;
>  	put_device(dev);
>  }
> @@ -1210,20 +1210,16 @@ static int cxl_mem_add_memdev(struct cxl_mem *cxlm)
>  	cdev_init(cdev, &cxl_memdev_fops);
>  
>  	rc = cdev_device_add(cdev, dev);
> -	if (rc)
> -		goto err_add;
> +	if (rc) {
> +		percpu_ref_kill(&cxlmd->ops_active);
> +		put_device(dev);

This must be one high performance ioctl to warrant the percpu ref.. If
it is not high performance use a rwsem, otherwise I'd suggest srcu as
a faster/simpler alternative.

This is a use-after-free:

static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
			     unsigned long arg)
{
	struct cxl_memdev *cxlmd;
	struct inode *inode;
	int rc = -ENOTTY;

	inode = file_inode(file);
	cxlmd = container_of(inode->i_cdev, typeof(*cxlmd), cdev);
       ^^^^^ can be freed memory

ioctl needs to store the cxlmd in file->private_data and
open()/release() need to do get/put device on it so the memory stays
around. This is why open gets the inode as an argument and ioctl/etc
does not.

The ordering cxlmdev_unregister should mirror the ordering in create
so cdev_device_del should be first

Jason
Dan Williams March 29, 2021, 9:03 p.m. UTC | #2
On Thu, Mar 25, 2021 at 10:12 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Mar 24, 2021 at 02:01:56PM -0700, Dan Williams wrote:
> > If cdev_device_add() fails then the allocation performed by
> > dev_set_name() is leaked. Use put_device(), not open coded release, for
> > device_add() failures.
> >
> > The comment is obsolete because direct err_id failures need not worry
> > about the device being live.
> >
> > The release method expects the percpu_ref is already dead, so
> > percpu_ref_kill() is needed before put_device(). However, given that the
> > cdev was partially live wait_for_completion() also belongs in the
> > release method.
> >
> > Fixes: b39cb1052a5c ("cxl/mem: Register CXL memX devices")
> > Reported-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >  drivers/cxl/mem.c |   16 ++++++----------
> >  1 file changed, 6 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > index 30bf4f0f3c17..e53d573ae4ab 100644
> > +++ b/drivers/cxl/mem.c
> > @@ -1049,6 +1049,7 @@ static void cxl_memdev_release(struct device *dev)
> >  {
> >       struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> >
> > +     wait_for_completion(&cxlmd->ops_dead);
>
> This only works because the fops stuff is not right, a kref shouldn't
> have a completion like this.
>
> Also, don't use devm for unregister. That just makes it extra-hard to
> write the driver remove function correctly.

To date there is no driver remove function, however if that changes
then I expect all the devm needs to go.

>
> > @@ -1157,7 +1158,6 @@ static void cxlmdev_unregister(void *_cxlmd)
> >
> >       percpu_ref_kill(&cxlmd->ops_active);
> >       cdev_device_del(&cxlmd->cdev, dev);
> > -     wait_for_completion(&cxlmd->ops_dead);
> >       cxlmd->cxlm = NULL;
> >       put_device(dev);
> >  }
> > @@ -1210,20 +1210,16 @@ static int cxl_mem_add_memdev(struct cxl_mem *cxlm)
> >       cdev_init(cdev, &cxl_memdev_fops);
> >
> >       rc = cdev_device_add(cdev, dev);
> > -     if (rc)
> > -             goto err_add;
> > +     if (rc) {
> > +             percpu_ref_kill(&cxlmd->ops_active);
> > +             put_device(dev);
>
> This must be one high performance ioctl to warrant the percpu ref.. If
> it is not high performance use a rwsem, otherwise I'd suggest srcu as
> a faster/simpler alternative.

The plan is to refactor and share the same reference counted fops
mechanism as debugfs and make that common infrastructure. However, in
the meantime I think global srcu is suitable.

>
> This is a use-after-free:
>
> static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
>                              unsigned long arg)
> {
>         struct cxl_memdev *cxlmd;
>         struct inode *inode;
>         int rc = -ENOTTY;
>
>         inode = file_inode(file);
>         cxlmd = container_of(inode->i_cdev, typeof(*cxlmd), cdev);
>        ^^^^^ can be freed memory
>
> ioctl needs to store the cxlmd in file->private_data and
> open()/release() need to do get/put device on it so the memory stays
> around. This is why open gets the inode as an argument and ioctl/etc
> does not.

Ugh, exactly why I was motivated to attempt to preclude this with new
core infrastructure that attempted to fix this centrally [1]. Remove
the  possibility of "others" getting this wrong. However after my
initial idea bounced off Greg then I ended up shipping this bug in the
local rewrite. I think the debugfs api gets this right in terms of
centralizing the reference count management, and I want to see
something similar for common driver ioctl patterns.

[1]: http://lore.kernel.org/r/CAPcyv4hGxLZGEkfnqdLfF-a1CzfEjLux-TBxXztbknFhEe9mYA@mail.gmail.com

>
> The ordering cxlmdev_unregister should mirror the ordering in create
> so cdev_device_del should be first

Sure.
Jason Gunthorpe March 29, 2021, 10:44 p.m. UTC | #3
On Mon, Mar 29, 2021 at 02:03:37PM -0700, Dan Williams wrote:

> Ugh, exactly why I was motivated to attempt to preclude this with new
> core infrastructure that attempted to fix this centrally [1]. Remove
> the  possibility of "others" getting this wrong. However after my
> initial idea bounced off Greg then I ended up shipping this bug in the
> local rewrite. I think the debugfs api gets this right in terms of
> centralizing the reference count management, and I want to see
> something similar for common driver ioctl patterns.

There is a lot of variety here, I'm not sure how much valuable common
code there will be that could be lifted into the core.. srcu,
refcount, rwsem, percpu_ref, etc are all common implementations with
various properties.

The easist implementation is to just block driver destruction with a
refcount & completion pattern

The hardest is to allow the underlying HW driver to be removed from
the fops while the file remains open.

Usually whatever scheme is used has to flow into some in-kernel API as
well, so isolating it in cdev may no be entirely helpful.

The easisted helper API would be to add an 'unregistration lock' to
the struct device that blocks unregistration. A refcount & completion
for instance. I've seen that open coded enough times.

Jason
Dan Williams March 30, 2021, 4:48 a.m. UTC | #4
On Mon, Mar 29, 2021 at 3:44 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Mon, Mar 29, 2021 at 02:03:37PM -0700, Dan Williams wrote:
>
> > Ugh, exactly why I was motivated to attempt to preclude this with new
> > core infrastructure that attempted to fix this centrally [1]. Remove
> > the  possibility of "others" getting this wrong. However after my
> > initial idea bounced off Greg then I ended up shipping this bug in the
> > local rewrite. I think the debugfs api gets this right in terms of
> > centralizing the reference count management, and I want to see
> > something similar for common driver ioctl patterns.
>
> There is a lot of variety here, I'm not sure how much valuable common
> code there will be that could be lifted into the core.. srcu,
> refcount, rwsem, percpu_ref, etc are all common implementations with
> various properties.
>
> The easist implementation is to just block driver destruction with a
> refcount & completion pattern
>
> The hardest is to allow the underlying HW driver to be removed from
> the fops while the file remains open.
>
> Usually whatever scheme is used has to flow into some in-kernel API as
> well, so isolating it in cdev may no be entirely helpful.
>
> The easisted helper API would be to add an 'unregistration lock' to
> the struct device that blocks unregistration. A refcount & completion
> for instance. I've seen that open coded enough times.

I do agree there is too much variety to widely unify. At the same time
it is a common enough pattern for devices that allow removal before
final close, especially devices that support hot-removal disconnecting
is a better pattern than blocking unregisteration.

Just the small matter of time to see this through...
diff mbox series

Patch

diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 30bf4f0f3c17..e53d573ae4ab 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1049,6 +1049,7 @@  static void cxl_memdev_release(struct device *dev)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 
+	wait_for_completion(&cxlmd->ops_dead);
 	percpu_ref_exit(&cxlmd->ops_active);
 	ida_free(&cxl_memdev_ida, cxlmd->id);
 	kfree(cxlmd);
@@ -1157,7 +1158,6 @@  static void cxlmdev_unregister(void *_cxlmd)
 
 	percpu_ref_kill(&cxlmd->ops_active);
 	cdev_device_del(&cxlmd->cdev, dev);
-	wait_for_completion(&cxlmd->ops_dead);
 	cxlmd->cxlm = NULL;
 	put_device(dev);
 }
@@ -1210,20 +1210,16 @@  static int cxl_mem_add_memdev(struct cxl_mem *cxlm)
 	cdev_init(cdev, &cxl_memdev_fops);
 
 	rc = cdev_device_add(cdev, dev);
-	if (rc)
-		goto err_add;
+	if (rc) {
+		percpu_ref_kill(&cxlmd->ops_active);
+		put_device(dev);
+		return rc;
+	}
 
 	return devm_add_action_or_reset(dev->parent, cxlmdev_unregister, cxlmd);
 
-err_add:
-	ida_free(&cxl_memdev_ida, cxlmd->id);
 err_id:
-	/*
-	 * Theoretically userspace could have already entered the fops,
-	 * so flush ops_active.
-	 */
 	percpu_ref_kill(&cxlmd->ops_active);
-	wait_for_completion(&cxlmd->ops_dead);
 	percpu_ref_exit(&cxlmd->ops_active);
 err_ref:
 	kfree(cxlmd);