diff mbox

[V1,for-next,6/7] IB/core: Add support for fd objects

Message ID 1485952745-58476-7-git-send-email-matanb@mellanox.com (mailing list archive)
State Superseded
Headers show

Commit Message

Matan Barak Feb. 1, 2017, 12:39 p.m. UTC
The completion channel we use in verbs infrastructure is FD based.
Previously, we had a separate way to manage this object. Since we
strive for a single way to manage any kind of object in this
infrastructure, we conceptually treat all objects as subclasses
of ib_uobject.

This commit adds the necessary mechanism to support FD based objects
like their IDR counterparts. FD objects release need to be synchronized
with context release. We use the cleanup_mutex on the uverbs_file for
that.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c   | 157 +++++++++++++++++++++++++++++++++-
 drivers/infiniband/core/rdma_core.h   |   7 ++
 drivers/infiniband/core/uverbs.h      |   1 +
 drivers/infiniband/core/uverbs_main.c |   4 +-
 include/rdma/ib_verbs.h               |   6 ++
 include/rdma/uverbs_types.h           |  16 ++++
 6 files changed, 189 insertions(+), 2 deletions(-)

Comments

Jason Gunthorpe Feb. 10, 2017, 9:03 p.m. UTC | #1
On Wed, Feb 01, 2017 at 02:39:04PM +0200, Matan Barak wrote:
> The completion channel we use in verbs infrastructure is FD based.
> Previously, we had a separate way to manage this object. Since we
> strive for a single way to manage any kind of object in this
> infrastructure, we conceptually treat all objects as subclasses
> of ib_uobject.
> 
> This commit adds the necessary mechanism to support FD based objects
> like their IDR counterparts. FD objects release need to be synchronized
> with context release. We use the cleanup_mutex on the uverbs_file for
> that.
> 
> Signed-off-by: Matan Barak <matanb@mellanox.com>
> Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
>  drivers/infiniband/core/rdma_core.c   | 157 +++++++++++++++++++++++++++++++++-
>  drivers/infiniband/core/rdma_core.h   |   7 ++
>  drivers/infiniband/core/uverbs.h      |   1 +
>  drivers/infiniband/core/uverbs_main.c |   4 +-
>  include/rdma/ib_verbs.h               |   6 ++
>  include/rdma/uverbs_types.h           |  16 ++++
>  6 files changed, 189 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
> index 7ce4d67..1d24f26 100644
> +++ b/drivers/infiniband/core/rdma_core.c
> @@ -160,6 +160,73 @@ static void uverbs_uobject_add(struct ib_uobject *uobject)
>  	mutex_unlock(&uobject->context->lock);
>  }
>  
> +static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type,
> +						 struct ib_ucontext *ucontext)
> +{
> +	const struct uverbs_obj_fd_type *fd_type =
> +		container_of(type, struct uverbs_obj_fd_type, type);
> +	int new_fd;
> +	struct ib_uobject_file *uobj_file = NULL;
> +	struct file *filp;
> +
> +	new_fd = get_unused_fd_flags(O_CLOEXEC);
> +	if (new_fd < 0)
> +		return ERR_PTR(new_fd);
> +
> +	uobj_file = kmalloc(fd_type->obj_size, GFP_KERNEL);
> +	if (!uobj_file) {
> +		put_unused_fd(new_fd);
> +		return ERR_PTR(-ENOMEM);
> +	}

Move to alloc_uobj, see prior patches

> +static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
> +						struct ib_ucontext *ucontext,
> +						int id, bool write)
> +{
> +	struct file *f;
> +	struct ib_uobject *uobject;
> +	const struct uverbs_obj_fd_type *fd_type =
> +		container_of(type, struct uverbs_obj_fd_type, type);
> +
> +	if (write)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	f = fget(id);
> +	if (!f)
> +		return ERR_PTR(-EBADF);
> +
> +	uobject = f->private_data;
> +	if (f->f_op != fd_type->fops ||
> +	    !uobject->context) {
> +		fput(f);
> +		return ERR_PTR(-EBADF);
> +	}

The read of uobject->context needs locking

> +static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
> +{
> +	struct ib_uobject_file *uobj_file =
> +		container_of(uobj, struct ib_uobject_file, uobj);
> +
> +	kref_get(&uobj_file->ufile->ref);

Something has gone wrong with the krefs here..

kref_get needs to be done for anon_inode_getfile. That kref_get pairs
with the put in uverbs_close_fd.

> +	uverbs_uobject_add(&uobj_file->uobj);
> +	fd_install(uobj_file->uobj.id, uobj->object);
> +	/* This shouldn't be used anymore. Use the file object instead */
> +	uobj_file->uobj.id = 0;

Needs a put to pair with the kref_get done by alloc_uobj.

> +static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
> +{
> +	struct ib_uobject_file *uobj_file =
> +		container_of(uobj, struct ib_uobject_file, uobj);
> +	struct file *filp = uobj->object;
> +
> +	/* Unsuccessful NEW */
> +	fput(filp);
> +	put_unused_fd(uobj_file->uobj.id);

And here is the bug that fixes: since fput(filp) will call
uverbs_close_fd it just put the last kref and free'd the struct.

> +static void destroy_commit_null_uobject(struct ib_uobject *uobj)
> +{
> +	WARN_ON(true);
> +}

So in the model I suggest this would be remove and the purpose is to
disconnect the driver and leave the FD 'detached'.

> +
>  const struct uverbs_obj_type_ops uverbs_idr_ops = {
>  	.alloc_begin = alloc_begin_idr_uobject,
>  	.lookup_get = lookup_get_idr_uobject,
> @@ -254,8 +363,15 @@ void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
>  
>  		/*
>  		 * This shouldn't run while executing other commands on this
> -		 * context, thus no lock is required.
> +		 * context. Thus, the only thing we should take care of is
> +		 * releasing a FD while traversing this list. The FD could be
> +		 * closed and released from the _release fop of this FD.
> +		 * In order to mitigate this, we add a lock.
> +		 * We take and release the lock per order traversal in order
> +		 * to let other threads (which might still use the FDs) chance
> +		 * to run.
>  		 */
> +		mutex_lock(&ucontext->lock);

I think you may as well move this lock to patch 2, reads better that way.

>  		list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
>  					 list)
>  			if (obj->type->destroy_order == cur_order) {
> @@ -265,6 +381,7 @@ void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
>  				next_order = min(next_order,
>  						 obj->type->destroy_order);
>  			}
> +		mutex_unlock(&ucontext->lock);

We had lots of problems with this sort of scheme in the hotplug
series, is this OK like this? Feels potentially dangerous..

I feel like we need a rwsem for cleanup. Write lock the rwsem above
which will inhibit changes to the list and have all the remove paths
try_read the rwsem and if that fails then skip removing from the list
and doing the destroy because we know uverbs_cleanup_ucontext is
running and will do both.

That sounds like a much more self-contained and clearer mechanism
compared to where we are today.

> +void uverbs_close_fd(struct file *f)
> +{
> +	struct ib_uobject_file *uobj_file = f->private_data;
> +
> +	mutex_lock(&uobj_file->ufile->cleanup_mutex);

I don't get what cleanup_mutex is doing. Nothing in here seems like it
needs that lock...

> +	if (uobj_file->uobj.context) {
> +		mutex_lock(&uobj_file->uobj.context->lock);
> +		list_del(&uobj_file->uobj.list);
> +		mutex_unlock(&uobj_file->uobj.context->lock);

Again, I don't like this. If we remove something from the list then it
should go through the whole destruction cycle. Why do we skip
is_closed=1 on this path, for instance? Seems wrong.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Feb. 15, 2017, 1:48 p.m. UTC | #2
On Fri, Feb 10, 2017 at 11:03 PM, Jason Gunthorpe
<jgunthorpe@obsidianresearch.com> wrote:
> On Wed, Feb 01, 2017 at 02:39:04PM +0200, Matan Barak wrote:
>> The completion channel we use in verbs infrastructure is FD based.
>> Previously, we had a separate way to manage this object. Since we
>> strive for a single way to manage any kind of object in this
>> infrastructure, we conceptually treat all objects as subclasses
>> of ib_uobject.
>>
>> This commit adds the necessary mechanism to support FD based objects
>> like their IDR counterparts. FD objects release need to be synchronized
>> with context release. We use the cleanup_mutex on the uverbs_file for
>> that.
>>
>> Signed-off-by: Matan Barak <matanb@mellanox.com>
>> Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
>>  drivers/infiniband/core/rdma_core.c   | 157 +++++++++++++++++++++++++++++++++-
>>  drivers/infiniband/core/rdma_core.h   |   7 ++
>>  drivers/infiniband/core/uverbs.h      |   1 +
>>  drivers/infiniband/core/uverbs_main.c |   4 +-
>>  include/rdma/ib_verbs.h               |   6 ++
>>  include/rdma/uverbs_types.h           |  16 ++++
>>  6 files changed, 189 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
>> index 7ce4d67..1d24f26 100644
>> +++ b/drivers/infiniband/core/rdma_core.c
>> @@ -160,6 +160,73 @@ static void uverbs_uobject_add(struct ib_uobject *uobject)
>>       mutex_unlock(&uobject->context->lock);
>>  }
>>
>> +static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type,
>> +                                              struct ib_ucontext *ucontext)
>> +{
>> +     const struct uverbs_obj_fd_type *fd_type =
>> +             container_of(type, struct uverbs_obj_fd_type, type);
>> +     int new_fd;
>> +     struct ib_uobject_file *uobj_file = NULL;
>> +     struct file *filp;
>> +
>> +     new_fd = get_unused_fd_flags(O_CLOEXEC);
>> +     if (new_fd < 0)
>> +             return ERR_PTR(new_fd);
>> +
>> +     uobj_file = kmalloc(fd_type->obj_size, GFP_KERNEL);
>> +     if (!uobj_file) {
>> +             put_unused_fd(new_fd);
>> +             return ERR_PTR(-ENOMEM);
>> +     }
>
> Move to alloc_uobj, see prior patches
>

Ok

>> +static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
>> +                                             struct ib_ucontext *ucontext,
>> +                                             int id, bool write)
>> +{
>> +     struct file *f;
>> +     struct ib_uobject *uobject;
>> +     const struct uverbs_obj_fd_type *fd_type =
>> +             container_of(type, struct uverbs_obj_fd_type, type);
>> +
>> +     if (write)
>> +             return ERR_PTR(-EOPNOTSUPP);
>> +
>> +     f = fget(id);
>> +     if (!f)
>> +             return ERR_PTR(-EBADF);
>> +
>> +     uobject = f->private_data;
>> +     if (f->f_op != fd_type->fops ||
>> +         !uobject->context) {
>> +             fput(f);
>> +             return ERR_PTR(-EBADF);
>> +     }
>
> The read of uobject->context needs locking
>

All these callbacks are assumed to run when the context is alive.
Some upper synchronization mechanism (SRCU) should take care of not calling
uverbs_cleanup_ucontext while executing some other action on a uobject.
The only call we allow the user to execute while this is in process is
uverbs_close_fd and that's
why this is the only call which is protected by the cleanup_mutex.
close_fd is called from the file's release function. This could happen
in two cases - either the object
wasn't exposed to user-space and then commit_abort does the last fput
or the user-space closes the
file. In the first case, nobody should use the fd object as it wasn't
committed yet.
In the second case, fget and __close_fd should protect us.

>> +static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
>> +{
>> +     struct ib_uobject_file *uobj_file =
>> +             container_of(uobj, struct ib_uobject_file, uobj);
>> +
>> +     kref_get(&uobj_file->ufile->ref);
>
> Something has gone wrong with the krefs here..
>
> kref_get needs to be done for anon_inode_getfile. That kref_get pairs
> with the put in uverbs_close_fd.
>

The code assumes close_fd shall always be called from the release
function. So, I guess
alloc_uobj should kref_init and close_fd should be always paired with it.
You could reach close_fd either from alloc_abort, close system call or
some external kernel code
calling fput (with file_get that was called before. In this case
lookup_get isn't valid anymore).

>> +     uverbs_uobject_add(&uobj_file->uobj);
>> +     fd_install(uobj_file->uobj.id, uobj->object);
>> +     /* This shouldn't be used anymore. Use the file object instead */
>> +     uobj_file->uobj.id = 0;
>
> Needs a put to pair with the kref_get done by alloc_uobj.
>
>> +static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
>> +{
>> +     struct ib_uobject_file *uobj_file =
>> +             container_of(uobj, struct ib_uobject_file, uobj);
>> +     struct file *filp = uobj->object;
>> +
>> +     /* Unsuccessful NEW */
>> +     fput(filp);
>> +     put_unused_fd(uobj_file->uobj.id);
>
> And here is the bug that fixes: since fput(filp) will call
> uverbs_close_fd it just put the last kref and free'd the struct.
>

We could either remove uverbs_uobject_put call.

>> +static void destroy_commit_null_uobject(struct ib_uobject *uobj)
>> +{
>> +     WARN_ON(true);
>> +}
>
> So in the model I suggest this would be remove and the purpose is to
> disconnect the driver and leave the FD 'detached'.
>

As stated in the previous patch, remove_commit (the new name) is
entirely different
than cleanup. This just removes the object from the objects list and
decrease its
reference count.
I guess if we want to enable that for fds too, we need some lighter
version of uverbs_close_fd:

Something like:
        mutex_lock(&uobj_file->ufile->cleanup_mutex);
        if (uobj_file->uobj.context) {
                mutex_lock(&uobj_file->uobj.context->uobjects_lock);
                list_del(&uobj_file->uobj.list);
                mutex_unlock(&uobj_file->uobj.context->uobjects_lock);
                uobj_file->uobj.context = NULL;
        }
        mutex_unlock(&uobj_file->ufile->cleanup_mutex);

It removes it from the list and nullify the context that we wouldn't do that
again in close_fd.

The cleanup callback runs from the context termination. It actually destroys
the real object (or marks it as closed in the fd case).

>> +
>>  const struct uverbs_obj_type_ops uverbs_idr_ops = {
>>       .alloc_begin = alloc_begin_idr_uobject,
>>       .lookup_get = lookup_get_idr_uobject,
>> @@ -254,8 +363,15 @@ void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
>>
>>               /*
>>                * This shouldn't run while executing other commands on this
>> -              * context, thus no lock is required.
>> +              * context. Thus, the only thing we should take care of is
>> +              * releasing a FD while traversing this list. The FD could be
>> +              * closed and released from the _release fop of this FD.
>> +              * In order to mitigate this, we add a lock.
>> +              * We take and release the lock per order traversal in order
>> +              * to let other threads (which might still use the FDs) chance
>> +              * to run.
>>                */
>> +             mutex_lock(&ucontext->lock);
>
> I think you may as well move this lock to patch 2, reads better that way.
>

Ok

>>               list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
>>                                        list)
>>                       if (obj->type->destroy_order == cur_order) {
>> @@ -265,6 +381,7 @@ void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
>>                               next_order = min(next_order,
>>                                                obj->type->destroy_order);
>>                       }
>> +             mutex_unlock(&ucontext->lock);
>
> We had lots of problems with this sort of scheme in the hotplug
> series, is this OK like this? Feels potentially dangerous..
>
> I feel like we need a rwsem for cleanup. Write lock the rwsem above
> which will inhibit changes to the list and have all the remove paths
> try_read the rwsem and if that fails then skip removing from the list
> and doing the destroy because we know uverbs_cleanup_ucontext is
> running and will do both.
>
> That sounds like a much more self-contained and clearer mechanism
> compared to where we are today.
>

We use SRCU to make sure all handlers are completed and no new handlers
could be executed. This means we're safe (I'll add WARN_ON to be sure).
We could only race with closing fd objects and this is handled with
the cleanup_mutex.
If we run handlers and context cleanup at the same time, we could end
up with double
cleanups of the objects themselves (without additional locks of course).

>> +void uverbs_close_fd(struct file *f)
>> +{
>> +     struct ib_uobject_file *uobj_file = f->private_data;
>> +
>> +     mutex_lock(&uobj_file->ufile->cleanup_mutex);
>
> I don't get what cleanup_mutex is doing. Nothing in here seems like it
> needs that lock...
>

cleanup_mutex protects a race when the context is cleaned up and
uverbs_close_fd is called. They both tries to alter the uobjects list,
so they need the lock on the context. The problem here is that the
context might be released and its pointer could be nullified.
It could be even nullified after the "if (uobj_file->uobj.context) "
statement. In order to solve that, the cleanup_context delays
"file->ucontext = NULL;" and thus makes this safe.


>> +     if (uobj_file->uobj.context) {
>> +             mutex_lock(&uobj_file->uobj.context->lock);
>> +             list_del(&uobj_file->uobj.list);
>> +             mutex_unlock(&uobj_file->uobj.context->lock);
>
> Again, I don't like this. If we remove something from the list then it
> should go through the whole destruction cycle. Why do we skip
> is_closed=1 on this path, for instance? Seems wrong.
>

An object could be destroyed in two ways:
a. handler + remove_commit
b. context cleanup

is_closed = 1 is part of the free function of the completion channel objects.
This series only refactors how these objects are managed and tries hard
not to change the internal implementation there.

> Jason

Thanks for the review

Matan

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 15, 2017, 6:54 p.m. UTC | #3
On Wed, Feb 15, 2017 at 03:48:40PM +0200, Matan Barak wrote:
> >> +static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
> >> +                                             struct ib_ucontext *ucontext,
> >> +                                             int id, bool write)
> >> +{
> >> +     struct file *f;
> >> +     struct ib_uobject *uobject;
> >> +     const struct uverbs_obj_fd_type *fd_type =
> >> +             container_of(type, struct uverbs_obj_fd_type, type);
> >> +
> >> +     if (write)
> >> +             return ERR_PTR(-EOPNOTSUPP);
> >> +
> >> +     f = fget(id);
> >> +     if (!f)
> >> +             return ERR_PTR(-EBADF);
> >> +
> >> +     uobject = f->private_data;
> >> +     if (f->f_op != fd_type->fops ||
> >> +         !uobject->context) {
> >> +             fput(f);
> >> +             return ERR_PTR(-EBADF);
> >> +     }
> >
> > The read of uobject->context needs locking
> >
> 
> All these callbacks are assumed to run when the context is alive.

'uboject->context' is touched outside the SRCU, eg in
uverbs_close_fd() - so it absolutely needs locking.

> The only call we allow the user to execute while this is in process
> is uverbs_close_fd and that's why this is the only call which is
> protected by the cleanup_mutex.

So cleanup_mutex would need to be held during lookup

> >> +static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
> >> +{
> >> +     struct ib_uobject_file *uobj_file =
> >> +             container_of(uobj, struct ib_uobject_file, uobj);
> >> +
> >> +     kref_get(&uobj_file->ufile->ref);
> >
> > Something has gone wrong with the krefs here..
> >
> > kref_get needs to be done for anon_inode_getfile. That kref_get pairs
> > with the put in uverbs_close_fd.
> >
> 
> The code assumes close_fd shall always be called from the release
> function. So, I guess
> alloc_uobj should kref_init and close_fd should be always paired
> with it.

You could make this work, but is is not a great approach. For clarity
a kref should never be shared by two pointers, that provides the
fewest surprises. For instance this is very confusing

uobj = alloc_uobj();   // uobj hold the kref
filp = anon_inode_getfile(); // now another uobj pointer is in filp
[..]
if (err) {
   fput(filp)
   // Oops! Now uobj is toast, but we still have a uobj pointer on our stack!

Make filp totally self-contained and make uobj on the stack totally
self contained with regard to the krefs.

> Something like:
>         mutex_lock(&uobj_file->ufile->cleanup_mutex);
>         if (uobj_file->uobj.context) {
>                 mutex_lock(&uobj_file->uobj.context->uobjects_lock);
>                 list_del(&uobj_file->uobj.list);
>                 mutex_unlock(&uobj_file->uobj.context->uobjects_lock);
>                 uobj_file->uobj.context = NULL;
>         }
>         mutex_unlock(&uobj_file->ufile->cleanup_mutex);
> 
> It removes it from the list and nullify the context that we wouldn't do that
> again in close_fd.
> 
> The cleanup callback runs from the context termination. It actually destroys
> the real object (or marks it as closed in the fd case).

Maybe, I'd have to see this to comment I think.. But this locking is
looking nasty

> > We had lots of problems with this sort of scheme in the hotplug
> > series, is this OK like this? Feels potentially dangerous..
> >
> > I feel like we need a rwsem for cleanup. Write lock the rwsem above
> > which will inhibit changes to the list and have all the remove paths
> > try_read the rwsem and if that fails then skip removing from the list
> > and doing the destroy because we know uverbs_cleanup_ucontext is
> > running and will do both.
> >
> > That sounds like a much more self-contained and clearer mechanism
> > compared to where we are today.

I think you should try something like this:

void add_object_to_cleanup(struct ib_uobject *uobj)
{
    // Cleanup is running. Calling this should have been impossible
    if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
       WARN();
       return;
    }
    
    mutex_lock(&icontext->uobjects_lock);
    // kref is held while the object is on the list
    get_uobj(uobj);
    list_add(&uobj->list, &ucontext->uobjects);
    mutex_unlock(&iucontext->uobjects_lock);

    up_read(&ucontext->cleanup_rwsem);
}

/* For an object on the cleanup list, remove it from the list and
   call the remove function. This is either done immediately
   or defered to an ongoing cleanup. Caller must hold a kref on uobj */
void destory_object_from_cleanup(struct ib_uobject *uobj,
                                 enum rdma_remove_reason why)
{
    // Cleanup is running, the object will be destroyed by the cleanup.
    if (!down_read_trylock(&ucontext->cleanup_rwsem))
       return;

    // Remove it from the list
    mutex_lock(&ucontext->uobjects_lock);
    list_del(&uobj->list);
    uobj->context = NULL;
    put_uobj(uobj);
    mutex_unlock(&ucontext->uobjects_lock);

    [..]->remove(uobj, why);

    up_read(&ucontext->cleanup_rwsem);
}

void cleanup()
{
    /* Claim ownership of the context list. Every add attempt will
       fail and every remove attempt will defer to this loop */
    down_write(&ucontext->cleanup_rwsem);

    /* Note all uobjects_lock's are nested inside cleanup_rwsem, by
       holding the write side we prove we have exclusive list access */
    
    for_each_safe(uobj) {
        [..]->remove(uobj, why);
        put_uobj(uobj);
    }

    // rwsem remains held forever, uncontext is going to be destroyed
}

void uverbs_close_fd(struct file *f)
{
   destroy_object_from_cleanup(uobj);
   put_uobj(uobj);
}

> >> +     if (uobj_file->uobj.context) {
> >> +             mutex_lock(&uobj_file->uobj.context->lock);
> >> +             list_del(&uobj_file->uobj.list);
> >> +             mutex_unlock(&uobj_file->uobj.context->lock);
> >
> > Again, I don't like this. If we remove something from the list then it
> > should go through the whole destruction cycle. Why do we skip
> > is_closed=1 on this path, for instance? Seems wrong.
> >
> 
> An object could be destroyed in two ways:
> a. handler + remove_commit
> b. context cleanup

Again, this doesn't really make sense and is just going to cause bugs

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak Feb. 19, 2017, 12:47 p.m. UTC | #4
On Wed, Feb 15, 2017 at 8:54 PM, Jason Gunthorpe
<jgunthorpe@obsidianresearch.com> wrote:
> On Wed, Feb 15, 2017 at 03:48:40PM +0200, Matan Barak wrote:
>> >> +static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
>> >> +                                             struct ib_ucontext *ucontext,
>> >> +                                             int id, bool write)
>> >> +{
>> >> +     struct file *f;
>> >> +     struct ib_uobject *uobject;
>> >> +     const struct uverbs_obj_fd_type *fd_type =
>> >> +             container_of(type, struct uverbs_obj_fd_type, type);
>> >> +
>> >> +     if (write)
>> >> +             return ERR_PTR(-EOPNOTSUPP);
>> >> +
>> >> +     f = fget(id);
>> >> +     if (!f)
>> >> +             return ERR_PTR(-EBADF);
>> >> +
>> >> +     uobject = f->private_data;
>> >> +     if (f->f_op != fd_type->fops ||
>> >> +         !uobject->context) {
>> >> +             fput(f);
>> >> +             return ERR_PTR(-EBADF);
>> >> +     }
>> >
>> > The read of uobject->context needs locking
>> >
>>
>> All these callbacks are assumed to run when the context is alive.
>
> 'uboject->context' is touched outside the SRCU, eg in
> uverbs_close_fd() - so it absolutely needs locking.
>

uobject->context isn't protected by SRCU as intended.
lookup_get_fd_uobject can't be
executed concurrently with uverbs_close_fd, as uverbs_close_fd is
called when the
last reference of the fd is put and in that case fget called by
lookup_get_fd_uobject
would simply fail. Since fput and fget are atomically safe, this should be safe.

>> The only call we allow the user to execute while this is in process
>> is uverbs_close_fd and that's why this is the only call which is
>> protected by the cleanup_mutex.
>
> So cleanup_mutex would need to be held during lookup
>

You can't run lookup_get and uverbs_close_fd concurrently as stated above.

Regarding the locking, assuming uverbs_close_fd is called from the
release file operation and SRCU
protects us (as currently implemented), we don't see locking issues here.

Yishai and Matan.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 21, 2017, 7:24 p.m. UTC | #5
On Sun, Feb 19, 2017 at 02:47:02PM +0200, Matan Barak wrote:

> >>
> >> All these callbacks are assumed to run when the context is alive.
> >
> > 'uboject->context' is touched outside the SRCU, eg in
> > uverbs_close_fd() - so it absolutely needs locking.
> >
> 
> uobject->context isn't protected by SRCU as intended.
> lookup_get_fd_uobject can't be executed concurrently with
> uverbs_close_fd, as uverbs_close_fd is called when the last
> reference of the fd is put and in that case fget called by
> lookup_get_fd_uobject would simply fail. Since fput and fget are
> atomically safe, this should be safe.

That is so tricky it needs a comment, but yes, that argument makes
sense for uverbs_close_fd().

But, you've just explained away every reason for this test:

+	if (f->f_op != fd_type->fops ||
+	    !uobject->context) {

It should probably be

	if (f->f_op != fd_type->fops) {
		fput(f);
		return ERR_PTR(-EBADF);
	}

	/* fget(id) ensures we are not currently running
	 * uverbs_close_fd, and the caller is expected to ensure
	 * that uverbs_cleanup_ucontext is never done while
	 * a call top lookup is possible. */
	WARN_ON(!uobject->context);
	if (!uobject->context) {
	        // Try to recover
		fput(f);
		return ERR_PTR(-EBADF);
	}

Or you need to explain where context = null comes from..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 7ce4d67..1d24f26 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -160,6 +160,73 @@  static void uverbs_uobject_add(struct ib_uobject *uobject)
 	mutex_unlock(&uobject->context->lock);
 }
 
+static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type,
+						 struct ib_ucontext *ucontext)
+{
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(type, struct uverbs_obj_fd_type, type);
+	int new_fd;
+	struct ib_uobject_file *uobj_file = NULL;
+	struct file *filp;
+
+	new_fd = get_unused_fd_flags(O_CLOEXEC);
+	if (new_fd < 0)
+		return ERR_PTR(new_fd);
+
+	uobj_file = kmalloc(fd_type->obj_size, GFP_KERNEL);
+	if (!uobj_file) {
+		put_unused_fd(new_fd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	filp = anon_inode_getfile(fd_type->name,
+				  fd_type->fops,
+				  uobj_file,
+				  fd_type->flags);
+	if (IS_ERR(filp)) {
+		put_unused_fd(new_fd);
+		kfree(uobj_file);
+		return (void *)filp;
+	}
+
+	init_uobj(&uobj_file->uobj, ucontext, type);
+	uobj_file->uobj.id = new_fd;
+	uobj_file->uobj.object = filp;
+	uobj_file->ufile = ucontext->ufile;
+
+	return &uobj_file->uobj;
+}
+
+static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
+						struct ib_ucontext *ucontext,
+						int id, bool write)
+{
+	struct file *f;
+	struct ib_uobject *uobject;
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(type, struct uverbs_obj_fd_type, type);
+
+	if (write)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	f = fget(id);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	uobject = f->private_data;
+	if (f->f_op != fd_type->fops ||
+	    !uobject->context) {
+		fput(f);
+		return ERR_PTR(-EBADF);
+	}
+
+	/*
+	 * No need to protect it with a ref count, as fget increases
+	 * f_count.
+	 */
+	return uobject;
+}
+
 static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
 {
 	uverbs_uobject_add(uobj);
@@ -196,6 +263,38 @@  static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool write)
 		up_read(&uobj->currently_used);
 }
 
+static void lookup_put_fd_uobject(struct ib_uobject *uobj, bool write)
+{
+	struct file *filp = uobj->object;
+
+	WARN_ON(write);
+	fput(filp);
+}
+
+static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
+{
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+
+	kref_get(&uobj_file->ufile->ref);
+	uverbs_uobject_add(&uobj_file->uobj);
+	fd_install(uobj_file->uobj.id, uobj->object);
+	/* This shouldn't be used anymore. Use the file object instead */
+	uobj_file->uobj.id = 0;
+}
+
+static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
+{
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+	struct file *filp = uobj->object;
+
+	/* Unsuccessful NEW */
+	fput(filp);
+	put_unused_fd(uobj_file->uobj.id);
+	uverbs_uobject_put(&uobj_file->uobj);
+}
+
 static void destroy_commit_idr_uobject(struct ib_uobject *uobj)
 {
 	uverbs_idr_remove_uobj(uobj);
@@ -233,6 +332,16 @@  static void release_idr_uobject(struct ib_uobject *uobj)
 	kfree_rcu(uobj, rcu);
 }
 
+static void release_fd_uobject(struct ib_uobject *uobj)
+{
+	kfree(container_of(uobj, struct ib_uobject_file, uobj));
+}
+
+static void destroy_commit_null_uobject(struct ib_uobject *uobj)
+{
+	WARN_ON(true);
+}
+
 const struct uverbs_obj_type_ops uverbs_idr_ops = {
 	.alloc_begin = alloc_begin_idr_uobject,
 	.lookup_get = lookup_get_idr_uobject,
@@ -254,8 +363,15 @@  void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
 
 		/*
 		 * This shouldn't run while executing other commands on this
-		 * context, thus no lock is required.
+		 * context. Thus, the only thing we should take care of is
+		 * releasing a FD while traversing this list. The FD could be
+		 * closed and released from the _release fop of this FD.
+		 * In order to mitigate this, we add a lock.
+		 * We take and release the lock per order traversal in order
+		 * to let other threads (which might still use the FDs) chance
+		 * to run.
 		 */
+		mutex_lock(&ucontext->lock);
 		list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
 					 list)
 			if (obj->type->destroy_order == cur_order) {
@@ -265,6 +381,7 @@  void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
 				next_order = min(next_order,
 						 obj->type->destroy_order);
 			}
+		mutex_unlock(&ucontext->lock);
 		cur_order = next_order;
 	}
 }
@@ -275,3 +392,41 @@  void uverbs_initialize_ucontext(struct ib_ucontext *ucontext)
 	INIT_LIST_HEAD(&ucontext->uobjects);
 }
 
+static void hot_unplug_fd_uobject(struct ib_uobject *uobj, bool device_removed)
+{
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(uobj->type, struct uverbs_obj_fd_type, type);
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+
+	fd_type->hot_unplug(uobj_file, device_removed);
+	uobj_file->uobj.context = NULL;
+}
+
+const struct uverbs_obj_type_ops uverbs_fd_ops = {
+	.alloc_begin = alloc_begin_fd_uobject,
+	.lookup_get = lookup_get_fd_uobject,
+	.alloc_commit = alloc_commit_fd_uobject,
+	.alloc_abort = alloc_abort_fd_uobject,
+	.lookup_put = lookup_put_fd_uobject,
+	.destroy_commit = destroy_commit_null_uobject,
+	.hot_unplug = hot_unplug_fd_uobject,
+	.release = release_fd_uobject,
+};
+
+void uverbs_close_fd(struct file *f)
+{
+	struct ib_uobject_file *uobj_file = f->private_data;
+
+	mutex_lock(&uobj_file->ufile->cleanup_mutex);
+	if (uobj_file->uobj.context) {
+		mutex_lock(&uobj_file->uobj.context->lock);
+		list_del(&uobj_file->uobj.list);
+		mutex_unlock(&uobj_file->uobj.context->lock);
+		uobj_file->uobj.context = NULL;
+	}
+	mutex_unlock(&uobj_file->ufile->cleanup_mutex);
+	kref_put(&uobj_file->ufile->ref, ib_uverbs_release_file);
+	uverbs_uobject_put(&uobj_file->uobj);
+}
+
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index ab665a6..da4e808 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -52,4 +52,11 @@ 
 void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed);
 void uverbs_initialize_ucontext(struct ib_ucontext *ucontext);
 
+/*
+ * Indicate this fd is no longer used by this consumer, but its memory isn't
+ * released yet. Internally we call uverbs_uobject_put. When the last reference
+ * is put, we release the memory.
+ */
+void uverbs_close_fd(struct file *f);
+
 #endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 9fe5e02..20632ff 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -193,6 +193,7 @@  void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
 			   struct ib_ucq_object *uobj);
 void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
 			      struct ib_uevent_object *uobj);
+void ib_uverbs_release_file(struct kref *ref);
 
 void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
 void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 0eb4538..784eccc 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -229,7 +229,7 @@  static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
 	complete(&dev->comp);
 }
 
-static void ib_uverbs_release_file(struct kref *ref)
+void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
@@ -1128,7 +1128,9 @@  static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
 			 * (e.g mmput).
 			 */
 			ib_dev->disassociate_ucontext(ucontext);
+			mutex_lock(&file->cleanup_mutex);
 			ib_uverbs_cleanup_ucontext(file, ucontext, true);
+			mutex_unlock(&file->cleanup_mutex);
 		}
 
 		mutex_lock(&uverbs_dev->lists_mutex);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7ddb08f..941a764 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1372,6 +1372,12 @@  struct ib_uobject {
 	const struct uverbs_obj_type *type;
 };
 
+struct ib_uobject_file {
+	struct ib_uobject	uobj;
+	/* ufile contains the lock between context release and file close */
+	struct ib_uverbs_file	*ufile;
+};
+
 struct ib_udata {
 	const void __user *inbuf;
 	void __user *outbuf;
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index cdbf352..48b5e4e 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -101,6 +101,22 @@  struct uverbs_obj_idr_type {
 	void (*hot_unplug)(struct ib_uobject *uobj);
 };
 
+struct uverbs_obj_fd_type {
+	/*
+	 * In fd based objects, uverbs_obj_type_ops points to a generic
+	 * fd operations. In order to specialize the underlying types (e.g.
+	 * completion_channel), we use obj_size, fops, name and flags for fd
+	 * creation and hot_unplug for specific release callback.
+	 */
+	struct uverbs_obj_type  type;
+	size_t			obj_size;
+	void (*hot_unplug)(struct ib_uobject_file *uobj_file,
+			   bool device_removed);
+	const struct file_operations	*fops;
+	const char			*name;
+	int				flags;
+};
+
 extern const struct uverbs_obj_type_ops uverbs_idr_ops;
 
 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\