diff mbox

[V3,for-next,2/7] IB/core: Add support for idr types

Message ID 1491301907-32290-3-git-send-email-matanb@mellanox.com (mailing list archive)
State Accepted
Headers show

Commit Message

Matan Barak April 4, 2017, 10:31 a.m. UTC
The new ioctl infrastructure supports driver specific objects.
Each such object type has a hot unplug function, allocation size and
an order of destruction.

When a ucontext is created, a new list is created in this ib_ucontext.
This list contains all objects created under this ib_ucontext.
When a ib_ucontext is destroyed, we traverse this list several time
destroying the various objects by the order mentioned in the object
type description. If few object types have the same destruction order,
they are destroyed in an order opposite to their creation.

Adding an object is done in two parts.
First, an object is allocated and added to idr tree. Then, the
command's handlers (in downstream patches) could work on this object
and fill in its required details.
After a successful command, the commit part is called and the user
objects become ucontext visible. If the handler failed, alloc_abort
should be called.

Removing an uboject is done by calling lookup_get with the write flag
and finalizing it with destroy_commit. A major change from the previous
code is that we actually destroy the kernel object itself in
destroy_commit (rather than just the uobject).

We should make sure idr (per-uverbs-file) and list (per-ucontext) could
be accessed concurrently without corrupting them.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
---
 drivers/infiniband/core/Makefile    |   3 +-
 drivers/infiniband/core/rdma_core.c | 450 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/rdma_core.h |  55 +++++
 include/rdma/ib_verbs.h             |  21 ++
 include/rdma/uverbs_types.h         | 132 +++++++++++
 5 files changed, 660 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/rdma_core.c
 create mode 100644 drivers/infiniband/core/rdma_core.h
 create mode 100644 include/rdma/uverbs_types.h

Comments

Hefty, Sean April 5, 2017, 12:43 a.m. UTC | #1
> diff --git a/drivers/infiniband/core/rdma_core.c
> b/drivers/infiniband/core/rdma_core.c
> new file mode 100644
> index 0000000..1cbc053
> --- /dev/null
> +++ b/drivers/infiniband/core/rdma_core.c
> @@ -0,0 +1,450 @@
> +/*
> + * Copyright (c) 2016, Mellanox Technologies inc.  All rights
> reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the
> following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include <rdma/ib_verbs.h>
> +#include <rdma/uverbs_types.h>
> +#include <linux/rcupdate.h>
> +#include "uverbs.h"
> +#include "core_priv.h"
> +#include "rdma_core.h"
> +
> +void uverbs_uobject_get(struct ib_uobject *uobject)
> +{
> +	kref_get(&uobject->ref);
> +}
> +
> +static void uverbs_uobject_put_ref(struct kref *ref)
> +{
> +	struct ib_uobject *uobj =
> +		container_of(ref, struct ib_uobject, ref);
> +
> +	if (uobj->type->type_class->needs_kfree_rcu)
> +		kfree_rcu(uobj, rcu);
> +	else
> +		kfree(uobj);
> +}

I would rename 'put' to 'free'.

> +
> +void uverbs_uobject_put(struct ib_uobject *uobject)
> +{
> +	kref_put(&uobject->ref, uverbs_uobject_put_ref);
> +}
> +
> +static int uverbs_try_lock_object(struct ib_uobject *uobj, bool
> write)
> +{
> +	/*
> +	 * When a read is required, we use a positive counter. Each read
> +	 * request checks that the value != -1 and increment it. Write
> +	 * requires an exclusive access, thus we check that the counter
> is
> +	 * zero (nobody claimed this object) and we set it to -1.
> +	 * Releasing a read lock is done by simply decreasing the
> counter.
> +	 * As for writes, since only a single write is permitted,
> setting
> +	 * it to zero is enough for releasing it.
> +	 */
> +	if (!write)
> +		return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ?
> +			-EBUSY : 0;
> +
> +	/* lock is either WRITE or DESTROY - should be exclusive */
> +	return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
> +}

I would replace 'write' with 'exclusive'.

> +static struct ib_uobject *alloc_uobj(struct ib_ucontext *context,
> +				     const struct uverbs_obj_type *type)
> +{
> +	struct ib_uobject *uobj = kmalloc(type->obj_size, GFP_KERNEL);

kzalloc?

> +
> +	if (!uobj)
> +		return ERR_PTR(-ENOMEM);
> +	/*
> +	 * user_handle should be filled by the handler,
> +	 * The object is added to the list in the commit stage.
> +	 */
> +	uobj->context = context;
> +	uobj->type = type;
> +	atomic_set(&uobj->usecnt, 0);
> +	kref_init(&uobj->ref);
> +
> +	return uobj;
> +}
> +
> +static int idr_add_uobj(struct ib_uobject *uobj)
> +{
> +	int ret;
> +
> +	idr_preload(GFP_KERNEL);
> +	spin_lock(&uobj->context->ufile->idr_lock);
> +
> +	/*
> +	 * We start with allocating an idr pointing to NULL. This
> represents an
> +	 * object which isn't initialized yet. We'll replace it later on
> with
> +	 * the real object once we commit.
> +	 */
> +	ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0,
> +			min_t(unsigned long, U32_MAX - 1, INT_MAX),
> GFP_NOWAIT);
> +	if (ret >= 0)
> +		uobj->id = ret;
> +
> +	spin_unlock(&uobj->context->ufile->idr_lock);
> +	idr_preload_end();
> +
> +	return ret < 0 ? ret : 0;
> +}
> +
> +/*
> + * It only removes it from the uobjects list, uverbs_uobject_put() is
> still
> + * required.
> + */
> +static void uverbs_idr_remove_uobj(struct ib_uobject *uobj)
> +{
> +	spin_lock(&uobj->context->ufile->idr_lock);
> +	idr_remove(&uobj->context->ufile->idr, uobj->id);
> +	spin_unlock(&uobj->context->ufile->idr_lock);
> +}
> +
> +/* Returns the ib_uobject or an error. The caller should check for
> IS_ERR. */
> +static struct ib_uobject *lookup_get_idr_uobject(const struct
> uverbs_obj_type *type,
> +						 struct ib_ucontext *ucontext,
> +						 int id, bool write)
> +{
> +	struct ib_uobject *uobj;
> +
> +	rcu_read_lock();
> +	/* object won't be released as we're protected in rcu */
> +	uobj = idr_find(&ucontext->ufile->idr, id);
> +	if (!uobj) {
> +		uobj = ERR_PTR(-ENOENT);
> +		goto free;
> +	}
> +
> +	uverbs_uobject_get(uobj);
> +free:
> +	rcu_read_unlock();
> +	return uobj;
> +}
> +
> +struct ib_uobject *rdma_lookup_get_uobject(const struct
> uverbs_obj_type *type,
> +					   struct ib_ucontext *ucontext,
> +					   int id, bool write)
> +{
> +	struct ib_uobject *uobj;
> +	int ret;
> +
> +	uobj = type->type_class->lookup_get(type, ucontext, id, write);
> +	if (IS_ERR(uobj))
> +		return uobj;
> +
> +	if (uobj->type != type) {
> +		ret = -EINVAL;
> +		goto free;
> +	}
> +
> +	ret = uverbs_try_lock_object(uobj, write);
> +	if (ret) {
> +		WARN(ucontext->cleanup_reason,
> +		     "ib_uverbs: Trying to lookup_get while cleanup
> context\n");
> +		goto free;
> +	}
> +
> +	return uobj;
> +free:
> +	uobj->type->type_class->lookup_put(uobj, write);
> +	uverbs_uobject_put(uobj);

There's an unexpected asymmetry here.  lookup_get is pairing with lookup_put + uobject_put.

> +	return ERR_PTR(ret);
> +}
> +
> +static struct ib_uobject *alloc_begin_idr_uobject(const struct
> uverbs_obj_type *type,
> +						  struct ib_ucontext *ucontext)
> +{
> +	int ret;
> +	struct ib_uobject *uobj;
> +
> +	uobj = alloc_uobj(ucontext, type);
> +	if (IS_ERR(uobj))
> +		return uobj;
> +
> +	ret = idr_add_uobj(uobj);
> +	if (ret)
> +		goto uobj_put;
> +
> +	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device,
> +				   RDMACG_RESOURCE_HCA_OBJECT);
> +	if (ret)
> +		goto idr_remove;
> +
> +	return uobj;
> +
> +idr_remove:
> +	uverbs_idr_remove_uobj(uobj);
> +uobj_put:
> +	uverbs_uobject_put(uobj);
> +	return ERR_PTR(ret);
> +}
> +
> +struct ib_uobject *rdma_alloc_begin_uobject(const struct
> uverbs_obj_type *type,
> +					    struct ib_ucontext *ucontext)
> +{
> +	return type->type_class->alloc_begin(type, ucontext);
> +}
> +
> +static void uverbs_uobject_add(struct ib_uobject *uobject)
> +{
> +	mutex_lock(&uobject->context->uobjects_lock);
> +	list_add(&uobject->list, &uobject->context->uobjects);
> +	mutex_unlock(&uobject->context->uobjects_lock);
> +}
> +
> +static int __must_check remove_commit_idr_uobject(struct ib_uobject
> *uobj,
> +						  enum rdma_remove_reason why)
> +{
> +	const struct uverbs_obj_idr_type *idr_type =
> +		container_of(uobj->type, struct uverbs_obj_idr_type,
> +			     type);
> +	int ret = idr_type->destroy_object(uobj, why);
> +
> +	/*
> +	 * We can only fail gracefully if the user requested to destroy
> the
> +	 * object. In the rest of the cases, just remove whatever you
> can.
> +	 */
> +	if (why == RDMA_REMOVE_DESTROY && ret)
> +		return ret;
> +
> +	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
> +			   RDMACG_RESOURCE_HCA_OBJECT);
> +	uverbs_idr_remove_uobj(uobj);
> +
> +	return ret;
> +}
> +
> +static void lockdep_check(struct ib_uobject *uobj, bool write)
> +{
> +#ifdef CONFIG_LOCKDEP
> +	if (write)
> +		WARN_ON(atomic_read(&uobj->usecnt) > 0);
> +	else
> +		WARN_ON(atomic_read(&uobj->usecnt) == -1);
> +#endif
> +}
> +
> +static int __must_check _rdma_remove_commit_uobject(struct ib_uobject
> *uobj,
> +						    enum rdma_remove_reason why,
> +						    bool lock)
> +{
> +	int ret;
> +	struct ib_ucontext *ucontext = uobj->context;
> +
> +	ret = uobj->type->type_class->remove_commit(uobj, why);
> +	if (ret && why == RDMA_REMOVE_DESTROY) {
> +		/* We couldn't remove the object, so just unlock the
> uobject */
> +		atomic_set(&uobj->usecnt, 0);
> +		uobj->type->type_class->lookup_put(uobj, true);
> +	} else {
> +		if (lock)
> +			mutex_lock(&ucontext->uobjects_lock);
> +		list_del(&uobj->list);
> +		if (lock)
> +			mutex_unlock(&ucontext->uobjects_lock);
> +		/* put the ref we took when we created the object */
> +		uverbs_uobject_put(uobj);

Please try to restructure the code so that locking state doesn't need to be carried through to functions like this.


> +	}
> +
> +	return ret;
> +}
> +
> +/* This is called only for user requested DESTROY reasons */
> +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj)
> +{
> +	int ret;
> +	struct ib_ucontext *ucontext = uobj->context;
> +
> +	/* put the ref count we took at lookup_get */
> +	uverbs_uobject_put(uobj);
> +	/* Cleanup is running. Calling this should have been impossible
> */
> +	if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
> +		WARN(true, "ib_uverbs: Cleanup is running while removing
> an uobject\n");
> +		return 0;
> +	}
> +	lockdep_check(uobj, true);
> +	ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY,
> true);
> +
> +	up_read(&ucontext->cleanup_rwsem);
> +	return ret;
> +}
> +
> +static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
> +{
> +	uverbs_uobject_add(uobj);
> +	spin_lock(&uobj->context->ufile->idr_lock);
> +	/*
> +	 * We already allocated this IDR with a NULL object, so
> +	 * this shouldn't fail.
> +	 */
> +	WARN_ON(idr_replace(&uobj->context->ufile->idr,
> +			    uobj, uobj->id));
> +	spin_unlock(&uobj->context->ufile->idr_lock);
> +}
> +
> +int rdma_alloc_commit_uobject(struct ib_uobject *uobj)
> +{
> +	/* Cleanup is running. Calling this should have been impossible
> */
> +	if (!down_read_trylock(&uobj->context->cleanup_rwsem)) {
> +		int ret;
> +
> +		WARN(true, "ib_uverbs: Cleanup is running while allocating
> an uobject\n");
> +		ret = uobj->type->type_class->remove_commit(uobj,
> +
> RDMA_REMOVE_DURING_CLEANUP);
> +		if (ret)
> +			pr_warn("ib_uverbs: cleanup of idr object %d
> failed\n",
> +				uobj->id);
> +		return ret;
> +	}
> +
> +	uobj->type->type_class->alloc_commit(uobj);
> +	up_read(&uobj->context->cleanup_rwsem);
> +
> +	return 0;
> +}
> +
> +static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
> +{
> +	uverbs_idr_remove_uobj(uobj);
> +	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
> +			   RDMACG_RESOURCE_HCA_OBJECT);
> +	uverbs_uobject_put(uobj);
> +}
> +
> +void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
> +{
> +	uobj->type->type_class->alloc_abort(uobj);
> +}
> +
> +static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool
> write)
> +{
> +}
> +
> +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write)
> +{
> +	lockdep_check(uobj, write);
> +	uobj->type->type_class->lookup_put(uobj, write);
> +	/*
> +	 * In order to unlock an object, either decrease its usecnt for
> +	 * read access or zero it in case of write access. See
> +	 * uverbs_try_lock_object for locking schema information.
> +	 */
> +	if (!write)
> +		atomic_dec(&uobj->usecnt);
> +	else
> +		atomic_set(&uobj->usecnt, 0);
> +
> +	uverbs_uobject_put(uobj);
> +}
> +
> +const struct uverbs_obj_type_class uverbs_idr_class = {
> +	.alloc_begin = alloc_begin_idr_uobject,
> +	.lookup_get = lookup_get_idr_uobject,
> +	.alloc_commit = alloc_commit_idr_uobject,
> +	.alloc_abort = alloc_abort_idr_uobject,
> +	.lookup_put = lookup_put_idr_uobject,
> +	.remove_commit = remove_commit_idr_uobject,
> +	/*
> +	 * When we destroy an object, we first just lock it for WRITE
> and
> +	 * actually DESTROY it in the finalize stage. So, the
> problematic
> +	 * scenario is when we just started the finalize stage of the
> +	 * destruction (nothing was executed yet). Now, the other thread
> +	 * fetched the object for READ access, but it didn't lock it
> yet.
> +	 * The DESTROY thread continues and starts destroying the
> object.
> +	 * When the other thread continue - without the RCU, it would
> +	 * access freed memory. However, the rcu_read_lock delays the
> free
> +	 * until the rcu_read_lock of the READ operation quits. Since
> the
> +	 * write lock of the object is still taken by the DESTROY flow,
> the
> +	 * READ operation will get -EBUSY and it'll just bail out.
> +	 */
> +	.needs_kfree_rcu = true,
> +};
> +
> +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool
> device_removed)
> +{
> +	enum rdma_remove_reason reason = device_removed ?
> +		RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE;
> +	unsigned int cur_order = 0;
> +
> +	ucontext->cleanup_reason = reason;
> +	/*
> +	 * Waits for all remove_commit and alloc_commit to finish.
> Logically, We
> +	 * want to hold this forever as the context is going to be
> destroyed,
> +	 * but we'll release it since it causes a "held lock freed" BUG
> message.
> +	 */
> +	down_write(&ucontext->cleanup_rwsem);
> +
> +	while (!list_empty(&ucontext->uobjects)) {
> +		struct ib_uobject *obj, *next_obj;
> +		unsigned int next_order = UINT_MAX;
> +
> +		/*
> +		 * This shouldn't run while executing other commands on
> this
> +		 * context.
> +		 */
> +		mutex_lock(&ucontext->uobjects_lock);
> +		list_for_each_entry_safe(obj, next_obj, &ucontext-
> >uobjects,
> +					 list)

Please add braces

> +			if (obj->type->destroy_order == cur_order) {
> +				int ret;
> +
> +				/*
> +				 * if we hit this WARN_ON, that means we are
> +				 * racing with a lookup_get.
> +				 */
> +				WARN_ON(uverbs_try_lock_object(obj, true));
> +				ret = _rdma_remove_commit_uobject(obj, reason,
> +								  false);
> +				if (ret)
> +					pr_warn("ib_uverbs: failed to remove
> uobject id %d order %u\n",
> +						obj->id, cur_order);
> +			} else {
> +				next_order = min(next_order,
> +						 obj->type->destroy_order);
> +			}
> +		mutex_unlock(&ucontext->uobjects_lock);
> +		cur_order = next_order;
> +	}
> +	up_write(&ucontext->cleanup_rwsem);
> +}
> +
> +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext)
> +{
> +	ucontext->cleanup_reason = 0;
> +	mutex_init(&ucontext->uobjects_lock);
> +	INIT_LIST_HEAD(&ucontext->uobjects);
> +	init_rwsem(&ucontext->cleanup_rwsem);
> +}
> +
> diff --git a/drivers/infiniband/core/rdma_core.h
> b/drivers/infiniband/core/rdma_core.h
> new file mode 100644
> index 0000000..ab665a6
> --- /dev/null
> +++ b/drivers/infiniband/core/rdma_core.h
> @@ -0,0 +1,55 @@
> +/*
> + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> + * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
> + * Copyright (c) 2005-2017 Mellanox Technologies. All rights
> reserved.
> + * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
> + * Copyright (c) 2005 PathScale, Inc. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the
> following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef RDMA_CORE_H
> +#define RDMA_CORE_H
> +
> +#include <linux/idr.h>
> +#include <rdma/uverbs_types.h>
> +#include <rdma/ib_verbs.h>
> +#include <linux/mutex.h>
> +
> +/*
> + * These functions initialize the context and cleanups its uobjects.
> + * The context has a list of objects which is protected by a mutex
> + * on the context. initialize_ucontext should be called when we
> create
> + * a context.
> + * cleanup_ucontext removes all uobjects from the context and puts
> them.
> + */
> +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool
> device_removed);
> +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext);
> +
> +#endif /* RDMA_CORE_H */
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 319e691..d3efd22 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1357,6 +1357,17 @@ struct ib_fmr_attr {
> 
>  struct ib_umem;
> 
> +enum rdma_remove_reason {
> +	/* Userspace requested uobject deletion. Call could fail */
> +	RDMA_REMOVE_DESTROY,
> +	/* Context deletion. This call should delete the actual object
> itself */
> +	RDMA_REMOVE_CLOSE,
> +	/* Driver is being hot-unplugged. This call should delete the
> actual object itself */
> +	RDMA_REMOVE_DRIVER_REMOVE,
> +	/* Context is being cleaned-up, but commit was just completed */
> +	RDMA_REMOVE_DURING_CLEANUP,
> +};
> +
>  struct ib_rdmacg_object {
>  #ifdef CONFIG_CGROUP_RDMA
>  	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
> @@ -1379,6 +1390,13 @@ struct ib_ucontext {
>  	struct list_head	rwq_ind_tbl_list;
>  	int			closing;
> 
> +	/* locking the uobjects_list */
> +	struct mutex		uobjects_lock;
> +	struct list_head	uobjects;
> +	/* protects cleanup process from other actions */
> +	struct rw_semaphore	cleanup_rwsem;
> +	enum rdma_remove_reason cleanup_reason;
> +
>  	struct pid             *tgid;
>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
>  	struct rb_root      umem_tree;
> @@ -1409,8 +1427,11 @@ struct ib_uobject {
>  	int			id;		/* index into kernel idr */
>  	struct kref		ref;
>  	struct rw_semaphore	mutex;		/* protects .live */
> +	atomic_t		usecnt;		/* protects exclusive access
> */
>  	struct rcu_head		rcu;		/* kfree_rcu() overhead */
>  	int			live;
> +
> +	const struct uverbs_obj_type *type;
>  };
> 
>  struct ib_udata {
> diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
> new file mode 100644
> index 0000000..0777e40
> --- /dev/null
> +++ b/include/rdma/uverbs_types.h
> @@ -0,0 +1,132 @@
> +/*
> + * Copyright (c) 2017, Mellanox Technologies inc.  All rights
> reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the
> GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the
> following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef _UVERBS_TYPES_
> +#define _UVERBS_TYPES_
> +
> +#include <linux/kernel.h>
> +#include <rdma/ib_verbs.h>
> +
> +struct uverbs_obj_type;
> +
> +struct uverbs_obj_type_class {
> +	/*
> +	 * Get an ib_uobject that corresponds to the given id from
> ucontext,
> +	 * These functions could create or destroy objects if required.
> +	 * The action will be finalized only when commit, abort or put
> fops are
> +	 * called.
> +	 * The flow of the different actions is:
> +	 * [alloc]:	 Starts with alloc_begin. The handlers logic is than
> +	 *		 executed. If the handler is successful,
> alloc_commit
> +	 *		 is called and the object is inserted to the
> repository.
> +	 *		 Once alloc_commit completes the object is visible
> to
> +	 *		 other threads and userspace.
> +	 e		 Otherwise, alloc_abort is called and the object is
> +	 *		 destroyed.
> +	 * [lookup]:	 Starts with lookup_get which fetches and
> locks the
> +	 *		 object. After the handler finished using the
> object, it
> +	 *		 needs to call lookup_put to unlock it. The write
> flag
> +	 *		 indicates if the object is locked for exclusive
> access.
> +	 * [remove]:	 Starts with lookup_get with write flag set.
> This locks
> +	 *		 the object for exclusive access. If the handler
> code
> +	 *		 completed successfully, remove_commit is called and
> +	 *		 the ib_uobject is removed from the context's
> uobjects
> +	 *		 repository and put. The object itself is destroyed
> as
> +	 *		 well. Once remove succeeds new krefs to the object
> +	 *		 cannot be acquired by other threads or userspace
> and
> +	 *		 the hardware driver is removed from the object.
> +	 *		 Other krefs on the object may still exist.
> +	 *		 If the handler code failed, lookup_put should be
> +	 *		 called. This callback is used when the context
> +	 *		 is destroyed as well (process termination,
> +	 *		 reset flow).
> +	 */
> +	struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type
> *type,
> +					  struct ib_ucontext *ucontext);
> +	void (*alloc_commit)(struct ib_uobject *uobj);
> +	void (*alloc_abort)(struct ib_uobject *uobj);
> +
> +	struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type
> *type,
> +					 struct ib_ucontext *ucontext, int id,
> +					 bool write);
> +	void (*lookup_put)(struct ib_uobject *uobj, bool write);

Rather than passing in a write/exclusive flag to a bunch of different calls, why not just have separate calls?  E.g. get_shared/put_shared, get_excl/put_excl?

> +	/*
> +	 * Must be called with the write lock held. If successful uobj
> is
> +	 * invalid on return. On failure uobject is left completely
> +	 * unchanged
> +	 */
> +	int __must_check (*remove_commit)(struct ib_uobject *uobj,
> +					  enum rdma_remove_reason why);

Or add matching remove_begin()/remove_abort() calls.

> +	u8    needs_kfree_rcu;
> +};
> +
> +struct uverbs_obj_type {
> +	const struct uverbs_obj_type_class * const type_class;
> +	size_t	     obj_size;
> +	unsigned int destroy_order;
> +};
> +
> +/*
> + * Objects type classes which support a detach state (object is still
> alive but
> + * it's not attached to any context need to make sure:
> + * (a) no call through to a driver after a detach is called
> + * (b) detach isn't called concurrently with context_cleanup
> + */
> +
> +struct uverbs_obj_idr_type {
> +	/*
> +	 * In idr based objects, uverbs_obj_type_class points to a
> generic
> +	 * idr operations. In order to specialize the underlying types
> (e.g. CQ,
> +	 * QPs, etc.), we add destroy_object specific callbacks.
> +	 */
> +	struct uverbs_obj_type  type;
> +
> +	/* Free driver resources from the uobject, make the driver
> uncallable,
> +	 * and move the uobject to the detached state. If the object was
> +	 * destroyed by the user's request, a failure should leave the
> uobject
> +	 * completely unchanged.
> +	 */
> +	int __must_check (*destroy_object)(struct ib_uobject *uobj,
> +					   enum rdma_remove_reason why);
> +};
> +
> +struct ib_uobject *rdma_lookup_get_uobject(const struct
> uverbs_obj_type *type,
> +					   struct ib_ucontext *ucontext,
> +					   int id, bool write);
> +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write);
> +struct ib_uobject *rdma_alloc_begin_uobject(const struct
> uverbs_obj_type *type,
> +					    struct ib_ucontext *ucontext);
> +void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
> +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj);
> +int rdma_alloc_commit_uobject(struct ib_uobject *uobj);
> +
> +#endif

In general, this code requires a lot of in-function commenting, which suggests complexity.  The general approach seems reasonable based on what I've read so far.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matan Barak April 5, 2017, 10:55 a.m. UTC | #2
On Wed, Apr 5, 2017 at 3:43 AM, Hefty, Sean <sean.hefty@intel.com> wrote:
>> diff --git a/drivers/infiniband/core/rdma_core.c
>> b/drivers/infiniband/core/rdma_core.c
>> new file mode 100644
>> index 0000000..1cbc053
>> --- /dev/null
>> +++ b/drivers/infiniband/core/rdma_core.c
>> @@ -0,0 +1,450 @@
>> +/*
>> + * Copyright (c) 2016, Mellanox Technologies inc.  All rights
>> reserved.
>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses.  You may choose to be licensed under the terms of the
>> GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * OpenIB.org BSD license below:
>> + *
>> + *     Redistribution and use in source and binary forms, with or
>> + *     without modification, are permitted provided that the
>> following
>> + *     conditions are met:
>> + *
>> + *      - Redistributions of source code must retain the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer.
>> + *
>> + *      - Redistributions in binary form must reproduce the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer in the documentation and/or other materials
>> + *        provided with the distribution.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
>> HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>> + * SOFTWARE.
>> + */
>> +
>> +#include <linux/file.h>
>> +#include <linux/anon_inodes.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <rdma/uverbs_types.h>
>> +#include <linux/rcupdate.h>
>> +#include "uverbs.h"
>> +#include "core_priv.h"
>> +#include "rdma_core.h"
>> +
>> +void uverbs_uobject_get(struct ib_uobject *uobject)
>> +{
>> +     kref_get(&uobject->ref);
>> +}
>> +
>> +static void uverbs_uobject_put_ref(struct kref *ref)
>> +{
>> +     struct ib_uobject *uobj =
>> +             container_of(ref, struct ib_uobject, ref);
>> +
>> +     if (uobj->type->type_class->needs_kfree_rcu)
>> +             kfree_rcu(uobj, rcu);
>> +     else
>> +             kfree(uobj);
>> +}
>
> I would rename 'put' to 'free'.
>

Ok

>> +
>> +void uverbs_uobject_put(struct ib_uobject *uobject)
>> +{
>> +     kref_put(&uobject->ref, uverbs_uobject_put_ref);
>> +}
>> +
>> +static int uverbs_try_lock_object(struct ib_uobject *uobj, bool
>> write)
>> +{
>> +     /*
>> +      * When a read is required, we use a positive counter. Each read
>> +      * request checks that the value != -1 and increment it. Write
>> +      * requires an exclusive access, thus we check that the counter
>> is
>> +      * zero (nobody claimed this object) and we set it to -1.
>> +      * Releasing a read lock is done by simply decreasing the
>> counter.
>> +      * As for writes, since only a single write is permitted,
>> setting
>> +      * it to zero is enough for releasing it.
>> +      */
>> +     if (!write)
>> +             return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ?
>> +                     -EBUSY : 0;
>> +
>> +     /* lock is either WRITE or DESTROY - should be exclusive */
>> +     return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
>> +}
>
> I would replace 'write' with 'exclusive'.
>

Ok

>> +static struct ib_uobject *alloc_uobj(struct ib_ucontext *context,
>> +                                  const struct uverbs_obj_type *type)
>> +{
>> +     struct ib_uobject *uobj = kmalloc(type->obj_size, GFP_KERNEL);
>
> kzalloc?
>

All the standard uobject fields are initialized, but since it can be
used to allocate
a user struct that contains this uobject, it might help some users to
detect their bugs
earlier. No problem changing this.

>> +
>> +     if (!uobj)
>> +             return ERR_PTR(-ENOMEM);
>> +     /*
>> +      * user_handle should be filled by the handler,
>> +      * The object is added to the list in the commit stage.
>> +      */
>> +     uobj->context = context;
>> +     uobj->type = type;
>> +     atomic_set(&uobj->usecnt, 0);
>> +     kref_init(&uobj->ref);
>> +
>> +     return uobj;
>> +}
>> +
>> +static int idr_add_uobj(struct ib_uobject *uobj)
>> +{
>> +     int ret;
>> +
>> +     idr_preload(GFP_KERNEL);
>> +     spin_lock(&uobj->context->ufile->idr_lock);
>> +
>> +     /*
>> +      * We start with allocating an idr pointing to NULL. This
>> represents an
>> +      * object which isn't initialized yet. We'll replace it later on
>> with
>> +      * the real object once we commit.
>> +      */
>> +     ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0,
>> +                     min_t(unsigned long, U32_MAX - 1, INT_MAX),
>> GFP_NOWAIT);
>> +     if (ret >= 0)
>> +             uobj->id = ret;
>> +
>> +     spin_unlock(&uobj->context->ufile->idr_lock);
>> +     idr_preload_end();
>> +
>> +     return ret < 0 ? ret : 0;
>> +}
>> +
>> +/*
>> + * It only removes it from the uobjects list, uverbs_uobject_put() is
>> still
>> + * required.
>> + */
>> +static void uverbs_idr_remove_uobj(struct ib_uobject *uobj)
>> +{
>> +     spin_lock(&uobj->context->ufile->idr_lock);
>> +     idr_remove(&uobj->context->ufile->idr, uobj->id);
>> +     spin_unlock(&uobj->context->ufile->idr_lock);
>> +}
>> +
>> +/* Returns the ib_uobject or an error. The caller should check for
>> IS_ERR. */
>> +static struct ib_uobject *lookup_get_idr_uobject(const struct
>> uverbs_obj_type *type,
>> +                                              struct ib_ucontext *ucontext,
>> +                                              int id, bool write)
>> +{
>> +     struct ib_uobject *uobj;
>> +
>> +     rcu_read_lock();
>> +     /* object won't be released as we're protected in rcu */
>> +     uobj = idr_find(&ucontext->ufile->idr, id);
>> +     if (!uobj) {
>> +             uobj = ERR_PTR(-ENOENT);
>> +             goto free;
>> +     }
>> +
>> +     uverbs_uobject_get(uobj);
>> +free:
>> +     rcu_read_unlock();
>> +     return uobj;
>> +}
>> +
>> +struct ib_uobject *rdma_lookup_get_uobject(const struct
>> uverbs_obj_type *type,
>> +                                        struct ib_ucontext *ucontext,
>> +                                        int id, bool write)
>> +{
>> +     struct ib_uobject *uobj;
>> +     int ret;
>> +
>> +     uobj = type->type_class->lookup_get(type, ucontext, id, write);
>> +     if (IS_ERR(uobj))
>> +             return uobj;
>> +
>> +     if (uobj->type != type) {
>> +             ret = -EINVAL;
>> +             goto free;
>> +     }
>> +
>> +     ret = uverbs_try_lock_object(uobj, write);
>> +     if (ret) {
>> +             WARN(ucontext->cleanup_reason,
>> +                  "ib_uverbs: Trying to lookup_get while cleanup
>> context\n");
>> +             goto free;
>> +     }
>> +
>> +     return uobj;
>> +free:
>> +     uobj->type->type_class->lookup_put(uobj, write);
>> +     uverbs_uobject_put(uobj);
>
> There's an unexpected asymmetry here.  lookup_get is pairing with lookup_put + uobject_put.
>

lookup_get also calls uverbs_uobject_get. It's done in the idr/fd's
callback, as sometimes we need
to wrap it in rcu (or some other equivalent mechanism). In the
previous version, it was more symmetrical
but Jason suggested simplicity over symmetry and I think it looks
better this way.

>> +     return ERR_PTR(ret);
>> +}
>> +
>> +static struct ib_uobject *alloc_begin_idr_uobject(const struct
>> uverbs_obj_type *type,
>> +                                               struct ib_ucontext *ucontext)
>> +{
>> +     int ret;
>> +     struct ib_uobject *uobj;
>> +
>> +     uobj = alloc_uobj(ucontext, type);
>> +     if (IS_ERR(uobj))
>> +             return uobj;
>> +
>> +     ret = idr_add_uobj(uobj);
>> +     if (ret)
>> +             goto uobj_put;
>> +
>> +     ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device,
>> +                                RDMACG_RESOURCE_HCA_OBJECT);
>> +     if (ret)
>> +             goto idr_remove;
>> +
>> +     return uobj;
>> +
>> +idr_remove:
>> +     uverbs_idr_remove_uobj(uobj);
>> +uobj_put:
>> +     uverbs_uobject_put(uobj);
>> +     return ERR_PTR(ret);
>> +}
>> +
>> +struct ib_uobject *rdma_alloc_begin_uobject(const struct
>> uverbs_obj_type *type,
>> +                                         struct ib_ucontext *ucontext)
>> +{
>> +     return type->type_class->alloc_begin(type, ucontext);
>> +}
>> +
>> +static void uverbs_uobject_add(struct ib_uobject *uobject)
>> +{
>> +     mutex_lock(&uobject->context->uobjects_lock);
>> +     list_add(&uobject->list, &uobject->context->uobjects);
>> +     mutex_unlock(&uobject->context->uobjects_lock);
>> +}
>> +
>> +static int __must_check remove_commit_idr_uobject(struct ib_uobject
>> *uobj,
>> +                                               enum rdma_remove_reason why)
>> +{
>> +     const struct uverbs_obj_idr_type *idr_type =
>> +             container_of(uobj->type, struct uverbs_obj_idr_type,
>> +                          type);
>> +     int ret = idr_type->destroy_object(uobj, why);
>> +
>> +     /*
>> +      * We can only fail gracefully if the user requested to destroy
>> the
>> +      * object. In the rest of the cases, just remove whatever you
>> can.
>> +      */
>> +     if (why == RDMA_REMOVE_DESTROY && ret)
>> +             return ret;
>> +
>> +     ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
>> +                        RDMACG_RESOURCE_HCA_OBJECT);
>> +     uverbs_idr_remove_uobj(uobj);
>> +
>> +     return ret;
>> +}
>> +
>> +static void lockdep_check(struct ib_uobject *uobj, bool write)
>> +{
>> +#ifdef CONFIG_LOCKDEP
>> +     if (write)
>> +             WARN_ON(atomic_read(&uobj->usecnt) > 0);
>> +     else
>> +             WARN_ON(atomic_read(&uobj->usecnt) == -1);
>> +#endif
>> +}
>> +
>> +static int __must_check _rdma_remove_commit_uobject(struct ib_uobject
>> *uobj,
>> +                                                 enum rdma_remove_reason why,
>> +                                                 bool lock)
>> +{
>> +     int ret;
>> +     struct ib_ucontext *ucontext = uobj->context;
>> +
>> +     ret = uobj->type->type_class->remove_commit(uobj, why);
>> +     if (ret && why == RDMA_REMOVE_DESTROY) {
>> +             /* We couldn't remove the object, so just unlock the
>> uobject */
>> +             atomic_set(&uobj->usecnt, 0);
>> +             uobj->type->type_class->lookup_put(uobj, true);
>> +     } else {
>> +             if (lock)
>> +                     mutex_lock(&ucontext->uobjects_lock);
>> +             list_del(&uobj->list);
>> +             if (lock)
>> +                     mutex_unlock(&ucontext->uobjects_lock);
>> +             /* put the ref we took when we created the object */
>> +             uverbs_uobject_put(uobj);
>
> Please try to restructure the code so that locking state doesn't need to be carried through to functions like this.
>
>

Yeah, actually, I could just put the uobj->...->lookup_put + list_del
+ uobject_put straight in the cleanup_ucontext code.

>> +     }
>> +
>> +     return ret;
>> +}
>> +
>> +/* This is called only for user requested DESTROY reasons */
>> +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj)
>> +{
>> +     int ret;
>> +     struct ib_ucontext *ucontext = uobj->context;
>> +
>> +     /* put the ref count we took at lookup_get */
>> +     uverbs_uobject_put(uobj);
>> +     /* Cleanup is running. Calling this should have been impossible
>> */
>> +     if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
>> +             WARN(true, "ib_uverbs: Cleanup is running while removing
>> an uobject\n");
>> +             return 0;
>> +     }
>> +     lockdep_check(uobj, true);
>> +     ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY,
>> true);
>> +
>> +     up_read(&ucontext->cleanup_rwsem);
>> +     return ret;
>> +}
>> +
>> +static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
>> +{
>> +     uverbs_uobject_add(uobj);
>> +     spin_lock(&uobj->context->ufile->idr_lock);
>> +     /*
>> +      * We already allocated this IDR with a NULL object, so
>> +      * this shouldn't fail.
>> +      */
>> +     WARN_ON(idr_replace(&uobj->context->ufile->idr,
>> +                         uobj, uobj->id));
>> +     spin_unlock(&uobj->context->ufile->idr_lock);
>> +}
>> +
>> +int rdma_alloc_commit_uobject(struct ib_uobject *uobj)
>> +{
>> +     /* Cleanup is running. Calling this should have been impossible
>> */
>> +     if (!down_read_trylock(&uobj->context->cleanup_rwsem)) {
>> +             int ret;
>> +
>> +             WARN(true, "ib_uverbs: Cleanup is running while allocating
>> an uobject\n");
>> +             ret = uobj->type->type_class->remove_commit(uobj,
>> +
>> RDMA_REMOVE_DURING_CLEANUP);
>> +             if (ret)
>> +                     pr_warn("ib_uverbs: cleanup of idr object %d
>> failed\n",
>> +                             uobj->id);
>> +             return ret;
>> +     }
>> +
>> +     uobj->type->type_class->alloc_commit(uobj);
>> +     up_read(&uobj->context->cleanup_rwsem);
>> +
>> +     return 0;
>> +}
>> +
>> +static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
>> +{
>> +     uverbs_idr_remove_uobj(uobj);
>> +     ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
>> +                        RDMACG_RESOURCE_HCA_OBJECT);
>> +     uverbs_uobject_put(uobj);
>> +}
>> +
>> +void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
>> +{
>> +     uobj->type->type_class->alloc_abort(uobj);
>> +}
>> +
>> +static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool
>> write)
>> +{
>> +}
>> +
>> +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write)
>> +{
>> +     lockdep_check(uobj, write);
>> +     uobj->type->type_class->lookup_put(uobj, write);
>> +     /*
>> +      * In order to unlock an object, either decrease its usecnt for
>> +      * read access or zero it in case of write access. See
>> +      * uverbs_try_lock_object for locking schema information.
>> +      */
>> +     if (!write)
>> +             atomic_dec(&uobj->usecnt);
>> +     else
>> +             atomic_set(&uobj->usecnt, 0);
>> +
>> +     uverbs_uobject_put(uobj);
>> +}
>> +
>> +const struct uverbs_obj_type_class uverbs_idr_class = {
>> +     .alloc_begin = alloc_begin_idr_uobject,
>> +     .lookup_get = lookup_get_idr_uobject,
>> +     .alloc_commit = alloc_commit_idr_uobject,
>> +     .alloc_abort = alloc_abort_idr_uobject,
>> +     .lookup_put = lookup_put_idr_uobject,
>> +     .remove_commit = remove_commit_idr_uobject,
>> +     /*
>> +      * When we destroy an object, we first just lock it for WRITE
>> and
>> +      * actually DESTROY it in the finalize stage. So, the
>> problematic
>> +      * scenario is when we just started the finalize stage of the
>> +      * destruction (nothing was executed yet). Now, the other thread
>> +      * fetched the object for READ access, but it didn't lock it
>> yet.
>> +      * The DESTROY thread continues and starts destroying the
>> object.
>> +      * When the other thread continue - without the RCU, it would
>> +      * access freed memory. However, the rcu_read_lock delays the
>> free
>> +      * until the rcu_read_lock of the READ operation quits. Since
>> the
>> +      * write lock of the object is still taken by the DESTROY flow,
>> the
>> +      * READ operation will get -EBUSY and it'll just bail out.
>> +      */
>> +     .needs_kfree_rcu = true,
>> +};
>> +
>> +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool
>> device_removed)
>> +{
>> +     enum rdma_remove_reason reason = device_removed ?
>> +             RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE;
>> +     unsigned int cur_order = 0;
>> +
>> +     ucontext->cleanup_reason = reason;
>> +     /*
>> +      * Waits for all remove_commit and alloc_commit to finish.
>> Logically, We
>> +      * want to hold this forever as the context is going to be
>> destroyed,
>> +      * but we'll release it since it causes a "held lock freed" BUG
>> message.
>> +      */
>> +     down_write(&ucontext->cleanup_rwsem);
>> +
>> +     while (!list_empty(&ucontext->uobjects)) {
>> +             struct ib_uobject *obj, *next_obj;
>> +             unsigned int next_order = UINT_MAX;
>> +
>> +             /*
>> +              * This shouldn't run while executing other commands on
>> this
>> +              * context.
>> +              */
>> +             mutex_lock(&ucontext->uobjects_lock);
>> +             list_for_each_entry_safe(obj, next_obj, &ucontext-
>> >uobjects,
>> +                                      list)
>
> Please add braces
>

Sure :)

>> +                     if (obj->type->destroy_order == cur_order) {
>> +                             int ret;
>> +
>> +                             /*
>> +                              * if we hit this WARN_ON, that means we are
>> +                              * racing with a lookup_get.
>> +                              */
>> +                             WARN_ON(uverbs_try_lock_object(obj, true));
>> +                             ret = _rdma_remove_commit_uobject(obj, reason,
>> +                                                               false);
>> +                             if (ret)
>> +                                     pr_warn("ib_uverbs: failed to remove
>> uobject id %d order %u\n",
>> +                                             obj->id, cur_order);
>> +                     } else {
>> +                             next_order = min(next_order,
>> +                                              obj->type->destroy_order);
>> +                     }
>> +             mutex_unlock(&ucontext->uobjects_lock);
>> +             cur_order = next_order;
>> +     }
>> +     up_write(&ucontext->cleanup_rwsem);
>> +}
>> +
>> +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext)
>> +{
>> +     ucontext->cleanup_reason = 0;
>> +     mutex_init(&ucontext->uobjects_lock);
>> +     INIT_LIST_HEAD(&ucontext->uobjects);
>> +     init_rwsem(&ucontext->cleanup_rwsem);
>> +}
>> +
>> diff --git a/drivers/infiniband/core/rdma_core.h
>> b/drivers/infiniband/core/rdma_core.h
>> new file mode 100644
>> index 0000000..ab665a6
>> --- /dev/null
>> +++ b/drivers/infiniband/core/rdma_core.h
>> @@ -0,0 +1,55 @@
>> +/*
>> + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
>> + * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
>> + * Copyright (c) 2005-2017 Mellanox Technologies. All rights
>> reserved.
>> + * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
>> + * Copyright (c) 2005 PathScale, Inc. All rights reserved.
>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses.  You may choose to be licensed under the terms of the
>> GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * OpenIB.org BSD license below:
>> + *
>> + *     Redistribution and use in source and binary forms, with or
>> + *     without modification, are permitted provided that the
>> following
>> + *     conditions are met:
>> + *
>> + *      - Redistributions of source code must retain the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer.
>> + *
>> + *      - Redistributions in binary form must reproduce the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer in the documentation and/or other materials
>> + *        provided with the distribution.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
>> HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>> + * SOFTWARE.
>> + */
>> +
>> +#ifndef RDMA_CORE_H
>> +#define RDMA_CORE_H
>> +
>> +#include <linux/idr.h>
>> +#include <rdma/uverbs_types.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <linux/mutex.h>
>> +
>> +/*
>> + * These functions initialize the context and cleanups its uobjects.
>> + * The context has a list of objects which is protected by a mutex
>> + * on the context. initialize_ucontext should be called when we
>> create
>> + * a context.
>> + * cleanup_ucontext removes all uobjects from the context and puts
>> them.
>> + */
>> +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool
>> device_removed);
>> +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext);
>> +
>> +#endif /* RDMA_CORE_H */
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index 319e691..d3efd22 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -1357,6 +1357,17 @@ struct ib_fmr_attr {
>>
>>  struct ib_umem;
>>
>> +enum rdma_remove_reason {
>> +     /* Userspace requested uobject deletion. Call could fail */
>> +     RDMA_REMOVE_DESTROY,
>> +     /* Context deletion. This call should delete the actual object
>> itself */
>> +     RDMA_REMOVE_CLOSE,
>> +     /* Driver is being hot-unplugged. This call should delete the
>> actual object itself */
>> +     RDMA_REMOVE_DRIVER_REMOVE,
>> +     /* Context is being cleaned-up, but commit was just completed */
>> +     RDMA_REMOVE_DURING_CLEANUP,
>> +};
>> +
>>  struct ib_rdmacg_object {
>>  #ifdef CONFIG_CGROUP_RDMA
>>       struct rdma_cgroup      *cg;            /* owner rdma cgroup */
>> @@ -1379,6 +1390,13 @@ struct ib_ucontext {
>>       struct list_head        rwq_ind_tbl_list;
>>       int                     closing;
>>
>> +     /* locking the uobjects_list */
>> +     struct mutex            uobjects_lock;
>> +     struct list_head        uobjects;
>> +     /* protects cleanup process from other actions */
>> +     struct rw_semaphore     cleanup_rwsem;
>> +     enum rdma_remove_reason cleanup_reason;
>> +
>>       struct pid             *tgid;
>>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
>>       struct rb_root      umem_tree;
>> @@ -1409,8 +1427,11 @@ struct ib_uobject {
>>       int                     id;             /* index into kernel idr */
>>       struct kref             ref;
>>       struct rw_semaphore     mutex;          /* protects .live */
>> +     atomic_t                usecnt;         /* protects exclusive access
>> */
>>       struct rcu_head         rcu;            /* kfree_rcu() overhead */
>>       int                     live;
>> +
>> +     const struct uverbs_obj_type *type;
>>  };
>>
>>  struct ib_udata {
>> diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
>> new file mode 100644
>> index 0000000..0777e40
>> --- /dev/null
>> +++ b/include/rdma/uverbs_types.h
>> @@ -0,0 +1,132 @@
>> +/*
>> + * Copyright (c) 2017, Mellanox Technologies inc.  All rights
>> reserved.
>> + *
>> + * This software is available to you under a choice of one of two
>> + * licenses.  You may choose to be licensed under the terms of the
>> GNU
>> + * General Public License (GPL) Version 2, available from the file
>> + * COPYING in the main directory of this source tree, or the
>> + * OpenIB.org BSD license below:
>> + *
>> + *     Redistribution and use in source and binary forms, with or
>> + *     without modification, are permitted provided that the
>> following
>> + *     conditions are met:
>> + *
>> + *      - Redistributions of source code must retain the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer.
>> + *
>> + *      - Redistributions in binary form must reproduce the above
>> + *        copyright notice, this list of conditions and the following
>> + *        disclaimer in the documentation and/or other materials
>> + *        provided with the distribution.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
>> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
>> HOLDERS
>> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
>> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
>> + * SOFTWARE.
>> + */
>> +
>> +#ifndef _UVERBS_TYPES_
>> +#define _UVERBS_TYPES_
>> +
>> +#include <linux/kernel.h>
>> +#include <rdma/ib_verbs.h>
>> +
>> +struct uverbs_obj_type;
>> +
>> +struct uverbs_obj_type_class {
>> +     /*
>> +      * Get an ib_uobject that corresponds to the given id from
>> ucontext,
>> +      * These functions could create or destroy objects if required.
>> +      * The action will be finalized only when commit, abort or put
>> fops are
>> +      * called.
>> +      * The flow of the different actions is:
>> +      * [alloc]:      Starts with alloc_begin. The handlers logic is than
>> +      *               executed. If the handler is successful,
>> alloc_commit
>> +      *               is called and the object is inserted to the
>> repository.
>> +      *               Once alloc_commit completes the object is visible
>> to
>> +      *               other threads and userspace.
>> +      e               Otherwise, alloc_abort is called and the object is
>> +      *               destroyed.
>> +      * [lookup]:     Starts with lookup_get which fetches and
>> locks the
>> +      *               object. After the handler finished using the
>> object, it
>> +      *               needs to call lookup_put to unlock it. The write
>> flag
>> +      *               indicates if the object is locked for exclusive
>> access.
>> +      * [remove]:     Starts with lookup_get with write flag set.
>> This locks
>> +      *               the object for exclusive access. If the handler
>> code
>> +      *               completed successfully, remove_commit is called and
>> +      *               the ib_uobject is removed from the context's
>> uobjects
>> +      *               repository and put. The object itself is destroyed
>> as
>> +      *               well. Once remove succeeds new krefs to the object
>> +      *               cannot be acquired by other threads or userspace
>> and
>> +      *               the hardware driver is removed from the object.
>> +      *               Other krefs on the object may still exist.
>> +      *               If the handler code failed, lookup_put should be
>> +      *               called. This callback is used when the context
>> +      *               is destroyed as well (process termination,
>> +      *               reset flow).
>> +      */
>> +     struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type
>> *type,
>> +                                       struct ib_ucontext *ucontext);
>> +     void (*alloc_commit)(struct ib_uobject *uobj);
>> +     void (*alloc_abort)(struct ib_uobject *uobj);
>> +
>> +     struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type
>> *type,
>> +                                      struct ib_ucontext *ucontext, int id,
>> +                                      bool write);
>> +     void (*lookup_put)(struct ib_uobject *uobj, bool write);
>
> Rather than passing in a write/exclusive flag to a bunch of different calls, why not just have separate calls?  E.g. get_shared/put_shared, get_excl/put_excl?
>

Actually, there are only two functions which get "exclusive" flag.
That's the lookup_get and lookup_put.
Currently, in respect of idr/fd class types, this flag only used by fd
in order to forbid exclusive access.
I don't think that qualifies another set of _excel and _shared
callbacks. Maybe, instead of having these callbacks,
we could add .allow_exclusive flag on the type itself.

Regarding the rdma_lookup_get/put_uobject APIs, we could consider
splitting them to two separate functions.
However, they are so similar that I think sharing the code might be
better than having two separate calls. Trying
to sketch this up looks like:

struct ib_uobject *rdma_lookup_get_uobject_excl(const struct
uverbs_obj_type *type,
                                                struct ib_ucontext *ucontext,
                                                int id)
{
        struct ib_uobject *uobj;
        int ret;

        if (!type->type_class->allows_exclusive_access)
                return ERR_PTR(-EINVAL);

        uobj = type->type_class->lookup_get(type, ucontext, id);
        if (IS_ERR(uobj))
                return uobj;

        if (uobj->type != type) {
                ret = -EINVAL;
                goto free;
        }

        ret = uverbs_try_lock_object_excl(uobj);
        if (ret) {
                WARN(ucontext->cleanup_reason,
                     "ib_uverbs: Trying to lookup_get while cleanup context\n");
                goto free;
        }

        return uobj;
free:
        uobj->type->type_class->lookup_put(uobj);
        uverbs_uobject_put(uobj);
        return ERR_PTR(ret);
}

struct ib_uobject *rdma_lookup_get_uobject_shared(const struct
uverbs_obj_type *type,
                                                  struct ib_ucontext *ucontext,
                                                  int id)
{
        struct ib_uobject *uobj;
        int ret;

        uobj = type->type_class->lookup_get(type, ucontext, id);
        if (IS_ERR(uobj))
                return uobj;

        if (uobj->type != type) {
                ret = -EINVAL;
                goto free;
        }

        ret = uverbs_try_lock_object_shared(uobj);
        if (ret) {
                WARN(ucontext->cleanup_reason,
                     "ib_uverbs: Trying to lookup_get while cleanup context\n");
                goto free;
        }

        return uobj;
free:
        uobj->type->type_class->lookup_put(uobj);
        uverbs_uobject_put(uobj);
        return ERR_PTR(ret);
}

>> +     /*
>> +      * Must be called with the write lock held. If successful uobj
>> is
>> +      * invalid on return. On failure uobject is left completely
>> +      * unchanged
>> +      */
>> +     int __must_check (*remove_commit)(struct ib_uobject *uobj,
>> +                                       enum rdma_remove_reason why);
>
> Or add matching remove_begin()/remove_abort() calls.
>

Not sure we really need them. They're identical (functionality wise)
to callbacks we already have.
If you think about that, remove requires an exclusive access. When we
do that lookup_get, we have
no idea if remove is going to succeed.
Since remove can fail, it's possible that after the removal attampt
the only thing we'll need is to put this object (unlock its state)
So lookup_get and lookup_put covers this logic exactly. For the term
of documentation, we can wrap them with
macros or inline functions, but I'm not sure this is even necassary or
make the code more readable.

>> +     u8    needs_kfree_rcu;
>> +};
>> +
>> +struct uverbs_obj_type {
>> +     const struct uverbs_obj_type_class * const type_class;
>> +     size_t       obj_size;
>> +     unsigned int destroy_order;
>> +};
>> +
>> +/*
>> + * Objects type classes which support a detach state (object is still
>> alive but
>> + * it's not attached to any context need to make sure:
>> + * (a) no call through to a driver after a detach is called
>> + * (b) detach isn't called concurrently with context_cleanup
>> + */
>> +
>> +struct uverbs_obj_idr_type {
>> +     /*
>> +      * In idr based objects, uverbs_obj_type_class points to a
>> generic
>> +      * idr operations. In order to specialize the underlying types
>> (e.g. CQ,
>> +      * QPs, etc.), we add destroy_object specific callbacks.
>> +      */
>> +     struct uverbs_obj_type  type;
>> +
>> +     /* Free driver resources from the uobject, make the driver
>> uncallable,
>> +      * and move the uobject to the detached state. If the object was
>> +      * destroyed by the user's request, a failure should leave the
>> uobject
>> +      * completely unchanged.
>> +      */
>> +     int __must_check (*destroy_object)(struct ib_uobject *uobj,
>> +                                        enum rdma_remove_reason why);
>> +};
>> +
>> +struct ib_uobject *rdma_lookup_get_uobject(const struct
>> uverbs_obj_type *type,
>> +                                        struct ib_ucontext *ucontext,
>> +                                        int id, bool write);
>> +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write);
>> +struct ib_uobject *rdma_alloc_begin_uobject(const struct
>> uverbs_obj_type *type,
>> +                                         struct ib_ucontext *ucontext);
>> +void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
>> +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj);
>> +int rdma_alloc_commit_uobject(struct ib_uobject *uobj);
>> +
>> +#endif
>
> In general, this code requires a lot of in-function commenting, which suggests complexity.  The general approach seems reasonable based on what I've read so far.
>

Thanks for the review.
Regarding the simple cosmetic changes - If Doug prefers, I can send
another round. However, I think integrating these patches so people
could start evaluate them
and send these cosmetic changes as fixups is preferrable.

> - Sean

Matan

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 5, 2017, 3:50 p.m. UTC | #3
On Wed, Apr 05, 2017 at 01:55:22PM +0300, Matan Barak wrote:
> >> +struct ib_uobject *rdma_lookup_get_uobject(const struct
> >> uverbs_obj_type *type,
> >> +                                        struct ib_ucontext *ucontext,
> >> +                                        int id, bool write)
> >> +{
> >> +     struct ib_uobject *uobj;
> >> +     int ret;
> >> +
> >> +     uobj = type->type_class->lookup_get(type, ucontext, id, write);
> >> +     if (IS_ERR(uobj))
> >> +             return uobj;
> >> +
> >> +     if (uobj->type != type) {
> >> +             ret = -EINVAL;
> >> +             goto free;
> >> +     }
> >> +
> >> +     ret = uverbs_try_lock_object(uobj, write);
> >> +     if (ret) {
> >> +             WARN(ucontext->cleanup_reason,
> >> +                  "ib_uverbs: Trying to lookup_get while cleanup
> >> context\n");
> >> +             goto free;
> >> +     }
> >> +
> >> +     return uobj;
> >> +free:
> >> +     uobj->type->type_class->lookup_put(uobj, write);
> >> +     uverbs_uobject_put(uobj);
> >
> > There's an unexpected asymmetry here.  lookup_get is pairing with lookup_put + uobject_put.
> >
> 
> lookup_get also calls uverbs_uobject_get. It's done in the idr/fd's
> callback, as sometimes we need to wrap it in rcu (or some other
> equivalent mechanism). In the previous version, it was more
> symmetrical but Jason suggested simplicity over symmetry and I think
> it looks better this way.

The real problem here is that we have 'rdma_lookup_put' and
'uvbers_uobject_put' with very similar names which is very confusing.

Do we really need to have lookup_put at all? This is only to hold on
to the 'struct file *' across the lookup, which doesn't seem
important.

I suspect we can simplify this by eliminating the implicit fget held
by lookup_get and instead use an accessor to access the 'struct file
*' in the few places that need to do that:

  struct file *uverbs_get_file(struct ib_uobject *object)

We don't really care about the ordering here, if a caller does

 uobj = rdma_lookup_get_uobject(...);
 filp = uverbs_get_file(uobj);
 fput(filep);
 uverbs_uobject_put(uobj);

And filp is NULL because it raced with close(), we can cope with it
just fine.

With this approach we could get rid of the confusing rdma_lookup_put
entirely.

> >> +      */
> >> +     struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type
> >> *type,
> >> +                                       struct ib_ucontext *ucontext);
> >> +     void (*alloc_commit)(struct ib_uobject *uobj);
> >> +     void (*alloc_abort)(struct ib_uobject *uobj);
> >> +
> >> +     struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type
> >> *type,
> >> +                                      struct ib_ucontext *ucontext, int id,
> >> +                                      bool write);
> >> +     void (*lookup_put)(struct ib_uobject *uobj, bool write);
> >
> > Rather than passing in a write/exclusive flag to a bunch of different calls, why not just have separate calls?  E.g. get_shared/put_shared, get_excl/put_excl?
> >
> 
> Actually, there are only two functions which get "exclusive" flag.
> That's the lookup_get and lookup_put.
> Currently, in respect of idr/fd class types, this flag only used by fd
> in order to forbid exclusive access.

Why doesn't uverbs_try_lock_object work with FDs? I understand that we
don't use it right now, but that doesn't seem to explain why we
couldn't.

try_lock_object for a FD could hold the flip and the refcount?

> I don't think that qualifies another set of _excel and _shared
> callbacks. Maybe, instead of having these callbacks,
> we could add .allow_exclusive flag on the type itself.

Yes, that is nicer if we need this.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 5, 2017, 5:33 p.m. UTC | #4
On Wed, 2017-04-05 at 13:55 +0300, Matan Barak wrote:
> Thanks for the review.
> Regarding the simple cosmetic changes - If Doug prefers, I can send
> another round. However, I think integrating these patches so people
> could start evaluate them
> and send these cosmetic changes as fixups is preferrable.

I agree, I've pulled in the v3 patchset.  I'll push it out today and
further changes can be incremental.
Leon Romanovsky April 5, 2017, 5:49 p.m. UTC | #5
On Wed, Apr 05, 2017 at 01:33:59PM -0400, Doug Ledford wrote:
> On Wed, 2017-04-05 at 13:55 +0300, Matan Barak wrote:
> > Thanks for the review.
> > Regarding the simple cosmetic changes - If Doug prefers, I can send
> > another round. However, I think integrating these patches so people
> > could start evaluate them
> > and send these cosmetic changes as fixups is preferrable.
>
> I agree, I've pulled in the v3 patchset.  I'll push it out today and
> further changes can be incremental.

Thanks Doug,

Are you going to merge it into your for-4.12 branch? I need to know it,
so I'll be able to properly build our development branches.

>
> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>    
> Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 5, 2017, 6:51 p.m. UTC | #6
On Wed, 2017-04-05 at 20:49 +0300, Leon Romanovsky wrote:
> On Wed, Apr 05, 2017 at 01:33:59PM -0400, Doug Ledford wrote:
> > 
> > On Wed, 2017-04-05 at 13:55 +0300, Matan Barak wrote:
> > > 
> > > Thanks for the review.
> > > Regarding the simple cosmetic changes - If Doug prefers, I can
> > > send
> > > another round. However, I think integrating these patches so
> > > people
> > > could start evaluate them
> > > and send these cosmetic changes as fixups is preferrable.
> > 
> > I agree, I've pulled in the v3 patchset.  I'll push it out today
> > and
> > further changes can be incremental.
> 
> Thanks Doug,
> 
> Are you going to merge it into your for-4.12 branch? I need to know
> it,
> so I'll be able to properly build our development branches.

Well, I can't merge it into that branch because that's where I was when
I applied the patches ;-)

They've been pushed out, so you should be able to see what you need.
diff mbox

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index e426ac8..d29f910 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -29,4 +29,5 @@  ib_umad-y :=			user_mad.o
 
 ib_ucm-y :=			ucm.o
 
-ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
+				rdma_core.o
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
new file mode 100644
index 0000000..1cbc053
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.c
@@ -0,0 +1,450 @@ 
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <linux/rcupdate.h>
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+void uverbs_uobject_get(struct ib_uobject *uobject)
+{
+	kref_get(&uobject->ref);
+}
+
+static void uverbs_uobject_put_ref(struct kref *ref)
+{
+	struct ib_uobject *uobj =
+		container_of(ref, struct ib_uobject, ref);
+
+	if (uobj->type->type_class->needs_kfree_rcu)
+		kfree_rcu(uobj, rcu);
+	else
+		kfree(uobj);
+}
+
+void uverbs_uobject_put(struct ib_uobject *uobject)
+{
+	kref_put(&uobject->ref, uverbs_uobject_put_ref);
+}
+
+static int uverbs_try_lock_object(struct ib_uobject *uobj, bool write)
+{
+	/*
+	 * When a read is required, we use a positive counter. Each read
+	 * request checks that the value != -1 and increment it. Write
+	 * requires an exclusive access, thus we check that the counter is
+	 * zero (nobody claimed this object) and we set it to -1.
+	 * Releasing a read lock is done by simply decreasing the counter.
+	 * As for writes, since only a single write is permitted, setting
+	 * it to zero is enough for releasing it.
+	 */
+	if (!write)
+		return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ?
+			-EBUSY : 0;
+
+	/* lock is either WRITE or DESTROY - should be exclusive */
+	return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
+}
+
+static struct ib_uobject *alloc_uobj(struct ib_ucontext *context,
+				     const struct uverbs_obj_type *type)
+{
+	struct ib_uobject *uobj = kmalloc(type->obj_size, GFP_KERNEL);
+
+	if (!uobj)
+		return ERR_PTR(-ENOMEM);
+	/*
+	 * user_handle should be filled by the handler,
+	 * The object is added to the list in the commit stage.
+	 */
+	uobj->context = context;
+	uobj->type = type;
+	atomic_set(&uobj->usecnt, 0);
+	kref_init(&uobj->ref);
+
+	return uobj;
+}
+
+static int idr_add_uobj(struct ib_uobject *uobj)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&uobj->context->ufile->idr_lock);
+
+	/*
+	 * We start with allocating an idr pointing to NULL. This represents an
+	 * object which isn't initialized yet. We'll replace it later on with
+	 * the real object once we commit.
+	 */
+	ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0,
+			min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
+	if (ret >= 0)
+		uobj->id = ret;
+
+	spin_unlock(&uobj->context->ufile->idr_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * It only removes it from the uobjects list, uverbs_uobject_put() is still
+ * required.
+ */
+static void uverbs_idr_remove_uobj(struct ib_uobject *uobj)
+{
+	spin_lock(&uobj->context->ufile->idr_lock);
+	idr_remove(&uobj->context->ufile->idr, uobj->id);
+	spin_unlock(&uobj->context->ufile->idr_lock);
+}
+
+/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
+static struct ib_uobject *lookup_get_idr_uobject(const struct uverbs_obj_type *type,
+						 struct ib_ucontext *ucontext,
+						 int id, bool write)
+{
+	struct ib_uobject *uobj;
+
+	rcu_read_lock();
+	/* object won't be released as we're protected in rcu */
+	uobj = idr_find(&ucontext->ufile->idr, id);
+	if (!uobj) {
+		uobj = ERR_PTR(-ENOENT);
+		goto free;
+	}
+
+	uverbs_uobject_get(uobj);
+free:
+	rcu_read_unlock();
+	return uobj;
+}
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type,
+					   struct ib_ucontext *ucontext,
+					   int id, bool write)
+{
+	struct ib_uobject *uobj;
+	int ret;
+
+	uobj = type->type_class->lookup_get(type, ucontext, id, write);
+	if (IS_ERR(uobj))
+		return uobj;
+
+	if (uobj->type != type) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	ret = uverbs_try_lock_object(uobj, write);
+	if (ret) {
+		WARN(ucontext->cleanup_reason,
+		     "ib_uverbs: Trying to lookup_get while cleanup context\n");
+		goto free;
+	}
+
+	return uobj;
+free:
+	uobj->type->type_class->lookup_put(uobj, write);
+	uverbs_uobject_put(uobj);
+	return ERR_PTR(ret);
+}
+
+static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type *type,
+						  struct ib_ucontext *ucontext)
+{
+	int ret;
+	struct ib_uobject *uobj;
+
+	uobj = alloc_uobj(ucontext, type);
+	if (IS_ERR(uobj))
+		return uobj;
+
+	ret = idr_add_uobj(uobj);
+	if (ret)
+		goto uobj_put;
+
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto idr_remove;
+
+	return uobj;
+
+idr_remove:
+	uverbs_idr_remove_uobj(uobj);
+uobj_put:
+	uverbs_uobject_put(uobj);
+	return ERR_PTR(ret);
+}
+
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type,
+					    struct ib_ucontext *ucontext)
+{
+	return type->type_class->alloc_begin(type, ucontext);
+}
+
+static void uverbs_uobject_add(struct ib_uobject *uobject)
+{
+	mutex_lock(&uobject->context->uobjects_lock);
+	list_add(&uobject->list, &uobject->context->uobjects);
+	mutex_unlock(&uobject->context->uobjects_lock);
+}
+
+static int __must_check remove_commit_idr_uobject(struct ib_uobject *uobj,
+						  enum rdma_remove_reason why)
+{
+	const struct uverbs_obj_idr_type *idr_type =
+		container_of(uobj->type, struct uverbs_obj_idr_type,
+			     type);
+	int ret = idr_type->destroy_object(uobj, why);
+
+	/*
+	 * We can only fail gracefully if the user requested to destroy the
+	 * object. In the rest of the cases, just remove whatever you can.
+	 */
+	if (why == RDMA_REMOVE_DESTROY && ret)
+		return ret;
+
+	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+	uverbs_idr_remove_uobj(uobj);
+
+	return ret;
+}
+
+static void lockdep_check(struct ib_uobject *uobj, bool write)
+{
+#ifdef CONFIG_LOCKDEP
+	if (write)
+		WARN_ON(atomic_read(&uobj->usecnt) > 0);
+	else
+		WARN_ON(atomic_read(&uobj->usecnt) == -1);
+#endif
+}
+
+static int __must_check _rdma_remove_commit_uobject(struct ib_uobject *uobj,
+						    enum rdma_remove_reason why,
+						    bool lock)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobj->context;
+
+	ret = uobj->type->type_class->remove_commit(uobj, why);
+	if (ret && why == RDMA_REMOVE_DESTROY) {
+		/* We couldn't remove the object, so just unlock the uobject */
+		atomic_set(&uobj->usecnt, 0);
+		uobj->type->type_class->lookup_put(uobj, true);
+	} else {
+		if (lock)
+			mutex_lock(&ucontext->uobjects_lock);
+		list_del(&uobj->list);
+		if (lock)
+			mutex_unlock(&ucontext->uobjects_lock);
+		/* put the ref we took when we created the object */
+		uverbs_uobject_put(uobj);
+	}
+
+	return ret;
+}
+
+/* This is called only for user requested DESTROY reasons */
+int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobj->context;
+
+	/* put the ref count we took at lookup_get */
+	uverbs_uobject_put(uobj);
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
+		WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n");
+		return 0;
+	}
+	lockdep_check(uobj, true);
+	ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY, true);
+
+	up_read(&ucontext->cleanup_rwsem);
+	return ret;
+}
+
+static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
+{
+	uverbs_uobject_add(uobj);
+	spin_lock(&uobj->context->ufile->idr_lock);
+	/*
+	 * We already allocated this IDR with a NULL object, so
+	 * this shouldn't fail.
+	 */
+	WARN_ON(idr_replace(&uobj->context->ufile->idr,
+			    uobj, uobj->id));
+	spin_unlock(&uobj->context->ufile->idr_lock);
+}
+
+int rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+{
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&uobj->context->cleanup_rwsem)) {
+		int ret;
+
+		WARN(true, "ib_uverbs: Cleanup is running while allocating an uobject\n");
+		ret = uobj->type->type_class->remove_commit(uobj,
+							    RDMA_REMOVE_DURING_CLEANUP);
+		if (ret)
+			pr_warn("ib_uverbs: cleanup of idr object %d failed\n",
+				uobj->id);
+		return ret;
+	}
+
+	uobj->type->type_class->alloc_commit(uobj);
+	up_read(&uobj->context->cleanup_rwsem);
+
+	return 0;
+}
+
+static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+{
+	uverbs_idr_remove_uobj(uobj);
+	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+	uverbs_uobject_put(uobj);
+}
+
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+{
+	uobj->type->type_class->alloc_abort(uobj);
+}
+
+static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool write)
+{
+}
+
+void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write)
+{
+	lockdep_check(uobj, write);
+	uobj->type->type_class->lookup_put(uobj, write);
+	/*
+	 * In order to unlock an object, either decrease its usecnt for
+	 * read access or zero it in case of write access. See
+	 * uverbs_try_lock_object for locking schema information.
+	 */
+	if (!write)
+		atomic_dec(&uobj->usecnt);
+	else
+		atomic_set(&uobj->usecnt, 0);
+
+	uverbs_uobject_put(uobj);
+}
+
+const struct uverbs_obj_type_class uverbs_idr_class = {
+	.alloc_begin = alloc_begin_idr_uobject,
+	.lookup_get = lookup_get_idr_uobject,
+	.alloc_commit = alloc_commit_idr_uobject,
+	.alloc_abort = alloc_abort_idr_uobject,
+	.lookup_put = lookup_put_idr_uobject,
+	.remove_commit = remove_commit_idr_uobject,
+	/*
+	 * When we destroy an object, we first just lock it for WRITE and
+	 * actually DESTROY it in the finalize stage. So, the problematic
+	 * scenario is when we just started the finalize stage of the
+	 * destruction (nothing was executed yet). Now, the other thread
+	 * fetched the object for READ access, but it didn't lock it yet.
+	 * The DESTROY thread continues and starts destroying the object.
+	 * When the other thread continue - without the RCU, it would
+	 * access freed memory. However, the rcu_read_lock delays the free
+	 * until the rcu_read_lock of the READ operation quits. Since the
+	 * write lock of the object is still taken by the DESTROY flow, the
+	 * READ operation will get -EBUSY and it'll just bail out.
+	 */
+	.needs_kfree_rcu = true,
+};
+
+void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
+{
+	enum rdma_remove_reason reason = device_removed ?
+		RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE;
+	unsigned int cur_order = 0;
+
+	ucontext->cleanup_reason = reason;
+	/*
+	 * Waits for all remove_commit and alloc_commit to finish. Logically, We
+	 * want to hold this forever as the context is going to be destroyed,
+	 * but we'll release it since it causes a "held lock freed" BUG message.
+	 */
+	down_write(&ucontext->cleanup_rwsem);
+
+	while (!list_empty(&ucontext->uobjects)) {
+		struct ib_uobject *obj, *next_obj;
+		unsigned int next_order = UINT_MAX;
+
+		/*
+		 * This shouldn't run while executing other commands on this
+		 * context.
+		 */
+		mutex_lock(&ucontext->uobjects_lock);
+		list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
+					 list)
+			if (obj->type->destroy_order == cur_order) {
+				int ret;
+
+				/*
+				 * if we hit this WARN_ON, that means we are
+				 * racing with a lookup_get.
+				 */
+				WARN_ON(uverbs_try_lock_object(obj, true));
+				ret = _rdma_remove_commit_uobject(obj, reason,
+								  false);
+				if (ret)
+					pr_warn("ib_uverbs: failed to remove uobject id %d order %u\n",
+						obj->id, cur_order);
+			} else {
+				next_order = min(next_order,
+						 obj->type->destroy_order);
+			}
+		mutex_unlock(&ucontext->uobjects_lock);
+		cur_order = next_order;
+	}
+	up_write(&ucontext->cleanup_rwsem);
+}
+
+void uverbs_initialize_ucontext(struct ib_ucontext *ucontext)
+{
+	ucontext->cleanup_reason = 0;
+	mutex_init(&ucontext->uobjects_lock);
+	INIT_LIST_HEAD(&ucontext->uobjects);
+	init_rwsem(&ucontext->cleanup_rwsem);
+}
+
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
new file mode 100644
index 0000000..ab665a6
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.h
@@ -0,0 +1,55 @@ 
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_CORE_H
+#define RDMA_CORE_H
+
+#include <linux/idr.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mutex.h>
+
+/*
+ * These functions initialize the context and cleanups its uobjects.
+ * The context has a list of objects which is protected by a mutex
+ * on the context. initialize_ucontext should be called when we create
+ * a context.
+ * cleanup_ucontext removes all uobjects from the context and puts them.
+ */
+void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed);
+void uverbs_initialize_ucontext(struct ib_ucontext *ucontext);
+
+#endif /* RDMA_CORE_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 319e691..d3efd22 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1357,6 +1357,17 @@  struct ib_fmr_attr {
 
 struct ib_umem;
 
+enum rdma_remove_reason {
+	/* Userspace requested uobject deletion. Call could fail */
+	RDMA_REMOVE_DESTROY,
+	/* Context deletion. This call should delete the actual object itself */
+	RDMA_REMOVE_CLOSE,
+	/* Driver is being hot-unplugged. This call should delete the actual object itself */
+	RDMA_REMOVE_DRIVER_REMOVE,
+	/* Context is being cleaned-up, but commit was just completed */
+	RDMA_REMOVE_DURING_CLEANUP,
+};
+
 struct ib_rdmacg_object {
 #ifdef CONFIG_CGROUP_RDMA
 	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
@@ -1379,6 +1390,13 @@  struct ib_ucontext {
 	struct list_head	rwq_ind_tbl_list;
 	int			closing;
 
+	/* locking the uobjects_list */
+	struct mutex		uobjects_lock;
+	struct list_head	uobjects;
+	/* protects cleanup process from other actions */
+	struct rw_semaphore	cleanup_rwsem;
+	enum rdma_remove_reason cleanup_reason;
+
 	struct pid             *tgid;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct rb_root      umem_tree;
@@ -1409,8 +1427,11 @@  struct ib_uobject {
 	int			id;		/* index into kernel idr */
 	struct kref		ref;
 	struct rw_semaphore	mutex;		/* protects .live */
+	atomic_t		usecnt;		/* protects exclusive access */
 	struct rcu_head		rcu;		/* kfree_rcu() overhead */
 	int			live;
+
+	const struct uverbs_obj_type *type;
 };
 
 struct ib_udata {
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
new file mode 100644
index 0000000..0777e40
--- /dev/null
+++ b/include/rdma/uverbs_types.h
@@ -0,0 +1,132 @@ 
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_TYPES_
+#define _UVERBS_TYPES_
+
+#include <linux/kernel.h>
+#include <rdma/ib_verbs.h>
+
+struct uverbs_obj_type;
+
+struct uverbs_obj_type_class {
+	/*
+	 * Get an ib_uobject that corresponds to the given id from ucontext,
+	 * These functions could create or destroy objects if required.
+	 * The action will be finalized only when commit, abort or put fops are
+	 * called.
+	 * The flow of the different actions is:
+	 * [alloc]:	 Starts with alloc_begin. The handlers logic is than
+	 *		 executed. If the handler is successful, alloc_commit
+	 *		 is called and the object is inserted to the repository.
+	 *		 Once alloc_commit completes the object is visible to
+	 *		 other threads and userspace.
+	 e		 Otherwise, alloc_abort is called and the object is
+	 *		 destroyed.
+	 * [lookup]:	 Starts with lookup_get which fetches and locks the
+	 *		 object. After the handler finished using the object, it
+	 *		 needs to call lookup_put to unlock it. The write flag
+	 *		 indicates if the object is locked for exclusive access.
+	 * [remove]:	 Starts with lookup_get with write flag set. This locks
+	 *		 the object for exclusive access. If the handler code
+	 *		 completed successfully, remove_commit is called and
+	 *		 the ib_uobject is removed from the context's uobjects
+	 *		 repository and put. The object itself is destroyed as
+	 *		 well. Once remove succeeds new krefs to the object
+	 *		 cannot be acquired by other threads or userspace and
+	 *		 the hardware driver is removed from the object.
+	 *		 Other krefs on the object may still exist.
+	 *		 If the handler code failed, lookup_put should be
+	 *		 called. This callback is used when the context
+	 *		 is destroyed as well (process termination,
+	 *		 reset flow).
+	 */
+	struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type *type,
+					  struct ib_ucontext *ucontext);
+	void (*alloc_commit)(struct ib_uobject *uobj);
+	void (*alloc_abort)(struct ib_uobject *uobj);
+
+	struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type *type,
+					 struct ib_ucontext *ucontext, int id,
+					 bool write);
+	void (*lookup_put)(struct ib_uobject *uobj, bool write);
+	/*
+	 * Must be called with the write lock held. If successful uobj is
+	 * invalid on return. On failure uobject is left completely
+	 * unchanged
+	 */
+	int __must_check (*remove_commit)(struct ib_uobject *uobj,
+					  enum rdma_remove_reason why);
+	u8    needs_kfree_rcu;
+};
+
+struct uverbs_obj_type {
+	const struct uverbs_obj_type_class * const type_class;
+	size_t	     obj_size;
+	unsigned int destroy_order;
+};
+
+/*
+ * Objects type classes which support a detach state (object is still alive but
+ * it's not attached to any context need to make sure:
+ * (a) no call through to a driver after a detach is called
+ * (b) detach isn't called concurrently with context_cleanup
+ */
+
+struct uverbs_obj_idr_type {
+	/*
+	 * In idr based objects, uverbs_obj_type_class points to a generic
+	 * idr operations. In order to specialize the underlying types (e.g. CQ,
+	 * QPs, etc.), we add destroy_object specific callbacks.
+	 */
+	struct uverbs_obj_type  type;
+
+	/* Free driver resources from the uobject, make the driver uncallable,
+	 * and move the uobject to the detached state. If the object was
+	 * destroyed by the user's request, a failure should leave the uobject
+	 * completely unchanged.
+	 */
+	int __must_check (*destroy_object)(struct ib_uobject *uobj,
+					   enum rdma_remove_reason why);
+};
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type,
+					   struct ib_ucontext *ucontext,
+					   int id, bool write);
+void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool write);
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type,
+					    struct ib_ucontext *ucontext);
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
+int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj);
+int rdma_alloc_commit_uobject(struct ib_uobject *uobj);
+
+#endif