diff mbox

[v2,8/8] IB/isert: use ib_client API to wrap ib_device

Message ID 20180604123003.24748-9-roman.penyaev@profitbricks.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Pen June 4, 2018, 12:30 p.m. UTC
ib_client API provides a way to wrap an ib_device with a specific ULP
structure.  Using that API local lists and mutexes can be completely
avoided and allocation/removal paths become a bit cleaner.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Doug Ledford <dledford@redhat.com>
Cc: target-devel@vger.kernel.org
---
 drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 31 deletions(-)

Comments

Jason Gunthorpe June 4, 2018, 9:58 p.m. UTC | #1
On Mon, Jun 04, 2018 at 02:30:03PM +0200, Roman Pen wrote:
> ib_client API provides a way to wrap an ib_device with a specific ULP
> structure.  Using that API local lists and mutexes can be completely
> avoided and allocation/removal paths become a bit cleaner.
> 
> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Steve Wise <swise@opengridcomputing.com>
> Cc: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Sagi Grimberg <sagi@grimberg.me>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: target-devel@vger.kernel.org
>  drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++-----------
>  1 file changed, 65 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
> index 08dc9d27e5b1..c80f31573919 100644
> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
> @@ -42,8 +42,7 @@ static int isert_debug_level;
>  module_param_named(debug_level, isert_debug_level, int, 0644);
>  MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
>  
> -static DEFINE_MUTEX(device_list_mutex);
> -static LIST_HEAD(device_list);
> +static struct ib_client isert_rdma_ib_client;
>  static struct workqueue_struct *isert_comp_wq;
>  static struct workqueue_struct *isert_release_wq;
>  
> @@ -341,7 +340,6 @@ isert_free_device_ib_res(struct isert_device *device)
>  static void
>  isert_device_put(struct isert_device *device)
>  {
> -	mutex_lock(&device_list_mutex);
>  	isert_info("device %p refcount %d\n", device,
>  		   refcount_read(&device->refcount));
>  	if (refcount_dec_and_test(&device->refcount)) {
> @@ -349,48 +347,44 @@ isert_device_put(struct isert_device *device)
>  		list_del(&device->dev_node);
>  		kfree(device);
>  	}
> -	mutex_unlock(&device_list_mutex);
>  }
>  
>  static struct isert_device *
>  isert_device_get(struct rdma_cm_id *cma_id)
>  {
>  	struct isert_device *device;
> -	int ret;
>  
> -	mutex_lock(&device_list_mutex);
> -	list_for_each_entry(device, &device_list, dev_node) {
> -		if (device->ib_device->node_guid == cma_id->device->node_guid) {
> -			refcount_inc(&device->refcount);
> -			isert_info("Found iser device %p refcount %d\n",
> -				   device, refcount_read(&device->refcount));
> -			mutex_unlock(&device_list_mutex);
> -			return device;
> -		}
> -	}
> +	/* Paired with isert_ib_client_remove_one() */
> +	rcu_read_lock();
> +	device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client);
> +	if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
> +		device = NULL;
> +	rcu_read_unlock();

I think I'd prefer a managed kref API instead of tricky
RCU..

ib_get_client_data doesn't use rcu_derference, so this isn't
*technically* right

Something like:

 /* Returns NULL or a valid pointer holding a kref. The kref must be
    freed by the caller */
 struct kref_t *ib_get_client_kref(...);

 /* Atomically releases any kref already held and sets to a new
    value. If non-null a kref is obtained. */
 void ib_set_client_kref(struct kref_t *);

And a client_kref_put function pointer to struct ib_client to point to
the putter function.

The core code can safely manage all the required locking to guarentee
it returns a valid kref'd pointer, without needing to expose RCU.

Looks like many clients fit into this model already, so may as well
help them out.

Jason
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roman Pen June 5, 2018, 10:20 a.m. UTC | #2
On Mon, Jun 4, 2018 at 11:58 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> On Mon, Jun 04, 2018 at 02:30:03PM +0200, Roman Pen wrote:
>> ib_client API provides a way to wrap an ib_device with a specific ULP
>> structure.  Using that API local lists and mutexes can be completely
>> avoided and allocation/removal paths become a bit cleaner.
>>
>> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Steve Wise <swise@opengridcomputing.com>
>> Cc: Bart Van Assche <bart.vanassche@sandisk.com>
>> Cc: Sagi Grimberg <sagi@grimberg.me>
>> Cc: Doug Ledford <dledford@redhat.com>
>> Cc: target-devel@vger.kernel.org
>>  drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++-----------
>>  1 file changed, 65 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
>> index 08dc9d27e5b1..c80f31573919 100644
>> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
>> @@ -42,8 +42,7 @@ static int isert_debug_level;
>>  module_param_named(debug_level, isert_debug_level, int, 0644);
>>  MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
>>
>> -static DEFINE_MUTEX(device_list_mutex);
>> -static LIST_HEAD(device_list);
>> +static struct ib_client isert_rdma_ib_client;
>>  static struct workqueue_struct *isert_comp_wq;
>>  static struct workqueue_struct *isert_release_wq;
>>
>> @@ -341,7 +340,6 @@ isert_free_device_ib_res(struct isert_device *device)
>>  static void
>>  isert_device_put(struct isert_device *device)
>>  {
>> -     mutex_lock(&device_list_mutex);
>>       isert_info("device %p refcount %d\n", device,
>>                  refcount_read(&device->refcount));
>>       if (refcount_dec_and_test(&device->refcount)) {
>> @@ -349,48 +347,44 @@ isert_device_put(struct isert_device *device)
>>               list_del(&device->dev_node);
>>               kfree(device);
>>       }
>> -     mutex_unlock(&device_list_mutex);
>>  }
>>
>>  static struct isert_device *
>>  isert_device_get(struct rdma_cm_id *cma_id)
>>  {
>>       struct isert_device *device;
>> -     int ret;
>>
>> -     mutex_lock(&device_list_mutex);
>> -     list_for_each_entry(device, &device_list, dev_node) {
>> -             if (device->ib_device->node_guid == cma_id->device->node_guid) {
>> -                     refcount_inc(&device->refcount);
>> -                     isert_info("Found iser device %p refcount %d\n",
>> -                                device, refcount_read(&device->refcount));
>> -                     mutex_unlock(&device_list_mutex);
>> -                     return device;
>> -             }
>> -     }
>> +     /* Paired with isert_ib_client_remove_one() */
>> +     rcu_read_lock();
>> +     device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client);
>> +     if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
>> +             device = NULL;
>> +     rcu_read_unlock();
>
> I think I'd prefer a managed kref API instead of tricky
> RCU..
>
> ib_get_client_data doesn't use rcu_derference, so this isn't
> *technically* right

Why? ib_(get|set)_client_data() takes spin_lock, which guarantees
pointer will be observed correctly and nothing will be reordered.
rcu_dereference() does READ_ONCE(), which in its turn expands to
volatile cast, which prevents compiler from optimization.
Of course spin lock in ib_*_client_data() has stronger guarantees.

Here I used RCU only in order not to introduce another lock.
I need a grace period to be sure everybody observes pointer
as NULL *or* the reference is safely increased.

So these two variants are similar:

RCU

        /* READ PATH */

        rcu_read_lock();
        device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client);
        if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
                device = NULL;
        rcu_read_unlock();


        /* FREE PATH */

        ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL);
        synchronize_rcu();
        isert_device_put(device);


LOCK

        /* READ PATH */

        spin_lock(&lock);
        device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client);
        if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
                device = NULL;
        spin_unlock(&lock);


        /* FREE PATH */

        ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL);
        /* Blink with the lock to be sure everybody has left critical section */
        spin_lock(&lock);
        spin_unlock(&lock);
        isert_device_put(device);

        /* or even another FREE PATH, wrap ib_set_client_data() with
lock/unlock */

        spin_lock(&lock);
        ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL);
        spin_unlock(&lock);
        isert_device_put(device);

>
> Something like:
>
>  /* Returns NULL or a valid pointer holding a kref. The kref must be
>     freed by the caller */
>  struct kref_t *ib_get_client_kref(...);
>
>  /* Atomically releases any kref already held and sets to a new
>     value. If non-null a kref is obtained. */
>  void ib_set_client_kref(struct kref_t *);
>
> And a client_kref_put function pointer to struct ib_client to point to
> the putter function.
>
> The core code can safely manage all the required locking to guarentee
> it returns a valid kref'd pointer, without needing to expose RCU.

I understand your idea.  If refcount_inc_not_zero() can be invoked
directly from ib_get_client_data() under spin lock, that will of
course eliminate the need to do any locks outside the device.c.

But seems this is not enough.  Proposed kref protects read path,
when ulp device is already set, but if not?  I.e. there should
be some help from core API to create ulp device and set it,
if client_data has not been set yet, i.e. something the following
should be done under the lock inside device.c:

        mutex_lock(&nvme_devices_mutex);
        ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client);
        if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev)))
                ndev = NULL;
        else if (!ndev) {
               ndev = nvme_rdma_alloc_device(cmd_id->device);
               if (likely(ndev))
                   ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client,
                                      ndev);
        }
        mutex_unlock(&nvme_devices_mutex);

* code hunk is stolen from previous email, where we discuss that
* ulp device can be created on demand.

And it seems that such approach will require some new ib_client
callbacks to create and free ulp device, which I personally do
not like.

Can we follow instead the way with cmpxchg semantics:

        static struct nvme_rdma_device *
        nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
        {
                struct nvme_rdma_device *ndev;
                struct kref *kref;

                kref = ib_get_client_kref(cm_id->device,
                                          &nvme_rdma_ib_client);
                if (!kref) {
                        ndev = nvme_rdma_alloc_device(cm_id->device);
                        WARN_ON(!ndev); /* ignore error path for now */

                        kref = ib_cmpxchg_client_kref(cm_id->device,
&nvme_rdma_ib_client,
                                                      NULL, &ndev->kref);
                        if (kref) {
                                /* We raced with someone and lost, free ndev */
                                nvme_rdma_dev_put(ndev);
                        } else
                                /* We won! */
                                kref = &ndev->kref;
                }

                return container_of(kref, typeof(*ndev), kref);
        }


The major difference is that the pointer will be updated IFF it
was NULL.

IMO that can worth to do because no extra callbacks and locks are
required and ULP device has full control.

What do you think?

--
Roman
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe June 5, 2018, 2:22 p.m. UTC | #3
On Tue, Jun 05, 2018 at 12:20:49PM +0200, Roman Penyaev wrote:

> > I think I'd prefer a managed kref API instead of tricky
> > RCU..
> >
> > ib_get_client_data doesn't use rcu_derference, so this isn't
> > *technically* right
> 
> Why? ib_(get|set)_client_data() takes spin_lock, which guarantees
> pointer will be observed correctly and nothing will be reordered.
> rcu_dereference() does READ_ONCE(), which in its turn expands to
> volatile cast, which prevents compiler from optimization.
> Of course spin lock in ib_*_client_data() has stronger guarantees.

Maybe, or maybe Alpha does something insane. Who knows with RCU..

But, generally, using RCU and locks together like this is sort of a
non-sense thing to do, the point of RCU is to avoid locks, so if you
have locks, then use them locks properly and forget about the RCU.

> > Something like:
> >
> >  /* Returns NULL or a valid pointer holding a kref. The kref must be
> >     freed by the caller */
> >  struct kref_t *ib_get_client_kref(...);
> >
> >  /* Atomically releases any kref already held and sets to a new
> >     value. If non-null a kref is obtained. */
> >  void ib_set_client_kref(struct kref_t *);
> >
> > And a client_kref_put function pointer to struct ib_client to point to
> > the putter function.
> >
> > The core code can safely manage all the required locking to guarentee
> > it returns a valid kref'd pointer, without needing to expose RCU.
> 
> I understand your idea.  If refcount_inc_not_zero() can be invoked
> directly from ib_get_client_data() under spin lock, that will of
> course eliminate the need to do any locks outside the device.c.

Well, this wouldn't need refcount_inc_not_zero() anymore, as the kref
is now guarenteed to be positive when in the core's storage list.

> But seems this is not enough.  Proposed kref protects read path,
> when ulp device is already set, but if not?  I.e. there should
> be some help from core API to create ulp device and set it,
> if client_data has not been set yet, i.e. something the following
> should be done under the lock inside device.c:

Yes, it would be nice for the core code to handle this atomically for
the ULP.

>         mutex_lock(&nvme_devices_mutex);
>         ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client);
>         if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev)))
>                 ndev = NULL;
>         else if (!ndev) {
>                ndev = nvme_rdma_alloc_device(cmd_id->device);
>                if (likely(ndev))
>                    ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client,
>                                       ndev);
>         }
>         mutex_unlock(&nvme_devices_mutex);

And this is too much complexity for ULPs.

> And it seems that such approach will require some new ib_client
> callbacks to create and free ulp device, which I personally do
> not like.

Why? Seems like a fine idea to me, and simplifies all the ULPs

Add two new call backs:
 create_kref
 relase_kref

And the core code will call create the very first time the kref data
is accessed, and do so safely under some kind of locking.

The core code will call release_kref when the ib_device is removed.

> Can we follow instead the way with cmpxchg semantics:
> 
>         static struct nvme_rdma_device *
>         nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
>         {
>                 struct nvme_rdma_device *ndev;
>                 struct kref *kref;
> 
>                 kref = ib_get_client_kref(cm_id->device,
>                                           &nvme_rdma_ib_client);
>                 if (!kref) {
>                         ndev = nvme_rdma_alloc_device(cm_id->device);
>                         WARN_ON(!ndev); /* ignore error path for now */
> 
>                         kref = ib_cmpxchg_client_kref(cm_id->device,
> &nvme_rdma_ib_client,
>                                                       NULL, &ndev->kref);
>                         if (kref) {
>                                 /* We raced with someone and lost, free ndev */
>                                 nvme_rdma_dev_put(ndev);
>                         } else
>                                 /* We won! */
>                                 kref = &ndev->kref;
>                 }
> 
>                 return container_of(kref, typeof(*ndev), kref);
>         }

Bleck, even more complexity.

Why? That is too much code duplication in every ULP for what?

Why such a mess to avoid a simple and well defined callback?

Jason
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roman Pen June 11, 2018, 8:55 a.m. UTC | #4
On Tue, Jun 5, 2018 at 4:22 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> On Tue, Jun 05, 2018 at 12:20:49PM +0200, Roman Penyaev wrote:
>
>> > I think I'd prefer a managed kref API instead of tricky
>> > RCU..
>> >
>> > ib_get_client_data doesn't use rcu_derference, so this isn't
>> > *technically* right
>>
>> Why? ib_(get|set)_client_data() takes spin_lock, which guarantees
>> pointer will be observed correctly and nothing will be reordered.
>> rcu_dereference() does READ_ONCE(), which in its turn expands to
>> volatile cast, which prevents compiler from optimization.
>> Of course spin lock in ib_*_client_data() has stronger guarantees.
>
> Maybe, or maybe Alpha does something insane. Who knows with RCU..

The thing is that in this case it does not matter what memory model
has your architecture, we are trying to avoid compiler optimization,
i.e. the following case:

   rcu_read_lock();
   if (*ptr)
       do_something();
   rcu_read_unlock();

can be optimized by the *compiler* (again, only compiler optimization
matters):

   tmp = *ptr;         <<< NOT NICE
   rcu_read_lock();
   if (tmp)
       do_something();
   rcu_read_unlock();

To guarantee that load from @ptr happens after rcu_read_lock(),
@ptr should be volatile (that what exactly rcu_dereference() does),
i.e. that tells compiler explicitly: keep the exact order of these
two lines.

To avoid confusion with all this RCU macros the chunk below can be
expanded and rewritten as following:

    preempt_count_inc();      <<< just increase percpu variable
    barrier();                <<< compiler barrier, which prevents
                                  other volatile accesses leak up
    if (*(volatile int *)ptr) <<< explicit volatile access
        do_something();

barrier();                <<< compiler barrier, prevents from
                             leaking down
preempt_count_dec();

Returning to my case:

    rcu_read_lock();
device = ib_get_client_data(...);
...
rcu_read_unlock();

Previously I mentioned that ib_get_client_data() function has a
spin lock, but this actually also does not matter, what matters
is that this is a real function *call*.  Compiler in this case
is not able to predict what will happen in this function, thus
keeps the same order.  (C standard explicitly defines function
call as a sequence point (C99 Annex C), i.e. between sequence
points, where volatile access is also a sequence point, compiler
can't change the order of the execution).

Of course I do not consider here that ib_get_client_data() can
become a macro in the future. In that case spin lock inside
starts playing the main role.

> But, generally, using RCU and locks together like this is sort of a
> non-sense thing to do, the point of RCU is to avoid locks, so if you
> have locks, then use them locks properly and forget about the RCU.

I agree, looks like a mess, but without support from core API that
is impossible to do without additional protections: locks, RCU or
whatever you like.

>> > Something like:
>> >
>> >  /* Returns NULL or a valid pointer holding a kref. The kref must be
>> >     freed by the caller */
>> >  struct kref_t *ib_get_client_kref(...);
>> >
>> >  /* Atomically releases any kref already held and sets to a new
>> >     value. If non-null a kref is obtained. */
>> >  void ib_set_client_kref(struct kref_t *);
>> >
>> > And a client_kref_put function pointer to struct ib_client to point to
>> > the putter function.
>> >
>> > The core code can safely manage all the required locking to guarentee
>> > it returns a valid kref'd pointer, without needing to expose RCU.
>>
>> I understand your idea.  If refcount_inc_not_zero() can be invoked
>> directly from ib_get_client_data() under spin lock, that will of
>> course eliminate the need to do any locks outside the device.c.
>
> Well, this wouldn't need refcount_inc_not_zero() anymore, as the kref
> is now guarenteed to be positive when in the core's storage list.
>
>> But seems this is not enough.  Proposed kref protects read path,
>> when ulp device is already set, but if not?  I.e. there should
>> be some help from core API to create ulp device and set it,
>> if client_data has not been set yet, i.e. something the following
>> should be done under the lock inside device.c:
>
> Yes, it would be nice for the core code to handle this atomically for
> the ULP.
>
>>         mutex_lock(&nvme_devices_mutex);
>>         ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client);
>>         if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev)))
>>                 ndev = NULL;
>>         else if (!ndev) {
>>                ndev = nvme_rdma_alloc_device(cmd_id->device);
>>                if (likely(ndev))
>>                    ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client,
>>                                       ndev);
>>         }
>>         mutex_unlock(&nvme_devices_mutex);
>
> And this is too much complexity for ULPs.

Indeed, but it is explicit.  And please keep in mind I considered here
only one NVMe case.

>> And it seems that such approach will require some new ib_client
>> callbacks to create and free ulp device, which I personally do
>> not like.
>
> Why? Seems like a fine idea to me, and simplifies all the ULPs
>
> Add two new call backs:
>  create_kref
>  relase_kref
>
> And the core code will call create the very first time the kref data
> is accessed, and do so safely under some kind of locking.

The thing is that inside ib_(get|set)_client_data() spin lock is used,
because those two can be called from any context, thus ib_client.alloc()
callback can't be called under spin lock, because alloc() can sleep.

If we assume that each ULP knows that for the first time
ib_alloc_or_get_client_data() (or any analogue) is called on the path,
where sleep is allowed - then yes, that should work out.  But seems
that should be covered by fat comment.

But what if to go even further and make "struct ib_client_data" public
with kref inside?  That will require changes for ib_client API,  please
take a look:

--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h

@@

+struct ib_client_data;
+
 struct ib_client {
        char  *name;
        void (*add)   (struct ib_device *);
        void (*remove)(struct ib_device *, void *client_data);

+       struct ib_client_data *(*alloc_client_data)(struct ib_device *);
+       void (*free_client_data)(struct ib_client_data *);
+

@@

+struct ib_client_data {
+       struct kref kref;
+       struct ib_device *device;
+       struct ib_client *client;
+       struct list_head device_entry;
+       struct list_head client_entry;
+ };
+

- void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
- void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
-                          void *data);

+ struct ib_client_data *
+ ib_alloc_or_get_client_data(struct ib_device *device,
+                             struct ib_client *client);
+ struct ib_client_data *
+ ib_get_client_data(struct ib_device *device,
+                    struct ib_client *client);
+ bool ib_put_client_data(struct ib_client_data *data);


Then each existent ULP should be changed accordingly, e.g.:

static void ulp_add_one(struct ib_device *device)
{
       struct ib_client_data *data;

       /*
        * Called here for the first time, i.e. ib_client.alloc_client_data()
        * would be invoked.  We do not put the reference and keep ULP dev
        * structure in core lists.
        */
       (void)ib_alloc_or_get_client_data(device, &ib_client);
}

static void ulp_remove_one(struct ib_device *device, void *client_data)
{
       ...
       /* Should be last reference put */
       WARN_ON(!ib_put_client_data(&sdev->ib_data));
}

static void ulp_event_handler(struct ib_event_handler *handler,
                              struct ib_event *event)
{
       struct ib_client_data *data;
       struct ulp_device *ulp_dev;

       data = ib_get_client_data(event->device, &srpt_client);
       if (unlikely(!data))
                return;

       ulp_dev = container_of(data, typeof(*ulp_dev), ib_data);

       /*  ...
          Use ulp_dev
          ...
        */

       /* One reference is still alive */
       WARN_ON(ib_put_client_data(&sdev->ib_data));
}

And NVMe and ISER case would become indeed much simpler:
ib_alloc_or_get_client_data() can be called just from the cm_handler(),
where sleep is allowed.


> The core code will call release_kref when the ib_device is removed.
>
>> Can we follow instead the way with cmpxchg semantics:
>>
>>         static struct nvme_rdma_device *
>>         nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
>>         {
>>                 struct nvme_rdma_device *ndev;
>>                 struct kref *kref;
>>
>>                 kref = ib_get_client_kref(cm_id->device,
>>                                           &nvme_rdma_ib_client);
>>                 if (!kref) {
>>                         ndev = nvme_rdma_alloc_device(cm_id->device);
>>                         WARN_ON(!ndev); /* ignore error path for now */
>>
>>                         kref = ib_cmpxchg_client_kref(cm_id->device,
>> &nvme_rdma_ib_client,
>>                                                       NULL, &ndev->kref);
>>                         if (kref) {
>>                                 /* We raced with someone and lost, free ndev */
>>                                 nvme_rdma_dev_put(ndev);
>>                         } else
>>                                 /* We won! */
>>                                 kref = &ndev->kref;
>>                 }
>>
>>                 return container_of(kref, typeof(*ndev), kref);
>>         }
>
> Bleck, even more complexity.
>
> Why? That is too much code duplication in every ULP for what?

Well, I considered only NVMe case, I did not plan to touch others.

> Why such a mess to avoid a simple and well defined callback?

Indeed we can, if change API and refactor internals of device.c a bit.
I can send an RFC in couple of days with making ib_client_data public
and the stuff which I described above.  What do you think?

--
Roman
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe June 11, 2018, 4:51 p.m. UTC | #5
On Mon, Jun 11, 2018 at 10:55:39AM +0200, Roman Penyaev wrote:

> can be optimized by the *compiler* (again, only compiler optimization
> matters):
> 
>    tmp = *ptr;         <<< NOT NICE
>    rcu_read_lock();
>    if (tmp)
>        do_something();
>    rcu_read_unlock();

No, this already isn't possible.

>     preempt_count_inc();      <<< just increase percpu variable
>     barrier();                <<< compiler barrier, which prevents
>                                   other volatile accesses leak up

barrier() prevents everything from moving out.

>     if (*(volatile int *)ptr) <<< explicit volatile access
>         do_something();

Alpha inserts a special CPU barrier in here.

> barrier();                <<< compiler barrier, prevents from
>                              leaking down
> preempt_count_dec();

The point of rcu_dereference is to pair with rcu_assign_pointer so the
code can make a control flow decision without re-ordering, eg:

 if (rcu_dereference(ptr) != 0)
     assert(*something != NULL);

Guarentees that if the store side does

 rcu_assign_pointer(ptr, 0);
 *something = NULL;

Then the assert cannot fire.

It has nothing to do with accesses leaking outside the rcu critical,
section.

> Returning to my case:
> 
>     rcu_read_lock();
> device = ib_get_client_data(...);
> ...
> rcu_read_unlock();
> 
> Previously I mentioned that ib_get_client_data() function has a
> spin lock, but this actually also does not matter, what matters
> is that this is a real function *call*.

No, it does matter. Code should never rely on a function call to
create an implicit compiler barrier. eg We have LTO now which washes
that side-effect away.

The spinlock was the right answer, because there is no RCU protected
data here, and none of the special properties of RCU are required.

> >> And it seems that such approach will require some new ib_client
> >> callbacks to create and free ulp device, which I personally do
> >> not like.
> >
> > Why? Seems like a fine idea to me, and simplifies all the ULPs
> >
> > Add two new call backs:
> >  create_kref
> >  relase_kref
> >
> > And the core code will call create the very first time the kref data
> > is accessed, and do so safely under some kind of locking.
> 
> The thing is that inside ib_(get|set)_client_data() spin lock is used,
> because those two can be called from any context, thus ib_client.alloc()
> callback can't be called under spin lock, because alloc() can sleep.

> If we assume that each ULP knows that for the first time
> ib_alloc_or_get_client_data() (or any analogue) is called on the path,
> where sleep is allowed - then yes, that should work out.  But seems
> that should be covered by fat comment.

Seems reasonable..

> +struct ib_client_data;
> +
>  struct ib_client {
>         char  *name;
>         void (*add)   (struct ib_device *);
>         void (*remove)(struct ib_device *, void *client_data);
> 
> +       struct ib_client_data *(*alloc_client_data)(struct ib_device *);
> +       void (*free_client_data)(struct ib_client_data *);
> +

> +struct ib_client_data {
> +       struct kref kref;
> +       struct ib_device *device;
> +       struct ib_client *client;
> +       struct list_head device_entry;
> +       struct list_head client_entry;
> + };

Well, the point of the alloc callback was to not alloc data until the
client became activated, exposing this struct doesn't really seem to
help that since you'd still need a void * in here with the data pointer.

> > Why such a mess to avoid a simple and well defined callback?
> 
> Indeed we can, if change API and refactor internals of device.c a bit.
> I can send an RFC in couple of days with making ib_client_data public
> and the stuff which I described above.  What do you think?

Sure, you can mess around with the core code to make a nicer cleanup.

Jason
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roman Pen June 11, 2018, 5:43 p.m. UTC | #6
On Mon, Jun 11, 2018 at 6:51 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote:
> On Mon, Jun 11, 2018 at 10:55:39AM +0200, Roman Penyaev wrote:
>
>> can be optimized by the *compiler* (again, only compiler optimization
>> matters):
>>
>>    tmp = *ptr;         <<< NOT NICE
>>    rcu_read_lock();
>>    if (tmp)
>>        do_something();
>>    rcu_read_unlock();
>
> No, this already isn't possible.
>
>>     preempt_count_inc();      <<< just increase percpu variable
>>     barrier();                <<< compiler barrier, which prevents
>>                                   other volatile accesses leak up
>
> barrier() prevents everything from moving out.
>
>>     if (*(volatile int *)ptr) <<< explicit volatile access
>>         do_something();
>
> Alpha inserts a special CPU barrier in here.
>
>> barrier();                <<< compiler barrier, prevents from
>>                              leaking down
>> preempt_count_dec();
>
> The point of rcu_dereference is to pair with rcu_assign_pointer so the
> code can make a control flow decision without re-ordering, eg:
>
>  if (rcu_dereference(ptr) != 0)
>      assert(*something != NULL);
>
> Guarentees that if the store side does
>
>  rcu_assign_pointer(ptr, 0);
>  *something = NULL;
>
> Then the assert cannot fire.
>
> It has nothing to do with accesses leaking outside the rcu critical,
> section.
>
>> Returning to my case:
>>
>>     rcu_read_lock();
>> device = ib_get_client_data(...);
>> ...
>> rcu_read_unlock();
>>
>> Previously I mentioned that ib_get_client_data() function has a
>> spin lock, but this actually also does not matter, what matters
>> is that this is a real function *call*.
>
> No, it does matter. Code should never rely on a function call to
> create an implicit compiler barrier. eg We have LTO now which washes
> that side-effect away.
>
> The spinlock was the right answer, because there is no RCU protected
> data here, and none of the special properties of RCU are required.

Indeed, I missed the alpha case completely.  I was confused with
the volatile cast on x86_64, but it is needed only to prevent compiler
from deducing the resulting pointer, not reordering.  Thanks for
pointing that out.

>
>> >> And it seems that such approach will require some new ib_client
>> >> callbacks to create and free ulp device, which I personally do
>> >> not like.
>> >
>> > Why? Seems like a fine idea to me, and simplifies all the ULPs
>> >
>> > Add two new call backs:
>> >  create_kref
>> >  relase_kref
>> >
>> > And the core code will call create the very first time the kref data
>> > is accessed, and do so safely under some kind of locking.
>>
>> The thing is that inside ib_(get|set)_client_data() spin lock is used,
>> because those two can be called from any context, thus ib_client.alloc()
>> callback can't be called under spin lock, because alloc() can sleep.
>
>> If we assume that each ULP knows that for the first time
>> ib_alloc_or_get_client_data() (or any analogue) is called on the path,
>> where sleep is allowed - then yes, that should work out.  But seems
>> that should be covered by fat comment.
>
> Seems reasonable..
>
>> +struct ib_client_data;
>> +
>>  struct ib_client {
>>         char  *name;
>>         void (*add)   (struct ib_device *);
>>         void (*remove)(struct ib_device *, void *client_data);
>>
>> +       struct ib_client_data *(*alloc_client_data)(struct ib_device *);
>> +       void (*free_client_data)(struct ib_client_data *);
>> +
>
>> +struct ib_client_data {
>> +       struct kref kref;
>> +       struct ib_device *device;
>> +       struct ib_client *client;
>> +       struct list_head device_entry;
>> +       struct list_head client_entry;
>> + };
>
> Well, the point of the alloc callback was to not alloc data until the
> client became activated,

That is true, but from the very beginning that can be used only for
a new code: nvme and iser cases.  Others ULP devs can be switched to
a new API without significant changes, but still allocate data on
add_one() (i.e. preallocate) and then step by step that can be also
changed.  I am not sure that in one go I can refactor all the ULP
devs.

> exposing this struct doesn't really seem to
> help that since you'd still need a void * in here with the data pointer.

Why?  "struct ib_client_data" can be embedded inside ulp_dev specific
structure and then ulp_dev->ib_data can be linked in all the lists
inside device.c, so exposing ib_client_data structure we can avoid
setting void *.

>
>> > Why such a mess to avoid a simple and well defined callback?
>>
>> Indeed we can, if change API and refactor internals of device.c a bit.
>> I can send an RFC in couple of days with making ib_client_data public
>> and the stuff which I described above.  What do you think?
>
> Sure, you can mess around with the core code to make a nicer cleanup.

Ok, I will try to prepare an RFC.

--
Roman
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 08dc9d27e5b1..c80f31573919 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -42,8 +42,7 @@  static int isert_debug_level;
 module_param_named(debug_level, isert_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
 
-static DEFINE_MUTEX(device_list_mutex);
-static LIST_HEAD(device_list);
+static struct ib_client isert_rdma_ib_client;
 static struct workqueue_struct *isert_comp_wq;
 static struct workqueue_struct *isert_release_wq;
 
@@ -341,7 +340,6 @@  isert_free_device_ib_res(struct isert_device *device)
 static void
 isert_device_put(struct isert_device *device)
 {
-	mutex_lock(&device_list_mutex);
 	isert_info("device %p refcount %d\n", device,
 		   refcount_read(&device->refcount));
 	if (refcount_dec_and_test(&device->refcount)) {
@@ -349,48 +347,44 @@  isert_device_put(struct isert_device *device)
 		list_del(&device->dev_node);
 		kfree(device);
 	}
-	mutex_unlock(&device_list_mutex);
 }
 
 static struct isert_device *
 isert_device_get(struct rdma_cm_id *cma_id)
 {
 	struct isert_device *device;
-	int ret;
 
-	mutex_lock(&device_list_mutex);
-	list_for_each_entry(device, &device_list, dev_node) {
-		if (device->ib_device->node_guid == cma_id->device->node_guid) {
-			refcount_inc(&device->refcount);
-			isert_info("Found iser device %p refcount %d\n",
-				   device, refcount_read(&device->refcount));
-			mutex_unlock(&device_list_mutex);
-			return device;
-		}
-	}
+	/* Paired with isert_ib_client_remove_one() */
+	rcu_read_lock();
+	device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client);
+	if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount)))
+		device = NULL;
+	rcu_read_unlock();
 
-	device = kzalloc(sizeof(struct isert_device), GFP_KERNEL);
-	if (!device) {
-		mutex_unlock(&device_list_mutex);
-		return ERR_PTR(-ENOMEM);
-	}
+	return device;
+}
+
+static struct isert_device *
+isert_device_alloc(struct ib_device *ib_device)
+{
+	struct isert_device *device;
+	int ret;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (unlikely(!device))
+		return NULL;
 
 	INIT_LIST_HEAD(&device->dev_node);
 	mutex_init(&device->comp_mutex);
-
-	device->ib_device = cma_id->device;
+	refcount_set(&device->refcount, 1);
+	device->ib_device = ib_device;
 	ret = isert_create_device_ib_res(device);
 	if (ret) {
 		kfree(device);
-		mutex_unlock(&device_list_mutex);
-		return ERR_PTR(ret);
+		return NULL;
 	}
-
-	refcount_inc(&device->refcount);
-	list_add_tail(&device->dev_node, &device_list);
 	isert_info("Created a new iser device %p refcount %d\n",
 		   device, refcount_read(&device->refcount));
-	mutex_unlock(&device_list_mutex);
 
 	return device;
 }
@@ -530,8 +524,8 @@  isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 		goto out;
 
 	device = isert_device_get(cma_id);
-	if (IS_ERR(device)) {
-		ret = PTR_ERR(device);
+	if (unlikely(!device)) {
+		ret = -ENODEV;
 		goto out_rsp_dma_map;
 	}
 	isert_conn->device = device;
@@ -2681,15 +2675,52 @@  static struct iscsit_transport iser_target_transport = {
 	.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
 };
 
+static void isert_ib_client_add_one(struct ib_device *ib_device)
+{
+	struct isert_device *device;
+
+	device = isert_device_alloc(ib_device);
+	if (likely(device))
+		ib_set_client_data(ib_device, &isert_rdma_ib_client, device);
+	else
+		pr_err("Allocattion of a device failed.\n");
+}
+
+static void isert_ib_client_remove_one(struct ib_device *ib_device,
+				       void *client_data)
+{
+	struct isert_device *device = client_data;
+
+	if (unlikely(!device))
+		return;
+
+	ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL);
+	/* Paired with isert_device_get() */
+	synchronize_rcu();
+	isert_device_put(device);
+}
+
+static struct ib_client isert_rdma_ib_client = {
+	.name   = "isert_client_ib",
+	.add    = isert_ib_client_add_one,
+	.remove = isert_ib_client_remove_one
+};
+
 static int __init isert_init(void)
 {
 	int ret;
 
+	ret = ib_register_client(&isert_rdma_ib_client);
+	if (unlikely(ret)) {
+		isert_err("ib_register_client(): %d\n", ret);
+		return ret;
+	}
 	isert_comp_wq = alloc_workqueue("isert_comp_wq",
 					WQ_UNBOUND | WQ_HIGHPRI, 0);
 	if (!isert_comp_wq) {
 		isert_err("Unable to allocate isert_comp_wq\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto unregister_client;
 	}
 
 	isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND,
@@ -2707,6 +2738,8 @@  static int __init isert_init(void)
 
 destroy_comp_wq:
 	destroy_workqueue(isert_comp_wq);
+unregister_client:
+	ib_unregister_client(&isert_rdma_ib_client);
 
 	return ret;
 }
@@ -2717,6 +2750,7 @@  static void __exit isert_exit(void)
 	destroy_workqueue(isert_release_wq);
 	destroy_workqueue(isert_comp_wq);
 	iscsit_unregister_transport(&iser_target_transport);
+	ib_unregister_client(&isert_rdma_ib_client);
 	isert_info("iSER_TARGET[0] - Released iser_target_transport\n");
 }