Message ID | 20180604123003.24748-9-roman.penyaev@profitbricks.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Jun 04, 2018 at 02:30:03PM +0200, Roman Pen wrote: > ib_client API provides a way to wrap an ib_device with a specific ULP > structure. Using that API local lists and mutexes can be completely > avoided and allocation/removal paths become a bit cleaner. > > Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de> > Cc: Christoph Hellwig <hch@lst.de> > Cc: Steve Wise <swise@opengridcomputing.com> > Cc: Bart Van Assche <bart.vanassche@sandisk.com> > Cc: Sagi Grimberg <sagi@grimberg.me> > Cc: Doug Ledford <dledford@redhat.com> > Cc: target-devel@vger.kernel.org > drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++----------- > 1 file changed, 65 insertions(+), 31 deletions(-) > > diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c > index 08dc9d27e5b1..c80f31573919 100644 > +++ b/drivers/infiniband/ulp/isert/ib_isert.c > @@ -42,8 +42,7 @@ static int isert_debug_level; > module_param_named(debug_level, isert_debug_level, int, 0644); > MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)"); > > -static DEFINE_MUTEX(device_list_mutex); > -static LIST_HEAD(device_list); > +static struct ib_client isert_rdma_ib_client; > static struct workqueue_struct *isert_comp_wq; > static struct workqueue_struct *isert_release_wq; > > @@ -341,7 +340,6 @@ isert_free_device_ib_res(struct isert_device *device) > static void > isert_device_put(struct isert_device *device) > { > - mutex_lock(&device_list_mutex); > isert_info("device %p refcount %d\n", device, > refcount_read(&device->refcount)); > if (refcount_dec_and_test(&device->refcount)) { > @@ -349,48 +347,44 @@ isert_device_put(struct isert_device *device) > list_del(&device->dev_node); > kfree(device); > } > - mutex_unlock(&device_list_mutex); > } > > static struct isert_device * > isert_device_get(struct rdma_cm_id *cma_id) > { > struct isert_device *device; > - int ret; > > - mutex_lock(&device_list_mutex); > - list_for_each_entry(device, &device_list, dev_node) { > - if (device->ib_device->node_guid == cma_id->device->node_guid) { > - refcount_inc(&device->refcount); > - isert_info("Found iser device %p refcount %d\n", > - device, refcount_read(&device->refcount)); > - mutex_unlock(&device_list_mutex); > - return device; > - } > - } > + /* Paired with isert_ib_client_remove_one() */ > + rcu_read_lock(); > + device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client); > + if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) > + device = NULL; > + rcu_read_unlock(); I think I'd prefer a managed kref API instead of tricky RCU.. ib_get_client_data doesn't use rcu_derference, so this isn't *technically* right Something like: /* Returns NULL or a valid pointer holding a kref. The kref must be freed by the caller */ struct kref_t *ib_get_client_kref(...); /* Atomically releases any kref already held and sets to a new value. If non-null a kref is obtained. */ void ib_set_client_kref(struct kref_t *); And a client_kref_put function pointer to struct ib_client to point to the putter function. The core code can safely manage all the required locking to guarentee it returns a valid kref'd pointer, without needing to expose RCU. Looks like many clients fit into this model already, so may as well help them out. Jason -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 4, 2018 at 11:58 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Mon, Jun 04, 2018 at 02:30:03PM +0200, Roman Pen wrote: >> ib_client API provides a way to wrap an ib_device with a specific ULP >> structure. Using that API local lists and mutexes can be completely >> avoided and allocation/removal paths become a bit cleaner. >> >> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de> >> Cc: Christoph Hellwig <hch@lst.de> >> Cc: Steve Wise <swise@opengridcomputing.com> >> Cc: Bart Van Assche <bart.vanassche@sandisk.com> >> Cc: Sagi Grimberg <sagi@grimberg.me> >> Cc: Doug Ledford <dledford@redhat.com> >> Cc: target-devel@vger.kernel.org >> drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++----------- >> 1 file changed, 65 insertions(+), 31 deletions(-) >> >> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c >> index 08dc9d27e5b1..c80f31573919 100644 >> +++ b/drivers/infiniband/ulp/isert/ib_isert.c >> @@ -42,8 +42,7 @@ static int isert_debug_level; >> module_param_named(debug_level, isert_debug_level, int, 0644); >> MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)"); >> >> -static DEFINE_MUTEX(device_list_mutex); >> -static LIST_HEAD(device_list); >> +static struct ib_client isert_rdma_ib_client; >> static struct workqueue_struct *isert_comp_wq; >> static struct workqueue_struct *isert_release_wq; >> >> @@ -341,7 +340,6 @@ isert_free_device_ib_res(struct isert_device *device) >> static void >> isert_device_put(struct isert_device *device) >> { >> - mutex_lock(&device_list_mutex); >> isert_info("device %p refcount %d\n", device, >> refcount_read(&device->refcount)); >> if (refcount_dec_and_test(&device->refcount)) { >> @@ -349,48 +347,44 @@ isert_device_put(struct isert_device *device) >> list_del(&device->dev_node); >> kfree(device); >> } >> - mutex_unlock(&device_list_mutex); >> } >> >> static struct isert_device * >> isert_device_get(struct rdma_cm_id *cma_id) >> { >> struct isert_device *device; >> - int ret; >> >> - mutex_lock(&device_list_mutex); >> - list_for_each_entry(device, &device_list, dev_node) { >> - if (device->ib_device->node_guid == cma_id->device->node_guid) { >> - refcount_inc(&device->refcount); >> - isert_info("Found iser device %p refcount %d\n", >> - device, refcount_read(&device->refcount)); >> - mutex_unlock(&device_list_mutex); >> - return device; >> - } >> - } >> + /* Paired with isert_ib_client_remove_one() */ >> + rcu_read_lock(); >> + device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client); >> + if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) >> + device = NULL; >> + rcu_read_unlock(); > > I think I'd prefer a managed kref API instead of tricky > RCU.. > > ib_get_client_data doesn't use rcu_derference, so this isn't > *technically* right Why? ib_(get|set)_client_data() takes spin_lock, which guarantees pointer will be observed correctly and nothing will be reordered. rcu_dereference() does READ_ONCE(), which in its turn expands to volatile cast, which prevents compiler from optimization. Of course spin lock in ib_*_client_data() has stronger guarantees. Here I used RCU only in order not to introduce another lock. I need a grace period to be sure everybody observes pointer as NULL *or* the reference is safely increased. So these two variants are similar: RCU /* READ PATH */ rcu_read_lock(); device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client); if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) device = NULL; rcu_read_unlock(); /* FREE PATH */ ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL); synchronize_rcu(); isert_device_put(device); LOCK /* READ PATH */ spin_lock(&lock); device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client); if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) device = NULL; spin_unlock(&lock); /* FREE PATH */ ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL); /* Blink with the lock to be sure everybody has left critical section */ spin_lock(&lock); spin_unlock(&lock); isert_device_put(device); /* or even another FREE PATH, wrap ib_set_client_data() with lock/unlock */ spin_lock(&lock); ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL); spin_unlock(&lock); isert_device_put(device); > > Something like: > > /* Returns NULL or a valid pointer holding a kref. The kref must be > freed by the caller */ > struct kref_t *ib_get_client_kref(...); > > /* Atomically releases any kref already held and sets to a new > value. If non-null a kref is obtained. */ > void ib_set_client_kref(struct kref_t *); > > And a client_kref_put function pointer to struct ib_client to point to > the putter function. > > The core code can safely manage all the required locking to guarentee > it returns a valid kref'd pointer, without needing to expose RCU. I understand your idea. If refcount_inc_not_zero() can be invoked directly from ib_get_client_data() under spin lock, that will of course eliminate the need to do any locks outside the device.c. But seems this is not enough. Proposed kref protects read path, when ulp device is already set, but if not? I.e. there should be some help from core API to create ulp device and set it, if client_data has not been set yet, i.e. something the following should be done under the lock inside device.c: mutex_lock(&nvme_devices_mutex); ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client); if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev))) ndev = NULL; else if (!ndev) { ndev = nvme_rdma_alloc_device(cmd_id->device); if (likely(ndev)) ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client, ndev); } mutex_unlock(&nvme_devices_mutex); * code hunk is stolen from previous email, where we discuss that * ulp device can be created on demand. And it seems that such approach will require some new ib_client callbacks to create and free ulp device, which I personally do not like. Can we follow instead the way with cmpxchg semantics: static struct nvme_rdma_device * nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) { struct nvme_rdma_device *ndev; struct kref *kref; kref = ib_get_client_kref(cm_id->device, &nvme_rdma_ib_client); if (!kref) { ndev = nvme_rdma_alloc_device(cm_id->device); WARN_ON(!ndev); /* ignore error path for now */ kref = ib_cmpxchg_client_kref(cm_id->device, &nvme_rdma_ib_client, NULL, &ndev->kref); if (kref) { /* We raced with someone and lost, free ndev */ nvme_rdma_dev_put(ndev); } else /* We won! */ kref = &ndev->kref; } return container_of(kref, typeof(*ndev), kref); } The major difference is that the pointer will be updated IFF it was NULL. IMO that can worth to do because no extra callbacks and locks are required and ULP device has full control. What do you think? -- Roman -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 05, 2018 at 12:20:49PM +0200, Roman Penyaev wrote: > > I think I'd prefer a managed kref API instead of tricky > > RCU.. > > > > ib_get_client_data doesn't use rcu_derference, so this isn't > > *technically* right > > Why? ib_(get|set)_client_data() takes spin_lock, which guarantees > pointer will be observed correctly and nothing will be reordered. > rcu_dereference() does READ_ONCE(), which in its turn expands to > volatile cast, which prevents compiler from optimization. > Of course spin lock in ib_*_client_data() has stronger guarantees. Maybe, or maybe Alpha does something insane. Who knows with RCU.. But, generally, using RCU and locks together like this is sort of a non-sense thing to do, the point of RCU is to avoid locks, so if you have locks, then use them locks properly and forget about the RCU. > > Something like: > > > > /* Returns NULL or a valid pointer holding a kref. The kref must be > > freed by the caller */ > > struct kref_t *ib_get_client_kref(...); > > > > /* Atomically releases any kref already held and sets to a new > > value. If non-null a kref is obtained. */ > > void ib_set_client_kref(struct kref_t *); > > > > And a client_kref_put function pointer to struct ib_client to point to > > the putter function. > > > > The core code can safely manage all the required locking to guarentee > > it returns a valid kref'd pointer, without needing to expose RCU. > > I understand your idea. If refcount_inc_not_zero() can be invoked > directly from ib_get_client_data() under spin lock, that will of > course eliminate the need to do any locks outside the device.c. Well, this wouldn't need refcount_inc_not_zero() anymore, as the kref is now guarenteed to be positive when in the core's storage list. > But seems this is not enough. Proposed kref protects read path, > when ulp device is already set, but if not? I.e. there should > be some help from core API to create ulp device and set it, > if client_data has not been set yet, i.e. something the following > should be done under the lock inside device.c: Yes, it would be nice for the core code to handle this atomically for the ULP. > mutex_lock(&nvme_devices_mutex); > ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client); > if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev))) > ndev = NULL; > else if (!ndev) { > ndev = nvme_rdma_alloc_device(cmd_id->device); > if (likely(ndev)) > ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client, > ndev); > } > mutex_unlock(&nvme_devices_mutex); And this is too much complexity for ULPs. > And it seems that such approach will require some new ib_client > callbacks to create and free ulp device, which I personally do > not like. Why? Seems like a fine idea to me, and simplifies all the ULPs Add two new call backs: create_kref relase_kref And the core code will call create the very first time the kref data is accessed, and do so safely under some kind of locking. The core code will call release_kref when the ib_device is removed. > Can we follow instead the way with cmpxchg semantics: > > static struct nvme_rdma_device * > nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) > { > struct nvme_rdma_device *ndev; > struct kref *kref; > > kref = ib_get_client_kref(cm_id->device, > &nvme_rdma_ib_client); > if (!kref) { > ndev = nvme_rdma_alloc_device(cm_id->device); > WARN_ON(!ndev); /* ignore error path for now */ > > kref = ib_cmpxchg_client_kref(cm_id->device, > &nvme_rdma_ib_client, > NULL, &ndev->kref); > if (kref) { > /* We raced with someone and lost, free ndev */ > nvme_rdma_dev_put(ndev); > } else > /* We won! */ > kref = &ndev->kref; > } > > return container_of(kref, typeof(*ndev), kref); > } Bleck, even more complexity. Why? That is too much code duplication in every ULP for what? Why such a mess to avoid a simple and well defined callback? Jason -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 5, 2018 at 4:22 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Tue, Jun 05, 2018 at 12:20:49PM +0200, Roman Penyaev wrote: > >> > I think I'd prefer a managed kref API instead of tricky >> > RCU.. >> > >> > ib_get_client_data doesn't use rcu_derference, so this isn't >> > *technically* right >> >> Why? ib_(get|set)_client_data() takes spin_lock, which guarantees >> pointer will be observed correctly and nothing will be reordered. >> rcu_dereference() does READ_ONCE(), which in its turn expands to >> volatile cast, which prevents compiler from optimization. >> Of course spin lock in ib_*_client_data() has stronger guarantees. > > Maybe, or maybe Alpha does something insane. Who knows with RCU.. The thing is that in this case it does not matter what memory model has your architecture, we are trying to avoid compiler optimization, i.e. the following case: rcu_read_lock(); if (*ptr) do_something(); rcu_read_unlock(); can be optimized by the *compiler* (again, only compiler optimization matters): tmp = *ptr; <<< NOT NICE rcu_read_lock(); if (tmp) do_something(); rcu_read_unlock(); To guarantee that load from @ptr happens after rcu_read_lock(), @ptr should be volatile (that what exactly rcu_dereference() does), i.e. that tells compiler explicitly: keep the exact order of these two lines. To avoid confusion with all this RCU macros the chunk below can be expanded and rewritten as following: preempt_count_inc(); <<< just increase percpu variable barrier(); <<< compiler barrier, which prevents other volatile accesses leak up if (*(volatile int *)ptr) <<< explicit volatile access do_something(); barrier(); <<< compiler barrier, prevents from leaking down preempt_count_dec(); Returning to my case: rcu_read_lock(); device = ib_get_client_data(...); ... rcu_read_unlock(); Previously I mentioned that ib_get_client_data() function has a spin lock, but this actually also does not matter, what matters is that this is a real function *call*. Compiler in this case is not able to predict what will happen in this function, thus keeps the same order. (C standard explicitly defines function call as a sequence point (C99 Annex C), i.e. between sequence points, where volatile access is also a sequence point, compiler can't change the order of the execution). Of course I do not consider here that ib_get_client_data() can become a macro in the future. In that case spin lock inside starts playing the main role. > But, generally, using RCU and locks together like this is sort of a > non-sense thing to do, the point of RCU is to avoid locks, so if you > have locks, then use them locks properly and forget about the RCU. I agree, looks like a mess, but without support from core API that is impossible to do without additional protections: locks, RCU or whatever you like. >> > Something like: >> > >> > /* Returns NULL or a valid pointer holding a kref. The kref must be >> > freed by the caller */ >> > struct kref_t *ib_get_client_kref(...); >> > >> > /* Atomically releases any kref already held and sets to a new >> > value. If non-null a kref is obtained. */ >> > void ib_set_client_kref(struct kref_t *); >> > >> > And a client_kref_put function pointer to struct ib_client to point to >> > the putter function. >> > >> > The core code can safely manage all the required locking to guarentee >> > it returns a valid kref'd pointer, without needing to expose RCU. >> >> I understand your idea. If refcount_inc_not_zero() can be invoked >> directly from ib_get_client_data() under spin lock, that will of >> course eliminate the need to do any locks outside the device.c. > > Well, this wouldn't need refcount_inc_not_zero() anymore, as the kref > is now guarenteed to be positive when in the core's storage list. > >> But seems this is not enough. Proposed kref protects read path, >> when ulp device is already set, but if not? I.e. there should >> be some help from core API to create ulp device and set it, >> if client_data has not been set yet, i.e. something the following >> should be done under the lock inside device.c: > > Yes, it would be nice for the core code to handle this atomically for > the ULP. > >> mutex_lock(&nvme_devices_mutex); >> ndev = ib_get_client_data(cm_id->device, &nvme_rdma_ib_client); >> if (ndev && WARN_ON(!nvme_rdma_dev_get(ndev))) >> ndev = NULL; >> else if (!ndev) { >> ndev = nvme_rdma_alloc_device(cmd_id->device); >> if (likely(ndev)) >> ib_set_client_data(cm_id->device, &nvmet_rdma_ib_client, >> ndev); >> } >> mutex_unlock(&nvme_devices_mutex); > > And this is too much complexity for ULPs. Indeed, but it is explicit. And please keep in mind I considered here only one NVMe case. >> And it seems that such approach will require some new ib_client >> callbacks to create and free ulp device, which I personally do >> not like. > > Why? Seems like a fine idea to me, and simplifies all the ULPs > > Add two new call backs: > create_kref > relase_kref > > And the core code will call create the very first time the kref data > is accessed, and do so safely under some kind of locking. The thing is that inside ib_(get|set)_client_data() spin lock is used, because those two can be called from any context, thus ib_client.alloc() callback can't be called under spin lock, because alloc() can sleep. If we assume that each ULP knows that for the first time ib_alloc_or_get_client_data() (or any analogue) is called on the path, where sleep is allowed - then yes, that should work out. But seems that should be covered by fat comment. But what if to go even further and make "struct ib_client_data" public with kref inside? That will require changes for ib_client API, please take a look: --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ +struct ib_client_data; + struct ib_client { char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); + struct ib_client_data *(*alloc_client_data)(struct ib_device *); + void (*free_client_data)(struct ib_client_data *); + @@ +struct ib_client_data { + struct kref kref; + struct ib_device *device; + struct ib_client *client; + struct list_head device_entry; + struct list_head client_entry; + }; + - void *ib_get_client_data(struct ib_device *device, struct ib_client *client); - void ib_set_client_data(struct ib_device *device, struct ib_client *client, - void *data); + struct ib_client_data * + ib_alloc_or_get_client_data(struct ib_device *device, + struct ib_client *client); + struct ib_client_data * + ib_get_client_data(struct ib_device *device, + struct ib_client *client); + bool ib_put_client_data(struct ib_client_data *data); Then each existent ULP should be changed accordingly, e.g.: static void ulp_add_one(struct ib_device *device) { struct ib_client_data *data; /* * Called here for the first time, i.e. ib_client.alloc_client_data() * would be invoked. We do not put the reference and keep ULP dev * structure in core lists. */ (void)ib_alloc_or_get_client_data(device, &ib_client); } static void ulp_remove_one(struct ib_device *device, void *client_data) { ... /* Should be last reference put */ WARN_ON(!ib_put_client_data(&sdev->ib_data)); } static void ulp_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct ib_client_data *data; struct ulp_device *ulp_dev; data = ib_get_client_data(event->device, &srpt_client); if (unlikely(!data)) return; ulp_dev = container_of(data, typeof(*ulp_dev), ib_data); /* ... Use ulp_dev ... */ /* One reference is still alive */ WARN_ON(ib_put_client_data(&sdev->ib_data)); } And NVMe and ISER case would become indeed much simpler: ib_alloc_or_get_client_data() can be called just from the cm_handler(), where sleep is allowed. > The core code will call release_kref when the ib_device is removed. > >> Can we follow instead the way with cmpxchg semantics: >> >> static struct nvme_rdma_device * >> nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) >> { >> struct nvme_rdma_device *ndev; >> struct kref *kref; >> >> kref = ib_get_client_kref(cm_id->device, >> &nvme_rdma_ib_client); >> if (!kref) { >> ndev = nvme_rdma_alloc_device(cm_id->device); >> WARN_ON(!ndev); /* ignore error path for now */ >> >> kref = ib_cmpxchg_client_kref(cm_id->device, >> &nvme_rdma_ib_client, >> NULL, &ndev->kref); >> if (kref) { >> /* We raced with someone and lost, free ndev */ >> nvme_rdma_dev_put(ndev); >> } else >> /* We won! */ >> kref = &ndev->kref; >> } >> >> return container_of(kref, typeof(*ndev), kref); >> } > > Bleck, even more complexity. > > Why? That is too much code duplication in every ULP for what? Well, I considered only NVMe case, I did not plan to touch others. > Why such a mess to avoid a simple and well defined callback? Indeed we can, if change API and refactor internals of device.c a bit. I can send an RFC in couple of days with making ib_client_data public and the stuff which I described above. What do you think? -- Roman -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 11, 2018 at 10:55:39AM +0200, Roman Penyaev wrote: > can be optimized by the *compiler* (again, only compiler optimization > matters): > > tmp = *ptr; <<< NOT NICE > rcu_read_lock(); > if (tmp) > do_something(); > rcu_read_unlock(); No, this already isn't possible. > preempt_count_inc(); <<< just increase percpu variable > barrier(); <<< compiler barrier, which prevents > other volatile accesses leak up barrier() prevents everything from moving out. > if (*(volatile int *)ptr) <<< explicit volatile access > do_something(); Alpha inserts a special CPU barrier in here. > barrier(); <<< compiler barrier, prevents from > leaking down > preempt_count_dec(); The point of rcu_dereference is to pair with rcu_assign_pointer so the code can make a control flow decision without re-ordering, eg: if (rcu_dereference(ptr) != 0) assert(*something != NULL); Guarentees that if the store side does rcu_assign_pointer(ptr, 0); *something = NULL; Then the assert cannot fire. It has nothing to do with accesses leaking outside the rcu critical, section. > Returning to my case: > > rcu_read_lock(); > device = ib_get_client_data(...); > ... > rcu_read_unlock(); > > Previously I mentioned that ib_get_client_data() function has a > spin lock, but this actually also does not matter, what matters > is that this is a real function *call*. No, it does matter. Code should never rely on a function call to create an implicit compiler barrier. eg We have LTO now which washes that side-effect away. The spinlock was the right answer, because there is no RCU protected data here, and none of the special properties of RCU are required. > >> And it seems that such approach will require some new ib_client > >> callbacks to create and free ulp device, which I personally do > >> not like. > > > > Why? Seems like a fine idea to me, and simplifies all the ULPs > > > > Add two new call backs: > > create_kref > > relase_kref > > > > And the core code will call create the very first time the kref data > > is accessed, and do so safely under some kind of locking. > > The thing is that inside ib_(get|set)_client_data() spin lock is used, > because those two can be called from any context, thus ib_client.alloc() > callback can't be called under spin lock, because alloc() can sleep. > If we assume that each ULP knows that for the first time > ib_alloc_or_get_client_data() (or any analogue) is called on the path, > where sleep is allowed - then yes, that should work out. But seems > that should be covered by fat comment. Seems reasonable.. > +struct ib_client_data; > + > struct ib_client { > char *name; > void (*add) (struct ib_device *); > void (*remove)(struct ib_device *, void *client_data); > > + struct ib_client_data *(*alloc_client_data)(struct ib_device *); > + void (*free_client_data)(struct ib_client_data *); > + > +struct ib_client_data { > + struct kref kref; > + struct ib_device *device; > + struct ib_client *client; > + struct list_head device_entry; > + struct list_head client_entry; > + }; Well, the point of the alloc callback was to not alloc data until the client became activated, exposing this struct doesn't really seem to help that since you'd still need a void * in here with the data pointer. > > Why such a mess to avoid a simple and well defined callback? > > Indeed we can, if change API and refactor internals of device.c a bit. > I can send an RFC in couple of days with making ib_client_data public > and the stuff which I described above. What do you think? Sure, you can mess around with the core code to make a nicer cleanup. Jason -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 11, 2018 at 6:51 PM, Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Mon, Jun 11, 2018 at 10:55:39AM +0200, Roman Penyaev wrote: > >> can be optimized by the *compiler* (again, only compiler optimization >> matters): >> >> tmp = *ptr; <<< NOT NICE >> rcu_read_lock(); >> if (tmp) >> do_something(); >> rcu_read_unlock(); > > No, this already isn't possible. > >> preempt_count_inc(); <<< just increase percpu variable >> barrier(); <<< compiler barrier, which prevents >> other volatile accesses leak up > > barrier() prevents everything from moving out. > >> if (*(volatile int *)ptr) <<< explicit volatile access >> do_something(); > > Alpha inserts a special CPU barrier in here. > >> barrier(); <<< compiler barrier, prevents from >> leaking down >> preempt_count_dec(); > > The point of rcu_dereference is to pair with rcu_assign_pointer so the > code can make a control flow decision without re-ordering, eg: > > if (rcu_dereference(ptr) != 0) > assert(*something != NULL); > > Guarentees that if the store side does > > rcu_assign_pointer(ptr, 0); > *something = NULL; > > Then the assert cannot fire. > > It has nothing to do with accesses leaking outside the rcu critical, > section. > >> Returning to my case: >> >> rcu_read_lock(); >> device = ib_get_client_data(...); >> ... >> rcu_read_unlock(); >> >> Previously I mentioned that ib_get_client_data() function has a >> spin lock, but this actually also does not matter, what matters >> is that this is a real function *call*. > > No, it does matter. Code should never rely on a function call to > create an implicit compiler barrier. eg We have LTO now which washes > that side-effect away. > > The spinlock was the right answer, because there is no RCU protected > data here, and none of the special properties of RCU are required. Indeed, I missed the alpha case completely. I was confused with the volatile cast on x86_64, but it is needed only to prevent compiler from deducing the resulting pointer, not reordering. Thanks for pointing that out. > >> >> And it seems that such approach will require some new ib_client >> >> callbacks to create and free ulp device, which I personally do >> >> not like. >> > >> > Why? Seems like a fine idea to me, and simplifies all the ULPs >> > >> > Add two new call backs: >> > create_kref >> > relase_kref >> > >> > And the core code will call create the very first time the kref data >> > is accessed, and do so safely under some kind of locking. >> >> The thing is that inside ib_(get|set)_client_data() spin lock is used, >> because those two can be called from any context, thus ib_client.alloc() >> callback can't be called under spin lock, because alloc() can sleep. > >> If we assume that each ULP knows that for the first time >> ib_alloc_or_get_client_data() (or any analogue) is called on the path, >> where sleep is allowed - then yes, that should work out. But seems >> that should be covered by fat comment. > > Seems reasonable.. > >> +struct ib_client_data; >> + >> struct ib_client { >> char *name; >> void (*add) (struct ib_device *); >> void (*remove)(struct ib_device *, void *client_data); >> >> + struct ib_client_data *(*alloc_client_data)(struct ib_device *); >> + void (*free_client_data)(struct ib_client_data *); >> + > >> +struct ib_client_data { >> + struct kref kref; >> + struct ib_device *device; >> + struct ib_client *client; >> + struct list_head device_entry; >> + struct list_head client_entry; >> + }; > > Well, the point of the alloc callback was to not alloc data until the > client became activated, That is true, but from the very beginning that can be used only for a new code: nvme and iser cases. Others ULP devs can be switched to a new API without significant changes, but still allocate data on add_one() (i.e. preallocate) and then step by step that can be also changed. I am not sure that in one go I can refactor all the ULP devs. > exposing this struct doesn't really seem to > help that since you'd still need a void * in here with the data pointer. Why? "struct ib_client_data" can be embedded inside ulp_dev specific structure and then ulp_dev->ib_data can be linked in all the lists inside device.c, so exposing ib_client_data structure we can avoid setting void *. > >> > Why such a mess to avoid a simple and well defined callback? >> >> Indeed we can, if change API and refactor internals of device.c a bit. >> I can send an RFC in couple of days with making ib_client_data public >> and the stuff which I described above. What do you think? > > Sure, you can mess around with the core code to make a nicer cleanup. Ok, I will try to prepare an RFC. -- Roman -- To unsubscribe from this list: send the line "unsubscribe target-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 08dc9d27e5b1..c80f31573919 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -42,8 +42,7 @@ static int isert_debug_level; module_param_named(debug_level, isert_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)"); -static DEFINE_MUTEX(device_list_mutex); -static LIST_HEAD(device_list); +static struct ib_client isert_rdma_ib_client; static struct workqueue_struct *isert_comp_wq; static struct workqueue_struct *isert_release_wq; @@ -341,7 +340,6 @@ isert_free_device_ib_res(struct isert_device *device) static void isert_device_put(struct isert_device *device) { - mutex_lock(&device_list_mutex); isert_info("device %p refcount %d\n", device, refcount_read(&device->refcount)); if (refcount_dec_and_test(&device->refcount)) { @@ -349,48 +347,44 @@ isert_device_put(struct isert_device *device) list_del(&device->dev_node); kfree(device); } - mutex_unlock(&device_list_mutex); } static struct isert_device * isert_device_get(struct rdma_cm_id *cma_id) { struct isert_device *device; - int ret; - mutex_lock(&device_list_mutex); - list_for_each_entry(device, &device_list, dev_node) { - if (device->ib_device->node_guid == cma_id->device->node_guid) { - refcount_inc(&device->refcount); - isert_info("Found iser device %p refcount %d\n", - device, refcount_read(&device->refcount)); - mutex_unlock(&device_list_mutex); - return device; - } - } + /* Paired with isert_ib_client_remove_one() */ + rcu_read_lock(); + device = ib_get_client_data(cma_id->device, &isert_rdma_ib_client); + if (device && WARN_ON(!refcount_inc_not_zero(&device->refcount))) + device = NULL; + rcu_read_unlock(); - device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); - if (!device) { - mutex_unlock(&device_list_mutex); - return ERR_PTR(-ENOMEM); - } + return device; +} + +static struct isert_device * +isert_device_alloc(struct ib_device *ib_device) +{ + struct isert_device *device; + int ret; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (unlikely(!device)) + return NULL; INIT_LIST_HEAD(&device->dev_node); mutex_init(&device->comp_mutex); - - device->ib_device = cma_id->device; + refcount_set(&device->refcount, 1); + device->ib_device = ib_device; ret = isert_create_device_ib_res(device); if (ret) { kfree(device); - mutex_unlock(&device_list_mutex); - return ERR_PTR(ret); + return NULL; } - - refcount_inc(&device->refcount); - list_add_tail(&device->dev_node, &device_list); isert_info("Created a new iser device %p refcount %d\n", device, refcount_read(&device->refcount)); - mutex_unlock(&device_list_mutex); return device; } @@ -530,8 +524,8 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) goto out; device = isert_device_get(cma_id); - if (IS_ERR(device)) { - ret = PTR_ERR(device); + if (unlikely(!device)) { + ret = -ENODEV; goto out_rsp_dma_map; } isert_conn->device = device; @@ -2681,15 +2675,52 @@ static struct iscsit_transport iser_target_transport = { .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, }; +static void isert_ib_client_add_one(struct ib_device *ib_device) +{ + struct isert_device *device; + + device = isert_device_alloc(ib_device); + if (likely(device)) + ib_set_client_data(ib_device, &isert_rdma_ib_client, device); + else + pr_err("Allocattion of a device failed.\n"); +} + +static void isert_ib_client_remove_one(struct ib_device *ib_device, + void *client_data) +{ + struct isert_device *device = client_data; + + if (unlikely(!device)) + return; + + ib_set_client_data(ib_device, &isert_rdma_ib_client, NULL); + /* Paired with isert_device_get() */ + synchronize_rcu(); + isert_device_put(device); +} + +static struct ib_client isert_rdma_ib_client = { + .name = "isert_client_ib", + .add = isert_ib_client_add_one, + .remove = isert_ib_client_remove_one +}; + static int __init isert_init(void) { int ret; + ret = ib_register_client(&isert_rdma_ib_client); + if (unlikely(ret)) { + isert_err("ib_register_client(): %d\n", ret); + return ret; + } isert_comp_wq = alloc_workqueue("isert_comp_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!isert_comp_wq) { isert_err("Unable to allocate isert_comp_wq\n"); - return -ENOMEM; + ret = -ENOMEM; + goto unregister_client; } isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND, @@ -2707,6 +2738,8 @@ static int __init isert_init(void) destroy_comp_wq: destroy_workqueue(isert_comp_wq); +unregister_client: + ib_unregister_client(&isert_rdma_ib_client); return ret; } @@ -2717,6 +2750,7 @@ static void __exit isert_exit(void) destroy_workqueue(isert_release_wq); destroy_workqueue(isert_comp_wq); iscsit_unregister_transport(&iser_target_transport); + ib_unregister_client(&isert_rdma_ib_client); isert_info("iSER_TARGET[0] - Released iser_target_transport\n"); }
ib_client API provides a way to wrap an ib_device with a specific ULP structure. Using that API local lists and mutexes can be completely avoided and allocation/removal paths become a bit cleaner. Signed-off-by: Roman Pen <roman.penyaev@profitbricks.de> Cc: Christoph Hellwig <hch@lst.de> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Bart Van Assche <bart.vanassche@sandisk.com> Cc: Sagi Grimberg <sagi@grimberg.me> Cc: Doug Ledford <dledford@redhat.com> Cc: target-devel@vger.kernel.org --- drivers/infiniband/ulp/isert/ib_isert.c | 96 ++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 31 deletions(-)