Message ID | 1501017730-12797-11-git-send-email-sstabellini@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> +int pvcalls_front_release(struct socket *sock) > +{ > + struct pvcalls_bedata *bedata; > + struct sock_mapping *map; > + int req_id, notify; > + struct xen_pvcalls_request *req; > + > + if (!pvcalls_front_dev) > + return -EIO; > + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); > + if (!bedata) > + return -EIO; Some (all?) other ops don't check bedata validity. Should they all do? > + > + if (sock->sk == NULL) > + return 0; > + > + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); > + if (map == NULL) > + return 0; > + > + spin_lock(&bedata->pvcallss_lock); > + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); > + if (RING_FULL(&bedata->ring) || > + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { > + spin_unlock(&bedata->pvcallss_lock); > + return -EAGAIN; > + } > + WRITE_ONCE(sock->sk->sk_send_head, NULL); > + > + req = RING_GET_REQUEST(&bedata->ring, req_id); > + req->req_id = req_id; > + req->cmd = PVCALLS_RELEASE; > + req->u.release.id = (uint64_t)sock; > + > + bedata->ring.req_prod_pvt++; > + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); > + spin_unlock(&bedata->pvcallss_lock); > + if (notify) > + notify_remote_via_irq(bedata->irq); > + > + wait_event(bedata->inflight_req, > + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); > + > + if (map->active_socket) { > + /* > + * Set in_error and wake up inflight_conn_req to force > + * recvmsg waiters to exit. > + */ > + map->active.ring->in_error = -EBADF; > + wake_up_interruptible(&map->active.inflight_conn_req); > + > + mutex_lock(&map->active.in_mutex); > + mutex_lock(&map->active.out_mutex); > + pvcalls_front_free_map(bedata, map); > + mutex_unlock(&map->active.out_mutex); > + mutex_unlock(&map->active.in_mutex); > + kfree(map); Since you are locking here I assume you expect that someone else might also be trying to lock the map. But you are freeing it immediately after unlocking. Wouldn't that mean that whoever is trying to grab the lock might then dereference freed memory? -boris > + } else { > + spin_lock(&bedata->pvcallss_lock); > + list_del_init(&map->list); > + kfree(map); > + spin_unlock(&bedata->pvcallss_lock); > + } > + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); > + > + return 0; > +} > + > static const struct xenbus_device_id pvcalls_front_ids[] = { > { "pvcalls" }, > { "" } > diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h > index 25e05b8..3332978 100644 > --- a/drivers/xen/pvcalls-front.h > +++ b/drivers/xen/pvcalls-front.h > @@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock, > unsigned int pvcalls_front_poll(struct file *file, > struct socket *sock, > poll_table *wait); > +int pvcalls_front_release(struct socket *sock); > > #endif
On Thu, 27 Jul 2017, Boris Ostrovsky wrote: > > +int pvcalls_front_release(struct socket *sock) > > +{ > > + struct pvcalls_bedata *bedata; > > + struct sock_mapping *map; > > + int req_id, notify; > > + struct xen_pvcalls_request *req; > > + > > + if (!pvcalls_front_dev) > > + return -EIO; > > + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); > > + if (!bedata) > > + return -EIO; > > Some (all?) other ops don't check bedata validity. Should they all do? No, I don't think they should: dev_set_drvdata is called in the probe function (pvcalls_front_probe). I'll remove it. > > + > > + if (sock->sk == NULL) > > + return 0; > > + > > + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); > > + if (map == NULL) > > + return 0; > > + > > + spin_lock(&bedata->pvcallss_lock); > > + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); > > + if (RING_FULL(&bedata->ring) || > > + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { > > + spin_unlock(&bedata->pvcallss_lock); > > + return -EAGAIN; > > + } > > + WRITE_ONCE(sock->sk->sk_send_head, NULL); > > + > > + req = RING_GET_REQUEST(&bedata->ring, req_id); > > + req->req_id = req_id; > > + req->cmd = PVCALLS_RELEASE; > > + req->u.release.id = (uint64_t)sock; > > + > > + bedata->ring.req_prod_pvt++; > > + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); > > + spin_unlock(&bedata->pvcallss_lock); > > + if (notify) > > + notify_remote_via_irq(bedata->irq); > > + > > + wait_event(bedata->inflight_req, > > + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); > > + > > + if (map->active_socket) { > > + /* > > + * Set in_error and wake up inflight_conn_req to force > > + * recvmsg waiters to exit. > > + */ > > + map->active.ring->in_error = -EBADF; > > + wake_up_interruptible(&map->active.inflight_conn_req); > > + > > + mutex_lock(&map->active.in_mutex); > > + mutex_lock(&map->active.out_mutex); > > + pvcalls_front_free_map(bedata, map); > > + mutex_unlock(&map->active.out_mutex); > > + mutex_unlock(&map->active.in_mutex); > > + kfree(map); > > Since you are locking here I assume you expect that someone else might > also be trying to lock the map. But you are freeing it immediately after > unlocking. Wouldn't that mean that whoever is trying to grab the lock > might then dereference freed memory? The lock is to make sure there are no recvmsg or sendmsg in progress. We are sure that no newer sendmsg or recvmsg are waiting for pvcalls_front_release to release the lock because before send a message to the backend we set sk_send_head to NULL. > > + } else { > > + spin_lock(&bedata->pvcallss_lock); > > + list_del_init(&map->list); > > + kfree(map); > > + spin_unlock(&bedata->pvcallss_lock); > > + } > > + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); > > + > > + return 0; > > +} > > + > > static const struct xenbus_device_id pvcalls_front_ids[] = { > > { "pvcalls" }, > > { "" } > > diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h > > index 25e05b8..3332978 100644 > > --- a/drivers/xen/pvcalls-front.h > > +++ b/drivers/xen/pvcalls-front.h > > @@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock, > > unsigned int pvcalls_front_poll(struct file *file, > > struct socket *sock, > > poll_table *wait); > > +int pvcalls_front_release(struct socket *sock); > > > > #endif
On 07/31/2017 06:34 PM, Stefano Stabellini wrote: > On Thu, 27 Jul 2017, Boris Ostrovsky wrote: >>> +int pvcalls_front_release(struct socket *sock) >>> +{ >>> + struct pvcalls_bedata *bedata; >>> + struct sock_mapping *map; >>> + int req_id, notify; >>> + struct xen_pvcalls_request *req; >>> + >>> + if (!pvcalls_front_dev) >>> + return -EIO; >>> + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); >>> + if (!bedata) >>> + return -EIO; >> Some (all?) other ops don't check bedata validity. Should they all do? > No, I don't think they should: dev_set_drvdata is called in the probe > function (pvcalls_front_probe). I'll remove it. > > >>> + >>> + if (sock->sk == NULL) >>> + return 0; >>> + >>> + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); >>> + if (map == NULL) >>> + return 0; >>> + >>> + spin_lock(&bedata->pvcallss_lock); >>> + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); >>> + if (RING_FULL(&bedata->ring) || >>> + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { >>> + spin_unlock(&bedata->pvcallss_lock); >>> + return -EAGAIN; >>> + } >>> + WRITE_ONCE(sock->sk->sk_send_head, NULL); >>> + >>> + req = RING_GET_REQUEST(&bedata->ring, req_id); >>> + req->req_id = req_id; >>> + req->cmd = PVCALLS_RELEASE; >>> + req->u.release.id = (uint64_t)sock; >>> + >>> + bedata->ring.req_prod_pvt++; >>> + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); >>> + spin_unlock(&bedata->pvcallss_lock); >>> + if (notify) >>> + notify_remote_via_irq(bedata->irq); >>> + >>> + wait_event(bedata->inflight_req, >>> + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); >>> + >>> + if (map->active_socket) { >>> + /* >>> + * Set in_error and wake up inflight_conn_req to force >>> + * recvmsg waiters to exit. >>> + */ >>> + map->active.ring->in_error = -EBADF; >>> + wake_up_interruptible(&map->active.inflight_conn_req); >>> + >>> + mutex_lock(&map->active.in_mutex); >>> + mutex_lock(&map->active.out_mutex); >>> + pvcalls_front_free_map(bedata, map); >>> + mutex_unlock(&map->active.out_mutex); >>> + mutex_unlock(&map->active.in_mutex); >>> + kfree(map); >> Since you are locking here I assume you expect that someone else might >> also be trying to lock the map. But you are freeing it immediately after >> unlocking. Wouldn't that mean that whoever is trying to grab the lock >> might then dereference freed memory? > The lock is to make sure there are no recvmsg or sendmsg in progress. We > are sure that no newer sendmsg or recvmsg are waiting for > pvcalls_front_release to release the lock because before send a message > to the backend we set sk_send_head to NULL. Is there a chance that whoever is potentially calling send/rcvmsg has checked that sk_send_head is non-NULL but hasn't grabbed the lock yet? Freeing a structure containing a lock right after releasing the lock looks weird (to me). Is there any other way to synchronize with sender/receiver? Any other lock? BTW, I also noticed that in rcvmsg you are calling wait_event_interruptible() while holding the lock. Have you tested with CONFIG_DEBUG_ATOMIC_SLEEP? (or maybe it's some other config option that would complain about those sorts of thing) -boris
On 01/08/17 17:23, Boris Ostrovsky wrote: > On 07/31/2017 06:34 PM, Stefano Stabellini wrote: >> On Thu, 27 Jul 2017, Boris Ostrovsky wrote: >>>> +int pvcalls_front_release(struct socket *sock) >>>> +{ >>>> + struct pvcalls_bedata *bedata; >>>> + struct sock_mapping *map; >>>> + int req_id, notify; >>>> + struct xen_pvcalls_request *req; >>>> + >>>> + if (!pvcalls_front_dev) >>>> + return -EIO; >>>> + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); >>>> + if (!bedata) >>>> + return -EIO; >>> Some (all?) other ops don't check bedata validity. Should they all do? >> No, I don't think they should: dev_set_drvdata is called in the probe >> function (pvcalls_front_probe). I'll remove it. >> >> >>>> + >>>> + if (sock->sk == NULL) >>>> + return 0; >>>> + >>>> + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); >>>> + if (map == NULL) >>>> + return 0; >>>> + >>>> + spin_lock(&bedata->pvcallss_lock); >>>> + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); >>>> + if (RING_FULL(&bedata->ring) || >>>> + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { >>>> + spin_unlock(&bedata->pvcallss_lock); >>>> + return -EAGAIN; >>>> + } >>>> + WRITE_ONCE(sock->sk->sk_send_head, NULL); >>>> + >>>> + req = RING_GET_REQUEST(&bedata->ring, req_id); >>>> + req->req_id = req_id; >>>> + req->cmd = PVCALLS_RELEASE; >>>> + req->u.release.id = (uint64_t)sock; >>>> + >>>> + bedata->ring.req_prod_pvt++; >>>> + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); >>>> + spin_unlock(&bedata->pvcallss_lock); >>>> + if (notify) >>>> + notify_remote_via_irq(bedata->irq); >>>> + >>>> + wait_event(bedata->inflight_req, >>>> + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); >>>> + >>>> + if (map->active_socket) { >>>> + /* >>>> + * Set in_error and wake up inflight_conn_req to force >>>> + * recvmsg waiters to exit. >>>> + */ >>>> + map->active.ring->in_error = -EBADF; >>>> + wake_up_interruptible(&map->active.inflight_conn_req); >>>> + >>>> + mutex_lock(&map->active.in_mutex); >>>> + mutex_lock(&map->active.out_mutex); >>>> + pvcalls_front_free_map(bedata, map); >>>> + mutex_unlock(&map->active.out_mutex); >>>> + mutex_unlock(&map->active.in_mutex); >>>> + kfree(map); >>> Since you are locking here I assume you expect that someone else might >>> also be trying to lock the map. But you are freeing it immediately after >>> unlocking. Wouldn't that mean that whoever is trying to grab the lock >>> might then dereference freed memory? >> The lock is to make sure there are no recvmsg or sendmsg in progress. We >> are sure that no newer sendmsg or recvmsg are waiting for >> pvcalls_front_release to release the lock because before send a message >> to the backend we set sk_send_head to NULL. > > Is there a chance that whoever is potentially calling send/rcvmsg has > checked that sk_send_head is non-NULL but hasn't grabbed the lock yet? > > Freeing a structure containing a lock right after releasing the lock > looks weird (to me). Is there any other way to synchronize with > sender/receiver? Any other lock? Right. This looks fishy. Either you don't need the locks or you can't just free the area right after releasing the lock. > BTW, I also noticed that in rcvmsg you are calling > wait_event_interruptible() while holding the lock. Have you tested with > CONFIG_DEBUG_ATOMIC_SLEEP? (or maybe it's some other config option that > would complain about those sorts of thing) I believe sleeping while holding a mutex is allowed. Sleeping in spinlocked paths is bad. BTW: You are looking for CONFIG_DEBUG_MUTEXES (see Documentation/locking/mutex-design.txt ). Juergen
>> BTW, I also noticed that in rcvmsg you are calling >> wait_event_interruptible() while holding the lock. Have you tested with >> CONFIG_DEBUG_ATOMIC_SLEEP? (or maybe it's some other config option that >> would complain about those sorts of thing) > I believe sleeping while holding a mutex is allowed. Sleeping in > spinlocked paths is bad. Oh, right. I was thinking about spinlocks. Sorry. -boris > > BTW: You are looking for CONFIG_DEBUG_MUTEXES (see > Documentation/locking/mutex-design.txt ). > > > Juergen
On Tue, 1 Aug 2017, Juergen Gross wrote: > >>>> + if (sock->sk == NULL) > >>>> + return 0; > >>>> + > >>>> + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); > >>>> + if (map == NULL) > >>>> + return 0; > >>>> + > >>>> + spin_lock(&bedata->pvcallss_lock); > >>>> + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); > >>>> + if (RING_FULL(&bedata->ring) || > >>>> + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { > >>>> + spin_unlock(&bedata->pvcallss_lock); > >>>> + return -EAGAIN; > >>>> + } > >>>> + WRITE_ONCE(sock->sk->sk_send_head, NULL); > >>>> + > >>>> + req = RING_GET_REQUEST(&bedata->ring, req_id); > >>>> + req->req_id = req_id; > >>>> + req->cmd = PVCALLS_RELEASE; > >>>> + req->u.release.id = (uint64_t)sock; > >>>> + > >>>> + bedata->ring.req_prod_pvt++; > >>>> + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); > >>>> + spin_unlock(&bedata->pvcallss_lock); > >>>> + if (notify) > >>>> + notify_remote_via_irq(bedata->irq); > >>>> + > >>>> + wait_event(bedata->inflight_req, > >>>> + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); > >>>> + > >>>> + if (map->active_socket) { > >>>> + /* > >>>> + * Set in_error and wake up inflight_conn_req to force > >>>> + * recvmsg waiters to exit. > >>>> + */ > >>>> + map->active.ring->in_error = -EBADF; > >>>> + wake_up_interruptible(&map->active.inflight_conn_req); > >>>> + > >>>> + mutex_lock(&map->active.in_mutex); > >>>> + mutex_lock(&map->active.out_mutex); > >>>> + pvcalls_front_free_map(bedata, map); > >>>> + mutex_unlock(&map->active.out_mutex); > >>>> + mutex_unlock(&map->active.in_mutex); > >>>> + kfree(map); > >>> Since you are locking here I assume you expect that someone else might > >>> also be trying to lock the map. But you are freeing it immediately after > >>> unlocking. Wouldn't that mean that whoever is trying to grab the lock > >>> might then dereference freed memory? > >> The lock is to make sure there are no recvmsg or sendmsg in progress. We > >> are sure that no newer sendmsg or recvmsg are waiting for > >> pvcalls_front_release to release the lock because before send a message > >> to the backend we set sk_send_head to NULL. > > > > Is there a chance that whoever is potentially calling send/rcvmsg has > > checked that sk_send_head is non-NULL but hasn't grabbed the lock yet? > > > > Freeing a structure containing a lock right after releasing the lock > > looks weird (to me). Is there any other way to synchronize with > > sender/receiver? Any other lock? > > Right. This looks fishy. Either you don't need the locks or you can't > just free the area right after releasing the lock. I changed this code, you'll see soon in the new patch series I am going to send. There were two very similar mutex_unlock/kfree problems: 1) pvcalls_front_release 2) pvcalls_front_remove For 2), I introduced a refcount. I only free the data structs when the refcount reaches 0. For 1), I could introduce a similar refcount that would serve the same purpose, but instead I used mutex_trylock, effectively using the internal count in in_mutex and out_mutex for the same purpose.
diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 833b717..5a4040e 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -184,6 +184,23 @@ static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map) return IRQ_HANDLED; } +static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ + int i; + + spin_lock(&bedata->pvcallss_lock); + if (!list_empty(&map->list)) + list_del_init(&map->list); + spin_unlock(&bedata->pvcallss_lock); + + for (i = 0; i < (1 << map->active.ring->ring_order); i++) + gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0); + gnttab_end_foreign_access(map->active.ref, 0, 0); + free_page((unsigned long)map->active.ring); + unbind_from_irqhandler(map->active.irq, map); +} + int pvcalls_front_socket(struct socket *sock) { struct pvcalls_bedata *bedata; @@ -819,6 +836,74 @@ unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, return pvcalls_front_poll_passive(file, bedata, map, wait); } +int pvcalls_front_release(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int req_id, notify; + struct xen_pvcalls_request *req; + + if (!pvcalls_front_dev) + return -EIO; + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + if (!bedata) + return -EIO; + + if (sock->sk == NULL) + return 0; + + map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head); + if (map == NULL) + return 0; + + spin_lock(&bedata->pvcallss_lock); + req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); + if (RING_FULL(&bedata->ring) || + READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) { + spin_unlock(&bedata->pvcallss_lock); + return -EAGAIN; + } + WRITE_ONCE(sock->sk->sk_send_head, NULL); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_RELEASE; + req->u.release.id = (uint64_t)sock; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->pvcallss_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + if (map->active_socket) { + /* + * Set in_error and wake up inflight_conn_req to force + * recvmsg waiters to exit. + */ + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + + mutex_lock(&map->active.in_mutex); + mutex_lock(&map->active.out_mutex); + pvcalls_front_free_map(bedata, map); + mutex_unlock(&map->active.out_mutex); + mutex_unlock(&map->active.in_mutex); + kfree(map); + } else { + spin_lock(&bedata->pvcallss_lock); + list_del_init(&map->list); + kfree(map); + spin_unlock(&bedata->pvcallss_lock); + } + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); + + return 0; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index 25e05b8..3332978 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock, unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, poll_table *wait); +int pvcalls_front_release(struct socket *sock); #endif
Send PVCALLS_RELEASE to the backend and wait for a reply. Take both in_mutex and out_mutex to avoid concurrent accesses. Then, free the socket. Signed-off-by: Stefano Stabellini <stefano@aporeto.com> CC: boris.ostrovsky@oracle.com CC: jgross@suse.com --- drivers/xen/pvcalls-front.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 1 + 2 files changed, 86 insertions(+)