diff mbox

[v5,08/13] xen/pvcalls: implement accept command

Message ID 1507336227-20477-8-git-send-email-sstabellini@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Stefano Stabellini Oct. 7, 2017, 12:30 a.m. UTC
Introduce a waitqueue to allow only one outstanding accept command at
any given time and to implement polling on the passive socket. Introduce
a flags field to keep track of in-flight accept and poll commands.

Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make
sure that only one accept command is executed at any given time by
setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the
inflight_accept_req waitqueue.

Convert the new struct sock_mapping pointer into an uint64_t and use it
as id for the new socket to pass to the backend.

Check if the accept call is non-blocking: in that case after sending the
ACCEPT command to the backend store the sock_mapping pointer of the new
struct and the inflight req_id then return -EAGAIN (which will respond
only when there is something to accept). Next time accept is called,
we'll check if the ACCEPT command has been answered, if so we'll pick up
where we left off, otherwise we return -EAGAIN again.

Note that, differently from the other commands, we can use
wait_event_interruptible (instead of wait_event) in the case of accept
as we are able to track the req_id of the ACCEPT response that we are
waiting.

Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
CC: boris.ostrovsky@oracle.com
CC: jgross@suse.com
---
 drivers/xen/pvcalls-front.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/xen/pvcalls-front.h |   3 +
 2 files changed, 149 insertions(+)

Comments

Boris Ostrovsky Oct. 17, 2017, 6:34 p.m. UTC | #1
On 10/06/2017 08:30 PM, Stefano Stabellini wrote:
> Introduce a waitqueue to allow only one outstanding accept command at
> any given time and to implement polling on the passive socket. Introduce
> a flags field to keep track of in-flight accept and poll commands.
> 
> Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make
> sure that only one accept command is executed at any given time by
> setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the
> inflight_accept_req waitqueue.
> 
> Convert the new struct sock_mapping pointer into an uint64_t and use it
> as id for the new socket to pass to the backend.
> 
> Check if the accept call is non-blocking: in that case after sending the
> ACCEPT command to the backend store the sock_mapping pointer of the new
> struct and the inflight req_id then return -EAGAIN (which will respond
> only when there is something to accept). Next time accept is called,
> we'll check if the ACCEPT command has been answered, if so we'll pick up
> where we left off, otherwise we return -EAGAIN again.
> 
> Note that, differently from the other commands, we can use
> wait_event_interruptible (instead of wait_event) in the case of accept
> as we are able to track the req_id of the ACCEPT response that we are
> waiting.
> 
> Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
> CC: boris.ostrovsky@oracle.com
> CC: jgross@suse.com
> ---
>  drivers/xen/pvcalls-front.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
>  drivers/xen/pvcalls-front.h |   3 +
>  2 files changed, 149 insertions(+)
> 
> diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
> index 5433fae..8958e74 100644
> --- a/drivers/xen/pvcalls-front.c
> +++ b/drivers/xen/pvcalls-front.c
> @@ -77,6 +77,16 @@ struct sock_mapping {
>  #define PVCALLS_STATUS_BIND          1
>  #define PVCALLS_STATUS_LISTEN        2
>  			uint8_t status;
> +		/*
> +		 * Internal state-machine flags.
> +		 * Only one accept operation can be inflight for a socket.
> +		 * Only one poll operation can be inflight for a given socket.
> +		 */
> +#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
> +			uint8_t flags;
> +			uint32_t inflight_req_id;
> +			struct sock_mapping *accept_map;
> +			wait_queue_head_t inflight_accept_req;
>  		} passive;
>  	};
>  };
> @@ -392,6 +402,8 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>  	memcpy(req->u.bind.addr, addr, sizeof(*addr));
>  	req->u.bind.len = addr_len;
>  
> +	init_waitqueue_head(&map->passive.inflight_accept_req);
> +
>  	map->active_socket = false;
>  
>  	bedata->ring.req_prod_pvt++;
> @@ -470,6 +482,140 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
>  	return ret;
>  }
>  
> +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
> +{
> +	struct pvcalls_bedata *bedata;
> +	struct sock_mapping *map;
> +	struct sock_mapping *map2 = NULL;
> +	struct xen_pvcalls_request *req;
> +	int notify, req_id, ret, evtchn, nonblock;
> +
> +	pvcalls_enter();
> +	if (!pvcalls_front_dev) {
> +		pvcalls_exit();
> +		return -ENOTCONN;
> +	}
> +	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
> +
> +	map = (struct sock_mapping *) sock->sk->sk_send_head;
> +	if (!map) {
> +		pvcalls_exit();
> +		return -ENOTSOCK;
> +	}
> +
> +	if (map->passive.status != PVCALLS_STATUS_LISTEN) {
> +		pvcalls_exit();
> +		return -EINVAL;
> +	}
> +
> +	nonblock = flags & SOCK_NONBLOCK;
> +	/*
> +	 * Backend only supports 1 inflight accept request, will return
> +	 * errors for the others
> +	 */
> +	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +			     (void *)&map->passive.flags)) {
> +		req_id = READ_ONCE(map->passive.inflight_req_id);
> +		if (req_id != PVCALLS_INVALID_ID &&
> +		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {


READ_ONCE (especially the second one)? I know I may sound fixated on
this but I really don't understand how compiler may do anything wrong if
straight reads were used.

For the first case, I guess, theoretically the compiler may decide to
re-fetch map->passive.inflight_req_id. But even if it did, would that be
a problem? Both of these READ_ONCE targets are updated below before
PVCALLS_FLAG_ACCEPT_INFLIGHT is cleared so there should not be any
change between re-fetching, I think. (The only exception is the noblock
case, which does WRITE_ONCE that don't understand either)


> +			map2 = map->passive.accept_map;
> +			goto received;
> +		}
> +		if (nonblock) {
> +			pvcalls_exit();
> +			return -EAGAIN;
> +		}
> +		if (wait_event_interruptible(map->passive.inflight_accept_req,
> +			!test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +					  (void *)&map->passive.flags))) {
> +			pvcalls_exit();
> +			return -EINTR;
> +		}
> +	}
> +
> +	spin_lock(&bedata->socket_lock);
> +	ret = get_request(bedata, &req_id);
> +	if (ret < 0) {
> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +			  (void *)&map->passive.flags);
> +		spin_unlock(&bedata->socket_lock);
> +		pvcalls_exit();
> +		return ret;
> +	}
> +	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
> +	if (map2 == NULL) {
> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +			  (void *)&map->passive.flags);
> +		spin_unlock(&bedata->socket_lock);
> +		pvcalls_exit();
> +		return -ENOMEM;
> +	}
> +	ret =  create_active(map2, &evtchn);
> +	if (ret < 0) {
> +		kfree(map2);
> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +			  (void *)&map->passive.flags);
> +		spin_unlock(&bedata->socket_lock);
> +		pvcalls_exit();
> +		return -ENOMEM;

Why not ret?

-boris


> +	}
> +	list_add_tail(&map2->list, &bedata->socket_mappings);
> +
> +	req = RING_GET_REQUEST(&bedata->ring, req_id);
> +	req->req_id = req_id;
> +	req->cmd = PVCALLS_ACCEPT;
> +	req->u.accept.id = (uint64_t) map;
> +	req->u.accept.ref = map2->active.ref;
> +	req->u.accept.id_new = (uint64_t) map2;
> +	req->u.accept.evtchn = evtchn;
> +	map->passive.accept_map = map2;
> +
> +	bedata->ring.req_prod_pvt++;
> +	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
> +	spin_unlock(&bedata->socket_lock);
> +	if (notify)
> +		notify_remote_via_irq(bedata->irq);
> +	/* We could check if we have received a response before returning. */
> +	if (nonblock) {
> +		WRITE_ONCE(map->passive.inflight_req_id, req_id);
> +		pvcalls_exit();
> +		return -EAGAIN;
> +	}
> +
> +	if (wait_event_interruptible(bedata->inflight_req,
> +		READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) {
> +		pvcalls_exit();
> +		return -EINTR;
> +	}
> +	/* read req_id, then the content */
> +	smp_rmb();
> +
> +received:
> +	map2->sock = newsock;
> +	newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL);
> +	if (!newsock->sk) {
> +		bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
> +		map->passive.inflight_req_id = PVCALLS_INVALID_ID;
> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> +			  (void *)&map->passive.flags);
> +		pvcalls_front_free_map(bedata, map2);
> +		kfree(map2);
> +		pvcalls_exit();
> +		return -ENOMEM;
> +	}
> +	newsock->sk->sk_send_head = (void *)map2;
> +
> +	ret = bedata->rsp[req_id].ret;
> +	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
> +	map->passive.inflight_req_id = PVCALLS_INVALID_ID;
> +
> +	clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags);
> +	wake_up(&map->passive.inflight_accept_req);
> +
> +	pvcalls_exit();
> +	return ret;
> +}
> +
>  static const struct xenbus_device_id pvcalls_front_ids[] = {
>  	{ "pvcalls" },
>  	{ "" }
> diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
> index aa8fe10..ab4f1da 100644
> --- a/drivers/xen/pvcalls-front.h
> +++ b/drivers/xen/pvcalls-front.h
> @@ -10,5 +10,8 @@ int pvcalls_front_bind(struct socket *sock,
>  		       struct sockaddr *addr,
>  		       int addr_len);
>  int pvcalls_front_listen(struct socket *sock, int backlog);
> +int pvcalls_front_accept(struct socket *sock,
> +			 struct socket *newsock,
> +			 int flags);
>  
>  #endif
>
Stefano Stabellini Oct. 23, 2017, 11:03 p.m. UTC | #2
On Tue, 17 Oct 2017, Boris Ostrovsky wrote:
> On 10/06/2017 08:30 PM, Stefano Stabellini wrote:
> > Introduce a waitqueue to allow only one outstanding accept command at
> > any given time and to implement polling on the passive socket. Introduce
> > a flags field to keep track of in-flight accept and poll commands.
> > 
> > Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make
> > sure that only one accept command is executed at any given time by
> > setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the
> > inflight_accept_req waitqueue.
> > 
> > Convert the new struct sock_mapping pointer into an uint64_t and use it
> > as id for the new socket to pass to the backend.
> > 
> > Check if the accept call is non-blocking: in that case after sending the
> > ACCEPT command to the backend store the sock_mapping pointer of the new
> > struct and the inflight req_id then return -EAGAIN (which will respond
> > only when there is something to accept). Next time accept is called,
> > we'll check if the ACCEPT command has been answered, if so we'll pick up
> > where we left off, otherwise we return -EAGAIN again.
> > 
> > Note that, differently from the other commands, we can use
> > wait_event_interruptible (instead of wait_event) in the case of accept
> > as we are able to track the req_id of the ACCEPT response that we are
> > waiting.
> > 
> > Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
> > CC: boris.ostrovsky@oracle.com
> > CC: jgross@suse.com
> > ---
> >  drivers/xen/pvcalls-front.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/xen/pvcalls-front.h |   3 +
> >  2 files changed, 149 insertions(+)
> > 
> > diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
> > index 5433fae..8958e74 100644
> > --- a/drivers/xen/pvcalls-front.c
> > +++ b/drivers/xen/pvcalls-front.c
> > @@ -77,6 +77,16 @@ struct sock_mapping {
> >  #define PVCALLS_STATUS_BIND          1
> >  #define PVCALLS_STATUS_LISTEN        2
> >  			uint8_t status;
> > +		/*
> > +		 * Internal state-machine flags.
> > +		 * Only one accept operation can be inflight for a socket.
> > +		 * Only one poll operation can be inflight for a given socket.
> > +		 */
> > +#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
> > +			uint8_t flags;
> > +			uint32_t inflight_req_id;
> > +			struct sock_mapping *accept_map;
> > +			wait_queue_head_t inflight_accept_req;
> >  		} passive;
> >  	};
> >  };
> > @@ -392,6 +402,8 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> >  	memcpy(req->u.bind.addr, addr, sizeof(*addr));
> >  	req->u.bind.len = addr_len;
> >  
> > +	init_waitqueue_head(&map->passive.inflight_accept_req);
> > +
> >  	map->active_socket = false;
> >  
> >  	bedata->ring.req_prod_pvt++;
> > @@ -470,6 +482,140 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
> >  	return ret;
> >  }
> >  
> > +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
> > +{
> > +	struct pvcalls_bedata *bedata;
> > +	struct sock_mapping *map;
> > +	struct sock_mapping *map2 = NULL;
> > +	struct xen_pvcalls_request *req;
> > +	int notify, req_id, ret, evtchn, nonblock;
> > +
> > +	pvcalls_enter();
> > +	if (!pvcalls_front_dev) {
> > +		pvcalls_exit();
> > +		return -ENOTCONN;
> > +	}
> > +	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
> > +
> > +	map = (struct sock_mapping *) sock->sk->sk_send_head;
> > +	if (!map) {
> > +		pvcalls_exit();
> > +		return -ENOTSOCK;
> > +	}
> > +
> > +	if (map->passive.status != PVCALLS_STATUS_LISTEN) {
> > +		pvcalls_exit();
> > +		return -EINVAL;
> > +	}
> > +
> > +	nonblock = flags & SOCK_NONBLOCK;
> > +	/*
> > +	 * Backend only supports 1 inflight accept request, will return
> > +	 * errors for the others
> > +	 */
> > +	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +			     (void *)&map->passive.flags)) {
> > +		req_id = READ_ONCE(map->passive.inflight_req_id);
> > +		if (req_id != PVCALLS_INVALID_ID &&
> > +		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {
> 
> 
> READ_ONCE (especially the second one)? I know I may sound fixated on
> this but I really don't understand how compiler may do anything wrong if
> straight reads were used.
> 
> For the first case, I guess, theoretically the compiler may decide to
> re-fetch map->passive.inflight_req_id. But even if it did, would that be
> a problem? Both of these READ_ONCE targets are updated below before
> PVCALLS_FLAG_ACCEPT_INFLIGHT is cleared so there should not be any
> change between re-fetching, I think. (The only exception is the noblock
> case, which does WRITE_ONCE that don't understand either)

READ_ONCE is reasonably cheap: do we really want to have this kind of
conversation every time we touch this code in the future? Personally, I
would have used READ/WRITE_ONCE everywhere for inflight_req_id and
req_id, because it makes the code easier to understand. 

We have already limited their usage, but at least we have followed a set
of guidelines. Doing further optimizations on this code seems
unnecessary and prone to confuse the reader.


> > +			map2 = map->passive.accept_map;
> > +			goto received;
> > +		}
> > +		if (nonblock) {
> > +			pvcalls_exit();
> > +			return -EAGAIN;
> > +		}
> > +		if (wait_event_interruptible(map->passive.inflight_accept_req,
> > +			!test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +					  (void *)&map->passive.flags))) {
> > +			pvcalls_exit();
> > +			return -EINTR;
> > +		}
> > +	}
> > +
> > +	spin_lock(&bedata->socket_lock);
> > +	ret = get_request(bedata, &req_id);
> > +	if (ret < 0) {
> > +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +			  (void *)&map->passive.flags);
> > +		spin_unlock(&bedata->socket_lock);
> > +		pvcalls_exit();
> > +		return ret;
> > +	}
> > +	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
> > +	if (map2 == NULL) {
> > +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +			  (void *)&map->passive.flags);
> > +		spin_unlock(&bedata->socket_lock);
> > +		pvcalls_exit();
> > +		return -ENOMEM;
> > +	}
> > +	ret =  create_active(map2, &evtchn);
> > +	if (ret < 0) {
> > +		kfree(map2);
> > +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +			  (void *)&map->passive.flags);
> > +		spin_unlock(&bedata->socket_lock);
> > +		pvcalls_exit();
> > +		return -ENOMEM;
> 
> Why not ret?

yes, good idea.


> 
> > +	}
> > +	list_add_tail(&map2->list, &bedata->socket_mappings);
> > +
> > +	req = RING_GET_REQUEST(&bedata->ring, req_id);
> > +	req->req_id = req_id;
> > +	req->cmd = PVCALLS_ACCEPT;
> > +	req->u.accept.id = (uint64_t) map;
> > +	req->u.accept.ref = map2->active.ref;
> > +	req->u.accept.id_new = (uint64_t) map2;
> > +	req->u.accept.evtchn = evtchn;
> > +	map->passive.accept_map = map2;
> > +
> > +	bedata->ring.req_prod_pvt++;
> > +	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
> > +	spin_unlock(&bedata->socket_lock);
> > +	if (notify)
> > +		notify_remote_via_irq(bedata->irq);
> > +	/* We could check if we have received a response before returning. */
> > +	if (nonblock) {
> > +		WRITE_ONCE(map->passive.inflight_req_id, req_id);
> > +		pvcalls_exit();
> > +		return -EAGAIN;
> > +	}
> > +
> > +	if (wait_event_interruptible(bedata->inflight_req,
> > +		READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) {
> > +		pvcalls_exit();
> > +		return -EINTR;
> > +	}
> > +	/* read req_id, then the content */
> > +	smp_rmb();
> > +
> > +received:
> > +	map2->sock = newsock;
> > +	newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL);
> > +	if (!newsock->sk) {
> > +		bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
> > +		map->passive.inflight_req_id = PVCALLS_INVALID_ID;
> > +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> > +			  (void *)&map->passive.flags);
> > +		pvcalls_front_free_map(bedata, map2);
> > +		kfree(map2);
> > +		pvcalls_exit();
> > +		return -ENOMEM;
> > +	}
> > +	newsock->sk->sk_send_head = (void *)map2;
> > +
> > +	ret = bedata->rsp[req_id].ret;
> > +	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
> > +	map->passive.inflight_req_id = PVCALLS_INVALID_ID;
> > +
> > +	clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags);
> > +	wake_up(&map->passive.inflight_accept_req);
> > +
> > +	pvcalls_exit();
> > +	return ret;
> > +}
> > +
> >  static const struct xenbus_device_id pvcalls_front_ids[] = {
> >  	{ "pvcalls" },
> >  	{ "" }
> > diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
> > index aa8fe10..ab4f1da 100644
> > --- a/drivers/xen/pvcalls-front.h
> > +++ b/drivers/xen/pvcalls-front.h
> > @@ -10,5 +10,8 @@ int pvcalls_front_bind(struct socket *sock,
> >  		       struct sockaddr *addr,
> >  		       int addr_len);
> >  int pvcalls_front_listen(struct socket *sock, int backlog);
> > +int pvcalls_front_accept(struct socket *sock,
> > +			 struct socket *newsock,
> > +			 int flags);
> >  
> >  #endif
> > 
>
Boris Ostrovsky Oct. 24, 2017, 1:52 p.m. UTC | #3
On 10/23/2017 07:03 PM, Stefano Stabellini wrote:
> On Tue, 17 Oct 2017, Boris Ostrovsky wrote:
>> On 10/06/2017 08:30 PM, Stefano Stabellini wrote:
>>> +	/*
>>> +	 * Backend only supports 1 inflight accept request, will return
>>> +	 * errors for the others
>>> +	 */
>>> +	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
>>> +			     (void *)&map->passive.flags)) {
>>> +		req_id = READ_ONCE(map->passive.inflight_req_id);
>>> +		if (req_id != PVCALLS_INVALID_ID &&
>>> +		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {
>>
>> READ_ONCE (especially the second one)? I know I may sound fixated on
>> this but I really don't understand how compiler may do anything wrong if
>> straight reads were used.
>>
>> For the first case, I guess, theoretically the compiler may decide to
>> re-fetch map->passive.inflight_req_id. But even if it did, would that be
>> a problem? Both of these READ_ONCE targets are updated below before
>> PVCALLS_FLAG_ACCEPT_INFLIGHT is cleared so there should not be any
>> change between re-fetching, I think. (The only exception is the noblock
>> case, which does WRITE_ONCE that don't understand either)
> READ_ONCE is reasonably cheap: do we really want to have this kind of
> conversation every time we touch this code in the future? Personally, I
> would have used READ/WRITE_ONCE everywhere for inflight_req_id and
> req_id, because it makes the code easier to understand.

I guess it's a matter of opinion. I actually think it's harder to read.

But it doesn't make the code wrong so...

>
> We have already limited their usage, but at least we have followed a set
> of guidelines. Doing further optimizations on this code seems
> unnecessary and prone to confuse the reader.
>
>

>>> +	ret =  create_active(map2, &evtchn);
>>> +	if (ret < 0) {
>>> +		kfree(map2);
>>> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
>>> +			  (void *)&map->passive.flags);
>>> +		spin_unlock(&bedata->socket_lock);
>>> +		pvcalls_exit();
>>> +		return -ENOMEM;
>> Why not ret?
> yes, good idea.

With that fixed (and extra space removed in 'ret =  create_active(map2,
&evtchn);')

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Stefano Stabellini Oct. 24, 2017, 4:42 p.m. UTC | #4
On Tue, 24 Oct 2017, Boris Ostrovsky wrote:
> On 10/23/2017 07:03 PM, Stefano Stabellini wrote:
> > On Tue, 17 Oct 2017, Boris Ostrovsky wrote:
> >> On 10/06/2017 08:30 PM, Stefano Stabellini wrote:
> >>> +	/*
> >>> +	 * Backend only supports 1 inflight accept request, will return
> >>> +	 * errors for the others
> >>> +	 */
> >>> +	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> >>> +			     (void *)&map->passive.flags)) {
> >>> +		req_id = READ_ONCE(map->passive.inflight_req_id);
> >>> +		if (req_id != PVCALLS_INVALID_ID &&
> >>> +		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {
> >>
> >> READ_ONCE (especially the second one)? I know I may sound fixated on
> >> this but I really don't understand how compiler may do anything wrong if
> >> straight reads were used.
> >>
> >> For the first case, I guess, theoretically the compiler may decide to
> >> re-fetch map->passive.inflight_req_id. But even if it did, would that be
> >> a problem? Both of these READ_ONCE targets are updated below before
> >> PVCALLS_FLAG_ACCEPT_INFLIGHT is cleared so there should not be any
> >> change between re-fetching, I think. (The only exception is the noblock
> >> case, which does WRITE_ONCE that don't understand either)
> > READ_ONCE is reasonably cheap: do we really want to have this kind of
> > conversation every time we touch this code in the future? Personally, I
> > would have used READ/WRITE_ONCE everywhere for inflight_req_id and
> > req_id, because it makes the code easier to understand.
> 
> I guess it's a matter of opinion. I actually think it's harder to read.
> 
> But it doesn't make the code wrong so...
> 
> >
> > We have already limited their usage, but at least we have followed a set
> > of guidelines. Doing further optimizations on this code seems
> > unnecessary and prone to confuse the reader.
> >
> >
> 
> >>> +	ret =  create_active(map2, &evtchn);
> >>> +	if (ret < 0) {
> >>> +		kfree(map2);
> >>> +		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
> >>> +			  (void *)&map->passive.flags);
> >>> +		spin_unlock(&bedata->socket_lock);
> >>> +		pvcalls_exit();
> >>> +		return -ENOMEM;
> >> Why not ret?
> > yes, good idea.
> 
> With that fixed (and extra space removed in 'ret =  create_active(map2,
> &evtchn);')
> 
> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Thank you!
diff mbox

Patch

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 5433fae..8958e74 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -77,6 +77,16 @@  struct sock_mapping {
 #define PVCALLS_STATUS_BIND          1
 #define PVCALLS_STATUS_LISTEN        2
 			uint8_t status;
+		/*
+		 * Internal state-machine flags.
+		 * Only one accept operation can be inflight for a socket.
+		 * Only one poll operation can be inflight for a given socket.
+		 */
+#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
+			uint8_t flags;
+			uint32_t inflight_req_id;
+			struct sock_mapping *accept_map;
+			wait_queue_head_t inflight_accept_req;
 		} passive;
 	};
 };
@@ -392,6 +402,8 @@  int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	memcpy(req->u.bind.addr, addr, sizeof(*addr));
 	req->u.bind.len = addr_len;
 
+	init_waitqueue_head(&map->passive.inflight_accept_req);
+
 	map->active_socket = false;
 
 	bedata->ring.req_prod_pvt++;
@@ -470,6 +482,140 @@  int pvcalls_front_listen(struct socket *sock, int backlog)
 	return ret;
 }
 
+int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map;
+	struct sock_mapping *map2 = NULL;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret, evtchn, nonblock;
+
+	pvcalls_enter();
+	if (!pvcalls_front_dev) {
+		pvcalls_exit();
+		return -ENOTCONN;
+	}
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	map = (struct sock_mapping *) sock->sk->sk_send_head;
+	if (!map) {
+		pvcalls_exit();
+		return -ENOTSOCK;
+	}
+
+	if (map->passive.status != PVCALLS_STATUS_LISTEN) {
+		pvcalls_exit();
+		return -EINVAL;
+	}
+
+	nonblock = flags & SOCK_NONBLOCK;
+	/*
+	 * Backend only supports 1 inflight accept request, will return
+	 * errors for the others
+	 */
+	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			     (void *)&map->passive.flags)) {
+		req_id = READ_ONCE(map->passive.inflight_req_id);
+		if (req_id != PVCALLS_INVALID_ID &&
+		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {
+			map2 = map->passive.accept_map;
+			goto received;
+		}
+		if (nonblock) {
+			pvcalls_exit();
+			return -EAGAIN;
+		}
+		if (wait_event_interruptible(map->passive.inflight_accept_req,
+			!test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+					  (void *)&map->passive.flags))) {
+			pvcalls_exit();
+			return -EINTR;
+		}
+	}
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit();
+		return ret;
+	}
+	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
+	if (map2 == NULL) {
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit();
+		return -ENOMEM;
+	}
+	ret =  create_active(map2, &evtchn);
+	if (ret < 0) {
+		kfree(map2);
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit();
+		return -ENOMEM;
+	}
+	list_add_tail(&map2->list, &bedata->socket_mappings);
+
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_ACCEPT;
+	req->u.accept.id = (uint64_t) map;
+	req->u.accept.ref = map2->active.ref;
+	req->u.accept.id_new = (uint64_t) map2;
+	req->u.accept.evtchn = evtchn;
+	map->passive.accept_map = map2;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+	/* We could check if we have received a response before returning. */
+	if (nonblock) {
+		WRITE_ONCE(map->passive.inflight_req_id, req_id);
+		pvcalls_exit();
+		return -EAGAIN;
+	}
+
+	if (wait_event_interruptible(bedata->inflight_req,
+		READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) {
+		pvcalls_exit();
+		return -EINTR;
+	}
+	/* read req_id, then the content */
+	smp_rmb();
+
+received:
+	map2->sock = newsock;
+	newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL);
+	if (!newsock->sk) {
+		bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+		map->passive.inflight_req_id = PVCALLS_INVALID_ID;
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		pvcalls_front_free_map(bedata, map2);
+		kfree(map2);
+		pvcalls_exit();
+		return -ENOMEM;
+	}
+	newsock->sk->sk_send_head = (void *)map2;
+
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+	map->passive.inflight_req_id = PVCALLS_INVALID_ID;
+
+	clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags);
+	wake_up(&map->passive.inflight_accept_req);
+
+	pvcalls_exit();
+	return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
 	{ "pvcalls" },
 	{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index aa8fe10..ab4f1da 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -10,5 +10,8 @@  int pvcalls_front_bind(struct socket *sock,
 		       struct sockaddr *addr,
 		       int addr_len);
 int pvcalls_front_listen(struct socket *sock, int backlog);
+int pvcalls_front_accept(struct socket *sock,
+			 struct socket *newsock,
+			 int flags);
 
 #endif