diff mbox series

[v2,2/5] vfio-ccw: concurrent I/O handling

Message ID 20190121110354.2247-3-cohuck@redhat.com (mailing list archive)
State New, archived
Headers show
Series vfio-ccw: support hsch/csch (kernel part) | expand

Commit Message

Cornelia Huck Jan. 21, 2019, 11:03 a.m. UTC
Rework handling of multiple I/O requests to return -EAGAIN if
we are already processing an I/O request. Introduce a mutex
to disallow concurrent writes to the I/O region.

The expectation is that userspace simply retries the operation
if it gets -EAGAIN.

We currently don't allow multiple ssch requests at the same
time, as we don't have support for keeping channel programs
around for more than one request.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
 drivers/s390/cio/vfio_ccw_drv.c     |  1 +
 drivers/s390/cio/vfio_ccw_fsm.c     |  8 +++-----
 drivers/s390/cio/vfio_ccw_ops.c     | 31 +++++++++++++++++++----------
 drivers/s390/cio/vfio_ccw_private.h |  2 ++
 4 files changed, 26 insertions(+), 16 deletions(-)

Comments

Halil Pasic Jan. 21, 2019, 8:20 p.m. UTC | #1
On Mon, 21 Jan 2019 12:03:51 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> Rework handling of multiple I/O requests to return -EAGAIN if
> we are already processing an I/O request. Introduce a mutex
> to disallow concurrent writes to the I/O region.
> 
> The expectation is that userspace simply retries the operation
> if it gets -EAGAIN.
> 
> We currently don't allow multiple ssch requests at the same
> time, as we don't have support for keeping channel programs
> around for more than one request.
> 
> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> ---

[..]

>  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
>  {
>  	struct vfio_ccw_private *private;
>  	struct ccw_io_region *region;
> +	int ret;
>  
>  	if (*ppos + count > sizeof(*region))
>  		return -EINVAL;
>  
>  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> -	if (private->state != VFIO_CCW_STATE_IDLE)
> +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> +	    private->state == VFIO_CCW_STATE_STANDBY)
>  		return -EACCES;
> +	if (!mutex_trylock(&private->io_mutex))
> +		return -EAGAIN;
>  
>  	region = private->io_region;
> -	if (copy_from_user((void *)region + *ppos, buf, count))
> -		return -EFAULT;
> +	if (copy_from_user((void *)region + *ppos, buf, count)) {

This might race with vfio_ccw_sch_io_todo() on
private->io_region->irb_area, or?

> +		ret = -EFAULT;
> +		goto out_unlock;
> +	}
>  
>  	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ);
> -	if (region->ret_code != 0) {
> -		private->state = VFIO_CCW_STATE_IDLE;
> -		return region->ret_code;
> -	}
> +	ret = (region->ret_code != 0) ? region->ret_code : count;
>  
> -	return count;
> +out_unlock:
> +	mutex_unlock(&private->io_mutex);
> +	return ret;
>  }
>  
[..]
Cornelia Huck Jan. 22, 2019, 10:29 a.m. UTC | #2
On Mon, 21 Jan 2019 21:20:18 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Mon, 21 Jan 2019 12:03:51 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > Rework handling of multiple I/O requests to return -EAGAIN if
> > we are already processing an I/O request. Introduce a mutex
> > to disallow concurrent writes to the I/O region.
> > 
> > The expectation is that userspace simply retries the operation
> > if it gets -EAGAIN.
> > 
> > We currently don't allow multiple ssch requests at the same
> > time, as we don't have support for keeping channel programs
> > around for more than one request.
> > 
> > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > ---  
> 
> [..]
> 
> >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> >  {
> >  	struct vfio_ccw_private *private;
> >  	struct ccw_io_region *region;
> > +	int ret;
> >  
> >  	if (*ppos + count > sizeof(*region))
> >  		return -EINVAL;
> >  
> >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > +	    private->state == VFIO_CCW_STATE_STANDBY)
> >  		return -EACCES;
> > +	if (!mutex_trylock(&private->io_mutex))
> > +		return -EAGAIN;
> >  
> >  	region = private->io_region;
> > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > -		return -EFAULT;
> > +	if (copy_from_user((void *)region + *ppos, buf, count)) {  
> 
> This might race with vfio_ccw_sch_io_todo() on
> private->io_region->irb_area, or?

Ah yes, this should also take the mutex (should work because we're on a
workqueue).

> 
> > +		ret = -EFAULT;
> > +		goto out_unlock;
> > +	}
> >  
> >  	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ);
> > -	if (region->ret_code != 0) {
> > -		private->state = VFIO_CCW_STATE_IDLE;
> > -		return region->ret_code;
> > -	}
> > +	ret = (region->ret_code != 0) ? region->ret_code : count;
> >  
> > -	return count;
> > +out_unlock:
> > +	mutex_unlock(&private->io_mutex);
> > +	return ret;
> >  }
> >    
> [..]
>
Halil Pasic Jan. 22, 2019, 11:17 a.m. UTC | #3
On Tue, 22 Jan 2019 11:29:26 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Mon, 21 Jan 2019 21:20:18 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Mon, 21 Jan 2019 12:03:51 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > > Rework handling of multiple I/O requests to return -EAGAIN if
> > > we are already processing an I/O request. Introduce a mutex
> > > to disallow concurrent writes to the I/O region.
> > > 
> > > The expectation is that userspace simply retries the operation
> > > if it gets -EAGAIN.
> > > 
> > > We currently don't allow multiple ssch requests at the same
> > > time, as we don't have support for keeping channel programs
> > > around for more than one request.
> > > 
> > > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > > ---  
> > 
> > [..]
> > 
> > >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > >  {
> > >  	struct vfio_ccw_private *private;
> > >  	struct ccw_io_region *region;
> > > +	int ret;
> > >  
> > >  	if (*ppos + count > sizeof(*region))
> > >  		return -EINVAL;
> > >  
> > >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > > +	    private->state == VFIO_CCW_STATE_STANDBY)
> > >  		return -EACCES;
> > > +	if (!mutex_trylock(&private->io_mutex))
> > > +		return -EAGAIN;
> > >  
> > >  	region = private->io_region;
> > > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > > -		return -EFAULT;
> > > +	if (copy_from_user((void *)region + *ppos, buf, count)) {  
> > 
> > This might race with vfio_ccw_sch_io_todo() on
> > private->io_region->irb_area, or?
> 
> Ah yes, this should also take the mutex (should work because we're on a
> workqueue).
> 

I'm not sure that will do the trick (assumed I understood the
intention correctly). Let's say the things happen in this order:
1) vfio_ccw_sch_io_todo() goes first, I guess updates
private->io_region->irb_area and releases the mutex.
2) Then vfio_ccw_mdev_write() destroys the irb_area by zeriong it out,
and finally,
3) userspace reads the destroyed irb_area using vfio_ccw_mdev_read().

Or am I misunderstanding something? 

Regards,
Halil
Cornelia Huck Jan. 22, 2019, 11:53 a.m. UTC | #4
On Tue, 22 Jan 2019 12:17:37 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 22 Jan 2019 11:29:26 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Mon, 21 Jan 2019 21:20:18 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On Mon, 21 Jan 2019 12:03:51 +0100
> > > Cornelia Huck <cohuck@redhat.com> wrote:
> > >   
> > > > Rework handling of multiple I/O requests to return -EAGAIN if
> > > > we are already processing an I/O request. Introduce a mutex
> > > > to disallow concurrent writes to the I/O region.
> > > > 
> > > > The expectation is that userspace simply retries the operation
> > > > if it gets -EAGAIN.
> > > > 
> > > > We currently don't allow multiple ssch requests at the same
> > > > time, as we don't have support for keeping channel programs
> > > > around for more than one request.
> > > > 
> > > > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > > > ---    
> > > 
> > > [..]
> > >   
> > > >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > >  {
> > > >  	struct vfio_ccw_private *private;
> > > >  	struct ccw_io_region *region;
> > > > +	int ret;
> > > >  
> > > >  	if (*ppos + count > sizeof(*region))
> > > >  		return -EINVAL;
> > > >  
> > > >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > > > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > > > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > > > +	    private->state == VFIO_CCW_STATE_STANDBY)
> > > >  		return -EACCES;
> > > > +	if (!mutex_trylock(&private->io_mutex))
> > > > +		return -EAGAIN;
> > > >  
> > > >  	region = private->io_region;
> > > > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > > > -		return -EFAULT;
> > > > +	if (copy_from_user((void *)region + *ppos, buf, count)) {    
> > > 
> > > This might race with vfio_ccw_sch_io_todo() on
> > > private->io_region->irb_area, or?  
> > 
> > Ah yes, this should also take the mutex (should work because we're on a
> > workqueue).
> >   
> 
> I'm not sure that will do the trick (assumed I understood the
> intention correctly). Let's say the things happen in this order:
> 1) vfio_ccw_sch_io_todo() goes first, I guess updates
> private->io_region->irb_area and releases the mutex.
> 2) Then vfio_ccw_mdev_write() destroys the irb_area by zeriong it out,
> and finally,
> 3) userspace reads the destroyed irb_area using vfio_ccw_mdev_read().
> 
> Or am I misunderstanding something? 

You're not, but dealing with that race is outside the scope of this
patch. If userspace submits a request and then tries to get the old
data for a prior request, I suggest that userspace needs to fix their
sequencing.
Halil Pasic Jan. 22, 2019, 12:46 p.m. UTC | #5
On Tue, 22 Jan 2019 12:53:22 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Tue, 22 Jan 2019 12:17:37 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Tue, 22 Jan 2019 11:29:26 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > > On Mon, 21 Jan 2019 21:20:18 +0100
> > > Halil Pasic <pasic@linux.ibm.com> wrote:
> > >   
> > > > On Mon, 21 Jan 2019 12:03:51 +0100
> > > > Cornelia Huck <cohuck@redhat.com> wrote:
> > > >   
> > > > > Rework handling of multiple I/O requests to return -EAGAIN if
> > > > > we are already processing an I/O request. Introduce a mutex
> > > > > to disallow concurrent writes to the I/O region.
> > > > > 
> > > > > The expectation is that userspace simply retries the operation
> > > > > if it gets -EAGAIN.
> > > > > 
> > > > > We currently don't allow multiple ssch requests at the same
> > > > > time, as we don't have support for keeping channel programs
> > > > > around for more than one request.
> > > > > 
> > > > > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > > > > ---    
> > > > 
> > > > [..]
> > > >   
> > > > >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > >  {
> > > > >  	struct vfio_ccw_private *private;
> > > > >  	struct ccw_io_region *region;
> > > > > +	int ret;
> > > > >  
> > > > >  	if (*ppos + count > sizeof(*region))
> > > > >  		return -EINVAL;
> > > > >  
> > > > >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > > > > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > > > > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > > > > +	    private->state == VFIO_CCW_STATE_STANDBY)
> > > > >  		return -EACCES;
> > > > > +	if (!mutex_trylock(&private->io_mutex))
> > > > > +		return -EAGAIN;
> > > > >  
> > > > >  	region = private->io_region;
> > > > > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > > > > -		return -EFAULT;
> > > > > +	if (copy_from_user((void *)region + *ppos, buf, count)) {    
> > > > 
> > > > This might race with vfio_ccw_sch_io_todo() on
> > > > private->io_region->irb_area, or?  
> > > 
> > > Ah yes, this should also take the mutex (should work because we're on a
> > > workqueue).
> > >   
> > 
> > I'm not sure that will do the trick (assumed I understood the
> > intention correctly). Let's say the things happen in this order:
> > 1) vfio_ccw_sch_io_todo() goes first, I guess updates
> > private->io_region->irb_area and releases the mutex.
> > 2) Then vfio_ccw_mdev_write() destroys the irb_area by zeriong it out,
> > and finally,
> > 3) userspace reads the destroyed irb_area using vfio_ccw_mdev_read().
> > 
> > Or am I misunderstanding something? 
> 
> You're not, but dealing with that race is outside the scope of this
> patch. If userspace submits a request and then tries to get the old
> data for a prior request, I suggest that userspace needs to fix their
> sequencing.
> 

I tend to disagree, because I think this would be a degradation compared
to what we have right now.

Let me explain. I guess the current idea is that the private->state !=
VFIO_CCW_STATE_IDLE check safeguards against this. Yes we lack proper
synchronization (atomic/interlocked access or locks) that would guarantee
that different thread observe state transitions as required -- no
splint brain. But if state were atomic the scenario I have in mind can
not happen, because we get the solicited interrupt in BUSY state (and
set IDLE in vfio_ccw_sch_io_todo()). Unsolicited interrupts are another
piece of cake -- I have no idea how may of those do we get. And because
of this the broken sequencing in userspace could actually be the kernels
fault.

Regards,
Halil
Cornelia Huck Jan. 22, 2019, 5:26 p.m. UTC | #6
On Tue, 22 Jan 2019 13:46:12 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 22 Jan 2019 12:53:22 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Tue, 22 Jan 2019 12:17:37 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On Tue, 22 Jan 2019 11:29:26 +0100
> > > Cornelia Huck <cohuck@redhat.com> wrote:
> > >   
> > > > On Mon, 21 Jan 2019 21:20:18 +0100
> > > > Halil Pasic <pasic@linux.ibm.com> wrote:
> > > >     
> > > > > On Mon, 21 Jan 2019 12:03:51 +0100
> > > > > Cornelia Huck <cohuck@redhat.com> wrote:
> > > > >     
> > > > > > Rework handling of multiple I/O requests to return -EAGAIN if
> > > > > > we are already processing an I/O request. Introduce a mutex
> > > > > > to disallow concurrent writes to the I/O region.
> > > > > > 
> > > > > > The expectation is that userspace simply retries the operation
> > > > > > if it gets -EAGAIN.
> > > > > > 
> > > > > > We currently don't allow multiple ssch requests at the same
> > > > > > time, as we don't have support for keeping channel programs
> > > > > > around for more than one request.
> > > > > > 
> > > > > > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > > > > > ---      
> > > > > 
> > > > > [..]
> > > > >     
> > > > > >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > > > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > > >  {
> > > > > >  	struct vfio_ccw_private *private;
> > > > > >  	struct ccw_io_region *region;
> > > > > > +	int ret;
> > > > > >  
> > > > > >  	if (*ppos + count > sizeof(*region))
> > > > > >  		return -EINVAL;
> > > > > >  
> > > > > >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > > > > > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > > > > > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > > > > > +	    private->state == VFIO_CCW_STATE_STANDBY)
> > > > > >  		return -EACCES;
> > > > > > +	if (!mutex_trylock(&private->io_mutex))
> > > > > > +		return -EAGAIN;
> > > > > >  
> > > > > >  	region = private->io_region;
> > > > > > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > > > > > -		return -EFAULT;
> > > > > > +	if (copy_from_user((void *)region + *ppos, buf, count)) {      
> > > > > 
> > > > > This might race with vfio_ccw_sch_io_todo() on
> > > > > private->io_region->irb_area, or?    
> > > > 
> > > > Ah yes, this should also take the mutex (should work because we're on a
> > > > workqueue).
> > > >     
> > > 
> > > I'm not sure that will do the trick (assumed I understood the
> > > intention correctly). Let's say the things happen in this order:
> > > 1) vfio_ccw_sch_io_todo() goes first, I guess updates
> > > private->io_region->irb_area and releases the mutex.
> > > 2) Then vfio_ccw_mdev_write() destroys the irb_area by zeriong it out,
> > > and finally,
> > > 3) userspace reads the destroyed irb_area using vfio_ccw_mdev_read().
> > > 
> > > Or am I misunderstanding something?   
> > 
> > You're not, but dealing with that race is outside the scope of this
> > patch. If userspace submits a request and then tries to get the old
> > data for a prior request, I suggest that userspace needs to fix their
> > sequencing.
> >   
> 
> I tend to disagree, because I think this would be a degradation compared
> to what we have right now.
> 
> Let me explain. I guess the current idea is that the private->state !=
> VFIO_CCW_STATE_IDLE check safeguards against this. Yes we lack proper
> synchronization (atomic/interlocked access or locks) that would guarantee
> that different thread observe state transitions as required -- no
> splint brain. But if state were atomic the scenario I have in mind can
> not happen, because we get the solicited interrupt in BUSY state (and
> set IDLE in vfio_ccw_sch_io_todo()). 

This BUSY handling is broken for another case: If the guest requests
intermediate interrupts, there may be more than one interrupt by the
hardware -- and we still go out of BUSY state. (Freeing the cp is also
broken in that case.) However, the Linux dasd driver does not seem to
do that.

> Unsolicited interrupts are another
> piece of cake -- I have no idea how may of those do we get.

They at least don't have the "free the cp before we got final state"
bug. But I think both are reasons to get away from "use the BUSY state
to ensure the right sequence".

> And because
> of this the broken sequencing in userspace could actually be the kernels
> fault.

Here, I can't follow you at all :(
Halil Pasic Jan. 22, 2019, 6:33 p.m. UTC | #7
On Mon, 21 Jan 2019 12:03:51 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> --- a/drivers/s390/cio/vfio_ccw_private.h
> +++ b/drivers/s390/cio/vfio_ccw_private.h
> @@ -28,6 +28,7 @@
>   * @mdev: pointer to the mediated device
>   * @nb: notifier for vfio events
>   * @io_region: MMIO region to input/output I/O arguments/results
> + * @io_mutex: protect against concurrent update of I/O structures

We could be a bit more specific about what does this mutex guard.
Is it only io_region, or cp, irb and the new regions a well? ->state does
not seem to be covered, but should need some sort of synchronisation
too, or?

>   * @cp: channel program for the current I/O operation
>   * @irb: irb info received from interrupt
>   * @scsw: scsw info
> @@ -42,6 +43,7 @@ struct vfio_ccw_private {
>  	struct mdev_device	*mdev;
>  	struct notifier_block	nb;
>  	struct ccw_io_region	*io_region;
> +	struct mutex		io_mutex;
>  
>  	struct channel_program	cp;
>  	struct irb		irb;
> --
Halil Pasic Jan. 22, 2019, 7:03 p.m. UTC | #8
On Tue, 22 Jan 2019 18:26:17 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Tue, 22 Jan 2019 13:46:12 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Tue, 22 Jan 2019 12:53:22 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > > On Tue, 22 Jan 2019 12:17:37 +0100
> > > Halil Pasic <pasic@linux.ibm.com> wrote:
> > >   
> > > > On Tue, 22 Jan 2019 11:29:26 +0100
> > > > Cornelia Huck <cohuck@redhat.com> wrote:
> > > >   
> > > > > On Mon, 21 Jan 2019 21:20:18 +0100
> > > > > Halil Pasic <pasic@linux.ibm.com> wrote:
> > > > >     
> > > > > > On Mon, 21 Jan 2019 12:03:51 +0100
> > > > > > Cornelia Huck <cohuck@redhat.com> wrote:
> > > > > >     
> > > > > > > Rework handling of multiple I/O requests to return -EAGAIN if
> > > > > > > we are already processing an I/O request. Introduce a mutex
> > > > > > > to disallow concurrent writes to the I/O region.
> > > > > > > 
> > > > > > > The expectation is that userspace simply retries the operation
> > > > > > > if it gets -EAGAIN.
> > > > > > > 
> > > > > > > We currently don't allow multiple ssch requests at the same
> > > > > > > time, as we don't have support for keeping channel programs
> > > > > > > around for more than one request.
> > > > > > > 
> > > > > > > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > > > > > > ---      
> > > > > > 
> > > > > > [..]
> > > > > >     
> > > > > > >  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > > > > @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> > > > > > >  {
> > > > > > >  	struct vfio_ccw_private *private;
> > > > > > >  	struct ccw_io_region *region;
> > > > > > > +	int ret;
> > > > > > >  
> > > > > > >  	if (*ppos + count > sizeof(*region))
> > > > > > >  		return -EINVAL;
> > > > > > >  
> > > > > > >  	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > > > > > > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > > > > > > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > > > > > > +	    private->state == VFIO_CCW_STATE_STANDBY)
> > > > > > >  		return -EACCES;
> > > > > > > +	if (!mutex_trylock(&private->io_mutex))
> > > > > > > +		return -EAGAIN;
> > > > > > >  
> > > > > > >  	region = private->io_region;
> > > > > > > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > > > > > > -		return -EFAULT;
> > > > > > > +	if (copy_from_user((void *)region + *ppos, buf, count)) {      
> > > > > > 
> > > > > > This might race with vfio_ccw_sch_io_todo() on
> > > > > > private->io_region->irb_area, or?    
> > > > > 
> > > > > Ah yes, this should also take the mutex (should work because we're on a
> > > > > workqueue).
> > > > >     
> > > > 
> > > > I'm not sure that will do the trick (assumed I understood the
> > > > intention correctly). Let's say the things happen in this order:
> > > > 1) vfio_ccw_sch_io_todo() goes first, I guess updates
> > > > private->io_region->irb_area and releases the mutex.
> > > > 2) Then vfio_ccw_mdev_write() destroys the irb_area by zeriong it out,
> > > > and finally,
> > > > 3) userspace reads the destroyed irb_area using vfio_ccw_mdev_read().
> > > > 
> > > > Or am I misunderstanding something?   
> > > 
> > > You're not, but dealing with that race is outside the scope of this
> > > patch. If userspace submits a request and then tries to get the old
> > > data for a prior request, I suggest that userspace needs to fix their
> > > sequencing.
> > >   
> > 
> > I tend to disagree, because I think this would be a degradation compared
> > to what we have right now.
> > 
> > Let me explain. I guess the current idea is that the private->state !=
> > VFIO_CCW_STATE_IDLE check safeguards against this. Yes we lack proper
> > synchronization (atomic/interlocked access or locks) that would guarantee
> > that different thread observe state transitions as required -- no
> > splint brain. But if state were atomic the scenario I have in mind can
> > not happen, because we get the solicited interrupt in BUSY state (and
> > set IDLE in vfio_ccw_sch_io_todo()). 
> 
> This BUSY handling is broken for another case: If the guest requests
> intermediate interrupts, there may be more than one interrupt by the
> hardware -- and we still go out of BUSY state. (Freeing the cp is also
> broken in that case.) However, the Linux dasd driver does not seem to
> do that.
> 

Nod.

> > Unsolicited interrupts are another
> > piece of cake -- I have no idea how may of those do we get.
> 
> They at least don't have the "free the cp before we got final state"
> bug. But I think both are reasons to get away from "use the BUSY state
> to ensure the right sequence".
> 

I'm not sure I understand you correctly. I was under the impression that
the whole point in having a state machine was to ensure the states are
traversed in the right sequence with the right stuff being done on each
transition. At least in theory.

You've probably figured out that IMHO vfio-ccw is not in a good shape
(to put it mildly). I have a hard time reviewing a non-holistic
concurrency fix. Please tell sould I get perceived as non-constructive,
I will try to cut back on criticism. 

> > And because
> > of this the broken sequencing in userspace could actually be the kernels
> > fault.
> 
> Here, I can't follow you at all :(
> 

Should we ever deliver a zeroed out IRB to the userspace, for the next
ioinst it would look like we have no status nor FC bit set. That is, the
guest could end up with stuff in parallel that was never supposed to
be in parallel (i.e. broken sequencing because kernel feeds false
information due to race with unsolicited interrupt).

Does that help?

Regards,
Halil
Cornelia Huck Jan. 23, 2019, 10:21 a.m. UTC | #9
On Tue, 22 Jan 2019 19:33:46 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Mon, 21 Jan 2019 12:03:51 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > --- a/drivers/s390/cio/vfio_ccw_private.h
> > +++ b/drivers/s390/cio/vfio_ccw_private.h
> > @@ -28,6 +28,7 @@
> >   * @mdev: pointer to the mediated device
> >   * @nb: notifier for vfio events
> >   * @io_region: MMIO region to input/output I/O arguments/results
> > + * @io_mutex: protect against concurrent update of I/O structures  
> 
> We could be a bit more specific about what does this mutex guard.
> Is it only io_region, or cp, irb and the new regions a well? ->state does
> not seem to be covered, but should need some sort of synchronisation
> too, or?

I'm not sure. IIRC Pierre had some ideas about locking in the fsm?

> 
> >   * @cp: channel program for the current I/O operation
> >   * @irb: irb info received from interrupt
> >   * @scsw: scsw info
> > @@ -42,6 +43,7 @@ struct vfio_ccw_private {
> >  	struct mdev_device	*mdev;
> >  	struct notifier_block	nb;
> >  	struct ccw_io_region	*io_region;
> > +	struct mutex		io_mutex;
> >  
> >  	struct channel_program	cp;
> >  	struct irb		irb;
> > --   
>
Cornelia Huck Jan. 23, 2019, 10:34 a.m. UTC | #10
On Tue, 22 Jan 2019 20:03:31 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 22 Jan 2019 18:26:17 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Tue, 22 Jan 2019 13:46:12 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:

> > > Unsolicited interrupts are another
> > > piece of cake -- I have no idea how may of those do we get.  
> > 
> > They at least don't have the "free the cp before we got final state"
> > bug. But I think both are reasons to get away from "use the BUSY state
> > to ensure the right sequence".
> >   
> 
> I'm not sure I understand you correctly. I was under the impression that
> the whole point in having a state machine was to ensure the states are
> traversed in the right sequence with the right stuff being done on each
> transition. At least in theory.

Sequence in user space programs, not in the state machine.

> 
> You've probably figured out that IMHO vfio-ccw is not in a good shape
> (to put it mildly). I have a hard time reviewing a non-holistic
> concurrency fix. Please tell sould I get perceived as non-constructive,
> I will try to cut back on criticism. 

I'm afraid this is just confusing me :(

> 
> > > And because
> > > of this the broken sequencing in userspace could actually be the kernels
> > > fault.  
> > 
> > Here, I can't follow you at all :(
> >   
> 
> Should we ever deliver a zeroed out IRB to the userspace, for the next
> ioinst it would look like we have no status nor FC bit set. That is, the
> guest could end up with stuff in parallel that was never supposed to
> be in parallel (i.e. broken sequencing because kernel feeds false
> information due to race with unsolicited interrupt).
> 
> Does that help?

Not at all, I'm afraid :( User space programs still need to make sure
they poke the interfaces in the right order IMO...

At this point, I'm mostly confused... I'd prefer to simply fix things
as they come up so that we can finally move forward with the halt/clear
handling (and probably rework the state machine on top of that.)
Halil Pasic Jan. 23, 2019, 1:06 p.m. UTC | #11
On Wed, 23 Jan 2019 11:34:47 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Tue, 22 Jan 2019 20:03:31 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Tue, 22 Jan 2019 18:26:17 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > > On Tue, 22 Jan 2019 13:46:12 +0100
> > > Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > > > Unsolicited interrupts are another
> > > > piece of cake -- I have no idea how may of those do we get.  
> > > 
> > > They at least don't have the "free the cp before we got final state"
> > > bug. But I think both are reasons to get away from "use the BUSY state
> > > to ensure the right sequence".
> > >   
> > 
> > I'm not sure I understand you correctly. I was under the impression that
> > the whole point in having a state machine was to ensure the states are
> > traversed in the right sequence with the right stuff being done on each
> > transition. At least in theory.
> 
> Sequence in user space programs, not in the state machine.
> 

I'm a bit confused.

> > 
> > You've probably figured out that IMHO vfio-ccw is not in a good shape
> > (to put it mildly). I have a hard time reviewing a non-holistic
> > concurrency fix. Please tell sould I get perceived as non-constructive,
> > I will try to cut back on criticism. 
> 
> I'm afraid this is just confusing me :(
> 
> > 
> > > > And because
> > > > of this the broken sequencing in userspace could actually be the kernels
> > > > fault.  
> > > 
> > > Here, I can't follow you at all :(
> > >   
> > 
> > Should we ever deliver a zeroed out IRB to the userspace, for the next
> > ioinst it would look like we have no status nor FC bit set. That is, the
> > guest could end up with stuff in parallel that was never supposed to
> > be in parallel (i.e. broken sequencing because kernel feeds false
> > information due to race with unsolicited interrupt).
> > 
> > Does that help?
> 
> Not at all, I'm afraid :( User space programs still need to make sure
> they poke the interfaces in the right order IMO...
> 

Yes, one can usually think of interfaces as contracts: both sides need
to keep their end for things to work as intended. Unfortunately the
vfio-ccw iterface is not a very well specified one, and that makes
reasoning about right order so much harder.

I was under the impression that the right ordering is dictated by the
SCSW in userspace. E.g. if there is an FC bit set there userspace is not
ought to issue a SSCH request (write to the io_region). The kernel part
however may say 'userspace read the actual SCSW' by signaling
the io_trigger eventfd. Userspace is supposed to read the IRB from the
region and update it's SCSW.

Now if userspace reads a broken SCSW from the IRB, because of a race
(due to poorly written kernel part -- userspace not at fault), it is
going to make wrong assumptions about currently legal and illegal
operations (ordering).

Previously I described a scenario where IRB can break without userspace
being at fault (race between unsolicited interrupt -- can happen at any
time -- and a legit io request). I was under the impression we agreed on
this.

This in turn could lead to userspace violating the contract, as perceived
by the kernel side.

> At this point, I'm mostly confused... I'd prefer to simply fix things
> as they come up so that we can finally move forward with the halt/clear
> handling (and probably rework the state machine on top of that.)
> 

I understand. I guess you will want to send a new version because of the
stuff that got lost in the rebase, or?

Regards,
Halil
Halil Pasic Jan. 23, 2019, 1:30 p.m. UTC | #12
On Wed, 23 Jan 2019 11:21:12 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Tue, 22 Jan 2019 19:33:46 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Mon, 21 Jan 2019 12:03:51 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > > --- a/drivers/s390/cio/vfio_ccw_private.h
> > > +++ b/drivers/s390/cio/vfio_ccw_private.h
> > > @@ -28,6 +28,7 @@
> > >   * @mdev: pointer to the mediated device
> > >   * @nb: notifier for vfio events
> > >   * @io_region: MMIO region to input/output I/O arguments/results
> > > + * @io_mutex: protect against concurrent update of I/O structures  
> > 
> > We could be a bit more specific about what does this mutex guard.
> > Is it only io_region, or cp, irb and the new regions a well? ->state does
> > not seem to be covered, but should need some sort of synchronisation
> > too, or?
> 
> I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
> 

Yes there was something. I usually do review by sanity checking the
resulting code. IMHO the fsm stuff is broken now and differently broken
after this series. If we don't want to fix what we are touching, maybe a
pointing out ignored problems in patch descriptions and a
minimal-invasive approach could help ease review.

Regards,
Halil

> > 
> > >   * @cp: channel program for the current I/O operation
> > >   * @irb: irb info received from interrupt
> > >   * @scsw: scsw info
> > > @@ -42,6 +43,7 @@ struct vfio_ccw_private {
> > >  	struct mdev_device	*mdev;
> > >  	struct notifier_block	nb;
> > >  	struct ccw_io_region	*io_region;
> > > +	struct mutex		io_mutex;
> > >  
> > >  	struct channel_program	cp;
> > >  	struct irb		irb;
> > > --   
> > 
> 
>
Cornelia Huck Jan. 23, 2019, 1:34 p.m. UTC | #13
On Wed, 23 Jan 2019 14:06:01 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Wed, 23 Jan 2019 11:34:47 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:

> Yes, one can usually think of interfaces as contracts: both sides need
> to keep their end for things to work as intended. Unfortunately the
> vfio-ccw iterface is not a very well specified one, and that makes
> reasoning about right order so much harder.

That's probably where our disconnect comes from.

> 
> I was under the impression that the right ordering is dictated by the
> SCSW in userspace. E.g. if there is an FC bit set there userspace is not
> ought to issue a SSCH request (write to the io_region). The kernel part
> however may say 'userspace read the actual SCSW' by signaling
> the io_trigger eventfd. Userspace is supposed to read the IRB from the
> region and update it's SCSW.
> 
> Now if userspace reads a broken SCSW from the IRB, because of a race
> (due to poorly written kernel part -- userspace not at fault), it is
> going to make wrong assumptions about currently legal and illegal
> operations (ordering).

My understanding of the interface was that writing to the I/O region
triggers a ssch (unless rejected with error) and that reading it just
gets whatever the kernel wrote there the last time it updated its
internal structures. The eventfd simply triggers to say "the region has
been updated with an IRB", not to say "userspace, read this".

> 
> Previously I described a scenario where IRB can break without userspace
> being at fault (race between unsolicited interrupt -- can happen at any
> time -- and a legit io request). I was under the impression we agreed on
> this.

There is a bug in there (clearing the cp for non-final interrupts), and
it needs to be fixed. I'm not so sure if the unsolicited interrupt
thing is a bug (beyond that the internal state machine is confused).

> 
> This in turn could lead to userspace violating the contract, as perceived
> by the kernel side.

Which contract? ;)

Also, I'm not sure if we'd rather get a deferred cc 1?

> 
> > At this point, I'm mostly confused... I'd prefer to simply fix things
> > as they come up so that we can finally move forward with the halt/clear
> > handling (and probably rework the state machine on top of that.)
> >   
> 
> I understand. I guess you will want to send a new version because of the
> stuff that got lost in the rebase, or?

Yes, I'll send a new version; but I'll wait for more feedback for a bit.
Cornelia Huck Jan. 24, 2019, 10:05 a.m. UTC | #14
On Wed, 23 Jan 2019 14:30:51 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Wed, 23 Jan 2019 11:21:12 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Tue, 22 Jan 2019 19:33:46 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On Mon, 21 Jan 2019 12:03:51 +0100
> > > Cornelia Huck <cohuck@redhat.com> wrote:
> > >   
> > > > --- a/drivers/s390/cio/vfio_ccw_private.h
> > > > +++ b/drivers/s390/cio/vfio_ccw_private.h
> > > > @@ -28,6 +28,7 @@
> > > >   * @mdev: pointer to the mediated device
> > > >   * @nb: notifier for vfio events
> > > >   * @io_region: MMIO region to input/output I/O arguments/results
> > > > + * @io_mutex: protect against concurrent update of I/O structures    
> > > 
> > > We could be a bit more specific about what does this mutex guard.
> > > Is it only io_region, or cp, irb and the new regions a well? ->state does
> > > not seem to be covered, but should need some sort of synchronisation
> > > too, or?  
> > 
> > I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
> >   
> 
> Yes there was something. I usually do review by sanity checking the
> resulting code. IMHO the fsm stuff is broken now and differently broken
> after this series. If we don't want to fix what we are touching, maybe a
> pointing out ignored problems in patch descriptions and a
> minimal-invasive approach could help ease review.

So, would changing the description above to reference "the I/O
regions" (as it will also be taken when writing to the async region)
and stating that this handles concurrent reading/writing of the regions
help? I really don't want to enumerate everything I don't fix...

> 
> Regards,
> Halil
> 
> > >   
> > > >   * @cp: channel program for the current I/O operation
> > > >   * @irb: irb info received from interrupt
> > > >   * @scsw: scsw info
> > > > @@ -42,6 +43,7 @@ struct vfio_ccw_private {
> > > >  	struct mdev_device	*mdev;
> > > >  	struct notifier_block	nb;
> > > >  	struct ccw_io_region	*io_region;
> > > > +	struct mutex		io_mutex;
> > > >  
> > > >  	struct channel_program	cp;
> > > >  	struct irb		irb;
> > > > --     
> > >   
> > 
> >   
>
Pierre Morel Jan. 24, 2019, 10:08 a.m. UTC | #15
On 23/01/2019 11:21, Cornelia Huck wrote:
> On Tue, 22 Jan 2019 19:33:46 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
>> On Mon, 21 Jan 2019 12:03:51 +0100
>> Cornelia Huck <cohuck@redhat.com> wrote:
>>
>>> --- a/drivers/s390/cio/vfio_ccw_private.h
>>> +++ b/drivers/s390/cio/vfio_ccw_private.h
>>> @@ -28,6 +28,7 @@
>>>    * @mdev: pointer to the mediated device
>>>    * @nb: notifier for vfio events
>>>    * @io_region: MMIO region to input/output I/O arguments/results
>>> + * @io_mutex: protect against concurrent update of I/O structures
>>
>> We could be a bit more specific about what does this mutex guard.
>> Is it only io_region, or cp, irb and the new regions a well? ->state does
>> not seem to be covered, but should need some sort of synchronisation
>> too, or?
> 
> I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
> 

Yes I postponed this work to not collide with your patch series.

Do you think I should provide a new version of the FSM reworking series 
based on the last comment I got?

I would take into account that the asynchronous commands will come with 
your patch series and only provide the framework changes.


Regards,
Pierre
Cornelia Huck Jan. 24, 2019, 10:19 a.m. UTC | #16
On Thu, 24 Jan 2019 11:08:02 +0100
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 23/01/2019 11:21, Cornelia Huck wrote:
> > On Tue, 22 Jan 2019 19:33:46 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> >> On Mon, 21 Jan 2019 12:03:51 +0100
> >> Cornelia Huck <cohuck@redhat.com> wrote:
> >>  
> >>> --- a/drivers/s390/cio/vfio_ccw_private.h
> >>> +++ b/drivers/s390/cio/vfio_ccw_private.h
> >>> @@ -28,6 +28,7 @@
> >>>    * @mdev: pointer to the mediated device
> >>>    * @nb: notifier for vfio events
> >>>    * @io_region: MMIO region to input/output I/O arguments/results
> >>> + * @io_mutex: protect against concurrent update of I/O structures  
> >>
> >> We could be a bit more specific about what does this mutex guard.
> >> Is it only io_region, or cp, irb and the new regions a well? ->state does
> >> not seem to be covered, but should need some sort of synchronisation
> >> too, or?  
> > 
> > I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
> >   
> 
> Yes I postponed this work to not collide with your patch series.
> 
> Do you think I should provide a new version of the FSM reworking series 
> based on the last comment I got?
> 
> I would take into account that the asynchronous commands will come with 
> your patch series and only provide the framework changes.

This was more an answer to Halil's concerns around state
synchronization. I would prefer to first get this series (or a
variation) into decent shape, and then address state machine handling
on top of that (when we know more about the transitions involved), just
to avoid confusion.

Does that sound reasonable?
Pierre Morel Jan. 24, 2019, 11:18 a.m. UTC | #17
On 24/01/2019 11:19, Cornelia Huck wrote:
> On Thu, 24 Jan 2019 11:08:02 +0100
> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
>> On 23/01/2019 11:21, Cornelia Huck wrote:
>>> On Tue, 22 Jan 2019 19:33:46 +0100
>>> Halil Pasic <pasic@linux.ibm.com> wrote:
>>>    
>>>> On Mon, 21 Jan 2019 12:03:51 +0100
>>>> Cornelia Huck <cohuck@redhat.com> wrote:
>>>>   
>>>>> --- a/drivers/s390/cio/vfio_ccw_private.h
>>>>> +++ b/drivers/s390/cio/vfio_ccw_private.h
>>>>> @@ -28,6 +28,7 @@
>>>>>     * @mdev: pointer to the mediated device
>>>>>     * @nb: notifier for vfio events
>>>>>     * @io_region: MMIO region to input/output I/O arguments/results
>>>>> + * @io_mutex: protect against concurrent update of I/O structures
>>>>
>>>> We could be a bit more specific about what does this mutex guard.
>>>> Is it only io_region, or cp, irb and the new regions a well? ->state does
>>>> not seem to be covered, but should need some sort of synchronisation
>>>> too, or?
>>>
>>> I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
>>>    
>>
>> Yes I postponed this work to not collide with your patch series.
>>
>> Do you think I should provide a new version of the FSM reworking series
>> based on the last comment I got?
>>
>> I would take into account that the asynchronous commands will come with
>> your patch series and only provide the framework changes.
> 
> This was more an answer to Halil's concerns around state
> synchronization. I would prefer to first get this series (or a
> variation) into decent shape, and then address state machine handling
> on top of that (when we know more about the transitions involved), just
> to avoid confusion.
> 
> Does that sound reasonable?
> 

Absolutely, this was why I waited with my series. :)
Halil Pasic Jan. 24, 2019, 11:45 a.m. UTC | #18
On Thu, 24 Jan 2019 11:19:34 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Thu, 24 Jan 2019 11:08:02 +0100
> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
> > On 23/01/2019 11:21, Cornelia Huck wrote:
> > > On Tue, 22 Jan 2019 19:33:46 +0100
> > > Halil Pasic <pasic@linux.ibm.com> wrote:
> > >   
> > >> On Mon, 21 Jan 2019 12:03:51 +0100
> > >> Cornelia Huck <cohuck@redhat.com> wrote:
> > >>  
> > >>> --- a/drivers/s390/cio/vfio_ccw_private.h
> > >>> +++ b/drivers/s390/cio/vfio_ccw_private.h
> > >>> @@ -28,6 +28,7 @@
> > >>>    * @mdev: pointer to the mediated device
> > >>>    * @nb: notifier for vfio events
> > >>>    * @io_region: MMIO region to input/output I/O arguments/results
> > >>> + * @io_mutex: protect against concurrent update of I/O structures  
> > >>
> > >> We could be a bit more specific about what does this mutex guard.
> > >> Is it only io_region, or cp, irb and the new regions a well? ->state does
> > >> not seem to be covered, but should need some sort of synchronisation
> > >> too, or?  
> > > 
> > > I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
> > >   
> > 
> > Yes I postponed this work to not collide with your patch series.
> > 
> > Do you think I should provide a new version of the FSM reworking series 
> > based on the last comment I got?
> > 
> > I would take into account that the asynchronous commands will come with 
> > your patch series and only provide the framework changes.
> 
> This was more an answer to Halil's concerns around state
> synchronization. I would prefer to first get this series (or a
> variation) into decent shape, and then address state machine handling
> on top of that (when we know more about the transitions involved), just
> to avoid confusion.
> 
> Does that sound reasonable?
> 

I would like the two hitting the same kernel release. In that case I'm
fine with deferring some of the concurrency fixes after the csch/hsch
stuff. Otherwise I would have a bad feeling about increasing the
complexity without fixing known bugs.

Regards,
Halil
Eric Farman Jan. 24, 2019, 7:14 p.m. UTC | #19
On 01/24/2019 05:19 AM, Cornelia Huck wrote:
> On Thu, 24 Jan 2019 11:08:02 +0100
> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
>> On 23/01/2019 11:21, Cornelia Huck wrote:
>>> On Tue, 22 Jan 2019 19:33:46 +0100
>>> Halil Pasic <pasic@linux.ibm.com> wrote:
>>>    
>>>> On Mon, 21 Jan 2019 12:03:51 +0100
>>>> Cornelia Huck <cohuck@redhat.com> wrote:
>>>>   
>>>>> --- a/drivers/s390/cio/vfio_ccw_private.h
>>>>> +++ b/drivers/s390/cio/vfio_ccw_private.h
>>>>> @@ -28,6 +28,7 @@
>>>>>     * @mdev: pointer to the mediated device
>>>>>     * @nb: notifier for vfio events
>>>>>     * @io_region: MMIO region to input/output I/O arguments/results
>>>>> + * @io_mutex: protect against concurrent update of I/O structures
>>>>
>>>> We could be a bit more specific about what does this mutex guard.
>>>> Is it only io_region, or cp, irb and the new regions a well? ->state does
>>>> not seem to be covered, but should need some sort of synchronisation
>>>> too, or?
>>>
>>> I'm not sure. IIRC Pierre had some ideas about locking in the fsm?
>>>    
>>
>> Yes I postponed this work to not collide with your patch series.
>>
>> Do you think I should provide a new version of the FSM reworking series
>> based on the last comment I got?
>>
>> I would take into account that the asynchronous commands will come with
>> your patch series and only provide the framework changes.
> 
> This was more an answer to Halil's concerns around state
> synchronization. I would prefer to first get this series (or a
> variation) into decent shape, and then address state machine handling
> on top of that (when we know more about the transitions involved), just
> to avoid confusion.
> 
> Does that sound reasonable?
> 

It does to me.

<Sorry for my silence; we teach our daughter to share, and she shares 
whatever bug is passed around daycare.  I'm catching up on my "todo" 
emails now!>
Eric Farman Jan. 24, 2019, 7:16 p.m. UTC | #20
On 01/23/2019 08:34 AM, Cornelia Huck wrote:
> On Wed, 23 Jan 2019 14:06:01 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
>> On Wed, 23 Jan 2019 11:34:47 +0100
>> Cornelia Huck <cohuck@redhat.com> wrote:
> 
>> Yes, one can usually think of interfaces as contracts: both sides need
>> to keep their end for things to work as intended. Unfortunately the
>> vfio-ccw iterface is not a very well specified one, and that makes
>> reasoning about right order so much harder.
> 
> That's probably where our disconnect comes from.
> 
>>
>> I was under the impression that the right ordering is dictated by the
>> SCSW in userspace. E.g. if there is an FC bit set there userspace is not
>> ought to issue a SSCH request (write to the io_region). The kernel part
>> however may say 'userspace read the actual SCSW' by signaling
>> the io_trigger eventfd. Userspace is supposed to read the IRB from the
>> region and update it's SCSW.
>>
>> Now if userspace reads a broken SCSW from the IRB, because of a race
>> (due to poorly written kernel part -- userspace not at fault), it is
>> going to make wrong assumptions about currently legal and illegal
>> operations (ordering).
> 
> My understanding of the interface was that writing to the I/O region
> triggers a ssch (unless rejected with error) and that reading it just
> gets whatever the kernel wrote there the last time it updated its
> internal structures. The eventfd simply triggers to say "the region has
> been updated with an IRB", not to say "userspace, read this".
> 
>>
>> Previously I described a scenario where IRB can break without userspace
>> being at fault (race between unsolicited interrupt -- can happen at any
>> time -- and a legit io request). I was under the impression we agreed on
>> this.
> 
> There is a bug in there (clearing the cp for non-final interrupts), and
> it needs to be fixed. I'm not so sure if the unsolicited interrupt
> thing is a bug (beyond that the internal state machine is confused).
> 
>>
>> This in turn could lead to userspace violating the contract, as perceived
>> by the kernel side.
> 
> Which contract? ;)
> 
> Also, I'm not sure if we'd rather get a deferred cc 1?

As I'm encountering dcc=1 quite regularly lately, it's a nice error. 
But we don't have a good way of recovering from it, and so my test tends 
to go down in a heap quite quickly.  This patch set will probably help; 
I should really get it applied and try it out.

> 
>>
>>> At this point, I'm mostly confused... I'd prefer to simply fix things
>>> as they come up so that we can finally move forward with the halt/clear
>>> handling (and probably rework the state machine on top of that.)

+1 for fixing things as we go.  I hear the complaints about this code 
(and probably say them too), but remain convinced that a large rewrite 
is unnecessary.  Lots of opportunities for improvement, with lots of 
willing and motivated participants, means it can only get better!

>>>    
>>
>> I understand. I guess you will want to send a new version because of the
>> stuff that got lost in the rebase, or?
> 
> Yes, I'll send a new version; but I'll wait for more feedback for a bit.
> 

I'll try to provide some now.  Still digging through the emails marked 
"todo" :)

  - Eric
Eric Farman Jan. 25, 2019, 2:25 a.m. UTC | #21
On 01/21/2019 06:03 AM, Cornelia Huck wrote:
> Rework handling of multiple I/O requests to return -EAGAIN if
> we are already processing an I/O request. Introduce a mutex
> to disallow concurrent writes to the I/O region.
> 
> The expectation is that userspace simply retries the operation
> if it gets -EAGAIN.
> 
> We currently don't allow multiple ssch requests at the same
> time, as we don't have support for keeping channel programs
> around for more than one request.
> 
> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> ---
>   drivers/s390/cio/vfio_ccw_drv.c     |  1 +
>   drivers/s390/cio/vfio_ccw_fsm.c     |  8 +++-----
>   drivers/s390/cio/vfio_ccw_ops.c     | 31 +++++++++++++++++++----------
>   drivers/s390/cio/vfio_ccw_private.h |  2 ++
>   4 files changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
> index a10cec0e86eb..2ef189fe45ed 100644
> --- a/drivers/s390/cio/vfio_ccw_drv.c
> +++ b/drivers/s390/cio/vfio_ccw_drv.c
> @@ -125,6 +125,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
>   
>   	private->sch = sch;
>   	dev_set_drvdata(&sch->dev, private);
> +	mutex_init(&private->io_mutex);
>   
>   	spin_lock_irq(sch->lock);
>   	private->state = VFIO_CCW_STATE_NOT_OPER;
> diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
> index cab17865aafe..f6ed934cc565 100644
> --- a/drivers/s390/cio/vfio_ccw_fsm.c
> +++ b/drivers/s390/cio/vfio_ccw_fsm.c
> @@ -28,7 +28,6 @@ static int fsm_io_helper(struct vfio_ccw_private *private)
>   	sch = private->sch;
>   
>   	spin_lock_irqsave(sch->lock, flags);
> -	private->state = VFIO_CCW_STATE_BUSY;

[1]

>   
>   	orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
>   
> @@ -42,6 +41,8 @@ static int fsm_io_helper(struct vfio_ccw_private *private)
>   		 */
>   		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
>   		ret = 0;
> +		/* Don't allow another ssch for now */
> +		private->state = VFIO_CCW_STATE_BUSY;

[1]

>   		break;
>   	case 1:		/* Status pending */
>   	case 2:		/* Busy */
> @@ -99,7 +100,7 @@ static void fsm_io_error(struct vfio_ccw_private *private,
>   static void fsm_io_busy(struct vfio_ccw_private *private,
>   			enum vfio_ccw_event event)
>   {
> -	private->io_region->ret_code = -EBUSY;
> +	private->io_region->ret_code = -EAGAIN;
>   }
>   
>   static void fsm_disabled_irq(struct vfio_ccw_private *private,
> @@ -130,8 +131,6 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>   	struct mdev_device *mdev = private->mdev;
>   	char *errstr = "request";
>   
> -	private->state = VFIO_CCW_STATE_BUSY;
> -

[1]

>   	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>   
>   	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> @@ -176,7 +175,6 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>   	}
>   
>   err_out:
> -	private->state = VFIO_CCW_STATE_IDLE;

[1] I think these changes are cool.  We end up going into (and staying 
in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we 
bumble along.

But why can't these be separated out from this patch?  It does change 
the behavior of the state machine, and seem distinct from the addition 
of the mutex you otherwise add here?  At the very least, this behavior 
change should be documented in the commit since it's otherwise lost in 
the mutex/EAGAIN stuff.

>   	trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private),
>   			       io_region->ret_code, errstr);
>   }
> diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> index f673e106c041..3fa9fc570400 100644
> --- a/drivers/s390/cio/vfio_ccw_ops.c
> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> @@ -169,16 +169,20 @@ static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
>   {
>   	struct vfio_ccw_private *private;
>   	struct ccw_io_region *region;
> +	int ret;
>   
>   	if (*ppos + count > sizeof(*region))
>   		return -EINVAL;
>   
>   	private = dev_get_drvdata(mdev_parent_dev(mdev));
> +	mutex_lock(&private->io_mutex);
>   	region = private->io_region;
>   	if (copy_to_user(buf, (void *)region + *ppos, count))
> -		return -EFAULT;
> -
> -	return count;
> +		ret = -EFAULT;
> +	else
> +		ret = count;
> +	mutex_unlock(&private->io_mutex);
> +	return ret;
>   }
>   
>   static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
>   {
>   	struct vfio_ccw_private *private;
>   	struct ccw_io_region *region;
> +	int ret;
>   
>   	if (*ppos + count > sizeof(*region))
>   		return -EINVAL;
>   
>   	private = dev_get_drvdata(mdev_parent_dev(mdev));
> -	if (private->state != VFIO_CCW_STATE_IDLE)
> +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> +	    private->state == VFIO_CCW_STATE_STANDBY)
>   		return -EACCES;
> +	if (!mutex_trylock(&private->io_mutex))
> +		return -EAGAIN;

Ah, I see Halil's difficulty here.

It is true there is a race condition today, and that this doesn't 
address it.  That's fine, add it to the todo list.  But even with that, 
I don't see what the mutex is enforcing?  Two simultaneous SSCHs will be 
serialized (one will get kicked out with a failed trylock() call), while 
still leaving the window open between cc=0 on the SSCH and the 
subsequent interrupt.  In the latter case, a second SSCH will come 
through here, do the copy_from_user below, and then jump to fsm_io_busy 
to return EAGAIN.  Do we really want to stomp on io_region in that case? 
  Why can't we simply return EAGAIN if state==BUSY?

>   
>   	region = private->io_region;
> -	if (copy_from_user((void *)region + *ppos, buf, count))
> -		return -EFAULT;
> +	if (copy_from_user((void *)region + *ppos, buf, count)) {
> +		ret = -EFAULT;
> +		goto out_unlock;
> +	}
>   
>   	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ)
> -	if (region->ret_code != 0) {
> -		private->state = VFIO_CCW_STATE_IDLE;

[1] (above)

> -		return region->ret_code;
> -	}
> +	ret = (region->ret_code != 0) ? region->ret_code : count;
>   
> -	return count;
> +out_unlock:
> +	mutex_unlock(&private->io_mutex);
> +	return ret;
>   }
>   
>   static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
> diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
> index 08e9a7dc9176..e88237697f83 100644
> --- a/drivers/s390/cio/vfio_ccw_private.h
> +++ b/drivers/s390/cio/vfio_ccw_private.h
> @@ -28,6 +28,7 @@
>    * @mdev: pointer to the mediated device
>    * @nb: notifier for vfio events
>    * @io_region: MMIO region to input/output I/O arguments/results
> + * @io_mutex: protect against concurrent update of I/O structures
>    * @cp: channel program for the current I/O operation
>    * @irb: irb info received from interrupt
>    * @scsw: scsw info
> @@ -42,6 +43,7 @@ struct vfio_ccw_private {
>   	struct mdev_device	*mdev;
>   	struct notifier_block	nb;
>   	struct ccw_io_region	*io_region;
> +	struct mutex		io_mutex;
>   
>   	struct channel_program	cp;
>   	struct irb		irb;
>
Eric Farman Jan. 25, 2019, 2:37 a.m. UTC | #22
On 01/24/2019 09:25 PM, Eric Farman wrote:
> 
> 
> On 01/21/2019 06:03 AM, Cornelia Huck wrote:
>> Rework handling of multiple I/O requests to return -EAGAIN if
>> we are already processing an I/O request. Introduce a mutex
>> to disallow concurrent writes to the I/O region.
>>
>> The expectation is that userspace simply retries the operation
>> if it gets -EAGAIN.
>>
>> We currently don't allow multiple ssch requests at the same
>> time, as we don't have support for keeping channel programs
>> around for more than one request.
>>
>> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
>> ---
>>   drivers/s390/cio/vfio_ccw_drv.c     |  1 +
>>   drivers/s390/cio/vfio_ccw_fsm.c     |  8 +++-----
>>   drivers/s390/cio/vfio_ccw_ops.c     | 31 +++++++++++++++++++----------
>>   drivers/s390/cio/vfio_ccw_private.h |  2 ++
>>   4 files changed, 26 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/s390/cio/vfio_ccw_drv.c 
>> b/drivers/s390/cio/vfio_ccw_drv.c
>> index a10cec0e86eb..2ef189fe45ed 100644
>> --- a/drivers/s390/cio/vfio_ccw_drv.c
>> +++ b/drivers/s390/cio/vfio_ccw_drv.c
>> @@ -125,6 +125,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
>>       private->sch = sch;
>>       dev_set_drvdata(&sch->dev, private);
>> +    mutex_init(&private->io_mutex);
>>       spin_lock_irq(sch->lock);
>>       private->state = VFIO_CCW_STATE_NOT_OPER;
>> diff --git a/drivers/s390/cio/vfio_ccw_fsm.c 
>> b/drivers/s390/cio/vfio_ccw_fsm.c
>> index cab17865aafe..f6ed934cc565 100644
>> --- a/drivers/s390/cio/vfio_ccw_fsm.c
>> +++ b/drivers/s390/cio/vfio_ccw_fsm.c
>> @@ -28,7 +28,6 @@ static int fsm_io_helper(struct vfio_ccw_private 
>> *private)
>>       sch = private->sch;
>>       spin_lock_irqsave(sch->lock, flags);
>> -    private->state = VFIO_CCW_STATE_BUSY;
> 
> [1]
> 
>>       orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
>> @@ -42,6 +41,8 @@ static int fsm_io_helper(struct vfio_ccw_private 
>> *private)
>>            */
>>           sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
>>           ret = 0;
>> +        /* Don't allow another ssch for now */
>> +        private->state = VFIO_CCW_STATE_BUSY;
> 
> [1]
> 
>>           break;
>>       case 1:        /* Status pending */
>>       case 2:        /* Busy */
>> @@ -99,7 +100,7 @@ static void fsm_io_error(struct vfio_ccw_private 
>> *private,
>>   static void fsm_io_busy(struct vfio_ccw_private *private,
>>               enum vfio_ccw_event event)
>>   {
>> -    private->io_region->ret_code = -EBUSY;
>> +    private->io_region->ret_code = -EAGAIN;
>>   }
>>   static void fsm_disabled_irq(struct vfio_ccw_private *private,
>> @@ -130,8 +131,6 @@ static void fsm_io_request(struct vfio_ccw_private 
>> *private,
>>       struct mdev_device *mdev = private->mdev;
>>       char *errstr = "request";
>> -    private->state = VFIO_CCW_STATE_BUSY;
>> -
> 
> [1]
> 
>>       memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>>       if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
>> @@ -176,7 +175,6 @@ static void fsm_io_request(struct vfio_ccw_private 
>> *private,
>>       }
>>   err_out:
>> -    private->state = VFIO_CCW_STATE_IDLE;
> 
> [1] I think these changes are cool.  We end up going into (and staying 
> in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we 
> bumble along.
> 
> But why can't these be separated out from this patch?  It does change 
> the behavior of the state machine, and seem distinct from the addition 
> of the mutex you otherwise add here?  At the very least, this behavior 
> change should be documented in the commit since it's otherwise lost in 
> the mutex/EAGAIN stuff.
> 
>>       trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private),
>>                      io_region->ret_code, errstr);
>>   }
>> diff --git a/drivers/s390/cio/vfio_ccw_ops.c 
>> b/drivers/s390/cio/vfio_ccw_ops.c
>> index f673e106c041..3fa9fc570400 100644
>> --- a/drivers/s390/cio/vfio_ccw_ops.c
>> +++ b/drivers/s390/cio/vfio_ccw_ops.c
>> @@ -169,16 +169,20 @@ static ssize_t vfio_ccw_mdev_read(struct 
>> mdev_device *mdev,
>>   {
>>       struct vfio_ccw_private *private;
>>       struct ccw_io_region *region;
>> +    int ret;
>>       if (*ppos + count > sizeof(*region))
>>           return -EINVAL;
>>       private = dev_get_drvdata(mdev_parent_dev(mdev));
>> +    mutex_lock(&private->io_mutex);
>>       region = private->io_region;
>>       if (copy_to_user(buf, (void *)region + *ppos, count))
>> -        return -EFAULT;
>> -
>> -    return count;
>> +        ret = -EFAULT;
>> +    else
>> +        ret = count;
>> +    mutex_unlock(&private->io_mutex);
>> +    return ret;
>>   }
>>   static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
>> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct 
>> mdev_device *mdev,
>>   {
>>       struct vfio_ccw_private *private;
>>       struct ccw_io_region *region;
>> +    int ret;
>>       if (*ppos + count > sizeof(*region))
>>           return -EINVAL;
>>       private = dev_get_drvdata(mdev_parent_dev(mdev));
>> -    if (private->state != VFIO_CCW_STATE_IDLE)
>> +    if (private->state == VFIO_CCW_STATE_NOT_OPER ||
>> +        private->state == VFIO_CCW_STATE_STANDBY)
>>           return -EACCES;
>> +    if (!mutex_trylock(&private->io_mutex))
>> +        return -EAGAIN;
> 
> Ah, I see Halil's difficulty here.
> 
> It is true there is a race condition today, and that this doesn't 
> address it.  That's fine, add it to the todo list.  But even with that, 
> I don't see what the mutex is enforcing?  Two simultaneous SSCHs will be 
> serialized (one will get kicked out with a failed trylock() call), while 
> still leaving the window open between cc=0 on the SSCH and the 
> subsequent interrupt.  In the latter case, a second SSCH will come 
> through here, do the copy_from_user below, and then jump to fsm_io_busy 
> to return EAGAIN.  Do we really want to stomp on io_region in that case? 
>   Why can't we simply return EAGAIN if state==BUSY?

(Answering my own questions as I skim patch 5...)

Because of course this series is for async handling, while I was looking 
specifically at the synchronous code that exists today.  I guess then my 
question just remains on how the mutex is adding protection in the sync 
case, because that's still not apparent to me.  (Perhaps I missed it in 
a reply to Halil; if so I apologize, there were a lot when I returned.)

> 
>>       region = private->io_region;
>> -    if (copy_from_user((void *)region + *ppos, buf, count))
>> -        return -EFAULT;
>> +    if (copy_from_user((void *)region + *ppos, buf, count)) {
>> +        ret = -EFAULT;
>> +        goto out_unlock;
>> +    }
>>       vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ)
>> -    if (region->ret_code != 0) {
>> -        private->state = VFIO_CCW_STATE_IDLE;
> 
> [1] (above)
> 
>> -        return region->ret_code;
>> -    }
>> +    ret = (region->ret_code != 0) ? region->ret_code : count;
>> -    return count;
>> +out_unlock:
>> +    mutex_unlock(&private->io_mutex);
>> +    return ret;
>>   }
>>   static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
>> diff --git a/drivers/s390/cio/vfio_ccw_private.h 
>> b/drivers/s390/cio/vfio_ccw_private.h
>> index 08e9a7dc9176..e88237697f83 100644
>> --- a/drivers/s390/cio/vfio_ccw_private.h
>> +++ b/drivers/s390/cio/vfio_ccw_private.h
>> @@ -28,6 +28,7 @@
>>    * @mdev: pointer to the mediated device
>>    * @nb: notifier for vfio events
>>    * @io_region: MMIO region to input/output I/O arguments/results
>> + * @io_mutex: protect against concurrent update of I/O structures
>>    * @cp: channel program for the current I/O operation
>>    * @irb: irb info received from interrupt
>>    * @scsw: scsw info
>> @@ -42,6 +43,7 @@ struct vfio_ccw_private {
>>       struct mdev_device    *mdev;
>>       struct notifier_block    nb;
>>       struct ccw_io_region    *io_region;
>> +    struct mutex        io_mutex;
>>       struct channel_program    cp;
>>       struct irb        irb;
>>
Cornelia Huck Jan. 25, 2019, 10:13 a.m. UTC | #23
On Thu, 24 Jan 2019 14:16:21 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> On 01/23/2019 08:34 AM, Cornelia Huck wrote:
> > On Wed, 23 Jan 2019 14:06:01 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> >> On Wed, 23 Jan 2019 11:34:47 +0100
> >> Cornelia Huck <cohuck@redhat.com> wrote:  
> >   
> >> Yes, one can usually think of interfaces as contracts: both sides need
> >> to keep their end for things to work as intended. Unfortunately the
> >> vfio-ccw iterface is not a very well specified one, and that makes
> >> reasoning about right order so much harder.  
> > 
> > That's probably where our disconnect comes from.
> >   
> >>
> >> I was under the impression that the right ordering is dictated by the
> >> SCSW in userspace. E.g. if there is an FC bit set there userspace is not
> >> ought to issue a SSCH request (write to the io_region). The kernel part
> >> however may say 'userspace read the actual SCSW' by signaling
> >> the io_trigger eventfd. Userspace is supposed to read the IRB from the
> >> region and update it's SCSW.
> >>
> >> Now if userspace reads a broken SCSW from the IRB, because of a race
> >> (due to poorly written kernel part -- userspace not at fault), it is
> >> going to make wrong assumptions about currently legal and illegal
> >> operations (ordering).  
> > 
> > My understanding of the interface was that writing to the I/O region
> > triggers a ssch (unless rejected with error) and that reading it just
> > gets whatever the kernel wrote there the last time it updated its
> > internal structures. The eventfd simply triggers to say "the region has
> > been updated with an IRB", not to say "userspace, read this".
> >   
> >>
> >> Previously I described a scenario where IRB can break without userspace
> >> being at fault (race between unsolicited interrupt -- can happen at any
> >> time -- and a legit io request). I was under the impression we agreed on
> >> this.  
> > 
> > There is a bug in there (clearing the cp for non-final interrupts), and
> > it needs to be fixed. I'm not so sure if the unsolicited interrupt
> > thing is a bug (beyond that the internal state machine is confused).
> >   
> >>
> >> This in turn could lead to userspace violating the contract, as perceived
> >> by the kernel side.  
> > 
> > Which contract? ;)
> > 
> > Also, I'm not sure if we'd rather get a deferred cc 1?  
> 
> As I'm encountering dcc=1 quite regularly lately, it's a nice error. 
> But we don't have a good way of recovering from it, and so my test tends 
> to go down in a heap quite quickly.  This patch set will probably help; 
> I should really get it applied and try it out.

The deferred cc 1 is probably more likely simply due to the overhead we
get from intercepting the I/O calls.

> 
> >   
> >>  
> >>> At this point, I'm mostly confused... I'd prefer to simply fix things
> >>> as they come up so that we can finally move forward with the halt/clear
> >>> handling (and probably rework the state machine on top of that.)  
> 
> +1 for fixing things as we go.  I hear the complaints about this code 
> (and probably say them too), but remain convinced that a large rewrite 
> is unnecessary.  Lots of opportunities for improvement, with lots of 
> willing and motivated participants, means it can only get better!

Yeah, the code would probably look a bit different if I started writing
it from scratch now, but I don't think the basic design is unfixably
broken.

> 
> >>>      
> >>
> >> I understand. I guess you will want to send a new version because of the
> >> stuff that got lost in the rebase, or?  
> > 
> > Yes, I'll send a new version; but I'll wait for more feedback for a bit.
> >   
> 
> I'll try to provide some now.  Still digging through the emails marked 
> "todo" :)

Ok, I'll wait for a bit more :)
Cornelia Huck Jan. 25, 2019, 10:24 a.m. UTC | #24
On Thu, 24 Jan 2019 21:37:44 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> On 01/24/2019 09:25 PM, Eric Farman wrote:
> > 
> > 
> > On 01/21/2019 06:03 AM, Cornelia Huck wrote:  

> > [1] I think these changes are cool.  We end up going into (and staying 
> > in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we 
> > bumble along.
> > 
> > But why can't these be separated out from this patch?  It does change 
> > the behavior of the state machine, and seem distinct from the addition 
> > of the mutex you otherwise add here?  At the very least, this behavior 
> > change should be documented in the commit since it's otherwise lost in 
> > the mutex/EAGAIN stuff.

That's a very good idea. I'll factor them out into a separate patch.

> >   
> >>       trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private),
> >>                      io_region->ret_code, errstr);
> >>   }
> >> diff --git a/drivers/s390/cio/vfio_ccw_ops.c 
> >> b/drivers/s390/cio/vfio_ccw_ops.c
> >> index f673e106c041..3fa9fc570400 100644
> >> --- a/drivers/s390/cio/vfio_ccw_ops.c
> >> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> >> @@ -169,16 +169,20 @@ static ssize_t vfio_ccw_mdev_read(struct 
> >> mdev_device *mdev,
> >>   {
> >>       struct vfio_ccw_private *private;
> >>       struct ccw_io_region *region;
> >> +    int ret;
> >>       if (*ppos + count > sizeof(*region))
> >>           return -EINVAL;
> >>       private = dev_get_drvdata(mdev_parent_dev(mdev));
> >> +    mutex_lock(&private->io_mutex);
> >>       region = private->io_region;
> >>       if (copy_to_user(buf, (void *)region + *ppos, count))
> >> -        return -EFAULT;
> >> -
> >> -    return count;
> >> +        ret = -EFAULT;
> >> +    else
> >> +        ret = count;
> >> +    mutex_unlock(&private->io_mutex);
> >> +    return ret;
> >>   }
> >>   static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> >> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct 
> >> mdev_device *mdev,
> >>   {
> >>       struct vfio_ccw_private *private;
> >>       struct ccw_io_region *region;
> >> +    int ret;
> >>       if (*ppos + count > sizeof(*region))
> >>           return -EINVAL;
> >>       private = dev_get_drvdata(mdev_parent_dev(mdev));
> >> -    if (private->state != VFIO_CCW_STATE_IDLE)
> >> +    if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> >> +        private->state == VFIO_CCW_STATE_STANDBY)
> >>           return -EACCES;
> >> +    if (!mutex_trylock(&private->io_mutex))
> >> +        return -EAGAIN;  
> > 
> > Ah, I see Halil's difficulty here.
> > 
> > It is true there is a race condition today, and that this doesn't 
> > address it.  That's fine, add it to the todo list.  But even with that, 
> > I don't see what the mutex is enforcing?  Two simultaneous SSCHs will be 
> > serialized (one will get kicked out with a failed trylock() call), while 
> > still leaving the window open between cc=0 on the SSCH and the 
> > subsequent interrupt.  In the latter case, a second SSCH will come 
> > through here, do the copy_from_user below, and then jump to fsm_io_busy 
> > to return EAGAIN.  Do we really want to stomp on io_region in that case? 
> >   Why can't we simply return EAGAIN if state==BUSY?  
> 
> (Answering my own questions as I skim patch 5...)
> 
> Because of course this series is for async handling, while I was looking 
> specifically at the synchronous code that exists today.  I guess then my 
> question just remains on how the mutex is adding protection in the sync 
> case, because that's still not apparent to me.  (Perhaps I missed it in 
> a reply to Halil; if so I apologize, there were a lot when I returned.)

My idea behind the mutex was to make sure that we get consistent data
when reading/writing (e.g. if one user space thread is reading the I/O
region while another is writing to it).
Halil Pasic Jan. 25, 2019, 12:58 p.m. UTC | #25
On Thu, 24 Jan 2019 21:25:10 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> >   	private = dev_get_drvdata(mdev_parent_dev(mdev));
> > -	if (private->state != VFIO_CCW_STATE_IDLE)
> > +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> > +	    private->state == VFIO_CCW_STATE_STANDBY)
> >   		return -EACCES;
> > +	if (!mutex_trylock(&private->io_mutex))
> > +		return -EAGAIN;  
> 
> Ah, I see Halil's difficulty here.
> 
> It is true there is a race condition today, and that this doesn't 
> address it.  That's fine, add it to the todo list.  But even with that, 
> I don't see what the mutex is enforcing?  

It is protecting the io regions. AFAIU the idea was that only one
thread is accessing the io region(s) at a time to prevent corruption and
reading half-morphed data.

> Two simultaneous SSCHs will be 
> serialized (one will get kicked out with a failed trylock() call), while 
> still leaving the window open between cc=0 on the SSCH and the 
> subsequent interrupt.  In the latter case, a second SSCH will come 
> through here, do the copy_from_user below, and then jump to fsm_io_busy 
> to return EAGAIN.  Do we really want to stomp on io_region in that case?

I'm not sure I understood you correctly. The interrupt handler does not
take the lock before writing to the io_region. That is one race but it is
easy to fix.

The bigger problem is that between the interrupt handler has written IRB
area and userspace has read it we may end up destroying it by stomping on
it (to use your words). The userspace reading a wrong (given todays qemu
zeroed out) IRB could lead to follow on problems.
 
>   Why can't we simply return EAGAIN if state==BUSY?
> 

Sure we can. That would essentially go back to the old way of things:
if not idle return with error. Just the error code returned would change
form EACCESS to EAGAIN. Which Isn't necessarily a win, because
conceptually here should be never two interleaved io_requests/start
commands hitting the module.


> >   
> >   	region = private->io_region;
> > -	if (copy_from_user((void *)region + *ppos, buf, count))
> > -		return -EFAULT;
> > +	if (copy_from_user((void *)region + *ppos, buf, count)) {
> > +		ret = -EFAULT;
> > +		goto out_unlock;
> > +	}
Cornelia Huck Jan. 25, 2019, 12:58 p.m. UTC | #26
On Fri, 25 Jan 2019 11:24:37 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Thu, 24 Jan 2019 21:37:44 -0500
> Eric Farman <farman@linux.ibm.com> wrote:
> 
> > On 01/24/2019 09:25 PM, Eric Farman wrote:  
> > > 
> > > 
> > > On 01/21/2019 06:03 AM, Cornelia Huck wrote:    
> 
> > > [1] I think these changes are cool.  We end up going into (and staying 
> > > in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we 
> > > bumble along.
> > > 
> > > But why can't these be separated out from this patch?  It does change 
> > > the behavior of the state machine, and seem distinct from the addition 
> > > of the mutex you otherwise add here?  At the very least, this behavior 
> > > change should be documented in the commit since it's otherwise lost in 
> > > the mutex/EAGAIN stuff.  
> 
> That's a very good idea. I'll factor them out into a separate patch.

And now that I've factored it out, I noticed some more problems.

What we basically need is the following, I think:

- The code should not be interrupted while we process the channel
  program, do the ssch etc. We want the caller to try again later (i.e.
  return -EAGAIN)
- We currently do not want the user space to submit another channel
  program while the first one is still in flight. As submitting another
  one is a valid request, however, we should allow this in the future
  (once we have the code to handle that in place).
- With the async interface, we want user space to be able to submit a
  halt/clear while a start request is still in flight, but not while
  we're processing a start request with translation etc. We probably
  want to do -EAGAIN in that case.

My idea would be:

- The BUSY state denotes "I'm busy processing a request right now, try
  again". We hold it while processing the cp and doing the ssch and
  leave it afterwards (i.e., while the start request is processed by
  the hardware). I/O requests and async requests get -EAGAIN in that
  state.
- A new state (CP_PENDING?) is entered after ssch returned with cc 0
  (from the BUSY state). We stay in there as long as no final state for
  that request has been received and delivered. (This may be final
  interrupt for that request, a deferred cc, or successful halt/clear.)
  I/O requests get -EBUSY, async requests are processed. This state can
  be removed again once we are able to handle more than one outstanding
  cp.

Does that make sense?
Halil Pasic Jan. 25, 2019, 1:09 p.m. UTC | #27
On Thu, 24 Jan 2019 21:37:44 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> >> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct 
> >> mdev_device *mdev,
> >>   {
> >>       struct vfio_ccw_private *private;
> >>       struct ccw_io_region *region;
> >> +    int ret;
> >>       if (*ppos + count > sizeof(*region))
> >>           return -EINVAL;
> >>       private = dev_get_drvdata(mdev_parent_dev(mdev));
> >> -    if (private->state != VFIO_CCW_STATE_IDLE)
> >> +    if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> >> +        private->state == VFIO_CCW_STATE_STANDBY)
> >>           return -EACCES;
> >> +    if (!mutex_trylock(&private->io_mutex))
> >> +        return -EAGAIN;  
> > 
> > Ah, I see Halil's difficulty here.
> > 
> > It is true there is a race condition today, and that this doesn't 
> > address it.  That's fine, add it to the todo list.  But even with that, 
> > I don't see what the mutex is enforcing?  Two simultaneous SSCHs will be 
> > serialized (one will get kicked out with a failed trylock() call), while 
> > still leaving the window open between cc=0 on the SSCH and the 
> > subsequent interrupt.  In the latter case, a second SSCH will come 
> > through here, do the copy_from_user below, and then jump to fsm_io_busy 
> > to return EAGAIN.  Do we really want to stomp on io_region in that case? 
> >   Why can't we simply return EAGAIN if state==BUSY?  
> 
> (Answering my own questions as I skim patch 5...)
> 
> Because of course this series is for async handling, while I was looking 
> specifically at the synchronous code that exists today.  I guess then my 
> question just remains on how the mutex is adding protection in the sync 
> case, because that's still not apparent to me.  (Perhaps I missed it in 
> a reply to Halil; if so I apologize, there were a lot when I returned.)

Careful, at the end we have vfio_ccw_mdev_write_io_region() and the
write callback for the capchain regions. We could return EAGAIN if
state==BUSY in the vfio_ccw_mdev_write_io_region() (but I would prefer a
different error code -- see my other response).

I answered your mutex question as well. Just a small addendum the mutex
is not only for the cases the userspace acts sane (but also when it acts
insane;).

Halil
Halil Pasic Jan. 25, 2019, 2:01 p.m. UTC | #28
On Fri, 25 Jan 2019 13:58:35 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Fri, 25 Jan 2019 11:24:37 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Thu, 24 Jan 2019 21:37:44 -0500
> > Eric Farman <farman@linux.ibm.com> wrote:
> > 
> > > On 01/24/2019 09:25 PM, Eric Farman wrote:  
> > > > 
> > > > 
> > > > On 01/21/2019 06:03 AM, Cornelia Huck wrote:    
> > 
> > > > [1] I think these changes are cool.  We end up going into (and staying 
> > > > in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we 
> > > > bumble along.
> > > > 
> > > > But why can't these be separated out from this patch?  It does change 
> > > > the behavior of the state machine, and seem distinct from the addition 
> > > > of the mutex you otherwise add here?  At the very least, this behavior 
> > > > change should be documented in the commit since it's otherwise lost in 
> > > > the mutex/EAGAIN stuff.  
> > 
> > That's a very good idea. I'll factor them out into a separate patch.
> 
> And now that I've factored it out, I noticed some more problems.
> 
> What we basically need is the following, I think:
> 
> - The code should not be interrupted while we process the channel
>   program, do the ssch etc. We want the caller to try again later (i.e.
>   return -EAGAIN)

We could also interrupt it e.g. by a TRANSLATE -> REQ_ABORT_TRANSLATE
state transition. The thread doing the translation could pick that up
and make sure we don't do the ssch(). Would match the architecture
better, but would be more complicated. And can be done any time later.

> - We currently do not want the user space to submit another channel
>   program while the first one is still in flight. As submitting another
>   one is a valid request, however, we should allow this in the future
>   (once we have the code to handle that in place).

I don't agree. There is at most one channel program processed by a
subchannel at any time. I would prefer an early error code if channel
programs are issued on top of each other (our virtual subchannel
is channel pending or FC start function bit set or similar).

Of course the interface exposed by the vfio-ccw module does not need to
look like the architecture interface. But IMHO any
unjustified deviation from the good old architecure ways of things will
just make it harder to reason about stuff. In the end you have that
interface both as input (guests passed-thorough subchannel) and as
output (the subchannel in the host that's being passed-thorough).


> - With the async interface, we want user space to be able to submit a
>   halt/clear while a start request is still in flight, but not while
>   we're processing a start request with translation etc. We probably
>   want to do -EAGAIN in that case.

This reads very similar to your first point.

> 
> My idea would be:
> 
> - The BUSY state denotes "I'm busy processing a request right now, try
>   again". We hold it while processing the cp and doing the ssch and
>   leave it afterwards (i.e., while the start request is processed by
>   the hardware). I/O requests and async requests get -EAGAIN in that
>   state.
> - A new state (CP_PENDING?) is entered after ssch returned with cc 0
>   (from the BUSY state). We stay in there as long as no final state for
>   that request has been received and delivered. (This may be final
>   interrupt for that request, a deferred cc, or successful halt/clear.)
>   I/O requests get -EBUSY, async requests are processed. This state can
>   be removed again once we are able to handle more than one outstanding
>   cp.
> 
> Does that make sense?
> 

AFAIU your idea is to split up the busy state into two states: CP_PENDING
and of busy without CP_PENDING called BUSY. I like the idea of having a
separate state for CP_PENDING but I don't like the new semantic of BUSY.

Hm mashing a conceptual state machine and the jumptabe stuff ain't
making reasoning about this simpler either. I'm taking about the
conceptual state machine. It would be nice to have a picture of it and
then think about how to express that in code.

Regards,
Halil
Cornelia Huck Jan. 25, 2019, 2:21 p.m. UTC | #29
On Fri, 25 Jan 2019 15:01:01 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Fri, 25 Jan 2019 13:58:35 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:

> > - We currently do not want the user space to submit another channel
> >   program while the first one is still in flight. As submitting another
> >   one is a valid request, however, we should allow this in the future
> >   (once we have the code to handle that in place).  
> 
> I don't agree. There is at most one channel program processed by a
> subchannel at any time. I would prefer an early error code if channel
> programs are issued on top of each other (our virtual subchannel
> is channel pending or FC start function bit set or similar).

You can submit a new request if the subchannel is pending with primary,
but not with secondary state.

Regardless of that, I think it is much easier to push as much as
possible of sorting out of requests to the hardware.
Eric Farman Jan. 25, 2019, 3:57 p.m. UTC | #30
On 01/25/2019 07:58 AM, Cornelia Huck wrote:
> On Fri, 25 Jan 2019 11:24:37 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
>> On Thu, 24 Jan 2019 21:37:44 -0500
>> Eric Farman <farman@linux.ibm.com> wrote:
>>
>>> On 01/24/2019 09:25 PM, Eric Farman wrote:
>>>>
>>>>
>>>> On 01/21/2019 06:03 AM, Cornelia Huck wrote:
>>
>>>> [1] I think these changes are cool.  We end up going into (and staying
>>>> in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we
>>>> bumble along.
>>>>
>>>> But why can't these be separated out from this patch?  It does change
>>>> the behavior of the state machine, and seem distinct from the addition
>>>> of the mutex you otherwise add here?  At the very least, this behavior
>>>> change should be documented in the commit since it's otherwise lost in
>>>> the mutex/EAGAIN stuff.
>>
>> That's a very good idea. I'll factor them out into a separate patch.
> 
> And now that I've factored it out, I noticed some more problems.

That's good!  Maybe it helps us with the circles we're on :)

> 
> What we basically need is the following, I think:
> 
> - The code should not be interrupted while we process the channel
>    program, do the ssch etc. We want the caller to try again later (i.e.
>    return -EAGAIN)
> - We currently do not want the user space to submit another channel
>    program while the first one is still in flight. 

These two seem to contradict one another.  I think you're saying is that 
we don't _want_ userspace to issue another channel program, even though 
its _allowed_ to as far as vfio-ccw is concerned.

As submitting another
>    one is a valid request, however, we should allow this in the future
>    (once we have the code to handle that in place).
> - With the async interface, we want user space to be able to submit a
>    halt/clear while a start request is still in flight, but not while
>    we're processing a start request with translation etc. We probably
>    want to do -EAGAIN in that case.
> 
> My idea would be:
> 
> - The BUSY state denotes "I'm busy processing a request right now, try
>    again". We hold it while processing the cp and doing the ssch and
>    leave it afterwards (i.e., while the start request is processed by
>    the hardware). I/O requests and async requests get -EAGAIN in that
>    state.
> - A new state (CP_PENDING?) is entered after ssch returned with cc 0
>    (from the BUSY state). We stay in there as long as no final state for
>    that request has been received and delivered. (This may be final
>    interrupt for that request, a deferred cc, or successful halt/clear.)
>    I/O requests get -EBUSY

I liked CP_PENDING, since it corresponds to the subchannel being marked 
"start pending" as described in POPS, but this statement suggests that 
the BUSY/PENDING state to be swapped, such that state=PENDING returns 
-EAGAIN and state=BUSY returns -EBUSY.  Not super-concerned with the 
terminology though.

, async requests are processed. This state can
>    be removed again once we are able to handle more than one outstanding
>    cp.
> 
> Does that make sense?
> 

I think so, and I think I like it.  So you want to distinguish between 
(I have swapped BUSY/PENDING in this example per my above comment):

A) SSCH issued by userspace (IDLE->PENDING)
B) SSCH issued (successfully) by kernel (PENDING->BUSY)
B') SSCH issued (unsuccessfully) by kernel (PENDING->IDLE?)
C) Interrupt received by kernel (no change?)
D) Interrupt given to userspace (BUSY->IDLE)

If we receive A and A, the second A gets EAGAIN

If we do A+B and A, the second A gets EBUSY (unless async, which is 
processed)

Does the boundary of "in flight" in the interrupt side (C and D) need to 
be defined, such that we go BUSY->PENDING->IDLE instead of BUSY->IDLE ?
Halil Pasic Jan. 25, 2019, 4:04 p.m. UTC | #31
On Fri, 25 Jan 2019 15:21:54 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Fri, 25 Jan 2019 15:01:01 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Fri, 25 Jan 2019 13:58:35 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > > - We currently do not want the user space to submit another channel
> > >   program while the first one is still in flight. As submitting another
> > >   one is a valid request, however, we should allow this in the future
> > >   (once we have the code to handle that in place).  
> > 
> > I don't agree. There is at most one channel program processed by a
> > subchannel at any time. I would prefer an early error code if channel
> > programs are issued on top of each other (our virtual subchannel
> > is channel pending or FC start function bit set or similar).
> 
> You can submit a new request if the subchannel is pending with primary,
> but not with secondary state.
> 
> Regardless of that, I think it is much easier to push as much as
> possible of sorting out of requests to the hardware.
> 

Do we expect userspace/QEMU to fence the bad scenarios as tries to do
today, or is this supposed to change to hardware should sort out
requests whenever possible.

The problem I see with the let the hardware sort it out is that, for that
to work, we need to juggle multiple translations simultaneously (or am I
wrong?). Doing that does not appear particularly simple to me.
Furthermore we would go through all that hassle knowingly that the sole
reason is working around bugs. We still expect our Linux guests
serializing it's ssch() stuff as it does today. Thus I would except this
code not getting the love nor the coverage that would guard against bugs
in that code.

Regards,
Halil
Eric Farman Jan. 25, 2019, 8:21 p.m. UTC | #32
On 01/25/2019 07:58 AM, Halil Pasic wrote:
> On Thu, 24 Jan 2019 21:25:10 -0500
> Eric Farman <farman@linux.ibm.com> wrote:
> 
>>>    	private = dev_get_drvdata(mdev_parent_dev(mdev));
>>> -	if (private->state != VFIO_CCW_STATE_IDLE)
>>> +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
>>> +	    private->state == VFIO_CCW_STATE_STANDBY)
>>>    		return -EACCES;
>>> +	if (!mutex_trylock(&private->io_mutex))
>>> +		return -EAGAIN;
>>
>> Ah, I see Halil's difficulty here.
>>
>> It is true there is a race condition today, and that this doesn't
>> address it.  That's fine, add it to the todo list.  But even with that,
>> I don't see what the mutex is enforcing?
> 
> It is protecting the io regions. AFAIU the idea was that only one
> thread is accessing the io region(s) at a time to prevent corruption and
> reading half-morphed data.
> 
>> Two simultaneous SSCHs will be
>> serialized (one will get kicked out with a failed trylock() call), while
>> still leaving the window open between cc=0 on the SSCH and the
>> subsequent interrupt.  In the latter case, a second SSCH will come
>> through here, do the copy_from_user below, and then jump to fsm_io_busy
>> to return EAGAIN.  Do we really want to stomp on io_region in that case?
> 
> I'm not sure I understood you correctly. The interrupt handler does not
> take the lock before writing to the io_region. That is one race but it is
> easy to fix.
> 
> The bigger problem is that between the interrupt handler has written IRB
> area and userspace has read it we may end up destroying it by stomping on
> it (to use your words). The userspace reading a wrong (given todays qemu
> zeroed out) IRB could lead to follow on problems.

I wasn't thinking about a race between the start and interrupt handler, 
but rather between two near-simultaneous starts.  Looking at it more 
closely, the orb and scsw structs as well as the ret_code field in 
ccw_io_region are only referenced under the protection of the new mutex 
(within fsm_io_request, for example), which I guess is the point.

So that leaves us with just the irb fields, which you'd mentioned a 
couple days ago (and which I was trying to ignore since it'd seems to 
have been discussed enough at the time).  So I withdraw my concerns on 
this point.  For now.  ;-)

>   
>>    Why can't we simply return EAGAIN if state==BUSY?
>>
> 
> Sure we can. That would essentially go back to the old way of things:
> if not idle return with error. 

I think this happens both before and after this series.  With this 
series, we just update the io_region with things that are never used 
because we're busy.

Just the error code returned would change
> form EACCESS to EAGAIN. Which Isn't necessarily a win, because
> conceptually here should be never two interleaved io_requests/start
> commands hitting the module.
> 
> 
>>>    
>>>    	region = private->io_region;
>>> -	if (copy_from_user((void *)region + *ppos, buf, count))
>>> -		return -EFAULT;
>>> +	if (copy_from_user((void *)region + *ppos, buf, count)) {
>>> +		ret = -EFAULT;
>>> +		goto out_unlock;
>>> +	}
>
Eric Farman Jan. 25, 2019, 8:22 p.m. UTC | #33
On 01/25/2019 05:24 AM, Cornelia Huck wrote:
> On Thu, 24 Jan 2019 21:37:44 -0500
> Eric Farman <farman@linux.ibm.com> wrote:
> 
>> On 01/24/2019 09:25 PM, Eric Farman wrote:
>>>
>>>
>>> On 01/21/2019 06:03 AM, Cornelia Huck wrote:
> 
>>> [1] I think these changes are cool.  We end up going into (and staying
>>> in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we
>>> bumble along.
>>>
>>> But why can't these be separated out from this patch?  It does change
>>> the behavior of the state machine, and seem distinct from the addition
>>> of the mutex you otherwise add here?  At the very least, this behavior
>>> change should be documented in the commit since it's otherwise lost in
>>> the mutex/EAGAIN stuff.
> 
> That's a very good idea. I'll factor them out into a separate patch.
> 
>>>    
>>>>        trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private),
>>>>                       io_region->ret_code, errstr);
>>>>    }
>>>> diff --git a/drivers/s390/cio/vfio_ccw_ops.c
>>>> b/drivers/s390/cio/vfio_ccw_ops.c
>>>> index f673e106c041..3fa9fc570400 100644
>>>> --- a/drivers/s390/cio/vfio_ccw_ops.c
>>>> +++ b/drivers/s390/cio/vfio_ccw_ops.c
>>>> @@ -169,16 +169,20 @@ static ssize_t vfio_ccw_mdev_read(struct
>>>> mdev_device *mdev,
>>>>    {
>>>>        struct vfio_ccw_private *private;
>>>>        struct ccw_io_region *region;
>>>> +    int ret;
>>>>        if (*ppos + count > sizeof(*region))
>>>>            return -EINVAL;
>>>>        private = dev_get_drvdata(mdev_parent_dev(mdev));
>>>> +    mutex_lock(&private->io_mutex);
>>>>        region = private->io_region;
>>>>        if (copy_to_user(buf, (void *)region + *ppos, count))
>>>> -        return -EFAULT;
>>>> -
>>>> -    return count;
>>>> +        ret = -EFAULT;
>>>> +    else
>>>> +        ret = count;
>>>> +    mutex_unlock(&private->io_mutex);
>>>> +    return ret;
>>>>    }
>>>>    static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
>>>> @@ -188,25 +192,30 @@ static ssize_t vfio_ccw_mdev_write(struct
>>>> mdev_device *mdev,
>>>>    {
>>>>        struct vfio_ccw_private *private;
>>>>        struct ccw_io_region *region;
>>>> +    int ret;
>>>>        if (*ppos + count > sizeof(*region))
>>>>            return -EINVAL;
>>>>        private = dev_get_drvdata(mdev_parent_dev(mdev));
>>>> -    if (private->state != VFIO_CCW_STATE_IDLE)
>>>> +    if (private->state == VFIO_CCW_STATE_NOT_OPER ||
>>>> +        private->state == VFIO_CCW_STATE_STANDBY)
>>>>            return -EACCES;
>>>> +    if (!mutex_trylock(&private->io_mutex))
>>>> +        return -EAGAIN;
>>>
>>> Ah, I see Halil's difficulty here.
>>>
>>> It is true there is a race condition today, and that this doesn't
>>> address it.  That's fine, add it to the todo list.  But even with that,
>>> I don't see what the mutex is enforcing?  Two simultaneous SSCHs will be
>>> serialized (one will get kicked out with a failed trylock() call), while
>>> still leaving the window open between cc=0 on the SSCH and the
>>> subsequent interrupt.  In the latter case, a second SSCH will come
>>> through here, do the copy_from_user below, and then jump to fsm_io_busy
>>> to return EAGAIN.  Do we really want to stomp on io_region in that case?
>>>    Why can't we simply return EAGAIN if state==BUSY?
>>
>> (Answering my own questions as I skim patch 5...)
>>
>> Because of course this series is for async handling, while I was looking
>> specifically at the synchronous code that exists today.  I guess then my
>> question just remains on how the mutex is adding protection in the sync
>> case, because that's still not apparent to me.  (Perhaps I missed it in
>> a reply to Halil; if so I apologize, there were a lot when I returned.)
> 
> My idea behind the mutex was to make sure that we get consistent data
> when reading/writing (e.g. if one user space thread is reading the I/O
> region while another is writing to it).
> 

And from that angle, this accomplishes that.  It just wasn't apparent to 
me at first.

I'm still not certain of how we handle mdev_write when state=BUSY, so 
let me ask my question a different way...

If we come into mdev_write with state=BUSY and we get the lock, 
copy_from_user, and do our jump table we go to fsm_io_busy to set 
ret_code and return -EAGAIN.  Why then don't we set the jump table for 
state=NOT_OPER||STANDBY to do something that will return -EACCES instead 
of how we currently do a direct return of -EACCES before all the 
lock/copy stuff (and the jump table that would take us to fsm_io_error 
and an error message before returning -EIO)?
Cornelia Huck Jan. 28, 2019, 5:09 p.m. UTC | #34
On Fri, 25 Jan 2019 15:01:01 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Fri, 25 Jan 2019 13:58:35 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:

> > - The code should not be interrupted while we process the channel
> >   program, do the ssch etc. We want the caller to try again later (i.e.
> >   return -EAGAIN)  

(...)

> > - With the async interface, we want user space to be able to submit a
> >   halt/clear while a start request is still in flight, but not while
> >   we're processing a start request with translation etc. We probably
> >   want to do -EAGAIN in that case.  
> 
> This reads very similar to your first point.

Not quite. ssch() means that we have a cp around; for hsch()/csch() we
don't have such a thing. So we want to protect the process of
translating the cp etc., but we don't need such protection for the
halt/clear processing.

> 
> > 
> > My idea would be:
> > 
> > - The BUSY state denotes "I'm busy processing a request right now, try
> >   again". We hold it while processing the cp and doing the ssch and
> >   leave it afterwards (i.e., while the start request is processed by
> >   the hardware). I/O requests and async requests get -EAGAIN in that
> >   state.
> > - A new state (CP_PENDING?) is entered after ssch returned with cc 0
> >   (from the BUSY state). We stay in there as long as no final state for
> >   that request has been received and delivered. (This may be final
> >   interrupt for that request, a deferred cc, or successful halt/clear.)
> >   I/O requests get -EBUSY, async requests are processed. This state can
> >   be removed again once we are able to handle more than one outstanding
> >   cp.
> > 
> > Does that make sense?
> >   
> 
> AFAIU your idea is to split up the busy state into two states: CP_PENDING
> and of busy without CP_PENDING called BUSY. I like the idea of having a
> separate state for CP_PENDING but I don't like the new semantic of BUSY.
> 
> Hm mashing a conceptual state machine and the jumptabe stuff ain't
> making reasoning about this simpler either. I'm taking about the
> conceptual state machine. It would be nice to have a picture of it and
> then think about how to express that in code.

Sorry, I'm having a hard time parsing your comments. Are you looking
for something like the below?

IDLE --- IO_REQ --> BUSY ---> CP_PENDING --- IRQ ---> IDLE (if final
state for I/O)
(normal ssch)

BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
(user space is supposed to retry, as we'll eventually progress from
BUSY)

CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
(user space is supposed to map this to the appropriate cc for the guest)

IDLE --- ASYNC_REQ ---> IDLE
(user space is welcome to do anything else right away)

BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
(user space is supposed to retry, as above)

CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
(the interrupt will get us out of CP_PENDING eventually)
Cornelia Huck Jan. 28, 2019, 5:13 p.m. UTC | #35
On Fri, 25 Jan 2019 17:04:04 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> Do we expect userspace/QEMU to fence the bad scenarios as tries to do
> today, or is this supposed to change to hardware should sort out
> requests whenever possible.

Does my other mail answer that?

> The problem I see with the let the hardware sort it out is that, for that
> to work, we need to juggle multiple translations simultaneously (or am I
> wrong?). Doing that does not appear particularly simple to me.

None in the first stage, at most two in the second stage, I guess.

> Furthermore we would go through all that hassle knowingly that the sole
> reason is working around bugs. We still expect our Linux guests
> serializing it's ssch() stuff as it does today. Thus I would except this
> code not getting the love nor the coverage that would guard against bugs
> in that code.

So, we should have test code for that? (Any IBM-internal channel I/O
exercisers that may help?)

We should not rely on the guest being sane, although Linux probably is
in that respect.
Cornelia Huck Jan. 28, 2019, 5:24 p.m. UTC | #36
On Fri, 25 Jan 2019 10:57:38 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> On 01/25/2019 07:58 AM, Cornelia Huck wrote:
> > On Fri, 25 Jan 2019 11:24:37 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> >   
> >> On Thu, 24 Jan 2019 21:37:44 -0500
> >> Eric Farman <farman@linux.ibm.com> wrote:
> >>  
> >>> On 01/24/2019 09:25 PM, Eric Farman wrote:  
> >>>>
> >>>>
> >>>> On 01/21/2019 06:03 AM, Cornelia Huck wrote:  
> >>  
> >>>> [1] I think these changes are cool.  We end up going into (and staying
> >>>> in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we
> >>>> bumble along.
> >>>>
> >>>> But why can't these be separated out from this patch?  It does change
> >>>> the behavior of the state machine, and seem distinct from the addition
> >>>> of the mutex you otherwise add here?  At the very least, this behavior
> >>>> change should be documented in the commit since it's otherwise lost in
> >>>> the mutex/EAGAIN stuff.  
> >>
> >> That's a very good idea. I'll factor them out into a separate patch.  
> > 
> > And now that I've factored it out, I noticed some more problems.  
> 
> That's good!  Maybe it helps us with the circles we're on :)

:)

> 
> > 
> > What we basically need is the following, I think:
> > 
> > - The code should not be interrupted while we process the channel
> >    program, do the ssch etc. We want the caller to try again later (i.e.
> >    return -EAGAIN)
> > - We currently do not want the user space to submit another channel
> >    program while the first one is still in flight.   
> 
> These two seem to contradict one another.  I think you're saying is that 
> we don't _want_ userspace to issue another channel program, even though 
> its _allowed_ to as far as vfio-ccw is concerned.

What I'm trying to say is that we want to distinguish two things:
- The code is currently doing translation etc. We probably want to keep
  that atomic, in order not to make things too complicated.
- We have sent the ssch() to the hardware, but have not yet received
  the final interrupt for that request (that's what I meant with "in
  flight"). It's easier for the first shot to disallow a second ssch()
  as that would need handling of more than one cp request, but we may
  want to allow it in the future.
  A hsch()/csch() (which does not generate a new cp) should be fine.

(see also my reply to Halil's mail)

> 
> As submitting another
> >    one is a valid request, however, we should allow this in the future
> >    (once we have the code to handle that in place).
> > - With the async interface, we want user space to be able to submit a
> >    halt/clear while a start request is still in flight, but not while
> >    we're processing a start request with translation etc. We probably
> >    want to do -EAGAIN in that case.
> > 
> > My idea would be:
> > 
> > - The BUSY state denotes "I'm busy processing a request right now, try
> >    again". We hold it while processing the cp and doing the ssch and
> >    leave it afterwards (i.e., while the start request is processed by
> >    the hardware). I/O requests and async requests get -EAGAIN in that
> >    state.
> > - A new state (CP_PENDING?) is entered after ssch returned with cc 0
> >    (from the BUSY state). We stay in there as long as no final state for
> >    that request has been received and delivered. (This may be final
> >    interrupt for that request, a deferred cc, or successful halt/clear.)
> >    I/O requests get -EBUSY  
> 
> I liked CP_PENDING, since it corresponds to the subchannel being marked 
> "start pending" as described in POPS, but this statement suggests that 
> the BUSY/PENDING state to be swapped, such that state=PENDING returns 
> -EAGAIN and state=BUSY returns -EBUSY.  Not super-concerned with the 
> terminology though.

What about s/BUSY/CP_PROCESSING/ ?

> 
> , async requests are processed. This state can
> >    be removed again once we are able to handle more than one outstanding
> >    cp.
> > 
> > Does that make sense?
> >   
> 
> I think so, and I think I like it.  So you want to distinguish between 
> (I have swapped BUSY/PENDING in this example per my above comment):
> 
> A) SSCH issued by userspace (IDLE->PENDING)
> B) SSCH issued (successfully) by kernel (PENDING->BUSY)
> B') SSCH issued (unsuccessfully) by kernel (PENDING->IDLE?)

I think so.

> C) Interrupt received by kernel (no change?)
> D) Interrupt given to userspace (BUSY->IDLE)

Only if that is the final interrupt for that cp.

> 
> If we receive A and A, the second A gets EAGAIN
> 
> If we do A+B and A, the second A gets EBUSY (unless async, which is 
> processed)

Nod.

> Does the boundary of "in flight" in the interrupt side (C and D) need to 
> be defined, such that we go BUSY->PENDING->IDLE instead of BUSY->IDLE ?

I don't think we can go BUSY->PENDING (in your terminology), at that
would imply a retry of the ssch()?
Cornelia Huck Jan. 28, 2019, 5:31 p.m. UTC | #37
On Fri, 25 Jan 2019 15:22:56 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> If we come into mdev_write with state=BUSY and we get the lock, 
> copy_from_user, and do our jump table we go to fsm_io_busy to set 
> ret_code and return -EAGAIN.  Why then don't we set the jump table for 
> state=NOT_OPER||STANDBY to do something that will return -EACCES instead 
> of how we currently do a direct return of -EACCES before all the 
> lock/copy stuff (and the jump table that would take us to fsm_io_error 
> and an error message before returning -EIO)?

If you phrase it like that, I'm wondering why we're not already doing
it that way :) We just need to make sure to revert to the previous
state on error instead of IDLE.
Halil Pasic Jan. 28, 2019, 7:15 p.m. UTC | #38
On Mon, 28 Jan 2019 18:09:48 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Fri, 25 Jan 2019 15:01:01 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Fri, 25 Jan 2019 13:58:35 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > > - The code should not be interrupted while we process the channel
> > >   program, do the ssch etc. We want the caller to try again later (i.e.
> > >   return -EAGAIN)  
> 
> (...)
> 
> > > - With the async interface, we want user space to be able to submit a
> > >   halt/clear while a start request is still in flight, but not while
> > >   we're processing a start request with translation etc. We probably
> > >   want to do -EAGAIN in that case.  
> > 
> > This reads very similar to your first point.
> 
> Not quite. ssch() means that we have a cp around; for hsch()/csch() we
> don't have such a thing. So we want to protect the process of
> translating the cp etc., but we don't need such protection for the
> halt/clear processing.
> 

What does this don't 'need such protection' mean in terms of code,
moving the unlock of the io_mutex upward (in
vfio_ccw_async_region_write())?

Here the function in question for reference:

+static ssize_t vfio_ccw_async_region_write(struct vfio_ccw_private
*private,
+					   const char __user *buf,
size_t count,
+					   loff_t *ppos)
+{
+	unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) -
VFIO_CCW_NUM_REGIONS;
+	loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK;
+	struct ccw_cmd_region *region;
+	int ret;
+
+	if (pos + count > sizeof(*region))
+		return -EINVAL;
+
+	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
+	    private->state == VFIO_CCW_STATE_STANDBY)
+		return -EACCES;
+	if (!mutex_trylock(&private->io_mutex))
+		return -EAGAIN;
+
+	region = private->region[i].data;
+	if (copy_from_user((void *)region + pos, buf, count)) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_ASYNC_REQ);
+
+	ret = region->ret_code ? region->ret_code : count;
+
+out_unlock:
+	mutex_unlock(&private->io_mutex);
+	return ret;
+}

That does not make much sense to me at the moment (so I guess I
misunderstood again).

> > 
> > > 
> > > My idea would be:
> > > 
> > > - The BUSY state denotes "I'm busy processing a request right now, try
> > >   again". We hold it while processing the cp and doing the ssch and
> > >   leave it afterwards (i.e., while the start request is processed by
> > >   the hardware). I/O requests and async requests get -EAGAIN in that
> > >   state.
> > > - A new state (CP_PENDING?) is entered after ssch returned with cc 0
> > >   (from the BUSY state). We stay in there as long as no final state for
> > >   that request has been received and delivered. (This may be final
> > >   interrupt for that request, a deferred cc, or successful halt/clear.)
> > >   I/O requests get -EBUSY, async requests are processed. This state can
> > >   be removed again once we are able to handle more than one outstanding
> > >   cp.
> > > 
> > > Does that make sense?
> > >   
> > 
> > AFAIU your idea is to split up the busy state into two states: CP_PENDING
> > and of busy without CP_PENDING called BUSY. I like the idea of having a
> > separate state for CP_PENDING but I don't like the new semantic of BUSY.
> > 
> > Hm mashing a conceptual state machine and the jumptabe stuff ain't
> > making reasoning about this simpler either. I'm taking about the
> > conceptual state machine. It would be nice to have a picture of it and
> > then think about how to express that in code.
> 
> Sorry, I'm having a hard time parsing your comments. Are you looking
> for something like the below?

I had more something like this 
https://en.wikipedia.org/wiki/UML_state_machine,
in mind but the lists of state transitions are also useful.

> 
> IDLE --- IO_REQ --> BUSY ---> CP_PENDING --- IRQ ---> IDLE (if final

There ain't no trigger/action list  between BUSY and CP_PENDING.
I'm also in the  dark about where the issuing of the ssch() happen
here (is it an internal transition within CP_PENDING?). I guess if
the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
transition
won't take place. And I guess the IRQ is a final one.

Sorry abstraction is not a concept unknown to me. But this is too much
abstraction for me in this context. The devil is in the details, and
AFAIU we are discussing these details right now.
 

> state for I/O)
> (normal ssch)
> 
> BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
> (user space is supposed to retry, as we'll eventually progress from
> BUSY)
> 
> CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
> (user space is supposed to map this to the appropriate cc for the guest)

From this it seems you don't intend to issue the second  requested ssch()
any more (and don't want to do any translation). Is that right? (If it
is, that what I was asking for for a while, but then it's a pity for the
retries.)

> 
> IDLE --- ASYNC_REQ ---> IDLE
> (user space is welcome to do anything else right away)

Your idea is to not issue a requested hsch() if we think we are IDLE
it seems. Do I understand this right? We would end up with a different
semantic for hsch()/and csch() (compared to PoP) in the guest with this
(AFAICT).

> 
> BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
> (user space is supposed to retry, as above)
> 
> CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
> (the interrupt will get us out of CP_PENDING eventually)

Issue (c|h)sch() is an action that is done on this internal 
transition (within CP_PENDING).

Thank you very much for investing into this description of the state
machine. I'm afraid I'm acting like a not so nice person (self censored)
at the moment. I can't help myself, sorry. Maybe Farhan and Eric can take
this as a starting point and come up with something that we can integrate
into our documentation. Maybe not...

Regards,
Halil
Halil Pasic Jan. 28, 2019, 7:30 p.m. UTC | #39
On Mon, 28 Jan 2019 18:13:55 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> On Fri, 25 Jan 2019 17:04:04 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > Do we expect userspace/QEMU to fence the bad scenarios as tries to do
> > today, or is this supposed to change to hardware should sort out
> > requests whenever possible.
> 
> Does my other mail answer that?

Sorry, I can't find the answer in your other (Date: Mon, 28 Jan 2019
17:59:10 +0100, Message-Id: <20190128175910.5d9677e7@oc2783563651>) mail.
AFAIU that mail talks abut the kernel and not about the userspace.

I guess the answer is we don't expect changes to userspace, so we do
expect userspace to fence bad scenarios.

> 
> > The problem I see with the let the hardware sort it out is that, for
> > that to work, we need to juggle multiple translations simultaneously
> > (or am I wrong?). Doing that does not appear particularly simple to
> > me.
> 
> None in the first stage, at most two in the second stage, I guess.
> 

Expected benefit of the second stage over the first stage? (I see none.)

> > Furthermore we would go through all that hassle knowingly that the
> > sole reason is working around bugs. We still expect our Linux guests
> > serializing it's ssch() stuff as it does today. Thus I would except
> > this code not getting the love nor the coverage that would guard
> > against bugs in that code.
> 
> So, we should have test code for that? (Any IBM-internal channel I/O
> exercisers that may help?)
>

None that I'm aware of. Anyone else? 

But the point I was trying to make is the following: I prefer keeping
the handling for the case "ssch()'s on top of each other" as trivial as
possible. (E.g. bail out if CP_PENDING without doing any translation.)
 
> We should not rely on the guest being sane, although Linux probably is
> in that respect.
> 

I agree 100%: we should not rely on either guest or userspace emulator
being sane. But IMHO we should handle insanity with the least possible
investment.

Regards,
Halil
Eric Farman Jan. 28, 2019, 9:48 p.m. UTC | #40
On 01/28/2019 02:15 PM, Halil Pasic wrote:
> On Mon, 28 Jan 2019 18:09:48 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
>> On Fri, 25 Jan 2019 15:01:01 +0100
>> Halil Pasic <pasic@linux.ibm.com> wrote:
>>
>>> On Fri, 25 Jan 2019 13:58:35 +0100
>>> Cornelia Huck <cohuck@redhat.com> wrote:
>>
>>>> - The code should not be interrupted while we process the channel
>>>>    program, do the ssch etc. We want the caller to try again later (i.e.
>>>>    return -EAGAIN)
>>
>> (...)
>>
>>>> - With the async interface, we want user space to be able to submit a
>>>>    halt/clear while a start request is still in flight, but not while
>>>>    we're processing a start request with translation etc. We probably
>>>>    want to do -EAGAIN in that case.
>>>
>>> This reads very similar to your first point.
>>
>> Not quite. ssch() means that we have a cp around; for hsch()/csch() we
>> don't have such a thing. So we want to protect the process of
>> translating the cp etc., but we don't need such protection for the
>> halt/clear processing.
>>
> 
> What does this don't 'need such protection' mean in terms of code,
> moving the unlock of the io_mutex upward (in
> vfio_ccw_async_region_write())?
> 
> Here the function in question for reference:
> 
> +static ssize_t vfio_ccw_async_region_write(struct vfio_ccw_private
> *private,
> +					   const char __user *buf,
> size_t count,
> +					   loff_t *ppos)
> +{
> +	unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) -
> VFIO_CCW_NUM_REGIONS;
> +	loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK;
> +	struct ccw_cmd_region *region;
> +	int ret;
> +
> +	if (pos + count > sizeof(*region))
> +		return -EINVAL;
> +
> +	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
> +	    private->state == VFIO_CCW_STATE_STANDBY)
> +		return -EACCES;
> +	if (!mutex_trylock(&private->io_mutex))
> +		return -EAGAIN;
> +
> +	region = private->region[i].data;
> +	if (copy_from_user((void *)region + pos, buf, count)) {
> +		ret = -EFAULT;
> +		goto out_unlock;
> +	}
> +
> +	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_ASYNC_REQ);
> +
> +	ret = region->ret_code ? region->ret_code : count;
> +
> +out_unlock:
> +	mutex_unlock(&private->io_mutex);
> +	return ret;
> +}
> 
> That does not make much sense to me at the moment (so I guess I
> misunderstood again).
> 
>>>
>>>>
>>>> My idea would be:
>>>>
>>>> - The BUSY state denotes "I'm busy processing a request right now, try
>>>>    again". We hold it while processing the cp and doing the ssch and
>>>>    leave it afterwards (i.e., while the start request is processed by
>>>>    the hardware). I/O requests and async requests get -EAGAIN in that
>>>>    state.
>>>> - A new state (CP_PENDING?) is entered after ssch returned with cc 0
>>>>    (from the BUSY state). We stay in there as long as no final state for
>>>>    that request has been received and delivered. (This may be final
>>>>    interrupt for that request, a deferred cc, or successful halt/clear.)
>>>>    I/O requests get -EBUSY, async requests are processed. This state can
>>>>    be removed again once we are able to handle more than one outstanding
>>>>    cp.
>>>>
>>>> Does that make sense?
>>>>    
>>>
>>> AFAIU your idea is to split up the busy state into two states: CP_PENDING
>>> and of busy without CP_PENDING called BUSY. I like the idea of having a
>>> separate state for CP_PENDING but I don't like the new semantic of BUSY.
>>>
>>> Hm mashing a conceptual state machine and the jumptabe stuff ain't
>>> making reasoning about this simpler either. I'm taking about the
>>> conceptual state machine. It would be nice to have a picture of it and
>>> then think about how to express that in code.
>>
>> Sorry, I'm having a hard time parsing your comments. Are you looking
>> for something like the below?
> 
> I had more something like this
> https://en.wikipedia.org/wiki/UML_state_machine,
> in mind but the lists of state transitions are also useful.
> 

I think the picture Connie paints below is just as useful as any 
formalized UML diagram.

>>
>> IDLE --- IO_REQ --> BUSY ---> CP_PENDING --- IRQ ---> IDLE (if final
> 
> There ain't no trigger/action list  between BUSY and CP_PENDING.

Right, because BUSY means "KVM started processing a SSCH" and CP_PENDING 
means "KVM finished processing the SSCH and issued it to the hardware, 
and got cc=0."

> I'm also in the  dark about where the issuing of the ssch() happen
> here (is it an internal transition within CP_PENDING?). 

Connie said...

 >>>> - A new state (CP_PENDING?) is entered after ssch returned with cc 0
 >>>>    (from the BUSY state).

...and I agree with that.

I guess if
> the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
> transition
> won't take place. And I guess the IRQ is a final one.

Yes this is the one point I hadn't seen explicitly stated.  We shouldn't 
remain in state=BUSY if the ssch got cc!=0, and probably return to IDLE 
when processing the failure.  In Connie's response (Mon, 28 Jan 2019 
18:24:24 +0100) to my note, she expressed some agreement to that.

> 
> Sorry abstraction is not a concept unknown to me. But this is too much
> abstraction for me in this context. The devil is in the details, and
> AFAIU we are discussing these details right now.
>   
> 
>> state for I/O)
>> (normal ssch)
>>
>> BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
>> (user space is supposed to retry, as we'll eventually progress from
>> BUSY)
>>
>> CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
>> (user space is supposed to map this to the appropriate cc for the guest)
> 
>  From this it seems you don't intend to issue the second  requested ssch()
> any more (and don't want to do any translation). Is that right? (If it
> is, that what I was asking for for a while, but then it's a pity for the
> retries.)
> 
>>
>> IDLE --- ASYNC_REQ ---> IDLE
>> (user space is welcome to do anything else right away)
> 
> Your idea is to not issue a requested hsch() if we think we are IDLE
> it seems. Do I understand this right? We would end up with a different
> semantic for hsch()/and csch() (compared to PoP) in the guest with this
> (AFAICT).
> 
>>
>> BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
>> (user space is supposed to retry, as above)
>>
>> CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
>> (the interrupt will get us out of CP_PENDING eventually)
> 
> Issue (c|h)sch() is an action that is done on this internal
> transition (within CP_PENDING).

These three do read like CSCH/HSCH are subject to the same rules as 
SSCH, when in fact they would be (among other reasons) issued to clean 
up a lost interrupt from a previous SSCH.  So maybe return -EAGAIN on 
state=BUSY (don't race ourselves with the start), but issue to hardware 
if CP_PENDING.

If we get an async request when state=IDLE, then maybe just issue it for 
fun, as if it were an SSCH?

> 
> Thank you very much for investing into this description of the state
> machine. I'm afraid I'm acting like a not so nice person (self censored)
> at the moment. I can't help myself, sorry. Maybe Farhan and Eric can take
> this as a starting point and come up with something that we can integrate
> into our documentation. Maybe not...
> 
> Regards,
> Halil
>
Eric Farman Jan. 28, 2019, 9:50 p.m. UTC | #41
On 01/28/2019 12:24 PM, Cornelia Huck wrote:
> On Fri, 25 Jan 2019 10:57:38 -0500
> Eric Farman <farman@linux.ibm.com> wrote:
> 
>> On 01/25/2019 07:58 AM, Cornelia Huck wrote:
>>> On Fri, 25 Jan 2019 11:24:37 +0100
>>> Cornelia Huck <cohuck@redhat.com> wrote:
>>>    
>>>> On Thu, 24 Jan 2019 21:37:44 -0500
>>>> Eric Farman <farman@linux.ibm.com> wrote:
>>>>   
>>>>> On 01/24/2019 09:25 PM, Eric Farman wrote:
>>>>>>
>>>>>>
>>>>>> On 01/21/2019 06:03 AM, Cornelia Huck wrote:
>>>>   
>>>>>> [1] I think these changes are cool.  We end up going into (and staying
>>>>>> in) state=BUSY if we get cc=0 on the SSCH, rather than in/out as we
>>>>>> bumble along.
>>>>>>
>>>>>> But why can't these be separated out from this patch?  It does change
>>>>>> the behavior of the state machine, and seem distinct from the addition
>>>>>> of the mutex you otherwise add here?  At the very least, this behavior
>>>>>> change should be documented in the commit since it's otherwise lost in
>>>>>> the mutex/EAGAIN stuff.
>>>>
>>>> That's a very good idea. I'll factor them out into a separate patch.
>>>
>>> And now that I've factored it out, I noticed some more problems.
>>
>> That's good!  Maybe it helps us with the circles we're on :)
> 
> :)
> 
>>
>>>
>>> What we basically need is the following, I think:
>>>
>>> - The code should not be interrupted while we process the channel
>>>     program, do the ssch etc. We want the caller to try again later (i.e.
>>>     return -EAGAIN)
>>> - We currently do not want the user space to submit another channel
>>>     program while the first one is still in flight.
>>
>> These two seem to contradict one another.  I think you're saying is that
>> we don't _want_ userspace to issue another channel program, even though
>> its _allowed_ to as far as vfio-ccw is concerned.
> 
> What I'm trying to say is that we want to distinguish two things:
> - The code is currently doing translation etc. We probably want to keep
>    that atomic, in order not to make things too complicated.
> - We have sent the ssch() to the hardware, but have not yet received
>    the final interrupt for that request (that's what I meant with "in
>    flight"). It's easier for the first shot to disallow a second ssch()
>    as that would need handling of more than one cp request, but we may
>    want to allow it in the future.
>    A hsch()/csch() (which does not generate a new cp) should be fine.
> 
> (see also my reply to Halil's mail)
> 
>>
>> As submitting another
>>>     one is a valid request, however, we should allow this in the future
>>>     (once we have the code to handle that in place).
>>> - With the async interface, we want user space to be able to submit a
>>>     halt/clear while a start request is still in flight, but not while
>>>     we're processing a start request with translation etc. We probably
>>>     want to do -EAGAIN in that case.
>>>
>>> My idea would be:
>>>
>>> - The BUSY state denotes "I'm busy processing a request right now, try
>>>     again". We hold it while processing the cp and doing the ssch and
>>>     leave it afterwards (i.e., while the start request is processed by
>>>     the hardware). I/O requests and async requests get -EAGAIN in that
>>>     state.
>>> - A new state (CP_PENDING?) is entered after ssch returned with cc 0
>>>     (from the BUSY state). We stay in there as long as no final state for
>>>     that request has been received and delivered. (This may be final
>>>     interrupt for that request, a deferred cc, or successful halt/clear.)
>>>     I/O requests get -EBUSY
>>
>> I liked CP_PENDING, since it corresponds to the subchannel being marked
>> "start pending" as described in POPS, but this statement suggests that
>> the BUSY/PENDING state to be swapped, such that state=PENDING returns
>> -EAGAIN and state=BUSY returns -EBUSY.  Not super-concerned with the
>> terminology though.
> 
> What about s/BUSY/CP_PROCESSING/ ?

So we go IDLE -> CP_PROCESSING -> CP_PENDING -> (IRQ) -> IDLE right? 
Seems good to me.

> 
>>
>> , async requests are processed. This state can
>>>     be removed again once we are able to handle more than one outstanding
>>>     cp.
>>>
>>> Does that make sense?
>>>    
>>
>> I think so, and I think I like it.  So you want to distinguish between
>> (I have swapped BUSY/PENDING in this example per my above comment):
>>
>> A) SSCH issued by userspace (IDLE->PENDING)
>> B) SSCH issued (successfully) by kernel (PENDING->BUSY)
>> B') SSCH issued (unsuccessfully) by kernel (PENDING->IDLE?)
> 
> I think so.
> 
>> C) Interrupt received by kernel (no change?)
>> D) Interrupt given to userspace (BUSY->IDLE)
> 
> Only if that is the final interrupt for that cp.

Agreed.

> 
>>
>> If we receive A and A, the second A gets EAGAIN
>>
>> If we do A+B and A, the second A gets EBUSY (unless async, which is
>> processed)
> 
> Nod.
> 
>> Does the boundary of "in flight" in the interrupt side (C and D) need to
>> be defined, such that we go BUSY->PENDING->IDLE instead of BUSY->IDLE ?
> 
> I don't think we can go BUSY->PENDING (in your terminology), at that
> would imply a retry of the ssch()?
> 

I didn't think so, but figured it's worth asking while we're already 
confused.  :)
Cornelia Huck Jan. 29, 2019, 9:58 a.m. UTC | #42
On Mon, 28 Jan 2019 20:30:00 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Mon, 28 Jan 2019 18:13:55 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri, 25 Jan 2019 17:04:04 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > Do we expect userspace/QEMU to fence the bad scenarios as tries to do
> > > today, or is this supposed to change to hardware should sort out
> > > requests whenever possible.  
> > 
> > Does my other mail answer that?  
> 
> Sorry, I can't find the answer in your other (Date: Mon, 28 Jan 2019
> 17:59:10 +0100, Message-Id: <20190128175910.5d9677e7@oc2783563651>) mail.
> AFAIU that mail talks abut the kernel and not about the userspace.
> 
> I guess the answer is we don't expect changes to userspace, so we do
> expect userspace to fence bad scenarios.

Then, I really have no idea what you are aiming at with your comment :(

> 
> >   
> > > The problem I see with the let the hardware sort it out is that, for
> > > that to work, we need to juggle multiple translations simultaneously
> > > (or am I wrong?). Doing that does not appear particularly simple to
> > > me.  
> > 
> > None in the first stage, at most two in the second stage, I guess.
> >   
> 
> Expected benefit of the second stage over the first stage? (I see none.)

Making something possible that is allowed by the architecture. Not
really important, though.

> 
> > > Furthermore we would go through all that hassle knowingly that the
> > > sole reason is working around bugs. We still expect our Linux guests
> > > serializing it's ssch() stuff as it does today. Thus I would except
> > > this code not getting the love nor the coverage that would guard
> > > against bugs in that code.  
> > 
> > So, we should have test code for that? (Any IBM-internal channel I/O
> > exercisers that may help?)
> >  
> 
> None that I'm aware of. Anyone else? 
> 
> But the point I was trying to make is the following: I prefer keeping
> the handling for the case "ssch()'s on top of each other" as trivial as
> possible. (E.g. bail out if CP_PENDING without doing any translation.)
>  
> > We should not rely on the guest being sane, although Linux probably is
> > in that respect.
> >   
> 
> I agree 100%: we should not rely on either guest or userspace emulator
> being sane. But IMHO we should handle insanity with the least possible
> investment.

We probably disagree what the least possible investment is.
Cornelia Huck Jan. 29, 2019, 10:10 a.m. UTC | #43
On Mon, 28 Jan 2019 20:15:48 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Mon, 28 Jan 2019 18:09:48 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri, 25 Jan 2019 15:01:01 +0100
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On Fri, 25 Jan 2019 13:58:35 +0100
> > > Cornelia Huck <cohuck@redhat.com> wrote:  
> >   
> > > > - The code should not be interrupted while we process the channel
> > > >   program, do the ssch etc. We want the caller to try again later (i.e.
> > > >   return -EAGAIN)    
> > 
> > (...)
> >   
> > > > - With the async interface, we want user space to be able to submit a
> > > >   halt/clear while a start request is still in flight, but not while
> > > >   we're processing a start request with translation etc. We probably
> > > >   want to do -EAGAIN in that case.    
> > > 
> > > This reads very similar to your first point.  
> > 
> > Not quite. ssch() means that we have a cp around; for hsch()/csch() we
> > don't have such a thing. So we want to protect the process of
> > translating the cp etc., but we don't need such protection for the
> > halt/clear processing.
> >   
> 
> What does this don't 'need such protection' mean in terms of code,
> moving the unlock of the io_mutex upward (in
> vfio_ccw_async_region_write())?

We don't have a cp that we need to process, so we don't need protection
for that.

> > 
> > IDLE --- IO_REQ --> BUSY ---> CP_PENDING --- IRQ ---> IDLE (if final  
> 
> There ain't no trigger/action list  between BUSY and CP_PENDING.
> I'm also in the  dark about where the issuing of the ssch() happen
> here (is it an internal transition within CP_PENDING?). I guess if
> the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
> transition
> won't take place. And I guess the IRQ is a final one.

Please refer to the original ideas. This is obviously not supposed to
be a complete description of every case we might encounter.

> > state for I/O)
> > (normal ssch)
> > 
> > BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
> > (user space is supposed to retry, as we'll eventually progress from
> > BUSY)
> > 
> > CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
> > (user space is supposed to map this to the appropriate cc for the guest)  
> 
> From this it seems you don't intend to issue the second  requested ssch()
> any more (and don't want to do any translation). Is that right? (If it
> is, that what I was asking for for a while, but then it's a pity for the
> retries.)

Which "second requested ssch"? In the first case, user space is
supposed to retry; in the second case, it should map it to a cc (and
the guest does whatever it does on busy conditions). We can't issue a
ssch if we're not able to handle multiple cps.

> 
> > 
> > IDLE --- ASYNC_REQ ---> IDLE
> > (user space is welcome to do anything else right away)  
> 
> Your idea is to not issue a requested hsch() if we think we are IDLE
> it seems. Do I understand this right? We would end up with a different
> semantic for hsch()/and csch() (compared to PoP) in the guest with this
> (AFAICT).

Nope, we're doing hsch/csch. We're just not moving out of IDLE, as we
(a) don't have any cp processing we need to protect and (b) no need to
fence of multiple attempts of hsch/csch.

> 
> > 
> > BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
> > (user space is supposed to retry, as above)
> > 
> > CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
> > (the interrupt will get us out of CP_PENDING eventually)  
> 
> Issue (c|h)sch() is an action that is done on this internal 
> transition (within CP_PENDING).

Yes. hsch/csch do not trigger a state change (other than possibly
dropping into NOT_OPER for cc 3).
Cornelia Huck Jan. 29, 2019, 10:20 a.m. UTC | #44
On Mon, 28 Jan 2019 16:48:10 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> On 01/28/2019 02:15 PM, Halil Pasic wrote:
> > On Mon, 28 Jan 2019 18:09:48 +0100
> > Cornelia Huck <cohuck@redhat.com> wrote:

> I guess if
> > the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
> > transition
> > won't take place. And I guess the IRQ is a final one.  
> 
> Yes this is the one point I hadn't seen explicitly stated.  We shouldn't 
> remain in state=BUSY if the ssch got cc!=0, and probably return to IDLE 
> when processing the failure.  In Connie's response (Mon, 28 Jan 2019 
> 18:24:24 +0100) to my note, she expressed some agreement to that.

Yes, I think that's what should happen.


> >> state for I/O)
> >> (normal ssch)
> >>
> >> BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
> >> (user space is supposed to retry, as we'll eventually progress from
> >> BUSY)
> >>
> >> CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
> >> (user space is supposed to map this to the appropriate cc for the guest)  
> > 
> >  From this it seems you don't intend to issue the second  requested ssch()
> > any more (and don't want to do any translation). Is that right? (If it
> > is, that what I was asking for for a while, but then it's a pity for the
> > retries.)
> >   
> >>
> >> IDLE --- ASYNC_REQ ---> IDLE
> >> (user space is welcome to do anything else right away)  
> > 
> > Your idea is to not issue a requested hsch() if we think we are IDLE
> > it seems. Do I understand this right? We would end up with a different
> > semantic for hsch()/and csch() (compared to PoP) in the guest with this
> > (AFAICT).
> >   
> >>
> >> BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
> >> (user space is supposed to retry, as above)
> >>
> >> CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
> >> (the interrupt will get us out of CP_PENDING eventually)  
> > 
> > Issue (c|h)sch() is an action that is done on this internal
> > transition (within CP_PENDING).  
> 
> These three do read like CSCH/HSCH are subject to the same rules as 
> SSCH, when in fact they would be (among other reasons) issued to clean 
> up a lost interrupt from a previous SSCH.  So maybe return -EAGAIN on 
> state=BUSY (don't race ourselves with the start), but issue to hardware 
> if CP_PENDING.

I think there are some devices which require a certain hsch/csch
sequence during device bringup, so it's not just cleaning up after a
ssch. Therefore, we should always try to do the requested hsch/csch,
unless things like "we're in the process of translating a cp, and can't
deal with another request right now" prevent it.

> 
> If we get an async request when state=IDLE, then maybe just issue it for 
> fun, as if it were an SSCH?

For fun, but mainly because the guest wants it :)
Eric Farman Jan. 29, 2019, 2:14 p.m. UTC | #45
On 01/29/2019 05:20 AM, Cornelia Huck wrote:
> On Mon, 28 Jan 2019 16:48:10 -0500
> Eric Farman <farman@linux.ibm.com> wrote:
> 
>> On 01/28/2019 02:15 PM, Halil Pasic wrote:
>>> On Mon, 28 Jan 2019 18:09:48 +0100
>>> Cornelia Huck <cohuck@redhat.com> wrote:
> 
>> I guess if
>>> the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
>>> transition
>>> won't take place. And I guess the IRQ is a final one.
>>
>> Yes this is the one point I hadn't seen explicitly stated.  We shouldn't
>> remain in state=BUSY if the ssch got cc!=0, and probably return to IDLE
>> when processing the failure.  In Connie's response (Mon, 28 Jan 2019
>> 18:24:24 +0100) to my note, she expressed some agreement to that.
> 
> Yes, I think that's what should happen.
> 
> 
>>>> state for I/O)
>>>> (normal ssch)
>>>>
>>>> BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
>>>> (user space is supposed to retry, as we'll eventually progress from
>>>> BUSY)
>>>>
>>>> CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
>>>> (user space is supposed to map this to the appropriate cc for the guest)
>>>
>>>   From this it seems you don't intend to issue the second  requested ssch()
>>> any more (and don't want to do any translation). Is that right? (If it
>>> is, that what I was asking for for a while, but then it's a pity for the
>>> retries.)
>>>    
>>>>
>>>> IDLE --- ASYNC_REQ ---> IDLE
>>>> (user space is welcome to do anything else right away)
>>>
>>> Your idea is to not issue a requested hsch() if we think we are IDLE
>>> it seems. Do I understand this right? We would end up with a different
>>> semantic for hsch()/and csch() (compared to PoP) in the guest with this
>>> (AFAICT).
>>>    
>>>>
>>>> BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
>>>> (user space is supposed to retry, as above)
>>>>
>>>> CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
>>>> (the interrupt will get us out of CP_PENDING eventually)
>>>
>>> Issue (c|h)sch() is an action that is done on this internal
>>> transition (within CP_PENDING).
>>
>> These three do read like CSCH/HSCH are subject to the same rules as
>> SSCH, when in fact they would be (among other reasons) issued to clean
>> up a lost interrupt from a previous SSCH.  So maybe return -EAGAIN on
>> state=BUSY (don't race ourselves with the start), but issue to hardware
>> if CP_PENDING.
> 
> I think there are some devices which require a certain hsch/csch
> sequence during device bringup, so it's not just cleaning up after a
> ssch. 

Ah, yes.

Therefore, we should always try to do the requested hsch/csch,
> unless things like "we're in the process of translating a cp, and can't
> deal with another request right now" prevent it.

Agreed.  I'm in support of all of this.

> 
>>
>> If we get an async request when state=IDLE, then maybe just issue it for
>> fun, as if it were an SSCH?
> 
> For fun, but mainly because the guest wants it :)
> 

Well, that too.  ;-)
Cornelia Huck Jan. 29, 2019, 6:53 p.m. UTC | #46
On Tue, 29 Jan 2019 09:14:40 -0500
Eric Farman <farman@linux.ibm.com> wrote:

> On 01/29/2019 05:20 AM, Cornelia Huck wrote:
> > On Mon, 28 Jan 2019 16:48:10 -0500
> > Eric Farman <farman@linux.ibm.com> wrote:
> >   
> >> On 01/28/2019 02:15 PM, Halil Pasic wrote:  
> >>> On Mon, 28 Jan 2019 18:09:48 +0100
> >>> Cornelia Huck <cohuck@redhat.com> wrote:  
> >   
> >> I guess if  
> >>> the ssch() returns with non cc == 0 the CP_PENDING ---IRQ---> IDLE
> >>> transition
> >>> won't take place. And I guess the IRQ is a final one.  
> >>
> >> Yes this is the one point I hadn't seen explicitly stated.  We shouldn't
> >> remain in state=BUSY if the ssch got cc!=0, and probably return to IDLE
> >> when processing the failure.  In Connie's response (Mon, 28 Jan 2019
> >> 18:24:24 +0100) to my note, she expressed some agreement to that.  
> > 
> > Yes, I think that's what should happen.
> > 
> >   
> >>>> state for I/O)
> >>>> (normal ssch)
> >>>>
> >>>> BUSY --- IO_REQ ---> return -EAGAIN, stay in BUSY
> >>>> (user space is supposed to retry, as we'll eventually progress from
> >>>> BUSY)
> >>>>
> >>>> CP_PENDING --- IO_REQ ---> return -EBUSY, stay in CP_PENDING
> >>>> (user space is supposed to map this to the appropriate cc for the guest)  
> >>>
> >>>   From this it seems you don't intend to issue the second  requested ssch()
> >>> any more (and don't want to do any translation). Is that right? (If it
> >>> is, that what I was asking for for a while, but then it's a pity for the
> >>> retries.)
> >>>      
> >>>>
> >>>> IDLE --- ASYNC_REQ ---> IDLE
> >>>> (user space is welcome to do anything else right away)  
> >>>
> >>> Your idea is to not issue a requested hsch() if we think we are IDLE
> >>> it seems. Do I understand this right? We would end up with a different
> >>> semantic for hsch()/and csch() (compared to PoP) in the guest with this
> >>> (AFAICT).
> >>>      
> >>>>
> >>>> BUSY --- ASYNC_REQ ---> return -EAGAIN, stay in BUSY
> >>>> (user space is supposed to retry, as above)
> >>>>
> >>>> CP_PENDING --- ASYNC_REQ ---> return success, stay in CP_PENDING
> >>>> (the interrupt will get us out of CP_PENDING eventually)  
> >>>
> >>> Issue (c|h)sch() is an action that is done on this internal
> >>> transition (within CP_PENDING).  
> >>
> >> These three do read like CSCH/HSCH are subject to the same rules as
> >> SSCH, when in fact they would be (among other reasons) issued to clean
> >> up a lost interrupt from a previous SSCH.  So maybe return -EAGAIN on
> >> state=BUSY (don't race ourselves with the start), but issue to hardware
> >> if CP_PENDING.  
> > 
> > I think there are some devices which require a certain hsch/csch
> > sequence during device bringup, so it's not just cleaning up after a
> > ssch.   
> 
> Ah, yes.
> 
> Therefore, we should always try to do the requested hsch/csch,
> > unless things like "we're in the process of translating a cp, and can't
> > deal with another request right now" prevent it.  
> 
> Agreed.  I'm in support of all of this.

Cool. In the meantime, I've coded the changes, and I think the result
looks reasonable. I'll give it some testing and then send it out; it's
probably easier to discuss it with some code in front of us.

[The QEMU part should not need any changes.]

> 
> >   
> >>
> >> If we get an async request when state=IDLE, then maybe just issue it for
> >> fun, as if it were an SSCH?  
> > 
> > For fun, but mainly because the guest wants it :)
> >   
> 
> Well, that too.  ;-)
>
Halil Pasic Jan. 29, 2019, 7:39 p.m. UTC | #47
On Tue, 29 Jan 2019 10:58:40 +0100
Cornelia Huck <cohuck@redhat.com> wrote:

> > > > The problem I see with the let the hardware sort it out is that, for
> > > > that to work, we need to juggle multiple translations simultaneously
> > > > (or am I wrong?). Doing that does not appear particularly simple to
> > > > me.    
> > > 
> > > None in the first stage, at most two in the second stage, I guess.
> > >     
> > 
> > Expected benefit of the second stage over the first stage? (I see none.)  
> 
> Making something possible that is allowed by the architecture. Not
> really important, though.
> 

I had a chat with Farhan, and he suggested that by 'allowed by
architecture' you mean " You can submit a new request if the subchannel
is pending with primary, but not with secondary state." (from Message-ID:
<20190125152154.05120461.cohuck@redhat.com>).

So I re-read the PoP.

From the description of the start subchannel instruction:
"""
Special Conditions

Condition code 1 is set, and no other action is
taken, when the subchannel is status pending when
START SUBCHANNEL is executed. On some mod-
els, condition code 1 is not set when the subchannel
is status pending with only secondary status; instead,
the status-pending condition is discarded.

Condition code 2 is set, and no other action is
taken, when a start, halt, or clear function is currently
in progress at the subchannel (see “Function Control
(FC)” on page 13).

"""

So I guess you mixed primary and secondary up and wanted to say:
"You can submit a new request if the subchannel
is pending with _secondary_, but not with _primary_ _status_."

But does that really mean architecture allows the subchannel
to accept multiple ssch() instructions so that it ends up processing
two or more channel programs in parallel.

My answer to that question is: no it does not, and furthermore
it would not make sense.

So let me provide some quotes that explain how this ominous accepting
the ssch() with a status pending can occur.

"""
Conclusion of I/O Operations

The conclusion of an I/O operation normally is indi-
cated by two status conditions: channel end and
device end. The channel-end condition indicates that
the I/O device has received or provided all data asso-
ciated with the operation and no longer needs chan-
nel-subsystem facilities. This condition is called the
primary interruption condition, and the channel end
in this case is the primary status. Generally, the pri-
mary interruption condition is any interruption condi-
tion that relates to an I/O operation and that signals
the conclusion at the subchannel of the I/O operation
or chain of I/O operations.

The device-end signal indicates that the I/O device
has concluded execution and is ready to perform
another operation. This condition is called the sec-
ondary interruption condition, and the device end in
this case is the secondary status. Generally, the sec-
ondary interruption condition is any interruption con-
dition that relates to an I/O operation and that signals
the conclusion at the device of the I/O operation or
chain of operations. The secondary interruption con-
dition can occur concurrently with, or later than, the
primary interruption condition.
"""

In my reading this means that a device may lag with signaling
that it is done (with respect to the conclusion at the subchannel).

It that window between primary and secondary the driver could do the
proper driving of a ccw device stuff, do the store subchannel and clear
the primary status. See  And happily start the next ssch(). If that
ssch() now hit a subchannel thah just got the secondary status, for some
modes we apparently don't need to wait for the secondary status before
issuing the next ssch(), nor do we need to do we need to do the clear the
pending status dance because ssch() says cc == 1. The subchannel will
discard the secondary status and process the second ssch(). But the
previous ssch() has long concluded. Because as the quoted text says
the primary status is either simultaneous with or precedes the secondary
status.

Also if the subchannel were to process more than one channel program
at the time, questions would arise like what happens if one of them fails
(does that affect the other one?). It's something I find very hard to
even thing about.

BTW we would have to deal with these problems as well, if we were
to implement your second stage.

> >   
> > > > Furthermore we would go through all that hassle knowingly that
> > > > the sole reason is working around bugs. We still expect our
> > > > Linux guests serializing it's ssch() stuff as it does today.
> > > > Thus I would except this code not getting the love nor the
> > > > coverage that would guard against bugs in that code.    
> > > 
> > > So, we should have test code for that? (Any IBM-internal channel
> > > I/O exercisers that may help?)
> > >    
> > 
> > None that I'm aware of. Anyone else? 
> > 
> > But the point I was trying to make is the following: I prefer keeping
> > the handling for the case "ssch()'s on top of each other" as trivial
> > as possible. (E.g. bail out if CP_PENDING without doing any
> > translation.) 
> > > We should not rely on the guest being sane, although Linux
> > > probably is in that respect.
> > >     
> > 
> > I agree 100%: we should not rely on either guest or userspace
> > emulator being sane. But IMHO we should handle insanity with the
> > least possible investment.  
> 
> We probably disagree what the least possible investment is.
> 

Yes. IMHO making sure that we accept io_requests only if we are in
an appropriate state (currently the IDLE) and rejecting requests with
the appropriate error code is easy. And juggling parallel translations is
hard. My intuition. I can try to prove my point should anybody ever try
to submit patches that attempt this juggling parallel translations. But
I'm loosing my confidence in my ability to convince people.

Farhan, Eric any opinions?

Regards,
Halil
Cornelia Huck Jan. 30, 2019, 1:29 p.m. UTC | #48
On Tue, 29 Jan 2019 20:39:33 +0100
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 29 Jan 2019 10:58:40 +0100
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > > > > The problem I see with the let the hardware sort it out is that, for
> > > > > that to work, we need to juggle multiple translations simultaneously
> > > > > (or am I wrong?). Doing that does not appear particularly simple to
> > > > > me.      
> > > > 
> > > > None in the first stage, at most two in the second stage, I guess.
> > > >       
> > > 
> > > Expected benefit of the second stage over the first stage? (I see none.)    
> > 
> > Making something possible that is allowed by the architecture. Not
> > really important, though.
> >   
> 
> I had a chat with Farhan, and he suggested that by 'allowed by
> architecture' you mean " You can submit a new request if the subchannel
> is pending with primary, but not with secondary state." (from Message-ID:
> <20190125152154.05120461.cohuck@redhat.com>).

Yes. I might have mixed things up, though.

> 
> So I re-read the PoP.
> 
> From the description of the start subchannel instruction:
> """
> Special Conditions
> 
> Condition code 1 is set, and no other action is
> taken, when the subchannel is status pending when
> START SUBCHANNEL is executed. On some mod-
> els, condition code 1 is not set when the subchannel
> is status pending with only secondary status; instead,
> the status-pending condition is discarded.
> 
> Condition code 2 is set, and no other action is
> taken, when a start, halt, or clear function is currently
> in progress at the subchannel (see “Function Control
> (FC)” on page 13).
> 
> """
> 
> So I guess you mixed primary and secondary up and wanted to say:
> "You can submit a new request if the subchannel
> is pending with _secondary_, but not with _primary_ _status_."
> 
> But does that really mean architecture allows the subchannel
> to accept multiple ssch() instructions so that it ends up processing
> two or more channel programs in parallel.

That's not what I meant. The vfio-ccw driver still holds on to one cp,
while a second one could be submitted.

But let's just end discussing this here, and continue with discussing
the reworked state machine, ok? It's not really relevant for going
forward with halt/clear.
Farhan Ali Jan. 30, 2019, 2:32 p.m. UTC | #49
On 01/30/2019 08:29 AM, Cornelia Huck wrote:
> On Tue, 29 Jan 2019 20:39:33 +0100
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
>> On Tue, 29 Jan 2019 10:58:40 +0100
>> Cornelia Huck <cohuck@redhat.com> wrote:
>>
>>>>>> The problem I see with the let the hardware sort it out is that, for
>>>>>> that to work, we need to juggle multiple translations simultaneously
>>>>>> (or am I wrong?). Doing that does not appear particularly simple to
>>>>>> me.
>>>>>
>>>>> None in the first stage, at most two in the second stage, I guess.
>>>>>        
>>>>
>>>> Expected benefit of the second stage over the first stage? (I see none.)
>>>
>>> Making something possible that is allowed by the architecture. Not
>>> really important, though.
>>>    
>>
>> I had a chat with Farhan, and he suggested that by 'allowed by
>> architecture' you mean " You can submit a new request if the subchannel
>> is pending with primary, but not with secondary state." (from Message-ID:
>> <20190125152154.05120461.cohuck@redhat.com>).
> 
> Yes. I might have mixed things up, though.
> 
>>
>> So I re-read the PoP.
>>
>>  From the description of the start subchannel instruction:
>> """
>> Special Conditions
>>
>> Condition code 1 is set, and no other action is
>> taken, when the subchannel is status pending when
>> START SUBCHANNEL is executed. On some mod-
>> els, condition code 1 is not set when the subchannel
>> is status pending with only secondary status; instead,
>> the status-pending condition is discarded.
>>
>> Condition code 2 is set, and no other action is
>> taken, when a start, halt, or clear function is currently
>> in progress at the subchannel (see “Function Control
>> (FC)” on page 13).
>>
>> """
>>
>> So I guess you mixed primary and secondary up and wanted to say:
>> "You can submit a new request if the subchannel
>> is pending with _secondary_, but not with _primary_ _status_."
>>
>> But does that really mean architecture allows the subchannel
>> to accept multiple ssch() instructions so that it ends up processing
>> two or more channel programs in parallel.
> 
> That's not what I meant. The vfio-ccw driver still holds on to one cp,
> while a second one could be submitted.
> 
> But let's just end discussing this here, and continue with discussing
> the reworked state machine, ok? It's not really relevant for going
> forward with halt/clear.
> 
> 
+1
  I think we should move forward with halt/clear.

Thanks
Farhan
diff mbox series

Patch

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index a10cec0e86eb..2ef189fe45ed 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -125,6 +125,7 @@  static int vfio_ccw_sch_probe(struct subchannel *sch)
 
 	private->sch = sch;
 	dev_set_drvdata(&sch->dev, private);
+	mutex_init(&private->io_mutex);
 
 	spin_lock_irq(sch->lock);
 	private->state = VFIO_CCW_STATE_NOT_OPER;
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index cab17865aafe..f6ed934cc565 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -28,7 +28,6 @@  static int fsm_io_helper(struct vfio_ccw_private *private)
 	sch = private->sch;
 
 	spin_lock_irqsave(sch->lock, flags);
-	private->state = VFIO_CCW_STATE_BUSY;
 
 	orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
 
@@ -42,6 +41,8 @@  static int fsm_io_helper(struct vfio_ccw_private *private)
 		 */
 		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
 		ret = 0;
+		/* Don't allow another ssch for now */
+		private->state = VFIO_CCW_STATE_BUSY;
 		break;
 	case 1:		/* Status pending */
 	case 2:		/* Busy */
@@ -99,7 +100,7 @@  static void fsm_io_error(struct vfio_ccw_private *private,
 static void fsm_io_busy(struct vfio_ccw_private *private,
 			enum vfio_ccw_event event)
 {
-	private->io_region->ret_code = -EBUSY;
+	private->io_region->ret_code = -EAGAIN;
 }
 
 static void fsm_disabled_irq(struct vfio_ccw_private *private,
@@ -130,8 +131,6 @@  static void fsm_io_request(struct vfio_ccw_private *private,
 	struct mdev_device *mdev = private->mdev;
 	char *errstr = "request";
 
-	private->state = VFIO_CCW_STATE_BUSY;
-
 	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
 
 	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
@@ -176,7 +175,6 @@  static void fsm_io_request(struct vfio_ccw_private *private,
 	}
 
 err_out:
-	private->state = VFIO_CCW_STATE_IDLE;
 	trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private),
 			       io_region->ret_code, errstr);
 }
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index f673e106c041..3fa9fc570400 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -169,16 +169,20 @@  static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
 {
 	struct vfio_ccw_private *private;
 	struct ccw_io_region *region;
+	int ret;
 
 	if (*ppos + count > sizeof(*region))
 		return -EINVAL;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	mutex_lock(&private->io_mutex);
 	region = private->io_region;
 	if (copy_to_user(buf, (void *)region + *ppos, count))
-		return -EFAULT;
-
-	return count;
+		ret = -EFAULT;
+	else
+		ret = count;
+	mutex_unlock(&private->io_mutex);
+	return ret;
 }
 
 static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
@@ -188,25 +192,30 @@  static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 {
 	struct vfio_ccw_private *private;
 	struct ccw_io_region *region;
+	int ret;
 
 	if (*ppos + count > sizeof(*region))
 		return -EINVAL;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	if (private->state != VFIO_CCW_STATE_IDLE)
+	if (private->state == VFIO_CCW_STATE_NOT_OPER ||
+	    private->state == VFIO_CCW_STATE_STANDBY)
 		return -EACCES;
+	if (!mutex_trylock(&private->io_mutex))
+		return -EAGAIN;
 
 	region = private->io_region;
-	if (copy_from_user((void *)region + *ppos, buf, count))
-		return -EFAULT;
+	if (copy_from_user((void *)region + *ppos, buf, count)) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
 
 	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ);
-	if (region->ret_code != 0) {
-		private->state = VFIO_CCW_STATE_IDLE;
-		return region->ret_code;
-	}
+	ret = (region->ret_code != 0) ? region->ret_code : count;
 
-	return count;
+out_unlock:
+	mutex_unlock(&private->io_mutex);
+	return ret;
 }
 
 static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 08e9a7dc9176..e88237697f83 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -28,6 +28,7 @@ 
  * @mdev: pointer to the mediated device
  * @nb: notifier for vfio events
  * @io_region: MMIO region to input/output I/O arguments/results
+ * @io_mutex: protect against concurrent update of I/O structures
  * @cp: channel program for the current I/O operation
  * @irb: irb info received from interrupt
  * @scsw: scsw info
@@ -42,6 +43,7 @@  struct vfio_ccw_private {
 	struct mdev_device	*mdev;
 	struct notifier_block	nb;
 	struct ccw_io_region	*io_region;
+	struct mutex		io_mutex;
 
 	struct channel_program	cp;
 	struct irb		irb;