diff mbox

[RFC,2/2] vfio-ccw: support for halt/clear subchannel

Message ID 20180509154822.23510-3-cohuck@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Cornelia Huck May 9, 2018, 3:48 p.m. UTC
Currently, vfio-ccw only relays start subchannel requests to the real
hardware, which is enough in many cases but falls short e.g. during
error recovery.

Fortunately, it is easy to add support for halt and clear subchannel
requests to the existing infrastructure. User space can detect
support for halt/clear subchannel easily, as we always returned
-EOPNOTSUPP before and therefore we do not need any capability to
make this support discoverable.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
 drivers/s390/cio/vfio_ccw_drv.c | 10 ++++-
 drivers/s390/cio/vfio_ccw_fsm.c | 94 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 92 insertions(+), 12 deletions(-)

Comments

Pierre Morel May 11, 2018, 9:33 a.m. UTC | #1
On 09/05/2018 17:48, Cornelia Huck wrote:
> Currently, vfio-ccw only relays start subchannel requests to the real
> hardware, which is enough in many cases but falls short e.g. during
> error recovery.
>
> Fortunately, it is easy to add support for halt and clear subchannel
> requests to the existing infrastructure. User space can detect
> support for halt/clear subchannel easily, as we always returned
> -EOPNOTSUPP before and therefore we do not need any capability to
> make this support discoverable.
>
> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> ---
>   drivers/s390/cio/vfio_ccw_drv.c | 10 ++++-
>   drivers/s390/cio/vfio_ccw_fsm.c | 94 ++++++++++++++++++++++++++++++++++++-----
>   2 files changed, 92 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
> index ea6a2d0b2894..6212d577117f 100644
> --- a/drivers/s390/cio/vfio_ccw_drv.c
> +++ b/drivers/s390/cio/vfio_ccw_drv.c
> @@ -76,8 +76,14 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work)
>   	irb = &private->irb;
>   
>   	if (scsw_is_solicited(&irb->scsw)) {
> -		cp_update_scsw(&private->cp, &irb->scsw);
> -		cp_free(&private->cp);
> +		/*
> +		 * For the start function, we need to update based on the
> +		 * channel program. Nothing needs to be done for halt/clear.
> +		 */
> +		if (scsw_fctl(&irb->scsw) & SCSW_FCTL_START_FUNC) {
> +			cp_update_scsw(&private->cp, &irb->scsw);
> +			cp_free(&private->cp);
> +		}
>   	}
>   	memcpy(private->io_region.irb_area, irb, sizeof(*irb));
>   
> diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
> index 3c800642134e..94cae2984c35 100644
> --- a/drivers/s390/cio/vfio_ccw_fsm.c
> +++ b/drivers/s390/cio/vfio_ccw_fsm.c
> @@ -3,8 +3,10 @@
>    * Finite state machine for vfio-ccw device handling
>    *
>    * Copyright IBM Corp. 2017
> + * Copyright Red Hat, Inc. 2018
>    *
>    * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
> + *            Cornelia Huck <cohuck@redhat.com>
>    */
>   
>   #include <linux/vfio.h>
> @@ -65,6 +67,70 @@ static int fsm_io_helper(struct vfio_ccw_private *private)
>   	return ret;
>   }
>   
> +static int fsm_halt_helper(struct vfio_ccw_private *private)
> +{
> +	struct subchannel *sch;
> +	int ccode;
> +	unsigned long flags;
> +	int ret;
> +
> +	sch = private->sch;
> +
> +	spin_lock_irqsave(sch->lock, flags);
> +	private->state = VFIO_CCW_STATE_BUSY;
> +
> +	/* Issue "Halt Subchannel" */
> +	ccode = hsch(sch->schid);
> +
> +	switch (ccode) {
> +	case 0:
> +		/*
> +		 * Initialize device status information
> +		 */
> +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_HALT_PEND;
> +		ret = 0;
> +		break;
> +	case 1:		/* Status pending */

shouldn't we make a difference between status pending
and having halt in progress?

The guest can examine the SCSW, but couldn't it introduce
a race condition?


> +	case 2:		/* Busy */
> +		ret = -EBUSY;
> +		break;
> +	default:	/* Device not operational */
> +		ret = -ENODEV;
> +	}
> +	spin_unlock_irqrestore(sch->lock, flags);
> +	return ret;
> +}
> +
> +static int fsm_clear_helper(struct vfio_ccw_private *private)
> +{
> +	struct subchannel *sch;
> +	int ccode;
> +	unsigned long flags;
> +	int ret;
> +
> +	sch = private->sch;
> +
> +	spin_lock_irqsave(sch->lock, flags);
> +	private->state = VFIO_CCW_STATE_BUSY;
> +
> +	/* Issue "Clear Subchannel" */
> +	ccode = csch(sch->schid);
> +
> +	switch (ccode) {
> +	case 0:
> +		/*
> +		 * Initialize device status information
> +		 */
> +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_CLEAR_PEND;
> +		ret = 0;
> +		break;
> +	default:	/* Device not operational */
> +		ret = -ENODEV;
> +	}
> +	spin_unlock_irqrestore(sch->lock, flags);
> +	return ret;
> +}
> +
>   static void fsm_notoper(struct vfio_ccw_private *private,
>   			enum vfio_ccw_event event)
>   {
> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>   
>   	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>   
> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> +	/*
> +	 * Start processing with the clear function, then halt, then start.
> +	 * We may still be start pending when the caller wants to clean
> +	 * up things via halt/clear.
> +	 */

hum. The scsw here does not reflect the hardware state but the
command passed from the user interface.
Can we and should we authorize multiple commands in one call?

If not, the comment is not appropriate and a switch on cmd.fctl
would be a clearer.

> +	if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
> +		/* issue clear and wait for interupt */
> +		io_region->ret_code = fsm_clear_helper(private);
> +		if (io_region->ret_code)
> +			goto err_out;
> +		return;
> +	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
> +		/* issue halt and wait for interrupt */
> +		io_region->ret_code = fsm_halt_helper(private);
> +		if (io_region->ret_code)
> +			goto err_out;
> +		return;
> +	} else if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
>   		orb = (union orb *)io_region->orb_area;
>   
>   		/* Don't try to build a cp if transport mode is specified. */
> @@ -152,16 +235,7 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>   			goto err_out;
>   		}
>   		return;
> -	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
> -		/* XXX: Handle halt. */
> -		io_region->ret_code = -EOPNOTSUPP;
> -		goto err_out;
> -	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
> -		/* XXX: Handle clear. */
> -		io_region->ret_code = -EOPNOTSUPP;
> -		goto err_out;
>   	}
> -
>   err_out:
>   	private->state = VFIO_CCW_STATE_IDLE;
>   }
Cornelia Huck May 15, 2018, 4:10 p.m. UTC | #2
On Fri, 11 May 2018 11:33:35 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 09/05/2018 17:48, Cornelia Huck wrote:
> > Currently, vfio-ccw only relays start subchannel requests to the real
> > hardware, which is enough in many cases but falls short e.g. during
> > error recovery.
> >
> > Fortunately, it is easy to add support for halt and clear subchannel
> > requests to the existing infrastructure. User space can detect
> > support for halt/clear subchannel easily, as we always returned
> > -EOPNOTSUPP before and therefore we do not need any capability to
> > make this support discoverable.
> >
> > Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> > ---
> >   drivers/s390/cio/vfio_ccw_drv.c | 10 ++++-
> >   drivers/s390/cio/vfio_ccw_fsm.c | 94 ++++++++++++++++++++++++++++++++++++-----
> >   2 files changed, 92 insertions(+), 12 deletions(-)

> > @@ -65,6 +67,70 @@ static int fsm_io_helper(struct vfio_ccw_private *private)
> >   	return ret;
> >   }
> >   
> > +static int fsm_halt_helper(struct vfio_ccw_private *private)
> > +{
> > +	struct subchannel *sch;
> > +	int ccode;
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	sch = private->sch;
> > +
> > +	spin_lock_irqsave(sch->lock, flags);
> > +	private->state = VFIO_CCW_STATE_BUSY;
> > +
> > +	/* Issue "Halt Subchannel" */
> > +	ccode = hsch(sch->schid);
> > +
> > +	switch (ccode) {
> > +	case 0:
> > +		/*
> > +		 * Initialize device status information
> > +		 */
> > +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_HALT_PEND;
> > +		ret = 0;
> > +		break;
> > +	case 1:		/* Status pending */  
> 
> shouldn't we make a difference between status pending
> and having halt in progress?
> 
> The guest can examine the SCSW, but couldn't it introduce
> a race condition?

Yes, good point. Especially as the guest might want to do different
things.

Regarding race conditions: The scsw can already be outdated after the
operation that stored it finished, which is true even on LPAR. That's
especially true for tsch which clears some status at the subchannel.
The guest must already be able to deal with this, the race window is
just larger.

> 
> 
> > +	case 2:		/* Busy */
> > +		ret = -EBUSY;
> > +		break;
> > +	default:	/* Device not operational */
> > +		ret = -ENODEV;
> > +	}
> > +	spin_unlock_irqrestore(sch->lock, flags);
> > +	return ret;
> > +}
> > +
> > +static int fsm_clear_helper(struct vfio_ccw_private *private)
> > +{
> > +	struct subchannel *sch;
> > +	int ccode;
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	sch = private->sch;
> > +
> > +	spin_lock_irqsave(sch->lock, flags);
> > +	private->state = VFIO_CCW_STATE_BUSY;
> > +
> > +	/* Issue "Clear Subchannel" */
> > +	ccode = csch(sch->schid);
> > +
> > +	switch (ccode) {
> > +	case 0:
> > +		/*
> > +		 * Initialize device status information
> > +		 */
> > +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_CLEAR_PEND;
> > +		ret = 0;
> > +		break;
> > +	default:	/* Device not operational */
> > +		ret = -ENODEV;
> > +	}
> > +	spin_unlock_irqrestore(sch->lock, flags);
> > +	return ret;
> > +}
> > +
> >   static void fsm_notoper(struct vfio_ccw_private *private,
> >   			enum vfio_ccw_event event)
> >   {
> > @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
> >   
> >   	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
> >   
> > -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> > +	/*
> > +	 * Start processing with the clear function, then halt, then start.
> > +	 * We may still be start pending when the caller wants to clean
> > +	 * up things via halt/clear.
> > +	 */  
> 
> hum. The scsw here does not reflect the hardware state but the
> command passed from the user interface.
> Can we and should we authorize multiple commands in one call?
> 
> If not, the comment is not appropriate and a switch on cmd.fctl
> would be a clearer.

There may be multiple functions specified, but we need to process them
in precedence order (and clear wins over the others, so to speak).
Would adding a sentence like "we always process just one function" help?

> 
> > +	if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
> > +		/* issue clear and wait for interupt */
> > +		io_region->ret_code = fsm_clear_helper(private);
> > +		if (io_region->ret_code)
> > +			goto err_out;
> > +		return;
> > +	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
> > +		/* issue halt and wait for interrupt */
> > +		io_region->ret_code = fsm_halt_helper(private);
> > +		if (io_region->ret_code)
> > +			goto err_out;
> > +		return;
> > +	} else if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> >   		orb = (union orb *)io_region->orb_area;
> >   
> >   		/* Don't try to build a cp if transport mode is specified. */
> > @@ -152,16 +235,7 @@ static void fsm_io_request(struct vfio_ccw_private *private,
> >   			goto err_out;
> >   		}
> >   		return;
> > -	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
> > -		/* XXX: Handle halt. */
> > -		io_region->ret_code = -EOPNOTSUPP;
> > -		goto err_out;
> > -	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
> > -		/* XXX: Handle clear. */
> > -		io_region->ret_code = -EOPNOTSUPP;
> > -		goto err_out;
> >   	}
> > -
> >   err_out:
> >   	private->state = VFIO_CCW_STATE_IDLE;
> >   }  
> 
>
Pierre Morel May 16, 2018, 1:32 p.m. UTC | #3
On 15/05/2018 18:10, Cornelia Huck wrote:
> On Fri, 11 May 2018 11:33:35 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 09/05/2018 17:48, Cornelia Huck wrote:
>>> Currently, vfio-ccw only relays start subchannel requests to the real
>>> hardware, which is enough in many cases but falls short e.g. during
>>> error recovery.
>>>
>>> Fortunately, it is easy to add support for halt and clear subchannel
>>> requests to the existing infrastructure. User space can detect
>>> support for halt/clear subchannel easily, as we always returned
>>> -EOPNOTSUPP before and therefore we do not need any capability to
>>> make this support discoverable.
>>>
>>> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
>>> ---
>>>    drivers/s390/cio/vfio_ccw_drv.c | 10 ++++-
>>>    drivers/s390/cio/vfio_ccw_fsm.c | 94 ++++++++++++++++++++++++++++++++++++-----
>>>    2 files changed, 92 insertions(+), 12 deletions(-)
>>> @@ -65,6 +67,70 @@ static int fsm_io_helper(struct vfio_ccw_private *private)
>>>    	return ret;
>>>    }
>>>    
>>> +static int fsm_halt_helper(struct vfio_ccw_private *private)
>>> +{
>>> +	struct subchannel *sch;
>>> +	int ccode;
>>> +	unsigned long flags;
>>> +	int ret;
>>> +
>>> +	sch = private->sch;
>>> +
>>> +	spin_lock_irqsave(sch->lock, flags);
>>> +	private->state = VFIO_CCW_STATE_BUSY;
>>> +
>>> +	/* Issue "Halt Subchannel" */
>>> +	ccode = hsch(sch->schid);
>>> +
>>> +	switch (ccode) {
>>> +	case 0:
>>> +		/*
>>> +		 * Initialize device status information
>>> +		 */
>>> +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_HALT_PEND;
>>> +		ret = 0;
>>> +		break;
>>> +	case 1:		/* Status pending */
>> shouldn't we make a difference between status pending
>> and having halt in progress?
>>
>> The guest can examine the SCSW, but couldn't it introduce
>> a race condition?
> Yes, good point. Especially as the guest might want to do different
> things.
>
> Regarding race conditions: The scsw can already be outdated after the
> operation that stored it finished, which is true even on LPAR. That's
> especially true for tsch which clears some status at the subchannel.
> The guest must already be able to deal with this, the race window is
> just larger.

This is the kind of race I try to avoid with the mutex protected
state changes patch.

>
>>
>>> +	case 2:		/* Busy */
>>> +		ret = -EBUSY;
>>> +		break;
>>> +	default:	/* Device not operational */
>>> +		ret = -ENODEV;
>>> +	}
>>> +	spin_unlock_irqrestore(sch->lock, flags);
>>> +	return ret;
>>> +}
>>> +
>>> +static int fsm_clear_helper(struct vfio_ccw_private *private)
>>> +{
>>> +	struct subchannel *sch;
>>> +	int ccode;
>>> +	unsigned long flags;
>>> +	int ret;
>>> +
>>> +	sch = private->sch;
>>> +
>>> +	spin_lock_irqsave(sch->lock, flags);
>>> +	private->state = VFIO_CCW_STATE_BUSY;
>>> +
>>> +	/* Issue "Clear Subchannel" */
>>> +	ccode = csch(sch->schid);
>>> +
>>> +	switch (ccode) {
>>> +	case 0:
>>> +		/*
>>> +		 * Initialize device status information
>>> +		 */
>>> +		sch->schib.scsw.cmd.actl |= SCSW_ACTL_CLEAR_PEND;
>>> +		ret = 0;
>>> +		break;
>>> +	default:	/* Device not operational */
>>> +		ret = -ENODEV;
>>> +	}
>>> +	spin_unlock_irqrestore(sch->lock, flags);
>>> +	return ret;
>>> +}
>>> +
>>>    static void fsm_notoper(struct vfio_ccw_private *private,
>>>    			enum vfio_ccw_event event)
>>>    {
>>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>>>    
>>>    	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>>>    
>>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
>>> +	/*
>>> +	 * Start processing with the clear function, then halt, then start.
>>> +	 * We may still be start pending when the caller wants to clean
>>> +	 * up things via halt/clear.
>>> +	 */
>> hum. The scsw here does not reflect the hardware state but the
>> command passed from the user interface.
>> Can we and should we authorize multiple commands in one call?
>>
>> If not, the comment is not appropriate and a switch on cmd.fctl
>> would be a clearer.
> There may be multiple functions specified, but we need to process them
> in precedence order (and clear wins over the others, so to speak).
> Would adding a sentence like "we always process just one function" help?

Why should we allow multiple commands in a single call ?
It brings no added value.
Is there a use case?
Currently QEMU does not do this and since we only have the SCSH there
is no difference having the bit set alone or not alone.
Cornelia Huck May 22, 2018, 12:52 p.m. UTC | #4
On Wed, 16 May 2018 15:32:48 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 15/05/2018 18:10, Cornelia Huck wrote:
> > On Fri, 11 May 2018 11:33:35 +0200
> > Pierre Morel <pmorel@linux.ibm.com> wrote:
> >  
> >> On 09/05/2018 17:48, Cornelia Huck wrote:  

> >>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
> >>>    
> >>>    	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
> >>>    
> >>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> >>> +	/*
> >>> +	 * Start processing with the clear function, then halt, then start.
> >>> +	 * We may still be start pending when the caller wants to clean
> >>> +	 * up things via halt/clear.
> >>> +	 */  
> >> hum. The scsw here does not reflect the hardware state but the
> >> command passed from the user interface.
> >> Can we and should we authorize multiple commands in one call?
> >>
> >> If not, the comment is not appropriate and a switch on cmd.fctl
> >> would be a clearer.  
> > There may be multiple functions specified, but we need to process them
> > in precedence order (and clear wins over the others, so to speak).
> > Would adding a sentence like "we always process just one function" help?  
> 
> Why should we allow multiple commands in a single call ?
> It brings no added value.
> Is there a use case?
> Currently QEMU does not do this and since we only have the SCSH there
> is no difference having the bit set alone or not alone.

I found this to be a very easy way to implement halt/clear. This still
holds true if we switch to some kind of capabilities for this (did not
have time to look at this further, though).

As we have the fctl field anyway, I'm in favour of processing this all
in one function.
Pierre Morel May 22, 2018, 3:10 p.m. UTC | #5
On 22/05/2018 14:52, Cornelia Huck wrote:
> On Wed, 16 May 2018 15:32:48 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 15/05/2018 18:10, Cornelia Huck wrote:
>>> On Fri, 11 May 2018 11:33:35 +0200
>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>   
>>>> On 09/05/2018 17:48, Cornelia Huck wrote:
>>>>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>>>>>     
>>>>>     	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>>>>>     
>>>>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
>>>>> +	/*
>>>>> +	 * Start processing with the clear function, then halt, then start.
>>>>> +	 * We may still be start pending when the caller wants to clean
>>>>> +	 * up things via halt/clear.
>>>>> +	 */
>>>> hum. The scsw here does not reflect the hardware state but the
>>>> command passed from the user interface.
>>>> Can we and should we authorize multiple commands in one call?
>>>>
>>>> If not, the comment is not appropriate and a switch on cmd.fctl
>>>> would be a clearer.
>>> There may be multiple functions specified, but we need to process them
>>> in precedence order (and clear wins over the others, so to speak).
>>> Would adding a sentence like "we always process just one function" help?
>> Why should we allow multiple commands in a single call ?
>> It brings no added value.
>> Is there a use case?
>> Currently QEMU does not do this and since we only have the SCSH there
>> is no difference having the bit set alone or not alone.
> I found this to be a very easy way to implement halt/clear. This still
> holds true if we switch to some kind of capabilities for this (did not
> have time to look at this further, though).
>
> As we have the fctl field anyway, I'm in favour of processing this all
> in one function.
>

Sorry, I do not understand if we agree or not.

I agree we have the fctl field and we must continue to use it
for backward compatibility.

I do not understand the "processing all in one function".

Since yo already have 3 function to process these three instructions.

Do you mean the if .. else if .. else if ?

Then I come back to what you said earlier on the precedence of the clear 
instruction:

1) do we have a use case to have more than one bit set in the fctl field?

- if no, there is no need for precedence

- if yes, why should clear have precedence ?
   How do QEMU set more than one bit in fctl?
   why should we alter the order of the instructions given by the guest?
   How can we know this order if there are multiple instructions at once?
Cornelia Huck June 5, 2018, 1:14 p.m. UTC | #6
On Tue, 22 May 2018 17:10:44 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 22/05/2018 14:52, Cornelia Huck wrote:
> > On Wed, 16 May 2018 15:32:48 +0200
> > Pierre Morel <pmorel@linux.ibm.com> wrote:
> >  
> >> On 15/05/2018 18:10, Cornelia Huck wrote:  
> >>> On Fri, 11 May 2018 11:33:35 +0200
> >>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>     
> >>>> On 09/05/2018 17:48, Cornelia Huck wrote:  
> >>>>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
> >>>>>     
> >>>>>     	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
> >>>>>     
> >>>>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> >>>>> +	/*
> >>>>> +	 * Start processing with the clear function, then halt, then start.
> >>>>> +	 * We may still be start pending when the caller wants to clean
> >>>>> +	 * up things via halt/clear.
> >>>>> +	 */  
> >>>> hum. The scsw here does not reflect the hardware state but the
> >>>> command passed from the user interface.
> >>>> Can we and should we authorize multiple commands in one call?
> >>>>
> >>>> If not, the comment is not appropriate and a switch on cmd.fctl
> >>>> would be a clearer.  
> >>> There may be multiple functions specified, but we need to process them
> >>> in precedence order (and clear wins over the others, so to speak).
> >>> Would adding a sentence like "we always process just one function" help?  
> >> Why should we allow multiple commands in a single call ?
> >> It brings no added value.
> >> Is there a use case?
> >> Currently QEMU does not do this and since we only have the SCSH there
> >> is no difference having the bit set alone or not alone.  
> > I found this to be a very easy way to implement halt/clear. This still
> > holds true if we switch to some kind of capabilities for this (did not
> > have time to look at this further, though).
> >
> > As we have the fctl field anyway, I'm in favour of processing this all
> > in one function.
> >  

[starting to look at this again]

> 
> Sorry, I do not understand if we agree or not.
> 
> I agree we have the fctl field and we must continue to use it
> for backward compatibility.

It also mirrors the hardware, no?

> 
> I do not understand the "processing all in one function".
> 
> Since yo already have 3 function to process these three instructions.
> 
> Do you mean the if .. else if .. else if ?

Yes. There is a lot of common handling for each of these.

> 
> Then I come back to what you said earlier on the precedence of the clear 
> instruction:
> 
> 1) do we have a use case to have more than one bit set in the fctl field?
> 
> - if no, there is no need for precedence

It mirrors what the hardware does: you just set an additional bit if
processing has not yet finished.

> 
> - if yes, why should clear have precedence ?

Because it does on the hardware?

>    How do QEMU set more than one bit in fctl?
>    why should we alter the order of the instructions given by the guest?
>    How can we know this order if there are multiple instructions at once?

In the future, we should return after we fired off the start etc.
request even if we did not receive an interrupt yet, so that the guest
might do a halt or clear before the start has finished. IOW, make this
as asynchronous as the hardware. That's why I'd like to simply
accumulate the things. The architecture already specified what happens
in the response.

Do you think that is feasible?
Pierre Morel June 5, 2018, 3:23 p.m. UTC | #7
On 05/06/2018 15:14, Cornelia Huck wrote:
> On Tue, 22 May 2018 17:10:44 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 22/05/2018 14:52, Cornelia Huck wrote:
>>> On Wed, 16 May 2018 15:32:48 +0200
>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>   
>>>> On 15/05/2018 18:10, Cornelia Huck wrote:
>>>>> On Fri, 11 May 2018 11:33:35 +0200
>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>      
>>>>>> On 09/05/2018 17:48, Cornelia Huck wrote:
>>>>>>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
>>>>>>>      
>>>>>>>      	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
>>>>>>>      
>>>>>>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
>>>>>>> +	/*
>>>>>>> +	 * Start processing with the clear function, then halt, then start.
>>>>>>> +	 * We may still be start pending when the caller wants to clean
>>>>>>> +	 * up things via halt/clear.
>>>>>>> +	 */
>>>>>> hum. The scsw here does not reflect the hardware state but the
>>>>>> command passed from the user interface.
>>>>>> Can we and should we authorize multiple commands in one call?
>>>>>>
>>>>>> If not, the comment is not appropriate and a switch on cmd.fctl
>>>>>> would be a clearer.
>>>>> There may be multiple functions specified, but we need to process them
>>>>> in precedence order (and clear wins over the others, so to speak).
>>>>> Would adding a sentence like "we always process just one function" help?
>>>> Why should we allow multiple commands in a single call ?
>>>> It brings no added value.
>>>> Is there a use case?
>>>> Currently QEMU does not do this and since we only have the SCSH there
>>>> is no difference having the bit set alone or not alone.
>>> I found this to be a very easy way to implement halt/clear. This still
>>> holds true if we switch to some kind of capabilities for this (did not
>>> have time to look at this further, though).
>>>
>>> As we have the fctl field anyway, I'm in favour of processing this all
>>> in one function.
>>>   
> [starting to look at this again]
>
>> Sorry, I do not understand if we agree or not.
>>
>> I agree we have the fctl field and we must continue to use it
>> for backward compatibility.
> It also mirrors the hardware, no?

No, in the hardware this is the result of the instruction in the SCSW.
Not the instruction itself.

>
>> I do not understand the "processing all in one function".
>>
>> Since yo already have 3 function to process these three instructions.
>>
>> Do you mean the if .. else if .. else if ?
> Yes. There is a lot of common handling for each of these.

There are also differences and it breaks the FSM

>
>> Then I come back to what you said earlier on the precedence of the clear
>> instruction:
>>
>> 1) do we have a use case to have more than one bit set in the fctl field?
>>
>> - if no, there is no need for precedence
> It mirrors what the hardware does: you just set an additional bit if
> processing has not yet finished.

I do not agree, this is true for the SCSW but not for instructions.
We receive instructions in VFIO and give back status.
The name used to provide the command is misleading.

>
>> - if yes, why should clear have precedence ?
> Because it does on the hardware?

What you say is right if we would have a register inside the subchannel
where we write the commands.
But this is not what we handle we handle separate instructions coming
from an instruction stream.

We do never receive two instructions at the same time, but each after
the other.
If the sub-channel is busy on IO a clear or a cancel must be able to 
stop the IO.
I agree upon this.
But we do not have any other command in the same call.

If we would construct the interface differently, for example using an 
mmap() system
call and let the user ORing the command bitfield before using an ioctl 
to inform
us from the change, or if we poll on the command bitfield we should 
implement
it like you say.
But this is not what we do, and this is not what the architecture does.
does it?

>
>>     How do QEMU set more than one bit in fctl?
>>     why should we alter the order of the instructions given by the guest?
>>     How can we know this order if there are multiple instructions at once?
> In the future, we should return after we fired off the start etc.
> request even if we did not receive an interrupt yet, so that the guest
> might do a halt or clear before the start has finished.

This is already what is done here:
We fire off the start (go to BUSY state) and return

If the guest want to start another command it polls on
the vfio_write() untill the channel isn't BUSY anymore.

On interrupt we set the channel back to IDLE and the next
command shall be proceed.

(I must enhance the cover letter (already said))

>   IOW, make this
> as asynchronous as the hardware. That's why I'd like to simply
> accumulate the things. The architecture already specified what happens
> in the response.
>
> Do you think that is feasible?
>
Yes I think it is feasible and it is what we need to do.

CLEAR, CANCEL and HALT must be able to overtake the
START and really stop the IO transfer.

They just should be able to proceed in the BUSY state
on the opposite of the START.
This is easily done with new events in the FSM inside
both IDLE and BUSY states

May be I should try to integrate your patches so we have a better view.

Thanks.

Best regards,

Pierre
Cornelia Huck June 5, 2018, 3:36 p.m. UTC | #8
On Tue, 5 Jun 2018 17:23:02 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

<snip lots of comments - I still need to think about this>

> May be I should try to integrate your patches so we have a better view.

Not sure whether it is worth trying to do so in the current state.
(Also, I really need to get the information into my personal cache
again, before I write too many weird things.)
Cornelia Huck June 6, 2018, 12:21 p.m. UTC | #9
On Tue, 5 Jun 2018 17:23:02 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 05/06/2018 15:14, Cornelia Huck wrote:
> > On Tue, 22 May 2018 17:10:44 +0200
> > Pierre Morel <pmorel@linux.ibm.com> wrote:
> >  
> >> On 22/05/2018 14:52, Cornelia Huck wrote:  
> >>> On Wed, 16 May 2018 15:32:48 +0200
> >>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>     
> >>>> On 15/05/2018 18:10, Cornelia Huck wrote:  
> >>>>> On Fri, 11 May 2018 11:33:35 +0200
> >>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>>>        
> >>>>>> On 09/05/2018 17:48, Cornelia Huck wrote:  
> >>>>>>> @@ -126,7 +192,24 @@ static void fsm_io_request(struct vfio_ccw_private *private,
> >>>>>>>      
> >>>>>>>      	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
> >>>>>>>      
> >>>>>>> -	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
> >>>>>>> +	/*
> >>>>>>> +	 * Start processing with the clear function, then halt, then start.
> >>>>>>> +	 * We may still be start pending when the caller wants to clean
> >>>>>>> +	 * up things via halt/clear.
> >>>>>>> +	 */  
> >>>>>> hum. The scsw here does not reflect the hardware state but the
> >>>>>> command passed from the user interface.
> >>>>>> Can we and should we authorize multiple commands in one call?
> >>>>>>
> >>>>>> If not, the comment is not appropriate and a switch on cmd.fctl
> >>>>>> would be a clearer.  
> >>>>> There may be multiple functions specified, but we need to process them
> >>>>> in precedence order (and clear wins over the others, so to speak).
> >>>>> Would adding a sentence like "we always process just one function" help?  
> >>>> Why should we allow multiple commands in a single call ?
> >>>> It brings no added value.
> >>>> Is there a use case?
> >>>> Currently QEMU does not do this and since we only have the SCSH there
> >>>> is no difference having the bit set alone or not alone.  
> >>> I found this to be a very easy way to implement halt/clear. This still
> >>> holds true if we switch to some kind of capabilities for this (did not
> >>> have time to look at this further, though).
> >>>
> >>> As we have the fctl field anyway, I'm in favour of processing this all
> >>> in one function.
> >>>     
> > [starting to look at this again]
> >  
> >> Sorry, I do not understand if we agree or not.
> >>
> >> I agree we have the fctl field and we must continue to use it
> >> for backward compatibility.  
> > It also mirrors the hardware, no?  
> 
> No, in the hardware this is the result of the instruction in the SCSW.
> Not the instruction itself.

Not sure if I parse this correctly... but the architecture says that
the subchannel has the {start,halt,clear} function set as a result of
{start,halt,clear} subchannel, doesn't it?

> 
> >  
> >> I do not understand the "processing all in one function".
> >>
> >> Since yo already have 3 function to process these three instructions.
> >>
> >> Do you mean the if .. else if .. else if ?  
> > Yes. There is a lot of common handling for each of these.  
> 
> There are also differences and it breaks the FSM

Depends on what we will do with the fsm ;)

> 
> >  
> >> Then I come back to what you said earlier on the precedence of the clear
> >> instruction:
> >>
> >> 1) do we have a use case to have more than one bit set in the fctl field?
> >>
> >> - if no, there is no need for precedence  
> > It mirrors what the hardware does: you just set an additional bit if
> > processing has not yet finished.  
> 
> I do not agree, this is true for the SCSW but not for instructions.
> We receive instructions in VFIO and give back status.
> The name used to provide the command is misleading.

Confused. What we get over the interface is an scsw (the current scsw
for the subchannel in QEMU). A halt does set an additional bit in the
fctl if the start is not yet finished.

But that had me re-reading the PoP, and clear is indeed different (it,
ah, clears the other bits). So, clear handling is different enough from
the others, and I'm not sure anymore whether it makes sense to handle
start and halt together. I'll rework this.

> 
> >  
> >> - if yes, why should clear have precedence ?  
> > Because it does on the hardware?  
> 
> What you say is right if we would have a register inside the subchannel
> where we write the commands.
> But this is not what we handle we handle separate instructions coming
> from an instruction stream.
> 
> We do never receive two instructions at the same time, but each after
> the other.
> If the sub-channel is busy on IO a clear or a cancel must be able to 
> stop the IO.
> I agree upon this.
> But we do not have any other command in the same call.
> 
> If we would construct the interface differently, for example using an 
> mmap() system
> call and let the user ORing the command bitfield before using an ioctl 
> to inform
> us from the change, or if we poll on the command bitfield we should 
> implement
> it like you say.
> But this is not what we do, and this is not what the architecture does.
> does it?

The thing is that the guest does not interact with this interface at
all, it is just the backend implementation. The instructions set the
bits in the scsw fctl field, and we get the scsw from QEMU. By the
architecture, both start and halt may be set in the fctl at the same
time. [That this currently does not happen because QEMU is not really
handling things asynchronously is an implementation detail.]

> 
> >  
> >>     How do QEMU set more than one bit in fctl?
> >>     why should we alter the order of the instructions given by the guest?
> >>     How can we know this order if there are multiple instructions at once?  
> > In the future, we should return after we fired off the start etc.
> > request even if we did not receive an interrupt yet, so that the guest
> > might do a halt or clear before the start has finished.  
> 
> This is already what is done here:
> We fire off the start (go to BUSY state) and return
> 
> If the guest want to start another command it polls on
> the vfio_write() untill the channel isn't BUSY anymore.

It's not the guest that polls, but QEMU (resp. another user space
program). And it should be able to fire a halt etc. even if the start
function is still active, as the architecture allows that. (I would
expect the real hardware to give us a busy if applicable anyway.)

> 
> On interrupt we set the channel back to IDLE and the next
> command shall be proceed.
> 
> (I must enhance the cover letter (already said))
> 
> >   IOW, make this
> > as asynchronous as the hardware. That's why I'd like to simply
> > accumulate the things. The architecture already specified what happens
> > in the response.
> >
> > Do you think that is feasible?
> >  
> Yes I think it is feasible and it is what we need to do.
> 
> CLEAR, CANCEL and HALT must be able to overtake the
> START and really stop the IO transfer.
> 
> They just should be able to proceed in the BUSY state
> on the opposite of the START.
> This is easily done with new events in the FSM inside
> both IDLE and BUSY states

On re-reading the PoP, the three of them do have some different
requirements:

- cancel is currently handled completely in QEMU, and it also has quite
  different semantics as it is not asynchronous etc.
- halt can be made pending in addition to a start function
- clear will need to clear anything that's currently going on

I'll need to rethink some of this; probably does not make sense for you
to try to integrate my patches.
Pierre Morel June 6, 2018, 2:15 p.m. UTC | #10
On 06/06/2018 14:21, Cornelia Huck wrote:
> On Tue, 5 Jun 2018 17:23:02 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 05/06/2018 15:14, Cornelia Huck wrote:
>>> On Tue, 22 May 2018 17:10:44 +0200
>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>   
>>>> On 22/05/2018 14:52, Cornelia Huck wrote:
>>>>> On Wed, 16 May 2018 15:32:48 +0200
>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>      
>>>>>> On 15/05/2018 18:10, Cornelia Huck wrote:
>>>>>>> On Fri, 11 May 2018 11:33:35 +0200
>>>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>>>         
>>>>>>>> On 09/05/2018 17:48, Cornelia Huck wrote:
...snip...
> Not sure if I parse this correctly... but the architecture says that
> the subchannel has the {start,halt,clear} function set as a result of
> {start,halt,clear} subchannel, doesn't it?
The fc field of the SCSW indicates the functions pending or in progress
resulting of the execution of the instructions START, CLEAR or HALT which
have been accepted by the subchannel.
Up to 2 functions being pending or in progress.
The fc field is only updated when the condition code is set to 0 during 
the execution
of the instruction.

1) The guest do a SSCH instruction
2) we intercept and QEMU issue the write with SSCH bit set
3) In the driver we are called in write and issue the SSCH instruction
4) the subchannel (the real one) set the start bit in fc field

later
1) The guest do a CSCH instruction
2) we intercept and QEMU issue the write with CSCH bit set
3) In the driver we are called in write and issue the CSCH instruction
4) the subchannel (the real one) set the clear bit in fc and clear the 
start bit


The subchannel accept to handle functions (start, halt, clear) 
asynchronously
otherwise it could not accept a CLEAR instruction to stop a previous START
instruction.
But the instructions are issued one after the other to the sub-channel.

I think we can only agree on this and that we had at some point a 
misunderstanding.


>
>>>   
>>>> I do not understand the "processing all in one function".
>>>>
>>>> Since yo already have 3 function to process these three instructions.
>>>>
>>>> Do you mean the if .. else if .. else if ?
>>> Yes. There is a lot of common handling for each of these.
>> There are also differences and it breaks the FSM
> Depends on what we will do with the fsm ;)

Yes :)

>
>>>   
>>>> Then I come back to what you said earlier on the precedence of the clear
>>>> instruction:
>>>>
>>>> 1) do we have a use case to have more than one bit set in the fctl field?
>>>>
>>>> - if no, there is no need for precedence
>>> It mirrors what the hardware does: you just set an additional bit if
>>> processing has not yet finished.
>> I do not agree, this is true for the SCSW but not for instructions.
>> We receive instructions in VFIO and give back status.
>> The name used to provide the command is misleading.
> Confused. What we get over the interface is an scsw (the current scsw
> for the subchannel in QEMU). A halt does set an additional bit in the
> fctl if the start is not yet finished.
>
> But that had me re-reading the PoP, and clear is indeed different (it,
> ah, clears the other bits). So, clear handling is different enough from
> the others, and I'm not sure anymore whether it makes sense to handle
> start and halt together. I'll rework this.

OK, it matches with my comprehension.

>
>>>   
>>>> - if yes, why should clear have precedence ?
>>> Because it does on the hardware?
>> What you say is right if we would have a register inside the subchannel
>> where we write the commands.
>> But this is not what we handle we handle separate instructions coming
>> from an instruction stream.
>>
>> We do never receive two instructions at the same time, but each after
>> the other.
>> If the sub-channel is busy on IO a clear or a cancel must be able to
>> stop the IO.
>> I agree upon this.
>> But we do not have any other command in the same call.
>>
>> If we would construct the interface differently, for example using an
>> mmap() system
>> call and let the user ORing the command bitfield before using an ioctl
>> to inform
>> us from the change, or if we poll on the command bitfield we should
>> implement
>> it like you say.
>> But this is not what we do, and this is not what the architecture does.
>> does it?
> The thing is that the guest does not interact with this interface at
> all, it is just the backend implementation. The instructions set the
> bits in the scsw fctl field, and we get the scsw from QEMU. By the
> architecture, both start and halt may be set in the fctl at the same
> time. [That this currently does not happen because QEMU is not really
> handling things asynchronously is an implementation detail.]

I do not understand the point of sending a halt and a start to the same
sub-channel at the same time?

If QEMU, once asynchronous, can do this, it could just
halt the start before asking this to the backend. Don't it?

Another point is that the start must have been accepted by the
sub-channel for the start bit in the fc field of the SCSW to be set.

>
>>>   
>>>>      How do QEMU set more than one bit in fctl?
>>>>      why should we alter the order of the instructions given by the guest?
>>>>      How can we know this order if there are multiple instructions at once?
>>> In the future, we should return after we fired off the start etc.
>>> request even if we did not receive an interrupt yet, so that the guest
>>> might do a halt or clear before the start has finished.
>> This is already what is done here:
>> We fire off the start (go to BUSY state) and return
>>
>> If the guest want to start another command it polls on
>> the vfio_write() untill the channel isn't BUSY anymore.
> It's not the guest that polls, but QEMU (resp. another user space
> program). And it should be able to fire a halt etc. even if the start
> function is still active, as the architecture allows that. (I would
> expect the real hardware to give us a busy if applicable anyway.)

I do not find where QEMU is waiting when the sub-channel is busy.
I found a loop on EAGAIN (which is never returned but by the system AFAIU)
and I missed the place where a retry is implemented when I followed
the back calls.

>
...snip...
> On re-reading the PoP, the three of them do have some different
> requirements:
>
> - cancel is currently handled completely in QEMU, and it also has quite
>    different semantics as it is not asynchronous etc.
> - halt can be made pending in addition to a start function
> - clear will need to clear anything that's currently going on
>
> I'll need to rethink some of this; probably does not make sense for you
> to try to integrate my patches.
>

Make sense (that it doesn't (make sense)).

Regards,

Pierre
Cornelia Huck June 7, 2018, 9:54 a.m. UTC | #11
On Wed, 6 Jun 2018 16:15:31 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 06/06/2018 14:21, Cornelia Huck wrote:
> > On Tue, 5 Jun 2018 17:23:02 +0200
> > Pierre Morel <pmorel@linux.ibm.com> wrote:
> >  
> >> On 05/06/2018 15:14, Cornelia Huck wrote:  
> >>> On Tue, 22 May 2018 17:10:44 +0200
> >>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>     
> >>>> On 22/05/2018 14:52, Cornelia Huck wrote:  
> >>>>> On Wed, 16 May 2018 15:32:48 +0200
> >>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>>>        
> >>>>>> On 15/05/2018 18:10, Cornelia Huck wrote:  
> >>>>>>> On Fri, 11 May 2018 11:33:35 +0200
> >>>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> >>>>>>>           
> >>>>>>>> On 09/05/2018 17:48, Cornelia Huck wrote:  
> ...snip...
> > Not sure if I parse this correctly... but the architecture says that
> > the subchannel has the {start,halt,clear} function set as a result of
> > {start,halt,clear} subchannel, doesn't it?  
> The fc field of the SCSW indicates the functions pending or in progress
> resulting of the execution of the instructions START, CLEAR or HALT which
> have been accepted by the subchannel.
> Up to 2 functions being pending or in progress.
> The fc field is only updated when the condition code is set to 0 during 
> the execution
> of the instruction.
> 
> 1) The guest do a SSCH instruction
> 2) we intercept and QEMU issue the write with SSCH bit set
> 3) In the driver we are called in write and issue the SSCH instruction
> 4) the subchannel (the real one) set the start bit in fc field
> 
> later
> 1) The guest do a CSCH instruction
> 2) we intercept and QEMU issue the write with CSCH bit set
> 3) In the driver we are called in write and issue the CSCH instruction
> 4) the subchannel (the real one) set the clear bit in fc and clear the 
> start bit
> 
> 
> The subchannel accept to handle functions (start, halt, clear) 
> asynchronously
> otherwise it could not accept a CLEAR instruction to stop a previous START
> instruction.
> But the instructions are issued one after the other to the sub-channel.
> 
> I think we can only agree on this and that we had at some point a 
> misunderstanding.

Yes. I think much of the confusion comes from the fact that QEMU is
performing the 'async' function while it is still processing the
instruction -- IOW, it is already done with the I/O when it sets cc 0.


> >>>> - if yes, why should clear have precedence ?  
> >>> Because it does on the hardware?  
> >> What you say is right if we would have a register inside the subchannel
> >> where we write the commands.
> >> But this is not what we handle we handle separate instructions coming
> >> from an instruction stream.
> >>
> >> We do never receive two instructions at the same time, but each after
> >> the other.
> >> If the sub-channel is busy on IO a clear or a cancel must be able to
> >> stop the IO.
> >> I agree upon this.
> >> But we do not have any other command in the same call.
> >>
> >> If we would construct the interface differently, for example using an
> >> mmap() system
> >> call and let the user ORing the command bitfield before using an ioctl
> >> to inform
> >> us from the change, or if we poll on the command bitfield we should
> >> implement
> >> it like you say.
> >> But this is not what we do, and this is not what the architecture does.
> >> does it?  
> > The thing is that the guest does not interact with this interface at
> > all, it is just the backend implementation. The instructions set the
> > bits in the scsw fctl field, and we get the scsw from QEMU. By the
> > architecture, both start and halt may be set in the fctl at the same
> > time. [That this currently does not happen because QEMU is not really
> > handling things asynchronously is an implementation detail.]  
> 
> I do not understand the point of sending a halt and a start to the same
> sub-channel at the same time?
> 
> If QEMU, once asynchronous, can do this, it could just
> halt the start before asking this to the backend. Don't it?
> 
> Another point is that the start must have been accepted by the
> sub-channel for the start bit in the fc field of the SCSW to be set.

Hm, I think we need to be more precise as to what scsw we're talking
about. Bad ascii art time:

--------------
|   scsw(g)  |  ssch
--------------   |
                 |                                       guest
--------------------------------------------------------------
                 |                                        qemu
--------------   v
|   scsw(q)  | emulate
--------------   |
                 |
--------------   v
|   scsw(r)  | pwrite()
--------------   |
                 |
--------------------------------------------------------------
                 |                                        vfio
                 v
                ssch
                 |
--------------------------------------------------------------
                 |                                    hardware
--------------   v
|   scsw(h)  | actually do something
--------------

The guest issues a ssch (which gets intercepted; it won't get control
back until ssch finishes with a cc set.) scsw(g) won't change, unless
the guest does a stsch for the subchannel on another vcpu, in which
case it will get whatever information qemu holds in scsw(q) at that
point in time.

When qemu starts to emulate the guest's ssch, it will set the start
function bit in the fctl field of scsw(q). It then copies scsw(q) to
scsw(r) in the vfio region.

The vfio code will then proceed to call ssch on the real subchannel.
This is the first time we get really asynchronous, as the ssch will
return with cc set and the start function will be performed at some
point in time. If we would do a stsch on the real subchannel, we would
see that scsw(h) now has the start function bit set.

Currently, we won't return back up the chain until we get an interrupt
from the hardware, at which time we update the scsw(r) from the irb.
This will propagate into the scsw(q). At the time we finish handling
the guest's ssch and return control to it, we're all done and if the
guest does a stsch to update its scsw(g), it will get the current
scsw(q) which will already contain the scsw from the interrupt's irb
(indicating that the start function is already finished).

Now let's imagine we have a future implementation that handles actually
performing the start on the hardware asynchronously, i.e. it returns
control to the guest without the interrupt having been posted (let's
say that it is a longer-running I/O request). If the guest now did a
stsch to update scsw(g), it would get the current state of scsw(q),
which would be "start function set, but not done yet".

If the guest now does a hsch, it would trap in the same way as the ssch
before. When qemu gets control, it adds the halt bit in scsw(q) (which
is in accordance with the architecture). My proposal is to do the same
copying to scsw(r) again, which would mean we get a request with both
the halt and the start bit set. The vfio code now needs to do a hsch
(instead of a ssch). The real channel subsystem should figure this out,
as we can't reliably check whether the start function has concluded
already (there's always a race window).

For csch, things are a bit different (which the code posted here did
not take into account). The qemu emulation of csch needs to clear any
start/halt bits in scsw(q) when setting the clear bit there, and
therefore scsw(r) will only have the clear bit set in that case. We
still should do an unconditional csch for the same reasons as above;
the hardware will do the same things (clearing start/halt, setting
clear) in the scsw(h).

Congratulations, you've reached the end :) I hope that was helpful and
not too confusing.
Halil Pasic June 7, 2018, 4:17 p.m. UTC | #12
On 06/07/2018 11:54 AM, Cornelia Huck wrote:
> Hm, I think we need to be more precise as to what scsw we're talking
> about. Bad ascii art time:
> 
> --------------
> |   scsw(g)  |  ssch
> --------------   |
>                   |                                       guest
> --------------------------------------------------------------
>                   |                                        qemu
> --------------   v
> |   scsw(q)  | emulate
> --------------   |
>                   |
> --------------   v
> |   scsw(r)  | pwrite()
> --------------   |
>                   |
> --------------------------------------------------------------
>                   |                                        vfio
>                   v
>                  ssch
>                   |
> --------------------------------------------------------------
>                   |                                    hardware
> --------------   v
> |   scsw(h)  | actually do something
> --------------
> 
> The guest issues a ssch (which gets intercepted; it won't get control
> back until ssch finishes with a cc set.) scsw(g) won't change, unless
> the guest does a stsch for the subchannel on another vcpu, in which
> case it will get whatever information qemu holds in scsw(q) at that
> point in time.

(1) I think BQL make other cpu or not other kind of the same. We will
effectively start processing the stsch in QEMU after we are done
with the ssch in QEMU.

> 
> When qemu starts to emulate the guest's ssch, it will set the start
> function bit in the fctl field of scsw(q). It then copies scsw(q) to
> scsw(r) in the vfio region.
> 

(2) This is architecturally wrong AFAIK. The fctl bit is supposed to be set on
cc 0. But because of (1) this might not be a observable by the guest --
we can fix it up.

(3)IMHO scsw(r) is not a real scsw as defined by the architecture but
a strange communication structure (not) defined vfio-ccw.

> The vfio code will then proceed to call ssch on the real subchannel.
> This is the first time we get really asynchronous, as the ssch will
> return with cc set and the start function will be performed at some
> point in time. If we would do a stsch on the real subchannel, we would
> see that scsw(h) now has the start function bit set.
> 

(4) I guess only if cc 0.

> Currently, we won't return back up the chain until we get an interrupt
> from the hardware, at which time we update the scsw(r) from the irb.
> This will propagate into the scsw(q). At the time we finish handling
> the guest's ssch and return control to it, we're all done and if the
> guest does a stsch to update its scsw(g), it will get the current
> scsw(q) which will already contain the scsw from the interrupt's irb
> (indicating that the start function is already finished).
> 
> Now let's imagine we have a future implementation that handles actually
> performing the start on the hardware asynchronously, i.e. it returns
> control to the guest without the interrupt having been posted (let's
> say that it is a longer-running I/O request). If the guest now did a
> stsch to update scsw(g), it would get the current state of scsw(q),
> which would be "start function set, but not done yet".

(5) AFAIK this is how the current implementation works. We don't wait
for the I/O interrupt on the host to present a cc to the guest for it's
ssch.

> 
> If the guest now does a hsch, it would trap in the same way as the ssch
> before. When qemu gets control, it adds the halt bit in scsw(q) (which
> is in accordance with the architecture).

(7) Again it's when is fctl set according to the architecture...

> My proposal is to do the same
> copying to scsw(r) again, which would mean we get a request with both
> the halt and the start bit set.

(8) IMHO when receiving the 'request' we are and should be in instruction
context -- opposed to basic io function context. So we should not set fctl
before we know what will our guest cc be. But since scsw(r) is not a real
scsw it is just strange.

> The vfio code now needs to do a hsch
> (instead of a ssch). The real channel subsystem should figure this out,
> as we can't reliably check whether the start function has concluded
> already (there's always a race window).
> 

(9) Yes we can't tell for sure if the start function is still being performed
by the stuff below.

Regards,
Halil

> For csch, things are a bit different (which the code posted here did
> not take into account). The qemu emulation of csch needs to clear any
> start/halt bits in scsw(q) when setting the clear bit there, and
> therefore scsw(r) will only have the clear bit set in that case. We
> still should do an unconditional csch for the same reasons as above;
> the hardware will do the same things (clearing start/halt, setting
> clear) in the scsw(h).
> 
> Congratulations, you've reached the end:)  I hope that was helpful and
> not too confusing.
>
Cornelia Huck June 7, 2018, 4:34 p.m. UTC | #13
On Thu, 7 Jun 2018 18:17:57 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On 06/07/2018 11:54 AM, Cornelia Huck wrote:
> > Hm, I think we need to be more precise as to what scsw we're talking
> > about. Bad ascii art time:
> > 
> > --------------
> > |   scsw(g)  |  ssch
> > --------------   |
> >                   |                                       guest
> > --------------------------------------------------------------
> >                   |                                        qemu
> > --------------   v
> > |   scsw(q)  | emulate
> > --------------   |
> >                   |
> > --------------   v
> > |   scsw(r)  | pwrite()
> > --------------   |
> >                   |
> > --------------------------------------------------------------
> >                   |                                        vfio
> >                   v
> >                  ssch
> >                   |
> > --------------------------------------------------------------
> >                   |                                    hardware
> > --------------   v
> > |   scsw(h)  | actually do something
> > --------------
> > 
> > The guest issues a ssch (which gets intercepted; it won't get control
> > back until ssch finishes with a cc set.) scsw(g) won't change, unless
> > the guest does a stsch for the subchannel on another vcpu, in which
> > case it will get whatever information qemu holds in scsw(q) at that
> > point in time.  
> 
> (1) I think BQL make other cpu or not other kind of the same. We will
> effectively start processing the stsch in QEMU after we are done
> with the ssch in QEMU.

Yeah, but my main point was that the change is in scsw(q) only.

> 
> > 
> > When qemu starts to emulate the guest's ssch, it will set the start
> > function bit in the fctl field of scsw(q). It then copies scsw(q) to
> > scsw(r) in the vfio region.
> >   
> 
> (2) This is architecturally wrong AFAIK. The fctl bit is supposed to be set on
> cc 0. But because of (1) this might not be a observable by the guest --
> we can fix it up.

The bit is set some time during the processing of the instruction - we
need finite time to do the processing, but it should not be observable
by the guest. We should not set the bit if we won't set cc 0.

> 
> (3)IMHO scsw(r) is not a real scsw as defined by the architecture but
> a strange communication structure (not) defined vfio-ccw.

IIRC it was intended as a real scsw; we just did not want to define the
whole structure as both Linux and QEMU have scsw definitions that map
to the same hardware structure but look different.

> 
> > The vfio code will then proceed to call ssch on the real subchannel.
> > This is the first time we get really asynchronous, as the ssch will
> > return with cc set and the start function will be performed at some
> > point in time. If we would do a stsch on the real subchannel, we would
> > see that scsw(h) now has the start function bit set.
> >   
> 
> (4) I guess only if cc 0.

Yes, obviously.

> 
> > Currently, we won't return back up the chain until we get an interrupt
> > from the hardware, at which time we update the scsw(r) from the irb.
> > This will propagate into the scsw(q). At the time we finish handling
> > the guest's ssch and return control to it, we're all done and if the
> > guest does a stsch to update its scsw(g), it will get the current
> > scsw(q) which will already contain the scsw from the interrupt's irb
> > (indicating that the start function is already finished).
> > 
> > Now let's imagine we have a future implementation that handles actually
> > performing the start on the hardware asynchronously, i.e. it returns
> > control to the guest without the interrupt having been posted (let's
> > say that it is a longer-running I/O request). If the guest now did a
> > stsch to update scsw(g), it would get the current state of scsw(q),
> > which would be "start function set, but not done yet".  
> 
> (5) AFAIK this is how the current implementation works. We don't wait
> for the I/O interrupt on the host to present a cc to the guest for it's
> ssch.

But the vfio code does wait, no? We just signal the interrupt via
eventfd as well.

> 
> > 
> > If the guest now does a hsch, it would trap in the same way as the ssch
> > before. When qemu gets control, it adds the halt bit in scsw(q) (which
> > is in accordance with the architecture).  
> 
> (7) Again it's when is fctl set according to the architecture...

Same comment as above. If we do a hsch for a subchannel with the start
function set, we'll set cc 0.

> 
> > My proposal is to do the same
> > copying to scsw(r) again, which would mean we get a request with both
> > the halt and the start bit set.  
> 
> (8) IMHO when receiving the 'request' we are and should be in instruction
> context -- opposed to basic io function context. So we should not set fctl
> before we know what will our guest cc be. But since scsw(r) is not a real
> scsw it is just strange.

I think what we are doing is really 'performing the start function' -
it's just not asynchronous in the current implementation. So we already
know that ssch will return with cc 0.

> 
> > The vfio code now needs to do a hsch
> > (instead of a ssch). The real channel subsystem should figure this out,
> > as we can't reliably check whether the start function has concluded
> > already (there's always a race window).
> >   
> 
> (9) Yes we can't tell for sure if the start function is still being performed
> by the stuff below.

We'll need to figure out a way to outsource most of those decisions to
the real hardware. If we're not sure whether we can set cc 0, we should
probably just set cc 2 and be done with it. (Serialization with regard
to interrupts needed, of course.)

> 
> Regards,
> Halil

Thanks for reading!

> 
> > For csch, things are a bit different (which the code posted here did
> > not take into account). The qemu emulation of csch needs to clear any
> > start/halt bits in scsw(q) when setting the clear bit there, and
> > therefore scsw(r) will only have the clear bit set in that case. We
> > still should do an unconditional csch for the same reasons as above;
> > the hardware will do the same things (clearing start/halt, setting
> > clear) in the scsw(h).
> > 
> > Congratulations, you've reached the end:)  I hope that was helpful and
> > not too confusing.
> >   
>
Pierre Morel June 7, 2018, 4:37 p.m. UTC | #14
On 07/06/2018 11:54, Cornelia Huck wrote:
> On Wed, 6 Jun 2018 16:15:31 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 06/06/2018 14:21, Cornelia Huck wrote:
>>> On Tue, 5 Jun 2018 17:23:02 +0200
>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>   
>>>> On 05/06/2018 15:14, Cornelia Huck wrote:
>>>>> On Tue, 22 May 2018 17:10:44 +0200
>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>      
>>>>>> On 22/05/2018 14:52, Cornelia Huck wrote:
>>>>>>> On Wed, 16 May 2018 15:32:48 +0200
>>>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>>>         
>>>>>>>> On 15/05/2018 18:10, Cornelia Huck wrote:
>>>>>>>>> On Fri, 11 May 2018 11:33:35 +0200
>>>>>>>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
>>>>>>>>>            
>>>>>>>>>> On 09/05/2018 17:48, Cornelia Huck wrote:
>>

snip

>> sub-channel at the same time?
>>
>> If QEMU, once asynchronous, can do this, it could just
>> halt the start before asking this to the backend. Don't it?
>>
>> Another point is that the start must have been accepted by the
>> sub-channel for the start bit in the fc field of the SCSW to be set.
> Hm, I think we need to be more precise as to what scsw we're talking
> about. Bad ascii art time:
>
> --------------
> |   scsw(g)  |  ssch
> --------------   |
>                   |                                       guest
> --------------------------------------------------------------
>                   |                                        qemu
> --------------   v
> |   scsw(q)  | emulate
> --------------   |
>                   |
> --------------   v
> |   scsw(r)  | pwrite()
> --------------   |
>                   |
> --------------------------------------------------------------
>                   |                                        vfio
>                   v
>                  ssch
>                   |
> --------------------------------------------------------------
>                   |                                    hardware
> --------------   v
> |   scsw(h)  | actually do something
> --------------
>
> The guest issues a ssch (which gets intercepted; it won't get control
> back until ssch finishes with a cc set.) scsw(g) won't change, unless
> the guest does a stsch for the subchannel on another vcpu, in which
> case it will get whatever information qemu holds in scsw(q) at that
> point in time.
>
> When qemu starts to emulate the guest's ssch, it will set the start
> function bit in the fctl field of scsw(q). It then copies scsw(q) to
> scsw(r) in the vfio region.
>
> The vfio code will then proceed to call ssch on the real subchannel.
> This is the first time we get really asynchronous, as the ssch will
> return with cc set and the start function will be performed at some
> point in time. If we would do a stsch on the real subchannel, we would
> see that scsw(h) now has the start function bit set.
>
> Currently, we won't return back up the chain until we get an interrupt
> from the hardware, at which time we update the scsw(r) from the irb.

I do not find where the thread waits for interrupt inside the
write->fsm_io_request->fsm_io_helper->ssch chain.

I must have miss it ten times. Can you point it to me?

> This will propagate into the scsw(q). At the time we finish handling
> the guest's ssch and return control to it, we're all done and if the
> guest does a stsch to update its scsw(g), it will get the current
> scsw(q) which will already contain the scsw from the interrupt's irb
> (indicating that the start function is already finished).
>
> Now let's imagine we have a future implementation that handles actually
> performing the start on the hardware asynchronously, i.e. it returns
> control to the guest without the interrupt having been posted (let's
> say that it is a longer-running I/O request). If the guest now did a
> stsch to update scsw(g), it would get the current state of scsw(q),
> which would be "start function set, but not done yet".
>
> If the guest now does a hsch, it would trap in the same way as the ssch
> before. When qemu gets control, it adds the halt bit in scsw(q) (which
> is in accordance with the architecture).

This I agree.
The scsw(q) is part of the QEMU device.

>   My proposal is to do the same
> copying to scsw(r) again, which would mean we get a request with both
> the halt and the start bit set. The vfio code now needs to do a hsch
> (instead of a ssch). The real channel subsystem should figure this out,
> as we can't reliably check whether the start function has concluded
> already (there's always a race window).

This I do not agree scsw(r) is part of the driver.
The interface here is not a device interface anymore but a driver 
interface.
SCSW is a status, it is at its place in QEMU device interface with the 
guest
but here pwrite() sends a command.


Since we do not do a STSCH on each command, scsw(q), should be
updated by QEMU depending on syscall return value.
This is not done currently, only the success path is right
because it is set before the call.

If I read right and the IRQ is asynchronous, the scsw(q) is not right,
because not updated, after the interrupt is received from the guest.

>
> For csch, things are a bit different (which the code posted here did
> not take into account). The qemu emulation of csch needs to clear any
> start/halt bits in scsw(q) when setting the clear bit there, and
> therefore scsw(r) will only have the clear bit set in that case. We
> still should do an unconditional csch for the same reasons as above;
> the hardware will do the same things (clearing start/halt, setting
> clear) in the scsw(h).
>
> Congratulations, you've reached the end :) I hope that was helpful and
> not too confusing.
>
Yes it was, thank you, and it is clear, but still... questions ... ;)

Best regards,

Pierre
Cornelia Huck June 8, 2018, 12:20 p.m. UTC | #15
On Thu, 7 Jun 2018 18:37:16 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 07/06/2018 11:54, Cornelia Huck wrote:

> > --------------
> > |   scsw(g)  |  ssch
> > --------------   |
> >                   |                                       guest
> > --------------------------------------------------------------
> >                   |                                        qemu
> > --------------   v
> > |   scsw(q)  | emulate
> > --------------   |
> >                   |
> > --------------   v
> > |   scsw(r)  | pwrite()
> > --------------   |
> >                   |
> > --------------------------------------------------------------
> >                   |                                        vfio
> >                   v
> >                  ssch
> >                   |
> > --------------------------------------------------------------
> >                   |                                    hardware
> > --------------   v
> > |   scsw(h)  | actually do something
> > --------------
> >
> > The guest issues a ssch (which gets intercepted; it won't get control
> > back until ssch finishes with a cc set.) scsw(g) won't change, unless
> > the guest does a stsch for the subchannel on another vcpu, in which
> > case it will get whatever information qemu holds in scsw(q) at that
> > point in time.
> >
> > When qemu starts to emulate the guest's ssch, it will set the start
> > function bit in the fctl field of scsw(q). It then copies scsw(q) to
> > scsw(r) in the vfio region.
> >
> > The vfio code will then proceed to call ssch on the real subchannel.
> > This is the first time we get really asynchronous, as the ssch will
> > return with cc set and the start function will be performed at some
> > point in time. If we would do a stsch on the real subchannel, we would
> > see that scsw(h) now has the start function bit set.
> >
> > Currently, we won't return back up the chain until we get an interrupt
> > from the hardware, at which time we update the scsw(r) from the irb.  
> 
> I do not find where the thread waits for interrupt inside the
> write->fsm_io_request->fsm_io_helper->ssch chain.
> 
> I must have miss it ten times. Can you point it to me?

I'm not surprised you did not find it, because it isn't there :)

I've let myself be confused by the /* Start channel program and wait
for I/O interrupt. */ comment -- it is wrong and should be removed (but
we did have that waiting initially). So we already have the async
situation :)

> 
> > This will propagate into the scsw(q). At the time we finish handling
> > the guest's ssch and return control to it, we're all done and if the
> > guest does a stsch to update its scsw(g), it will get the current
> > scsw(q) which will already contain the scsw from the interrupt's irb
> > (indicating that the start function is already finished).
> >
> > Now let's imagine we have a future implementation that handles actually
> > performing the start on the hardware asynchronously, i.e. it returns
> > control to the guest without the interrupt having been posted (let's
> > say that it is a longer-running I/O request). If the guest now did a
> > stsch to update scsw(g), it would get the current state of scsw(q),
> > which would be "start function set, but not done yet".
> >
> > If the guest now does a hsch, it would trap in the same way as the ssch
> > before. When qemu gets control, it adds the halt bit in scsw(q) (which
> > is in accordance with the architecture).  
> 
> This I agree.
> The scsw(q) is part of the QEMU device.
> 
> >   My proposal is to do the same
> > copying to scsw(r) again, which would mean we get a request with both
> > the halt and the start bit set. The vfio code now needs to do a hsch
> > (instead of a ssch). The real channel subsystem should figure this out,
> > as we can't reliably check whether the start function has concluded
> > already (there's always a race window).  
> 
> This I do not agree scsw(r) is part of the driver.
> The interface here is not a device interface anymore but a driver 
> interface.
> SCSW is a status, it is at its place in QEMU device interface with the 
> guest
> but here pwrite() sends a command.

Hm, I rather consider that "we write a status, and the backend figures
out what to do based on that status".

> 
> 
> Since we do not do a STSCH on each command, scsw(q), should be
> updated by QEMU depending on syscall return value.
> This is not done currently, only the success path is right
> because it is set before the call.
> 
> If I read right and the IRQ is asynchronous, the scsw(q) is not right,
> because not updated, after the interrupt is received from the guest.

Yes, we're probably missing some updates.
Halil Pasic June 8, 2018, 1:13 p.m. UTC | #16
On 06/08/2018 02:20 PM, Cornelia Huck wrote:
>>> My proposal is to do the same
>>> copying to scsw(r) again, which would mean we get a request with both
>>> the halt and the start bit set. The vfio code now needs to do a hsch
>>> (instead of a ssch). The real channel subsystem should figure this out,
>>> as we can't reliably check whether the start function has concluded
>>> already (there's always a race window).
>> This I do not agree scsw(r) is part of the driver.
>> The interface here is not a device interface anymore but a driver
>> interface.
>> SCSW is a status, it is at its place in QEMU device interface with the
>> guest
>> but here pwrite() sends a command.
> Hm, I rather consider that "we write a status, and the backend figures
> out what to do based on that status".
> 

The status of what? Kind of a target status?

I think this approach is the source of lots of complications. For instance
take xsch. How are we supposed to react to a guest xsch (in QEMU and
in the kernel module)? My guess is that the right thing to do is to issue
an xsch in the vfio-ccw kernel module on the passed through subchannel.
But there is no bit in fctl for cancel.

Bottom line is: I'm not happy with the current design but I'm not sure
if it's practical to do something about it (i.e. change it radically).

Regards,
Halil
Cornelia Huck June 8, 2018, 2:45 p.m. UTC | #17
On Fri, 8 Jun 2018 15:13:28 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On 06/08/2018 02:20 PM, Cornelia Huck wrote:
> >>> My proposal is to do the same
> >>> copying to scsw(r) again, which would mean we get a request with both
> >>> the halt and the start bit set. The vfio code now needs to do a hsch
> >>> (instead of a ssch). The real channel subsystem should figure this out,
> >>> as we can't reliably check whether the start function has concluded
> >>> already (there's always a race window).  
> >> This I do not agree scsw(r) is part of the driver.
> >> The interface here is not a device interface anymore but a driver
> >> interface.
> >> SCSW is a status, it is at its place in QEMU device interface with the
> >> guest
> >> but here pwrite() sends a command.  
> > Hm, I rather consider that "we write a status, and the backend figures
> > out what to do based on that status".
> >   
> 
> The status of what? Kind of a target status?
> 
> I think this approach is the source of lots of complications. For instance
> take xsch. How are we supposed to react to a guest xsch (in QEMU and
> in the kernel module)? My guess is that the right thing to do is to issue
> an xsch in the vfio-ccw kernel module on the passed through subchannel.
> But there is no bit in fctl for cancel.
> 
> Bottom line is: I'm not happy with the current design but I'm not sure
> if it's practical to do something about it (i.e. change it radically).

It might make sense to keep this for ssch, maybe reuse it for hsch/csch,
and think about something else for other things we want to handle
(xsch, channel monitoring, the path handling stuff for which we already
had a prototype etc.) It's probably not practical to do radical surgery
on the existing code.

[Speaking of which: Is there any current effort on the path handling
things?]
Pierre Morel June 8, 2018, 3:51 p.m. UTC | #18
On 08/06/2018 16:45, Cornelia Huck wrote:
> On Fri, 8 Jun 2018 15:13:28 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
>
>> On 06/08/2018 02:20 PM, Cornelia Huck wrote:
>>>>> My proposal is to do the same
>>>>> copying to scsw(r) again, which would mean we get a request with both
>>>>> the halt and the start bit set. The vfio code now needs to do a hsch
>>>>> (instead of a ssch). The real channel subsystem should figure this out,
>>>>> as we can't reliably check whether the start function has concluded
>>>>> already (there's always a race window).
>>>> This I do not agree scsw(r) is part of the driver.
>>>> The interface here is not a device interface anymore but a driver
>>>> interface.
>>>> SCSW is a status, it is at its place in QEMU device interface with the
>>>> guest
>>>> but here pwrite() sends a command.
>>> Hm, I rather consider that "we write a status, and the backend figures
>>> out what to do based on that status".
>>>    
>> The status of what? Kind of a target status?
>>
>> I think this approach is the source of lots of complications. For instance
>> take xsch. How are we supposed to react to a guest xsch (in QEMU and
>> in the kernel module)? My guess is that the right thing to do is to issue
>> an xsch in the vfio-ccw kernel module on the passed through subchannel.
>> But there is no bit in fctl for cancel.
>>
>> Bottom line is: I'm not happy with the current design but I'm not sure
>> if it's practical to do something about it (i.e. change it radically).
> It might make sense to keep this for ssch, maybe reuse it for hsch/csch,

I do not think we need to change the interface radically but
I also do not thing we should extend it by using multiple commands
in a single syscall.

Currently:
   - only SSCH bit is used
   - only the SSCH instruction is implemented
   - all other bits, CSCH,HSCH produce an error when used alone
     or are ignored in conjonction with SSCH
    - there is no implementation using the other bits
    - It is not specified in the documentation that multiple commands
      can be used.

Looking at these, I think there is no trouble to modify the way
the Kernel interface is implemented without impact on current QEMU.

But if we begin to allow ssch/hsch/csch in a single command
in a new implementation we will be stuck with it.

> and think about something else for other things we want to handle

Yes we will need to have another interface, ioctl, or new region,
all possible, but really more complex.

> (xsch, channel monitoring, the path handling stuff for which we already

We can use another region for getting up information on path handling
or monitoring, as does the patch IIRC.
This is not a problem.

> had a prototype etc.) It's probably not practical to do radical surgery
> on the existing code.

There is no need for radical surgery, no change is required to older or
current QEMU code.
My concern is to avoid a future implementation merging multiple commands
in a single syscall.
It is not only a problem of beauty of the interface,
using a status is for the up-stream, from device to program.
Using the same construct, same name and same location, to produce commands
for the down stream is misleading and source of incoherence.

Sorry to have insisted so much but it seems so obvious to me.
May be I missed something.


Regards,
Pierre

>
> [Speaking of which: Is there any current effort on the path handling
> things?]
>
Halil Pasic June 8, 2018, 8:40 p.m. UTC | #19
On 06/07/2018 06:34 PM, Cornelia Huck wrote:
> On Thu, 7 Jun 2018 18:17:57 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
>> On 06/07/2018 11:54 AM, Cornelia Huck wrote:
>>> Hm, I think we need to be more precise as to what scsw we're talking
>>> about. Bad ascii art time:
>>>
>>> --------------
>>> |   scsw(g)  |  ssch
>>> --------------   |
>>>                    |                                       guest
[..]

>> (5) AFAIK this is how the current implementation works. We don't wait
>> for the I/O interrupt on the host to present a cc to the guest for it's
>> ssch.
> 
> But the vfio code does wait, no? We just signal the interrupt via
> eventfd as well.
> 

We have sorted this out in the other thread.

>>
>>>
>>> If the guest now does a hsch, it would trap in the same way as the ssch
>>> before. When qemu gets control, it adds the halt bit in scsw(q) (which
>>> is in accordance with the architecture).
>>
>> (7) Again it's when is fctl set according to the architecture...
> 
> Same comment as above. If we do a hsch for a subchannel with the start
> function set, we'll set cc 0.
> 
>>
>>> My proposal is to do the same
>>> copying to scsw(r) again, which would mean we get a request with both
>>> the halt and the start bit set.
>>
>> (8) IMHO when receiving the 'request' we are and should be in instruction
>> context -- opposed to basic io function context. So we should not set fctl
>> before we know what will our guest cc be. But since scsw(r) is not a real
>> scsw it is just strange.
> 
> I think what we are doing is really 'performing the start function' -
> it's just not asynchronous in the current implementation. 

The code is written as if, especially in QEMU. But this was in my current
understanding a bad decision. The why is the following. It makes reasoning
both about architectural correctness and the code a lot trickier compared
to the interpretation of the guest instruction finishes after the host
instruction finishes (unless we can prove we don't need any) approach.


> So we already know that ssch will return with cc 0.
> 

I will use your example, and another example to explain what I mean
by tricky.

One can probably argue that setting cc 0 even if the host device
responds to the host ssch with cc 3 because the device is not any more
on the given subchannel or simply just disabled. It is probably true
that the guest would not have any means to prove that we were 'lying'
to it.

But AFAIR this is not how the current implementation works. The pwrite
in qemu basically depends on the cc of the host ssch. So if the host
ssch completes with cc 3 the vfio-ccw kernel module map ist to pwrite
reporting -ENODEV and vfio_ccw_handle_request makes sure that the
guest instruction completes with cc 3 by mapping it to return code
IOINST_CC_NOT_OPERATIONAL.

I mentioned xsch in the other thread. I don't think we can tell if
cc 0 or cc 2. In my reading xsch in simple words xsch completes with
cc 2 and does nothing else if the channel subsystem already started talking
to the cu/device. If in time it makes sure we don't start talking to the
device, and clear away stuff. So if we don't consider cc of the xsch
to be issued by the host the only safe bet seems to be cc 2. But that's
effectively getting around implementing the desired functionality of
xsch and still staying architecturally correct. Which however might
be good enough for vfio-ccw. But I think I demonstrated it's kinda
tricky business.

I prefer to avoid tricky if there is no good reason not to.

[..]

> 
> Thanks for reading!
> 

Your welcome. The discussion is kind of taking place all over the
place. I'm actively trying to find the best place to answer, and avoid
overtalking topics -- but it does not seem to work. Please bear with me.

Regards,
Halil
  
[..]
Halil Pasic June 8, 2018, 9:10 p.m. UTC | #20
On 06/08/2018 04:45 PM, Cornelia Huck wrote:
> On Fri, 8 Jun 2018 15:13:28 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
>> On 06/08/2018 02:20 PM, Cornelia Huck wrote:
>>>>> My proposal is to do the same
>>>>> copying to scsw(r) again, which would mean we get a request with both
>>>>> the halt and the start bit set. The vfio code now needs to do a hsch
>>>>> (instead of a ssch). The real channel subsystem should figure this out,
>>>>> as we can't reliably check whether the start function has concluded
>>>>> already (there's always a race window).
>>>> This I do not agree scsw(r) is part of the driver.
>>>> The interface here is not a device interface anymore but a driver
>>>> interface.
>>>> SCSW is a status, it is at its place in QEMU device interface with the
>>>> guest
>>>> but here pwrite() sends a command.
>>> Hm, I rather consider that "we write a status, and the backend figures
>>> out what to do based on that status".
>>>    
>>
>> The status of what? Kind of a target status?
>>
>> I think this approach is the source of lots of complications. For instance
>> take xsch. How are we supposed to react to a guest xsch (in QEMU and
>> in the kernel module)? My guess is that the right thing to do is to issue
>> an xsch in the vfio-ccw kernel module on the passed through subchannel.
>> But there is no bit in fctl for cancel.
>>
>> Bottom line is: I'm not happy with the current design but I'm not sure
>> if it's practical to do something about it (i.e. change it radically).
> 
> It might make sense to keep this for ssch, maybe reuse it for hsch/csch,
> and think about something else for other things we want to handle
> (xsch, channel monitoring, the path handling stuff for which we already
> had a prototype etc.) It's probably not practical to do radical surgery
> on the existing code.
> 


I'm reluctant to have a strong opinion. As far as i can tell ssch is
functionally quite good (see in the other sub-thread the part about host
ssch cc being reflected in the guest cc). I have the feeling the
implementation is at places unnecessarily complicated and at places
confusing and misleading (e.g.  the stale comment you have mentioned).
That feeling obviously has an impact on my confidence, e.g. the
confidence of my  'quite good' above.

I definitely don't have the time for even evaluating the prospects of a
radical surgery, let alone for making it happen. IMHO the key is not
making things worse as we proceed.

But I try to keep in touch and at least voice concern when I disagree. I
have been neglecting this series of yours and I feel bad about it. I even
lost track of the discussion and the conclusions (mainly between You and
Pierre). Your scsw write-up gave me the opportunity to connect.

I will try to do more for the next version, but it really depends on what
else do I have to do in parallel.

> [Speaking of which: Is there any current effort on the path handling
> things?]
> 

Dong Jia is the person with the best answers for this question. I hope
he will give us a piece of his mind about the design questions discussed
here too -- as the author he should have the best understanding of the
design decisions made.

Regards,
Halil
Cornelia Huck June 11, 2018, 11:12 a.m. UTC | #21
On Fri, 8 Jun 2018 22:40:39 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> Your welcome. The discussion is kind of taking place all over the
> place. I'm actively trying to find the best place to answer, and avoid
> overtalking topics -- but it does not seem to work. Please bear with me.

To help with this discussion (and with vfio-ccw design and development
in general), I've created
https://wiki.qemu.org/ToDo/Channel_I/O_Passthrough to collect some
things. Let me know if any of you folks still needs a wiki account.
Cornelia Huck June 11, 2018, 4 p.m. UTC | #22
On Fri, 8 Jun 2018 22:40:39 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

[I'm trying to not make the discussion branch out too much. Just
replying one more time here (and I'll add something to the wiki).]

> One can probably argue that setting cc 0 even if the host device
> responds to the host ssch with cc 3 because the device is not any more
> on the given subchannel or simply just disabled. It is probably true
> that the guest would not have any means to prove that we were 'lying'
> to it.

I would not frame this as 'lying'. We have an additional layer
inbetween, adding additional complications. As long as we reflect
something to the guest that (a) is covered by the architecture, and (b)
enables the guest to make reasonable decisions, we're fine.

> 
> But AFAIR this is not how the current implementation works. The pwrite
> in qemu basically depends on the cc of the host ssch. So if the host
> ssch completes with cc 3 the vfio-ccw kernel module map ist to pwrite
> reporting -ENODEV and vfio_ccw_handle_request makes sure that the
> guest instruction completes with cc 3 by mapping it to return code
> IOINST_CC_NOT_OPERATIONAL.

Will need to review the code again. But this behaviour is fine as well
by my reasoning above :)

> 
> I mentioned xsch in the other thread. I don't think we can tell if
> cc 0 or cc 2. In my reading xsch in simple words xsch completes with
> cc 2 and does nothing else if the channel subsystem already started talking
> to the cu/device. If in time it makes sure we don't start talking to the
> device, and clear away stuff. So if we don't consider cc of the xsch
> to be issued by the host the only safe bet seems to be cc 2. But that's
> effectively getting around implementing the desired functionality of
> xsch and still staying architecturally correct. Which however might
> be good enough for vfio-ccw. But I think I demonstrated it's kinda
> tricky business.

I'm wondering if we should simply give the guest a cc 2 in any case as
well.
> 
> I prefer to avoid tricky if there is no good reason not to.
Cornelia Huck June 12, 2018, 9:59 a.m. UTC | #23
On Fri, 8 Jun 2018 17:51:27 +0200
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 08/06/2018 16:45, Cornelia Huck wrote:
> > On Fri, 8 Jun 2018 15:13:28 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >  
> >> On 06/08/2018 02:20 PM, Cornelia Huck wrote:  
> >>>>> My proposal is to do the same
> >>>>> copying to scsw(r) again, which would mean we get a request with both
> >>>>> the halt and the start bit set. The vfio code now needs to do a hsch
> >>>>> (instead of a ssch). The real channel subsystem should figure this out,
> >>>>> as we can't reliably check whether the start function has concluded
> >>>>> already (there's always a race window).  
> >>>> This I do not agree scsw(r) is part of the driver.
> >>>> The interface here is not a device interface anymore but a driver
> >>>> interface.
> >>>> SCSW is a status, it is at its place in QEMU device interface with the
> >>>> guest
> >>>> but here pwrite() sends a command.  
> >>> Hm, I rather consider that "we write a status, and the backend figures
> >>> out what to do based on that status".
> >>>      
> >> The status of what? Kind of a target status?
> >>
> >> I think this approach is the source of lots of complications. For instance
> >> take xsch. How are we supposed to react to a guest xsch (in QEMU and
> >> in the kernel module)? My guess is that the right thing to do is to issue
> >> an xsch in the vfio-ccw kernel module on the passed through subchannel.
> >> But there is no bit in fctl for cancel.
> >>
> >> Bottom line is: I'm not happy with the current design but I'm not sure
> >> if it's practical to do something about it (i.e. change it radically).  
> > It might make sense to keep this for ssch, maybe reuse it for hsch/csch,  
> 
> I do not think we need to change the interface radically but
> I also do not thing we should extend it by using multiple commands
> in a single syscall.
> 
> Currently:
>    - only SSCH bit is used
>    - only the SSCH instruction is implemented
>    - all other bits, CSCH,HSCH produce an error when used alone
>      or are ignored in conjonction with SSCH
>     - there is no implementation using the other bits
>     - It is not specified in the documentation that multiple commands
>       can be used.

Looking at Documentation/s390/vfio-ccw.txt, it states that "scsw_area
should be filled with the SCSW of the Virtual Subchannel". This seems
to indicate that this is really intended to be a scsw... but I agree,
it does lack details.

> 
> Looking at these, I think there is no trouble to modify the way
> the Kernel interface is implemented without impact on current QEMU.
> 
> But if we begin to allow ssch/hsch/csch in a single command
> in a new implementation we will be stuck with it.

Yes, we're currently still free to go in different directions; adding
support for hsch/csch to the interface in the way I did would fix it.
In any case, we need better documentation :/

> 
> > and think about something else for other things we want to handle  
> 
> Yes we will need to have another interface, ioctl, or new region,
> all possible, but really more complex.
> 
> > (xsch, channel monitoring, the path handling stuff for which we already  
> 
> We can use another region for getting up information on path handling
> or monitoring, as does the patch IIRC.
> This is not a problem.

Not a problem, I agree (and yes, the patch did that).

For xsch, I like Halil's suggestion of simply always setting cc 2.

Channel monitoring is a difficult beast (need to pass through msch, mix
of virtual and passthrough devices on the machine which have different
semantics etc.) I put some of my concerns into
https://wiki.qemu.org/ToDo/Channel_I/O_Passthrough; please add to that
if you have further thoughts.

We should keep those requirements in mind, even if we won't implement
support for it right now.

> 
> > had a prototype etc.) It's probably not practical to do radical surgery
> > on the existing code.  
> 
> There is no need for radical surgery, no change is required to older or
> current QEMU code.
> My concern is to avoid a future implementation merging multiple commands
> in a single syscall.
> It is not only a problem of beauty of the interface,
> using a status is for the up-stream, from device to program.
> Using the same construct, same name and same location, to produce commands
> for the down stream is misleading and source of incoherence.

OK, I think I see your concern now.

What happens on the real hardware is that we do a ssch etc. and this
triggers a change that is visible in the scsw when we do a stsch.

What happens here is that the guest doing a ssch triggers a change in
our virtual scsw in QEMU (so far, so good); then we send this scsw to
the vfio module, which looks at the scsw to figure out that it should
do a ssch on the host. This works fine to figure out that a ssch needs
to be done, and would also work for hsch and csch, but it is really a
bit of reverse engineering, and it would probably fail for rsch
(haven't thought about that yet). To parse the intention of doing a
halt or clear out of the scsw_area, we need to rely (a) on user space
doing the right thing, and (b) on us implementing the rules for which
function can be initiated when correctly. If we treat fctl as a simple
command field, we'll just do what user space asks of us directly.

So, what are you proposing? Being more specific and stating that the
scsw is not necessarily a real scsw, but merely a vehicle for sending a
command? Or keeping it as it is now for ssch, and adding a second
interface for hsch/csch (and maybe rsch, msch, ...)?
Pierre Morel June 12, 2018, 1:56 p.m. UTC | #24
On 12/06/2018 11:59, Cornelia Huck wrote:
> On Fri, 8 Jun 2018 17:51:27 +0200
> Pierre Morel <pmorel@linux.ibm.com> wrote:
>
>> On 08/06/2018 16:45, Cornelia Huck wrote:
>>> On Fri, 8 Jun 2018 15:13:28 +0200
>>> Halil Pasic <pasic@linux.ibm.com> wrote:
>>>   
>>>> On 06/08/2018 02:20 PM, Cornelia Huck wrote:
>>>>>>> My proposal is to do the same
>>>>>>> copying to scsw(r) again, which would mean we get a request with both
>>>>>>> the halt and the start bit set. The vfio code now needs to do a hsch
>>>>>>> (instead of a ssch). The real channel subsystem should figure this out,
>>>>>>> as we can't reliably check whether the start function has concluded
>>>>>>> already (there's always a race window).
>>>>>> This I do not agree scsw(r) is part of the driver.
>>>>>> The interface here is not a device interface anymore but a driver
>>>>>> interface.
>>>>>> SCSW is a status, it is at its place in QEMU device interface with the
>>>>>> guest
>>>>>> but here pwrite() sends a command.
>>>>> Hm, I rather consider that "we write a status, and the backend figures
>>>>> out what to do based on that status".
>>>>>       
>>>> The status of what? Kind of a target status?
>>>>
>>>> I think this approach is the source of lots of complications. For instance
>>>> take xsch. How are we supposed to react to a guest xsch (in QEMU and
>>>> in the kernel module)? My guess is that the right thing to do is to issue
>>>> an xsch in the vfio-ccw kernel module on the passed through subchannel.
>>>> But there is no bit in fctl for cancel.
>>>>
>>>> Bottom line is: I'm not happy with the current design but I'm not sure
>>>> if it's practical to do something about it (i.e. change it radically).
>>> It might make sense to keep this for ssch, maybe reuse it for hsch/csch,
>> I do not think we need to change the interface radically but
>> I also do not thing we should extend it by using multiple commands
>> in a single syscall.
>>
>> Currently:
>>     - only SSCH bit is used
>>     - only the SSCH instruction is implemented
>>     - all other bits, CSCH,HSCH produce an error when used alone
>>       or are ignored in conjonction with SSCH
>>      - there is no implementation using the other bits
>>      - It is not specified in the documentation that multiple commands
>>        can be used.
> Looking at Documentation/s390/vfio-ccw.txt, it states that "scsw_area
> should be filled with the SCSW of the Virtual Subchannel". This seems
> to indicate that this is really intended to be a scsw... but I agree,
> it does lack details.
>
>> Looking at these, I think there is no trouble to modify the way
>> the Kernel interface is implemented without impact on current QEMU.
>>
>> But if we begin to allow ssch/hsch/csch in a single command
>> in a new implementation we will be stuck with it.
> Yes, we're currently still free to go in different directions; adding
> support for hsch/csch to the interface in the way I did would fix it.
> In any case, we need better documentation :/
>
>>> and think about something else for other things we want to handle
>> Yes we will need to have another interface, ioctl, or new region,
>> all possible, but really more complex.
>>
>>> (xsch, channel monitoring, the path handling stuff for which we already
>> We can use another region for getting up information on path handling
>> or monitoring, as does the patch IIRC.
>> This is not a problem.
> Not a problem, I agree (and yes, the patch did that).
>
> For xsch, I like Halil's suggestion of simply always setting cc 2.
>
> Channel monitoring is a difficult beast (need to pass through msch, mix
> of virtual and passthrough devices on the machine which have different
> semantics etc.) I put some of my concerns into
> https://wiki.qemu.org/ToDo/Channel_I/O_Passthrough; please add to that
> if you have further thoughts.
>
> We should keep those requirements in mind, even if we won't implement
> support for it right now.
>
>>> had a prototype etc.) It's probably not practical to do radical surgery
>>> on the existing code.
>> There is no need for radical surgery, no change is required to older or
>> current QEMU code.
>> My concern is to avoid a future implementation merging multiple commands
>> in a single syscall.
>> It is not only a problem of beauty of the interface,
>> using a status is for the up-stream, from device to program.
>> Using the same construct, same name and same location, to produce commands
>> for the down stream is misleading and source of incoherence.
> OK, I think I see your concern now.
>
> What happens on the real hardware is that we do a ssch etc. and this
> triggers a change that is visible in the scsw when we do a stsch.
>
> What happens here is that the guest doing a ssch triggers a change in
> our virtual scsw in QEMU (so far, so good); then we send this scsw to
> the vfio module, which looks at the scsw to figure out that it should
> do a ssch on the host. This works fine to figure out that a ssch needs
> to be done, and would also work for hsch and csch, but it is really a
> bit of reverse engineering, and it would probably fail for rsch
> (haven't thought about that yet). To parse the intention of doing a
> halt or clear out of the scsw_area, we need to rely (a) on user space
> doing the right thing, and (b) on us implementing the rules for which
> function can be initiated when correctly. If we treat fctl as a simple
> command field, we'll just do what user space asks of us directly.
>
> So, what are you proposing? Being more specific and stating that the
> scsw is not necessarily a real scsw, but merely a vehicle for sending a
> command? Or keeping it as it is now for ssch, and adding a second
> interface for hsch/csch (and maybe rsch, msch, ...)?
>


I said no radical surgery, but after thinking more about it...
I am not sure.

Let's explain this:

I see 2 ways to proceed but my favorite is definitively to introduce 
versioning.


Way 1)

This was the way I first thought about.
We keep the existing IO Regionand structures, and are more
specific by stating that the io_region is a command region during write
entry and a status region during interrupt handling:
This allow us to use the 3 bits of the FCTL field of the SCSW to pass
commands to the kernel and stay backward compatible.
The FCTL field has 3 bits => we can have 8 commands.

PRO: small change

CONTRA: This is still confusing, we do not really solve this
         unclarity problem since QEMU view / documentation and
         Linux view / documentation differ or we update QEMU.
         Moreover this does not allow for long term extensions
         and/or re-design.



Way 2)

We use the device VFIO versioning using the capability chain to advertise
a new interface.

This the preferred way, it is sane, allows for the userland backward
compatibility and allows to have a new command interface, extensible
for future use.

In this case we can extend the interface to be any kind we want
in a next version, pwrite with new io_region, mmap on new
IO regions, status region...

PRO: Extensible and also goes in the VFIO INFO extension direction
      proposed by Alex

CONTRA: I see none outer more work to do (but is it a problem?)


====================

Extra argumentation for versioning support


Suppose a future implementation with 4 mapped regions like
the following:

- A Status region (RO updated as result of command and IRQ)

- A command region (WO where the user send its commands)
   userland write here to trigger IO (quite as currently)

- A CCW program region (RW where the CCW chain is handled)
   most handling done from userspace, last translations in kernel,
   double buffering...

- A performance / measurement / statistics region (RO)
   This is updated asynchronously by hardware and/or driver

This is purely theoretical and we do not need to do all at once
but if we want to extend the implementation without problems
and continue backward compatibility the versioning and capability
handling is a must.

====================


Regards,

Pierre
Halil Pasic June 12, 2018, 2:08 p.m. UTC | #25
On 06/12/2018 03:56 PM, Pierre Morel wrote:
>> So, what are you proposing? Being more specific and stating that the
>> scsw is not necessarily a real scsw, but merely a vehicle for sending a
>> command? Or keeping it as it is now for ssch, and adding a second
>> interface for hsch/csch (and maybe rsch, msch, ...)?
>>
> 
> 
> I said no radical surgery, but after thinking more about it...
> I am not sure.
> 
> Let's explain this:
> 
> I see 2 ways to proceed but my favorite is definitively to introduce versioning.
> 
> 
> Way 1)
> 
> This was the way I first thought about.
> We keep the existing IO Regionand structures, and are more
> specific by stating that the io_region is a command region during write
> entry and a status region during interrupt handling:
> This allow us to use the 3 bits of the FCTL field of the SCSW to pass
> commands to the kernel and stay backward compatible.
> The FCTL field has 3 bits => we can have 8 commands.
> 
> PRO: small change
> 
> CONTRA: This is still confusing, we do not really solve this
>          unclarity problem since QEMU view / documentation and
>          Linux view / documentation differ or we update QEMU.
>          Moreover this does not allow for long term extensions
>          and/or re-design.
> 
> 

I'm not really in favor of way 1. Conie's point with RSCH is a good one.
And IMHO it speaks for a new interface for commands. Squeezing the RSCH
command into the SCSW does not seem to be a good idea. Considering your
proposal with the 3 bits, we could do something like: if in FCTL the
start and the clear and the halt bits are set then we postulate that is
request for a resume. But that would be quite confusing, and we would end
up re-defining the semantic of the scsw_area -- in respect to what is
documented Documentation/s390/vfio-ccw.txt, and also what is intuitive
based on the uapi header.

> 
> Way 2)
> 
> We use the device VFIO versioning using the capability chain to advertise
> a new interface.
> 
> This the preferred way, it is sane, allows for the userland backward
> compatibility and allows to have a new command interface, extensible
> for future use.
> 
> In this case we can extend the interface to be any kind we want
> in a next version, pwrite with new io_region, mmap on new
> IO regions, status region...
> 
> PRO: Extensible and also goes in the VFIO INFO extension direction
>       proposed by Alex
> 


Sounds much better. I imagine the coexistence of old and new like this.
Both the old and the new QEMU would supply the the SCSW area with the old
documented semantics -- the SCSW of the virtual subchannel. But with the
new version an explicit command would be supplied via the command region
(also for  SSCH). Maybe the SCSW can still end up being useful for
something in the kernel module too (maybe there are some  optimization
that can be done based on the QEMU SCSW).


> CONTRA: I see none outer more work to do (but is it a problem?)
> 
> 
The problem I see is that designing a good interface usually hard.
I could help with review, but I don't have the resources to commit
to more than that.

> ====================
> 
> Extra argumentation for versioning support
> 
> 
> Suppose a future implementation with 4 mapped regions like
> the following:
> 
> - A Status region (RO updated as result of command and IRQ)
> 
> - A command region (WO where the user send its commands)
>    userland write here to trigger IO (quite as currently)
> 
> - A CCW program region (RW where the CCW chain is handled)
>    most handling done from userspace, last translations in kernel,
>    double buffering...
> 
> - A performance / measurement / statistics region (RO)
>    This is updated asynchronously by hardware and/or driver
> 
> This is purely theoretical and we do not need to do all at once
> but if we want to extend the implementation without problems
> and continue backward compatibility the versioning and capability
> handling is a must.

I'm not sure about this.


Regards,
Halil
[..]
Cornelia Huck June 12, 2018, 3:25 p.m. UTC | #26
On Tue, 12 Jun 2018 16:08:42 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On 06/12/2018 03:56 PM, Pierre Morel wrote:
> >> So, what are you proposing? Being more specific and stating that the
> >> scsw is not necessarily a real scsw, but merely a vehicle for sending a
> >> command? Or keeping it as it is now for ssch, and adding a second
> >> interface for hsch/csch (and maybe rsch, msch, ...)?
> >>  
> > 
> > 
> > I said no radical surgery, but after thinking more about it...
> > I am not sure.
> > 
> > Let's explain this:
> > 
> > I see 2 ways to proceed but my favorite is definitively to introduce versioning.
> > 
> > 
> > Way 1)
> > 
> > This was the way I first thought about.
> > We keep the existing IO Regionand structures, and are more
> > specific by stating that the io_region is a command region during write
> > entry and a status region during interrupt handling:
> > This allow us to use the 3 bits of the FCTL field of the SCSW to pass
> > commands to the kernel and stay backward compatible.  
> > The FCTL field has 3 bits => we can have 8 commands.  
> > 
> > PRO: small change
> > 
> > CONTRA: This is still confusing, we do not really solve this
> >          unclarity problem since QEMU view / documentation and
> >          Linux view / documentation differ or we update QEMU.
> >          Moreover this does not allow for long term extensions
> >          and/or re-design.
> > 
> >   
> 
> I'm not really in favor of way 1. Conie's point with RSCH is a good one.
> And IMHO it speaks for a new interface for commands. Squeezing the RSCH
> command into the SCSW does not seem to be a good idea. Considering your
> proposal with the 3 bits, we could do something like: if in FCTL the
> start and the clear and the halt bits are set then we postulate that is
> request for a resume. But that would be quite confusing, and we would end
> up re-defining the semantic of the scsw_area -- in respect to what is
> documented Documentation/s390/vfio-ccw.txt, and also what is intuitive
> based on the uapi header.

Agreed. Making scsw_area something like an scsw but still different is
bound to be confusing, even if documented, and I'm not sure it covers
all our bases anyway. Just using the halt/clear bits might have been
feasible, but as that does not cover rsch, we need something different
anyway.

> 
> > 
> > Way 2)
> > 
> > We use the device VFIO versioning using the capability chain to advertise
> > a new interface.
> > 
> > This the preferred way, it is sane, allows for the userland backward
> > compatibility and allows to have a new command interface, extensible
> > for future use.
> > 
> > In this case we can extend the interface to be any kind we want
> > in a next version, pwrite with new io_region, mmap on new
> > IO regions, status region...
> > 
> > PRO: Extensible and also goes in the VFIO INFO extension direction
> >       proposed by Alex
> >   
> 
> 
> Sounds much better. I imagine the coexistence of old and new like this.
> Both the old and the new QEMU would supply the the SCSW area with the old
> documented semantics -- the SCSW of the virtual subchannel. But with the
> new version an explicit command would be supplied via the command region
> (also for  SSCH). Maybe the SCSW can still end up being useful for
> something in the kernel module too (maybe there are some  optimization
> that can be done based on the QEMU SCSW).

We need to keep the old interface anyway. But yes, I think capabilities
are the way to go.

> 
> 
> > CONTRA: I see none outer more work to do (but is it a problem?)
> > 
> >   
> The problem I see is that designing a good interface usually hard.

I fear that this is always the case :)

> I could help with review, but I don't have the resources to commit
> to more than that.

I'm looking into the halt/clear thing anyway. But review is appreciated.

> 
> > ====================
> > 
> > Extra argumentation for versioning support
> > 
> > 
> > Suppose a future implementation with 4 mapped regions like
> > the following:
> > 
> > - A Status region (RO updated as result of command and IRQ)

scsw/pmcw/anything else? Would also accommodate the path handling
stuff, I think.

> > 
> > - A command region (WO where the user send its commands)
> >    userland write here to trigger IO (quite as currently)
> > 
> > - A CCW program region (RW where the CCW chain is handled)
> >    most handling done from userspace, last translations in kernel,
> >    double buffering...

I'm not sure about that. But in any case, we can add this later on. We
need to keep the orb as it is now, and that should already cover our
current use cases.

> > 
> > - A performance / measurement / statistics region (RO)
> >    This is updated asynchronously by hardware and/or driver

For channel measurements, for example? Makes sense. (I recall that
there's also a measurement infrastructure triggered via CHSC, but I
don't have the documentation.)

> > 
> > This is purely theoretical and we do not need to do all at once
> > but if we want to extend the implementation without problems
> > and continue backward compatibility the versioning and capability
> > handling is a must.  
> 
> I'm not sure about this.

We can think about this later, the capabilities infrastructure enables
us to do so.
diff mbox

Patch

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index ea6a2d0b2894..6212d577117f 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -76,8 +76,14 @@  static void vfio_ccw_sch_io_todo(struct work_struct *work)
 	irb = &private->irb;
 
 	if (scsw_is_solicited(&irb->scsw)) {
-		cp_update_scsw(&private->cp, &irb->scsw);
-		cp_free(&private->cp);
+		/*
+		 * For the start function, we need to update based on the
+		 * channel program. Nothing needs to be done for halt/clear.
+		 */
+		if (scsw_fctl(&irb->scsw) & SCSW_FCTL_START_FUNC) {
+			cp_update_scsw(&private->cp, &irb->scsw);
+			cp_free(&private->cp);
+		}
 	}
 	memcpy(private->io_region.irb_area, irb, sizeof(*irb));
 
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index 3c800642134e..94cae2984c35 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -3,8 +3,10 @@ 
  * Finite state machine for vfio-ccw device handling
  *
  * Copyright IBM Corp. 2017
+ * Copyright Red Hat, Inc. 2018
  *
  * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Cornelia Huck <cohuck@redhat.com>
  */
 
 #include <linux/vfio.h>
@@ -65,6 +67,70 @@  static int fsm_io_helper(struct vfio_ccw_private *private)
 	return ret;
 }
 
+static int fsm_halt_helper(struct vfio_ccw_private *private)
+{
+	struct subchannel *sch;
+	int ccode;
+	unsigned long flags;
+	int ret;
+
+	sch = private->sch;
+
+	spin_lock_irqsave(sch->lock, flags);
+	private->state = VFIO_CCW_STATE_BUSY;
+
+	/* Issue "Halt Subchannel" */
+	ccode = hsch(sch->schid);
+
+	switch (ccode) {
+	case 0:
+		/*
+		 * Initialize device status information
+		 */
+		sch->schib.scsw.cmd.actl |= SCSW_ACTL_HALT_PEND;
+		ret = 0;
+		break;
+	case 1:		/* Status pending */
+	case 2:		/* Busy */
+		ret = -EBUSY;
+		break;
+	default:	/* Device not operational */
+		ret = -ENODEV;
+	}
+	spin_unlock_irqrestore(sch->lock, flags);
+	return ret;
+}
+
+static int fsm_clear_helper(struct vfio_ccw_private *private)
+{
+	struct subchannel *sch;
+	int ccode;
+	unsigned long flags;
+	int ret;
+
+	sch = private->sch;
+
+	spin_lock_irqsave(sch->lock, flags);
+	private->state = VFIO_CCW_STATE_BUSY;
+
+	/* Issue "Clear Subchannel" */
+	ccode = csch(sch->schid);
+
+	switch (ccode) {
+	case 0:
+		/*
+		 * Initialize device status information
+		 */
+		sch->schib.scsw.cmd.actl |= SCSW_ACTL_CLEAR_PEND;
+		ret = 0;
+		break;
+	default:	/* Device not operational */
+		ret = -ENODEV;
+	}
+	spin_unlock_irqrestore(sch->lock, flags);
+	return ret;
+}
+
 static void fsm_notoper(struct vfio_ccw_private *private,
 			enum vfio_ccw_event event)
 {
@@ -126,7 +192,24 @@  static void fsm_io_request(struct vfio_ccw_private *private,
 
 	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
 
-	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
+	/*
+	 * Start processing with the clear function, then halt, then start.
+	 * We may still be start pending when the caller wants to clean
+	 * up things via halt/clear.
+	 */
+	if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
+		/* issue clear and wait for interupt */
+		io_region->ret_code = fsm_clear_helper(private);
+		if (io_region->ret_code)
+			goto err_out;
+		return;
+	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
+		/* issue halt and wait for interrupt */
+		io_region->ret_code = fsm_halt_helper(private);
+		if (io_region->ret_code)
+			goto err_out;
+		return;
+	} else if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
 		orb = (union orb *)io_region->orb_area;
 
 		/* Don't try to build a cp if transport mode is specified. */
@@ -152,16 +235,7 @@  static void fsm_io_request(struct vfio_ccw_private *private,
 			goto err_out;
 		}
 		return;
-	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
-		/* XXX: Handle halt. */
-		io_region->ret_code = -EOPNOTSUPP;
-		goto err_out;
-	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
-		/* XXX: Handle clear. */
-		io_region->ret_code = -EOPNOTSUPP;
-		goto err_out;
 	}
-
 err_out:
 	private->state = VFIO_CCW_STATE_IDLE;
 }