diff mbox series

[kvm-unit-tests,v1,3/6] s390x: lib: css: upgrading IRQ handling

Message ID 1616073988-10381-4-git-send-email-pmorel@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series Testing SSCH, CSCH and HSCH for errors | expand

Commit Message

Pierre Morel March 18, 2021, 1:26 p.m. UTC
Until now we had very few usage of interrupts, to be able to handle
several interrupts coming up asynchronously we need to take care
to save the previous interrupt before handling the next one.

Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
---
 lib/s390x/css.h     |  29 +++++++++++
 lib/s390x/css_lib.c | 117 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 120 insertions(+), 26 deletions(-)

Comments

Pierre Morel March 18, 2021, 2:22 p.m. UTC | #1
hum, I just saw I forgot to upgrade the report of the residual count.
This is not really relevant for this series.
Of course I will adapt it for the v2.


On 3/18/21 2:26 PM, Pierre Morel wrote:
> Until now we had very few usage of interrupts, to be able to handle
> several interrupts coming up asynchronously we need to take care
> to save the previous interrupt before handling the next one.
> 
> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
> ---
>   lib/s390x/css.h     |  29 +++++++++++
>   lib/s390x/css_lib.c | 117 ++++++++++++++++++++++++++++++++++----------
>   2 files changed, 120 insertions(+), 26 deletions(-)
> 
> diff --git a/lib/s390x/css.h b/lib/s390x/css.h
> index 460b0bd..65fc335 100644
> --- a/lib/s390x/css.h
> +++ b/lib/s390x/css.h
> @@ -425,4 +425,33 @@ struct measurement_block_format1 {
>   	uint32_t irq_prio_delay_time;
>   };
>   
> +struct irq_entry {
> +	struct irq_entry *next;
> +	struct irb irb;
> +	uint32_t sid;
> +};
> +
> +struct irq_entry *alloc_irq(void);
> +struct irq_entry *get_irq(void);
> +void put_irq(struct irq_entry *irq);
> +
> +#include <asm/arch_def.h>
> +static inline void disable_io_irq(void)
> +{
> +	uint64_t mask;
> +
> +	mask = extract_psw_mask();
> +	mask &= ~PSW_MASK_IO;
> +	load_psw_mask(mask);
> +}
> +
> +static inline void enable_io_irq(void)
> +{
> +	uint64_t mask;
> +
> +	mask = extract_psw_mask();
> +	mask |= PSW_MASK_IO;
> +	load_psw_mask(mask);
> +}
> +
>   #endif
> diff --git a/lib/s390x/css_lib.c b/lib/s390x/css_lib.c
> index f8db205..211c73c 100644
> --- a/lib/s390x/css_lib.c
> +++ b/lib/s390x/css_lib.c
> @@ -9,6 +9,8 @@
>    */
>   #include <libcflat.h>
>   #include <alloc_phys.h>
> +#include <util.h>
> +#include <alloc.h>
>   #include <asm/page.h>
>   #include <string.h>
>   #include <interrupt.h>
> @@ -22,6 +24,46 @@
>   struct schib schib;
>   struct chsc_scsc *chsc_scsc;
>   
> +static struct irq_entry *irqs;
> +
> +struct irq_entry *get_irq(void)
> +{
> +	struct irq_entry *irq = NULL;
> +
> +	if (irqs) {
> +		irq = irqs;
> +		irqs = irq->next;
> +	}
> +	return irq;
> +}
> +
> +void put_irq(struct irq_entry *irq)
> +{
> +	free(irq);
> +}
> +
> +static void save_irq(struct irq_entry *irq)
> +{
> +	struct irq_entry *e;
> +
> +	if (!irqs) {
> +		irqs = irq;
> +	} else {
> +		e = irqs;
> +		while (e && e->next)
> +			e = e->next;
> +		e->next = irq;
> +	}
> +}
> +
> +struct irq_entry *alloc_irq(void)
> +{
> +	struct irq_entry *irq;
> +
> +	irq = calloc(1, sizeof(*irq));
> +	return irq;
> +}
> +
>   static const char * const chsc_rsp_description[] = {
>   	"CHSC unknown error",
>   	"Command executed",
> @@ -422,38 +464,38 @@ static struct irb irb;
>   void css_irq_io(void)
>   {
>   	int ret = 0;
> -	char *flags;
> -	int sid;
> +	struct irq_entry *irq;
>   
>   	report_prefix_push("Interrupt");
> -	sid = lowcore_ptr->subsys_id_word;
> +	irq = alloc_irq();
> +	assert(irq);
> +
> +	irq->sid = lowcore_ptr->subsys_id_word;
>   	/* Lowlevel set the SID as interrupt parameter. */
> -	if (lowcore_ptr->io_int_param != sid) {
> +	if (lowcore_ptr->io_int_param != irq->sid) {
>   		report(0,
>   		       "io_int_param: %x differs from subsys_id_word: %x",
> -		       lowcore_ptr->io_int_param, sid);
> +		       lowcore_ptr->io_int_param, irq->sid);
>   		goto pop;
>   	}
>   	report_prefix_pop();
>   
>   	report_prefix_push("tsch");
> -	ret = tsch(sid, &irb);
> +	ret = tsch(irq->sid, &irq->irb);
>   	switch (ret) {
>   	case 1:
> -		dump_irb(&irb);
> -		flags = dump_scsw_flags(irb.scsw.ctrl);
> -		report(0,
> -		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
> -		       sid, flags);
> +		report_info("no status pending on %08x : %s", irq->sid,
> +			    dump_scsw_flags(irq->irb.scsw.ctrl));
>   		break;
>   	case 2:
>   		report(0, "tsch returns unexpected CC 2");
>   		break;
>   	case 3:
> -		report(0, "tsch reporting sch %08x as not operational", sid);
> +		report(0, "tsch reporting sch %08x as not operational", irq->sid);
>   		break;
>   	case 0:
>   		/* Stay humble on success */
> +		save_irq(irq);
>   		break;
>   	}
>   pop:
> @@ -498,47 +540,70 @@ struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
>   int wait_and_check_io_completion(int schid)
>   {
>   	int ret = 0;
> -
> -	wait_for_interrupt(PSW_MASK_IO);
> +	struct irq_entry *irq = NULL;
>   
>   	report_prefix_push("check I/O completion");
>   
> -	if (lowcore_ptr->io_int_param != schid) {
> +	disable_io_irq();
> +	irq = get_irq();
> +	while (!irq) {
> +		wait_for_interrupt(PSW_MASK_IO);
> +		disable_io_irq();
> +		irq = get_irq();
> +		report_info("next try");
> +	}
> +	enable_io_irq();
> +
> +	assert(irq);
> +
> +	if (irq->sid != schid) {
>   		report(0, "interrupt parameter: expected %08x got %08x",
> -		       schid, lowcore_ptr->io_int_param);
> +		       schid, irq->sid);
>   		ret = -1;
>   		goto end;
>   	}
>   
>   	/* Verify that device status is valid */
> -	if (!(irb.scsw.ctrl & SCSW_SC_PENDING)) {
> -		report(0, "No status pending after interrupt. Subch Ctrl: %08x",
> -		       irb.scsw.ctrl);
> -		ret = -1;
> +	if (!(irq->irb.scsw.ctrl & SCSW_SC_PENDING)) {
> +		ret = 0;
>   		goto end;
>   	}
>   
> -	if (!(irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
> +	/* clear and halt pending are valid even without secondary or primary status */
> +	if (irq->irb.scsw.ctrl & (SCSW_FC_CLEAR | SCSW_FC_HALT)) {
> +		ret = 0;
> +		goto end;
> +	}
> +
> +	/* For start pending we need at least one of primary or secondary status */
> +	if (!(irq->irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
>   		report(0, "Primary or secondary status missing. Subch Ctrl: %08x",
> -		       irb.scsw.ctrl);
> +		       irq->irb.scsw.ctrl);
>   		ret = -1;
>   		goto end;
>   	}
>   
> -	if (!(irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
> +	/* For start pending we also need to have device or channel end information */
> +	if (!(irq->irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
>   		report(0, "No device end or sch end. Dev. status: %02x",
> -		       irb.scsw.dev_stat);
> +		       irq->irb.scsw.dev_stat);
>   		ret = -1;
>   		goto end;
>   	}
>   
> -	if (irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
> -		report_info("Unexpected Subch. status %02x", irb.scsw.sch_stat);
> +	/* We only accept the SubCHannel Status for Illegal Length */
> +	if (irq->irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
> +		report_info("Unexpected Subch. status %02x",
> +			    irq->irb.scsw.sch_stat);
>   		ret = -1;
>   		goto end;
>   	}
>   
>   end:
> +	if (ret)
> +		dump_irb(&irq->irb);
> +
> +	put_irq(irq);
>   	report_prefix_pop();
>   	return ret;
>   }
>
Cornelia Huck March 19, 2021, 11:01 a.m. UTC | #2
On Thu, 18 Mar 2021 14:26:25 +0100
Pierre Morel <pmorel@linux.ibm.com> wrote:

> Until now we had very few usage of interrupts, to be able to handle
> several interrupts coming up asynchronously we need to take care
> to save the previous interrupt before handling the next one.

An alternative would be to keep I/O interrupts disabled until you are
done with processing any information that might be overwritten.

> 
> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
> ---
>  lib/s390x/css.h     |  29 +++++++++++
>  lib/s390x/css_lib.c | 117 ++++++++++++++++++++++++++++++++++----------
>  2 files changed, 120 insertions(+), 26 deletions(-)
> 
> diff --git a/lib/s390x/css.h b/lib/s390x/css.h
> index 460b0bd..65fc335 100644
> --- a/lib/s390x/css.h
> +++ b/lib/s390x/css.h
> @@ -425,4 +425,33 @@ struct measurement_block_format1 {
>  	uint32_t irq_prio_delay_time;
>  };
>  
> +struct irq_entry {
> +	struct irq_entry *next;
> +	struct irb irb;
> +	uint32_t sid;

I'm wondering whether that set of information make sense for saving.

We basically have some things in the lowcore that get overwritten by
subsequent I/O interrupts (in addition to the sid the intparm and the
interrupt identification word which contains the isc), and the irb,
which only gets overwritten if you do a tsch into the same memory area.
So, if you need to save some things, I'd suggest to add the intparm and
the interrupt identification word to it. Not sure whether the irb can
be handled independently? Need to read code first :)

> +};

(...)

> @@ -422,38 +464,38 @@ static struct irb irb;
>  void css_irq_io(void)
>  {
>  	int ret = 0;
> -	char *flags;
> -	int sid;
> +	struct irq_entry *irq;
>  
>  	report_prefix_push("Interrupt");
> -	sid = lowcore_ptr->subsys_id_word;
> +	irq = alloc_irq();
> +	assert(irq);
> +
> +	irq->sid = lowcore_ptr->subsys_id_word;
>  	/* Lowlevel set the SID as interrupt parameter. */
> -	if (lowcore_ptr->io_int_param != sid) {
> +	if (lowcore_ptr->io_int_param != irq->sid) {
>  		report(0,
>  		       "io_int_param: %x differs from subsys_id_word: %x",
> -		       lowcore_ptr->io_int_param, sid);
> +		       lowcore_ptr->io_int_param, irq->sid);
>  		goto pop;
>  	}
>  	report_prefix_pop();
>  
>  	report_prefix_push("tsch");
> -	ret = tsch(sid, &irb);
> +	ret = tsch(irq->sid, &irq->irb);
>  	switch (ret) {
>  	case 1:
> -		dump_irb(&irb);
> -		flags = dump_scsw_flags(irb.scsw.ctrl);
> -		report(0,
> -		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
> -		       sid, flags);
> +		report_info("no status pending on %08x : %s", irq->sid,
> +			    dump_scsw_flags(irq->irb.scsw.ctrl));

This is not what you are looking at here, though?

The problem is that the hypervisor gave you cc 1 (stored, not status
pending) while you just got an interrupt; the previous message logged
that, while the new one does not. (The scsw flags are still
interesting, as it gives further information about the mismatch.)

>  		break;
>  	case 2:
>  		report(0, "tsch returns unexpected CC 2");
>  		break;
>  	case 3:
> -		report(0, "tsch reporting sch %08x as not operational", sid);
> +		report(0, "tsch reporting sch %08x as not operational", irq->sid);
>  		break;
>  	case 0:
>  		/* Stay humble on success */
> +		save_irq(irq);
>  		break;
>  	}
>  pop:
> @@ -498,47 +540,70 @@ struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
>  int wait_and_check_io_completion(int schid)
>  {
>  	int ret = 0;
> -
> -	wait_for_interrupt(PSW_MASK_IO);
> +	struct irq_entry *irq = NULL;
>  
>  	report_prefix_push("check I/O completion");
>  
> -	if (lowcore_ptr->io_int_param != schid) {
> +	disable_io_irq();
> +	irq = get_irq();
> +	while (!irq) {
> +		wait_for_interrupt(PSW_MASK_IO);
> +		disable_io_irq();

Isn't the disable_io_irq() redundant here?

(In general, I'm a bit confused about the I/O interrupt handling here.
Might need to read through the whole thing again.)

> +		irq = get_irq();
> +		report_info("next try");
> +	}
> +	enable_io_irq();
> +
> +	assert(irq);
> +
> +	if (irq->sid != schid) {
>  		report(0, "interrupt parameter: expected %08x got %08x",
> -		       schid, lowcore_ptr->io_int_param);
> +		       schid, irq->sid);
>  		ret = -1;
>  		goto end;

You're still expecting that there's only one subchannel enabled for I/O
interrupts at the same time, right?

>  	}
>  
>  	/* Verify that device status is valid */
> -	if (!(irb.scsw.ctrl & SCSW_SC_PENDING)) {
> -		report(0, "No status pending after interrupt. Subch Ctrl: %08x",
> -		       irb.scsw.ctrl);
> -		ret = -1;
> +	if (!(irq->irb.scsw.ctrl & SCSW_SC_PENDING)) {

Confused. An I/O interrupt for a subchannel that is not status pending
is surely an issue?

> +		ret = 0;
>  		goto end;
>  	}
>  
> -	if (!(irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
> +	/* clear and halt pending are valid even without secondary or primary status */
> +	if (irq->irb.scsw.ctrl & (SCSW_FC_CLEAR | SCSW_FC_HALT)) {

Can you factor out the new/changed checks here into a separate patch?
Would make the change easier to follow.

Also, you might want to check other things for halt/clear as well?

> +		ret = 0;
> +		goto end;
> +	}
> +
> +	/* For start pending we need at least one of primary or secondary status */
> +	if (!(irq->irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
>  		report(0, "Primary or secondary status missing. Subch Ctrl: %08x",
> -		       irb.scsw.ctrl);
> +		       irq->irb.scsw.ctrl);

I'm wondering whether that is actually true. Maybe need to double check
what happens with deferred ccs etc.

>  		ret = -1;
>  		goto end;
>  	}
>  
> -	if (!(irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
> +	/* For start pending we also need to have device or channel end information */
> +	if (!(irq->irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
>  		report(0, "No device end or sch end. Dev. status: %02x",
> -		       irb.scsw.dev_stat);
> +		       irq->irb.scsw.dev_stat);

Again, not sure whether that is true in any case (surely for the good
path, and I think for unit check as well; but ISTR that there can be
error conditions where we won't get another interrupt for the same I/O,
but device end is not set, because the error occurred before we even
reached the device... should those be logged?)

>  		ret = -1;
>  		goto end;
>  	}
>  
> -	if (irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
> -		report_info("Unexpected Subch. status %02x", irb.scsw.sch_stat);
> +	/* We only accept the SubCHannel Status for Illegal Length */

It's more like that we just don't deal with any of the other subchannel
status flags, right?

> +	if (irq->irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
> +		report_info("Unexpected Subch. status %02x",
> +			    irq->irb.scsw.sch_stat);
>  		ret = -1;
>  		goto end;
>  	}
>  
>  end:
> +	if (ret)
> +		dump_irb(&irq->irb);
> +
> +	put_irq(irq);
>  	report_prefix_pop();
>  	return ret;
>  }
Pierre Morel March 19, 2021, 3:55 p.m. UTC | #3
On 3/19/21 12:01 PM, Cornelia Huck wrote:
> On Thu, 18 Mar 2021 14:26:25 +0100
> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
>> Until now we had very few usage of interrupts, to be able to handle
>> several interrupts coming up asynchronously we need to take care
>> to save the previous interrupt before handling the next one.
> 
> An alternative would be to keep I/O interrupts disabled until you are
> done with processing any information that might be overwritten.
> 
>>
>> Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
>> ---
>>   lib/s390x/css.h     |  29 +++++++++++
>>   lib/s390x/css_lib.c | 117 ++++++++++++++++++++++++++++++++++----------
>>   2 files changed, 120 insertions(+), 26 deletions(-)
>>
>> diff --git a/lib/s390x/css.h b/lib/s390x/css.h
>> index 460b0bd..65fc335 100644
>> --- a/lib/s390x/css.h
>> +++ b/lib/s390x/css.h
>> @@ -425,4 +425,33 @@ struct measurement_block_format1 {
>>   	uint32_t irq_prio_delay_time;
>>   };
>>   
>> +struct irq_entry {
>> +	struct irq_entry *next;
>> +	struct irb irb;
>> +	uint32_t sid;
> 
> I'm wondering whether that set of information make sense for saving.
> 
> We basically have some things in the lowcore that get overwritten by
> subsequent I/O interrupts (in addition to the sid the intparm and the
> interrupt identification word which contains the isc), and the irb,
> which only gets overwritten if you do a tsch into the same memory area.
> So, if you need to save some things, I'd suggest to add the intparm and
> the interrupt identification word to it. Not sure whether the irb can
> be handled independently? Need to read code first :)

That is right.
I only saved what I needed for the moment.

> 
>> +};
> 
> (...)
> 
>> @@ -422,38 +464,38 @@ static struct irb irb;
>>   void css_irq_io(void)
>>   {
>>   	int ret = 0;
>> -	char *flags;
>> -	int sid;
>> +	struct irq_entry *irq;
>>   
>>   	report_prefix_push("Interrupt");
>> -	sid = lowcore_ptr->subsys_id_word;
>> +	irq = alloc_irq();
>> +	assert(irq);
>> +
>> +	irq->sid = lowcore_ptr->subsys_id_word;
>>   	/* Lowlevel set the SID as interrupt parameter. */
>> -	if (lowcore_ptr->io_int_param != sid) {
>> +	if (lowcore_ptr->io_int_param != irq->sid) {
>>   		report(0,
>>   		       "io_int_param: %x differs from subsys_id_word: %x",
>> -		       lowcore_ptr->io_int_param, sid);
>> +		       lowcore_ptr->io_int_param, irq->sid);
>>   		goto pop;
>>   	}
>>   	report_prefix_pop();
>>   
>>   	report_prefix_push("tsch");
>> -	ret = tsch(sid, &irb);
>> +	ret = tsch(irq->sid, &irq->irb);
>>   	switch (ret) {
>>   	case 1:
>> -		dump_irb(&irb);
>> -		flags = dump_scsw_flags(irb.scsw.ctrl);
>> -		report(0,
>> -		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
>> -		       sid, flags);
>> +		report_info("no status pending on %08x : %s", irq->sid,
>> +			    dump_scsw_flags(irq->irb.scsw.ctrl));
> 
> This is not what you are looking at here, though?
> 
> The problem is that the hypervisor gave you cc 1 (stored, not status
> pending) while you just got an interrupt; the previous message logged
> that, while the new one does not. (The scsw flags are still
> interesting, as it gives further information about the mismatch.)

I can keep the old message.
How ever I do not think it is a reason to report a failure.
Do you agree with replaacing report(0,) with report_info()

> 
>>   		break;
>>   	case 2:
>>   		report(0, "tsch returns unexpected CC 2");
>>   		break;
>>   	case 3:
>> -		report(0, "tsch reporting sch %08x as not operational", sid);
>> +		report(0, "tsch reporting sch %08x as not operational", irq->sid);
>>   		break;
>>   	case 0:
>>   		/* Stay humble on success */
>> +		save_irq(irq);
>>   		break;
>>   	}
>>   pop:
>> @@ -498,47 +540,70 @@ struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
>>   int wait_and_check_io_completion(int schid)
>>   {
>>   	int ret = 0;
>> -
>> -	wait_for_interrupt(PSW_MASK_IO);
>> +	struct irq_entry *irq = NULL;
>>   
>>   	report_prefix_push("check I/O completion");
>>   
>> -	if (lowcore_ptr->io_int_param != schid) {
>> +	disable_io_irq();
>> +	irq = get_irq();
>> +	while (!irq) {
>> +		wait_for_interrupt(PSW_MASK_IO);
>> +		disable_io_irq();
> 
> Isn't the disable_io_irq() redundant here?

No because wait for interrupt re-enable the interrupts
I will add a comment

> 
> (In general, I'm a bit confused about the I/O interrupt handling here.
> Might need to read through the whole thing again.)
> 
>> +		irq = get_irq();
>> +		report_info("next try");
>> +	}
>> +	enable_io_irq();
>> +
>> +	assert(irq);
>> +
>> +	if (irq->sid != schid) {
>>   		report(0, "interrupt parameter: expected %08x got %08x",
>> -		       schid, lowcore_ptr->io_int_param);
>> +		       schid, irq->sid);
>>   		ret = -1;
>>   		goto end;
> 
> You're still expecting that there's only one subchannel enabled for I/O
> interrupts at the same time, right?

Yes, I plan to introduce multiple channels later.

> 
>>   	}
>>   
>>   	/* Verify that device status is valid */
>> -	if (!(irb.scsw.ctrl & SCSW_SC_PENDING)) {
>> -		report(0, "No status pending after interrupt. Subch Ctrl: %08x",
>> -		       irb.scsw.ctrl);
>> -		ret = -1;
>> +	if (!(irq->irb.scsw.ctrl & SCSW_SC_PENDING)) {
> 
> Confused. An I/O interrupt for a subchannel that is not status pending
> is surely an issue?
> 
>> +		ret = 0;
>>   		goto end;
>>   	}
>>   
>> -	if (!(irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
>> +	/* clear and halt pending are valid even without secondary or primary status */
>> +	if (irq->irb.scsw.ctrl & (SCSW_FC_CLEAR | SCSW_FC_HALT)) {
> 
> Can you factor out the new/changed checks here into a separate patch?
> Would make the change easier to follow.

yes, these changes should not belong here.
I will rewrite this all

> 
> Also, you might want to check other things for halt/clear as well?

Yes in a further patch

> 
>> +		ret = 0;
>> +		goto end;
>> +	}
>> +
>> +	/* For start pending we need at least one of primary or secondary status */
>> +	if (!(irq->irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
>>   		report(0, "Primary or secondary status missing. Subch Ctrl: %08x",
>> -		       irb.scsw.ctrl);
>> +		       irq->irb.scsw.ctrl);
> 
> I'm wondering whether that is actually true. Maybe need to double check
> what happens with deferred ccs etc.

Yes,

> 
>>   		ret = -1;
>>   		goto end;
>>   	}
>>   
>> -	if (!(irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
>> +	/* For start pending we also need to have device or channel end information */
>> +	if (!(irq->irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
>>   		report(0, "No device end or sch end. Dev. status: %02x",
>> -		       irb.scsw.dev_stat);
>> +		       irq->irb.scsw.dev_stat);
> 
> Again, not sure whether that is true in any case (surely for the good
> path, and I think for unit check as well; but ISTR that there can be
> error conditions where we won't get another interrupt for the same I/O,
> but device end is not set, because the error occurred before we even
> reached the device... should those be logged?)

surely

> 
>>   		ret = -1;
>>   		goto end;
>>   	}
>>   
>> -	if (irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
>> -		report_info("Unexpected Subch. status %02x", irb.scsw.sch_stat);
>> +	/* We only accept the SubCHannel Status for Illegal Length */
> 
> It's more like that we just don't deal with any of the other subchannel
> status flags, right?

OK, I will rework this completely
Thanks for the comment,

Regards,
Pierre
Cornelia Huck March 19, 2021, 4:09 p.m. UTC | #4
On Fri, 19 Mar 2021 16:55:15 +0100
Pierre Morel <pmorel@linux.ibm.com> wrote:

> On 3/19/21 12:01 PM, Cornelia Huck wrote:
> > On Thu, 18 Mar 2021 14:26:25 +0100
> > Pierre Morel <pmorel@linux.ibm.com> wrote:

> >> @@ -422,38 +464,38 @@ static struct irb irb;
> >>   void css_irq_io(void)
> >>   {
> >>   	int ret = 0;
> >> -	char *flags;
> >> -	int sid;
> >> +	struct irq_entry *irq;
> >>   
> >>   	report_prefix_push("Interrupt");
> >> -	sid = lowcore_ptr->subsys_id_word;
> >> +	irq = alloc_irq();
> >> +	assert(irq);
> >> +
> >> +	irq->sid = lowcore_ptr->subsys_id_word;
> >>   	/* Lowlevel set the SID as interrupt parameter. */
> >> -	if (lowcore_ptr->io_int_param != sid) {
> >> +	if (lowcore_ptr->io_int_param != irq->sid) {
> >>   		report(0,
> >>   		       "io_int_param: %x differs from subsys_id_word: %x",
> >> -		       lowcore_ptr->io_int_param, sid);
> >> +		       lowcore_ptr->io_int_param, irq->sid);
> >>   		goto pop;
> >>   	}
> >>   	report_prefix_pop();
> >>   
> >>   	report_prefix_push("tsch");
> >> -	ret = tsch(sid, &irb);
> >> +	ret = tsch(irq->sid, &irq->irb);
> >>   	switch (ret) {
> >>   	case 1:
> >> -		dump_irb(&irb);
> >> -		flags = dump_scsw_flags(irb.scsw.ctrl);
> >> -		report(0,
> >> -		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
> >> -		       sid, flags);
> >> +		report_info("no status pending on %08x : %s", irq->sid,
> >> +			    dump_scsw_flags(irq->irb.scsw.ctrl));  
> > 
> > This is not what you are looking at here, though?
> > 
> > The problem is that the hypervisor gave you cc 1 (stored, not status
> > pending) while you just got an interrupt; the previous message logged
> > that, while the new one does not. (The scsw flags are still
> > interesting, as it gives further information about the mismatch.)  
> 
> I can keep the old message.
> How ever I do not think it is a reason to report a failure.
> Do you agree with replaacing report(0,) with report_info()

I don't really see how we could get an I/O interrupt for a subchannel
that is not status pending, unless we have other code racing with this
one that cleared the status pending already (and that would be a bug in
our test program.) Or are you aware in anything in the architecture
that could make the status pending go away again (other than the
subchannel becoming not operational?)

> 
> >   
> >>   		break;
> >>   	case 2:
> >>   		report(0, "tsch returns unexpected CC 2");
> >>   		break;
> >>   	case 3:
> >> -		report(0, "tsch reporting sch %08x as not operational", sid);
> >> +		report(0, "tsch reporting sch %08x as not operational", irq->sid);
> >>   		break;
> >>   	case 0:
> >>   		/* Stay humble on success */
> >> +		save_irq(irq);
> >>   		break;
> >>   	}
> >>   pop:
> >> @@ -498,47 +540,70 @@ struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
> >>   int wait_and_check_io_completion(int schid)
> >>   {
> >>   	int ret = 0;
> >> -
> >> -	wait_for_interrupt(PSW_MASK_IO);
> >> +	struct irq_entry *irq = NULL;
> >>   
> >>   	report_prefix_push("check I/O completion");
> >>   
> >> -	if (lowcore_ptr->io_int_param != schid) {
> >> +	disable_io_irq();
> >> +	irq = get_irq();
> >> +	while (!irq) {
> >> +		wait_for_interrupt(PSW_MASK_IO);
> >> +		disable_io_irq();  
> > 
> > Isn't the disable_io_irq() redundant here?  
> 
> No because wait for interrupt re-enable the interrupts
> I will add a comment

Hm, I thought it restored the previous status.

> 
> > 
> > (In general, I'm a bit confused about the I/O interrupt handling here.
> > Might need to read through the whole thing again.)

But also see this comment :)

> >   
> >> +		irq = get_irq();
> >> +		report_info("next try");
> >> +	}
> >> +	enable_io_irq();
Pierre Morel March 19, 2021, 4:34 p.m. UTC | #5
On 3/19/21 5:09 PM, Cornelia Huck wrote:
> On Fri, 19 Mar 2021 16:55:15 +0100
> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
>> On 3/19/21 12:01 PM, Cornelia Huck wrote:
>>> On Thu, 18 Mar 2021 14:26:25 +0100
>>> Pierre Morel <pmorel@linux.ibm.com> wrote:
> 
>>>> @@ -422,38 +464,38 @@ static struct irb irb;
>>>>    void css_irq_io(void)
>>>>    {
>>>>    	int ret = 0;
>>>> -	char *flags;
>>>> -	int sid;
>>>> +	struct irq_entry *irq;
>>>>    
>>>>    	report_prefix_push("Interrupt");
>>>> -	sid = lowcore_ptr->subsys_id_word;
>>>> +	irq = alloc_irq();
>>>> +	assert(irq);
>>>> +
>>>> +	irq->sid = lowcore_ptr->subsys_id_word;
>>>>    	/* Lowlevel set the SID as interrupt parameter. */
>>>> -	if (lowcore_ptr->io_int_param != sid) {
>>>> +	if (lowcore_ptr->io_int_param != irq->sid) {
>>>>    		report(0,
>>>>    		       "io_int_param: %x differs from subsys_id_word: %x",
>>>> -		       lowcore_ptr->io_int_param, sid);
>>>> +		       lowcore_ptr->io_int_param, irq->sid);
>>>>    		goto pop;
>>>>    	}
>>>>    	report_prefix_pop();
>>>>    
>>>>    	report_prefix_push("tsch");
>>>> -	ret = tsch(sid, &irb);
>>>> +	ret = tsch(irq->sid, &irq->irb);
>>>>    	switch (ret) {
>>>>    	case 1:
>>>> -		dump_irb(&irb);
>>>> -		flags = dump_scsw_flags(irb.scsw.ctrl);
>>>> -		report(0,
>>>> -		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
>>>> -		       sid, flags);
>>>> +		report_info("no status pending on %08x : %s", irq->sid,
>>>> +			    dump_scsw_flags(irq->irb.scsw.ctrl));
>>>
>>> This is not what you are looking at here, though?
>>>
>>> The problem is that the hypervisor gave you cc 1 (stored, not status
>>> pending) while you just got an interrupt; the previous message logged
>>> that, while the new one does not. (The scsw flags are still
>>> interesting, as it gives further information about the mismatch.)
>>
>> I can keep the old message.
>> How ever I do not think it is a reason to report a failure.
>> Do you agree with replaacing report(0,) with report_info()
> 
> I don't really see how we could get an I/O interrupt for a subchannel
> that is not status pending, unless we have other code racing with this
> one that cleared the status pending already (and that would be a bug in
> our test program.) Or are you aware in anything in the architecture
> that could make the status pending go away again (other than the
> subchannel becoming not operational?)

:) no
I really messed up with this patch.
sorry, can only do better


> 
>>
>>>    
>>>>    		break;
>>>>    	case 2:
>>>>    		report(0, "tsch returns unexpected CC 2");
>>>>    		break;
>>>>    	case 3:
>>>> -		report(0, "tsch reporting sch %08x as not operational", sid);
>>>> +		report(0, "tsch reporting sch %08x as not operational", irq->sid);
>>>>    		break;
>>>>    	case 0:
>>>>    		/* Stay humble on success */
>>>> +		save_irq(irq);
>>>>    		break;
>>>>    	}
>>>>    pop:
>>>> @@ -498,47 +540,70 @@ struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
>>>>    int wait_and_check_io_completion(int schid)
>>>>    {
>>>>    	int ret = 0;
>>>> -
>>>> -	wait_for_interrupt(PSW_MASK_IO);
>>>> +	struct irq_entry *irq = NULL;
>>>>    
>>>>    	report_prefix_push("check I/O completion");
>>>>    
>>>> -	if (lowcore_ptr->io_int_param != schid) {
>>>> +	disable_io_irq();
>>>> +	irq = get_irq();
>>>> +	while (!irq) {
>>>> +		wait_for_interrupt(PSW_MASK_IO);
>>>> +		disable_io_irq();
>>>
>>> Isn't the disable_io_irq() redundant here?
>>
>> No because wait for interrupt re-enable the interrupts
>> I will add a comment
> 
> Hm, I thought it restored the previous status.
> 
>>
>>>
>>> (In general, I'm a bit confused about the I/O interrupt handling here.
>>> Might need to read through the whole thing again.)
> 
> But also see this comment :)
> 

Oh you mean the comment were it restores the psw mask.
yes,I see it now.
hum
yes, this patch is awful. really sorry

please do not lose more time I must really rework the all series.

Regards,
Pierre
diff mbox series

Patch

diff --git a/lib/s390x/css.h b/lib/s390x/css.h
index 460b0bd..65fc335 100644
--- a/lib/s390x/css.h
+++ b/lib/s390x/css.h
@@ -425,4 +425,33 @@  struct measurement_block_format1 {
 	uint32_t irq_prio_delay_time;
 };
 
+struct irq_entry {
+	struct irq_entry *next;
+	struct irb irb;
+	uint32_t sid;
+};
+
+struct irq_entry *alloc_irq(void);
+struct irq_entry *get_irq(void);
+void put_irq(struct irq_entry *irq);
+
+#include <asm/arch_def.h>
+static inline void disable_io_irq(void)
+{
+	uint64_t mask;
+
+	mask = extract_psw_mask();
+	mask &= ~PSW_MASK_IO;
+	load_psw_mask(mask);
+}
+
+static inline void enable_io_irq(void)
+{
+	uint64_t mask;
+
+	mask = extract_psw_mask();
+	mask |= PSW_MASK_IO;
+	load_psw_mask(mask);
+}
+
 #endif
diff --git a/lib/s390x/css_lib.c b/lib/s390x/css_lib.c
index f8db205..211c73c 100644
--- a/lib/s390x/css_lib.c
+++ b/lib/s390x/css_lib.c
@@ -9,6 +9,8 @@ 
  */
 #include <libcflat.h>
 #include <alloc_phys.h>
+#include <util.h>
+#include <alloc.h>
 #include <asm/page.h>
 #include <string.h>
 #include <interrupt.h>
@@ -22,6 +24,46 @@ 
 struct schib schib;
 struct chsc_scsc *chsc_scsc;
 
+static struct irq_entry *irqs;
+
+struct irq_entry *get_irq(void)
+{
+	struct irq_entry *irq = NULL;
+
+	if (irqs) {
+		irq = irqs;
+		irqs = irq->next;
+	}
+	return irq;
+}
+
+void put_irq(struct irq_entry *irq)
+{
+	free(irq);
+}
+
+static void save_irq(struct irq_entry *irq)
+{
+	struct irq_entry *e;
+
+	if (!irqs) {
+		irqs = irq;
+	} else {
+		e = irqs;
+		while (e && e->next)
+			e = e->next;
+		e->next = irq;
+	}
+}
+
+struct irq_entry *alloc_irq(void)
+{
+	struct irq_entry *irq;
+
+	irq = calloc(1, sizeof(*irq));
+	return irq;
+}
+
 static const char * const chsc_rsp_description[] = {
 	"CHSC unknown error",
 	"Command executed",
@@ -422,38 +464,38 @@  static struct irb irb;
 void css_irq_io(void)
 {
 	int ret = 0;
-	char *flags;
-	int sid;
+	struct irq_entry *irq;
 
 	report_prefix_push("Interrupt");
-	sid = lowcore_ptr->subsys_id_word;
+	irq = alloc_irq();
+	assert(irq);
+
+	irq->sid = lowcore_ptr->subsys_id_word;
 	/* Lowlevel set the SID as interrupt parameter. */
-	if (lowcore_ptr->io_int_param != sid) {
+	if (lowcore_ptr->io_int_param != irq->sid) {
 		report(0,
 		       "io_int_param: %x differs from subsys_id_word: %x",
-		       lowcore_ptr->io_int_param, sid);
+		       lowcore_ptr->io_int_param, irq->sid);
 		goto pop;
 	}
 	report_prefix_pop();
 
 	report_prefix_push("tsch");
-	ret = tsch(sid, &irb);
+	ret = tsch(irq->sid, &irq->irb);
 	switch (ret) {
 	case 1:
-		dump_irb(&irb);
-		flags = dump_scsw_flags(irb.scsw.ctrl);
-		report(0,
-		       "I/O interrupt, but tsch returns CC 1 for subchannel %08x. SCSW flags: %s",
-		       sid, flags);
+		report_info("no status pending on %08x : %s", irq->sid,
+			    dump_scsw_flags(irq->irb.scsw.ctrl));
 		break;
 	case 2:
 		report(0, "tsch returns unexpected CC 2");
 		break;
 	case 3:
-		report(0, "tsch reporting sch %08x as not operational", sid);
+		report(0, "tsch reporting sch %08x as not operational", irq->sid);
 		break;
 	case 0:
 		/* Stay humble on success */
+		save_irq(irq);
 		break;
 	}
 pop:
@@ -498,47 +540,70 @@  struct ccw1 *ccw_alloc(int code, void *data, int count, unsigned char flags)
 int wait_and_check_io_completion(int schid)
 {
 	int ret = 0;
-
-	wait_for_interrupt(PSW_MASK_IO);
+	struct irq_entry *irq = NULL;
 
 	report_prefix_push("check I/O completion");
 
-	if (lowcore_ptr->io_int_param != schid) {
+	disable_io_irq();
+	irq = get_irq();
+	while (!irq) {
+		wait_for_interrupt(PSW_MASK_IO);
+		disable_io_irq();
+		irq = get_irq();
+		report_info("next try");
+	}
+	enable_io_irq();
+
+	assert(irq);
+
+	if (irq->sid != schid) {
 		report(0, "interrupt parameter: expected %08x got %08x",
-		       schid, lowcore_ptr->io_int_param);
+		       schid, irq->sid);
 		ret = -1;
 		goto end;
 	}
 
 	/* Verify that device status is valid */
-	if (!(irb.scsw.ctrl & SCSW_SC_PENDING)) {
-		report(0, "No status pending after interrupt. Subch Ctrl: %08x",
-		       irb.scsw.ctrl);
-		ret = -1;
+	if (!(irq->irb.scsw.ctrl & SCSW_SC_PENDING)) {
+		ret = 0;
 		goto end;
 	}
 
-	if (!(irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
+	/* clear and halt pending are valid even without secondary or primary status */
+	if (irq->irb.scsw.ctrl & (SCSW_FC_CLEAR | SCSW_FC_HALT)) {
+		ret = 0;
+		goto end;
+	}
+
+	/* For start pending we need at least one of primary or secondary status */
+	if (!(irq->irb.scsw.ctrl & (SCSW_SC_SECONDARY | SCSW_SC_PRIMARY))) {
 		report(0, "Primary or secondary status missing. Subch Ctrl: %08x",
-		       irb.scsw.ctrl);
+		       irq->irb.scsw.ctrl);
 		ret = -1;
 		goto end;
 	}
 
-	if (!(irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
+	/* For start pending we also need to have device or channel end information */
+	if (!(irq->irb.scsw.dev_stat & (SCSW_DEVS_DEV_END | SCSW_DEVS_SCH_END))) {
 		report(0, "No device end or sch end. Dev. status: %02x",
-		       irb.scsw.dev_stat);
+		       irq->irb.scsw.dev_stat);
 		ret = -1;
 		goto end;
 	}
 
-	if (irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
-		report_info("Unexpected Subch. status %02x", irb.scsw.sch_stat);
+	/* We only accept the SubCHannel Status for Illegal Length */
+	if (irq->irb.scsw.sch_stat & ~SCSW_SCHS_IL) {
+		report_info("Unexpected Subch. status %02x",
+			    irq->irb.scsw.sch_stat);
 		ret = -1;
 		goto end;
 	}
 
 end:
+	if (ret)
+		dump_irb(&irq->irb);
+
+	put_irq(irq);
 	report_prefix_pop();
 	return ret;
 }