diff mbox series

[kvm-unit-tests,v3,3/4] s390x: add extint loop test

Message ID 20220722060043.733796-4-nrb@linux.ibm.com (mailing list archive)
State Superseded, archived
Headers show
Series Add panic test support | expand

Commit Message

Nico Boehr July 22, 2022, 6 a.m. UTC
The CPU timer interrupt stays pending as long as the CPU timer value is
negative. This can lead to interruption loops when the ext_new_psw mask
has external interrupts enabled.

QEMU is able to detect this situation and panic the guest, so add a test
for it.

Signed-off-by: Nico Boehr <nrb@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
 s390x/Makefile            |  1 +
 s390x/panic-loop-extint.c | 60 +++++++++++++++++++++++++++++++++++++++
 s390x/unittests.cfg       |  6 ++++
 3 files changed, 67 insertions(+)
 create mode 100644 s390x/panic-loop-extint.c

Comments

Claudio Imbrenda Aug. 10, 2022, 10:08 a.m. UTC | #1
On Fri, 22 Jul 2022 08:00:42 +0200
Nico Boehr <nrb@linux.ibm.com> wrote:

> The CPU timer interrupt stays pending as long as the CPU timer value is
> negative. This can lead to interruption loops when the ext_new_psw mask
> has external interrupts enabled.
> 
> QEMU is able to detect this situation and panic the guest, so add a test
> for it.
> 
> Signed-off-by: Nico Boehr <nrb@linux.ibm.com>
> Reviewed-by: Thomas Huth <thuth@redhat.com>
> ---
>  s390x/Makefile            |  1 +
>  s390x/panic-loop-extint.c | 60 +++++++++++++++++++++++++++++++++++++++
>  s390x/unittests.cfg       |  6 ++++
>  3 files changed, 67 insertions(+)
>  create mode 100644 s390x/panic-loop-extint.c
> 
> diff --git a/s390x/Makefile b/s390x/Makefile
> index efd5e0c13102..e4649da50d9d 100644
> --- a/s390x/Makefile
> +++ b/s390x/Makefile
> @@ -34,6 +34,7 @@ tests += $(TEST_DIR)/migration.elf
>  tests += $(TEST_DIR)/pv-attest.elf
>  tests += $(TEST_DIR)/migration-cmm.elf
>  tests += $(TEST_DIR)/migration-skey.elf
> +tests += $(TEST_DIR)/panic-loop-extint.elf
>  
>  pv-tests += $(TEST_DIR)/pv-diags.elf
>  
> diff --git a/s390x/panic-loop-extint.c b/s390x/panic-loop-extint.c
> new file mode 100644
> index 000000000000..d3a3f06d9a34
> --- /dev/null
> +++ b/s390x/panic-loop-extint.c
> @@ -0,0 +1,60 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * External interrupt loop test
> + *
> + * Copyright IBM Corp. 2022
> + *
> + * Authors:
> + *  Nico Boehr <nrb@linux.ibm.com>
> + */
> +#include <libcflat.h>
> +#include <asm/interrupt.h>
> +#include <asm/barrier.h>
> +#include <asm/time.h>
> +#include <hardware.h>
> +
> +static void ext_int_handler(void)
> +{
> +	/*
> +	 * return to ext_old_psw. This gives us the chance to print the return_fail
> +	 * in case something goes wrong.
> +	 */
> +	asm volatile (
> +		"lpswe %[ext_old_psw]\n"
> +		:
> +		: [ext_old_psw] "Q"(lowcore.ext_old_psw)
> +		: "memory"
> +	);
> +}

why should ext_old_psw contain a good PSW? wouldn't it contain the
PSW at the time of the interrupt? (which in this case is the new PSW)

but this should never happen anyway, right?

> +
> +int main(void)
> +{
> +	struct psw ext_new_psw_orig;
> +
> +	report_prefix_push("panic-loop-extint");
> +
> +	if (!host_is_qemu() || host_is_tcg()) {
> +		report_skip("QEMU-KVM-only test");
> +		goto out;
> +	}
> +
> +	ext_new_psw_orig = lowcore.ext_new_psw;
> +	lowcore.ext_new_psw.addr = (uint64_t)ext_int_handler;
> +	lowcore.ext_new_psw.mask |= PSW_MASK_EXT;
> +
> +	load_psw_mask(extract_psw_mask() | PSW_MASK_EXT);
> +	ctl_set_bit(0, CTL0_CLOCK_COMPARATOR);
> +
> +	cpu_timer_set_ms(1);
> +
> +	mdelay(2000);
> +
> +	/* restore previous ext_new_psw so QEMU can properly terminate */
> +	lowcore.ext_new_psw = ext_new_psw_orig;
> +
> +	report_fail("survived extint loop");
> +
> +out:
> +	report_prefix_pop();
> +	return report_summary();
> +}
> diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg
> index f7b1fc3dbca1..b1b25f118ff6 100644
> --- a/s390x/unittests.cfg
> +++ b/s390x/unittests.cfg
> @@ -185,3 +185,9 @@ groups = migration
>  [migration-skey]
>  file = migration-skey.elf
>  groups = migration
> +
> +[panic-loop-extint]
> +file = panic-loop-extint.elf
> +groups = panic
> +accel = kvm
> +timeout = 5
Nico Boehr Aug. 10, 2022, 12:29 p.m. UTC | #2
Quoting Claudio Imbrenda (2022-08-10 12:08:22)
> > diff --git a/s390x/panic-loop-extint.c b/s390x/panic-loop-extint.c
> > new file mode 100644
> > index 000000000000..d3a3f06d9a34
> > --- /dev/null
> > +++ b/s390x/panic-loop-extint.c
[...]
> > +static void ext_int_handler(void)
> > +{
> > +     /*
> > +      * return to ext_old_psw. This gives us the chance to print the return_fail
> > +      * in case something goes wrong.
> > +      */
> > +     asm volatile (
> > +             "lpswe %[ext_old_psw]\n"
> > +             :
> > +             : [ext_old_psw] "Q"(lowcore.ext_old_psw)
> > +             : "memory"
> > +     );
> > +}
> 
> why should ext_old_psw contain a good PSW? wouldn't it contain the
> PSW at the time of the interrupt? (which in this case is the new PSW)

That's right in case the interrupt loop occurs, it doesn't make much sense to return to ext_old_psw. But in this case lpswe will never be executed anyways.

> but this should never happen anyway, right?

Exactly, this is just supposed to be a safety net in case the interrupt loop doesn't happen. If you want, we could remove it and just leave an empty function here. Then, we will just run into the kvm-unit-tests timeout and fail as well, but I prefer the fail fast.
Claudio Imbrenda Aug. 10, 2022, 1:13 p.m. UTC | #3
On Wed, 10 Aug 2022 14:29:27 +0200
Nico Boehr <nrb@linux.ibm.com> wrote:

> Quoting Claudio Imbrenda (2022-08-10 12:08:22)
> > > diff --git a/s390x/panic-loop-extint.c b/s390x/panic-loop-extint.c
> > > new file mode 100644
> > > index 000000000000..d3a3f06d9a34
> > > --- /dev/null
> > > +++ b/s390x/panic-loop-extint.c  
> [...]
> > > +static void ext_int_handler(void)
> > > +{
> > > +     /*
> > > +      * return to ext_old_psw. This gives us the chance to print the return_fail
> > > +      * in case something goes wrong.
> > > +      */
> > > +     asm volatile (
> > > +             "lpswe %[ext_old_psw]\n"
> > > +             :
> > > +             : [ext_old_psw] "Q"(lowcore.ext_old_psw)
> > > +             : "memory"
> > > +     );
> > > +}  
> > 
> > why should ext_old_psw contain a good PSW? wouldn't it contain the
> > PSW at the time of the interrupt? (which in this case is the new PSW)  
> 
> That's right in case the interrupt loop occurs, it doesn't make much sense to return to ext_old_psw. But in this case lpswe will never be executed anyways.
> 
> > but this should never happen anyway, right?  
> 
> Exactly, this is just supposed to be a safety net in case the interrupt loop doesn't happen. If you want, we could remove it and just leave an empty function here. Then, we will just run into the kvm-unit-tests timeout and fail as well, but I prefer the fail fast.

just add a comment to explain that
Nico Boehr Aug. 11, 2022, 7:54 a.m. UTC | #4
Quoting Claudio Imbrenda (2022-08-10 15:13:02)
[...]
> just add a comment to explain that

Yes makes sense, thanks. I came up with this:

/*
 * When the interrupt loop occurs, this instruction will never be
 * executed.
 * In case the loop isn't triggered return to pgm_old_psw so we can fail
 * fast and don't have to rely on the kvm-unit-tests timeout.
 */
Claudio Imbrenda Aug. 11, 2022, 8:37 a.m. UTC | #5
On Thu, 11 Aug 2022 09:54:37 +0200
Nico Boehr <nrb@linux.ibm.com> wrote:

> Quoting Claudio Imbrenda (2022-08-10 15:13:02)
> [...]
> > just add a comment to explain that  
> 
> Yes makes sense, thanks. I came up with this:
> 
> /*
>  * When the interrupt loop occurs, this instruction will never be
>  * executed.
>  * In case the loop isn't triggered return to pgm_old_psw so we can fail
>  * fast and don't have to rely on the kvm-unit-tests timeout.
>  */

lgtm
Nico Boehr Aug. 11, 2022, 4:46 p.m. UTC | #6
Quoting Claudio Imbrenda (2022-08-10 12:08:22)
[...]
> > diff --git a/s390x/panic-loop-extint.c b/s390x/panic-loop-extint.c
> > new file mode 100644
> > index 000000000000..d3a3f06d9a34
> > --- /dev/null
> > +++ b/s390x/panic-loop-extint.c
[...]
> > +static void ext_int_handler(void)
> > +{
> > +     /*
> > +      * return to ext_old_psw. This gives us the chance to print the return_fail
> > +      * in case something goes wrong.
> > +      */
> > +     asm volatile (
> > +             "lpswe %[ext_old_psw]\n"
> > +             :
> > +             : [ext_old_psw] "Q"(lowcore.ext_old_psw)
> > +             : "memory"
> > +     );
> > +}
> 
> why should ext_old_psw contain a good PSW? wouldn't it contain the
> PSW at the time of the interrupt? (which in this case is the new PSW)
> 
> but this should never happen anyway, right?

Well, after your remark, I thought about this a little more and found several issues with my implementation:

- I enabled the clock comparator subclass mask, but set the CPU timer. The test also works with the clock comparator since it also stays pending. Doesn't really matter which one to use as long as you stay consistent. :-)
- returning to ext_old_psw is not enough, since the CPU timer subclass mask is still enabled and the CPU timer is negative. This means the CPU timer will fire once we enable external interruptions and hence impede printing the report_fail()
- the whole lpswe is redundant, since the default kvm-unit-test handler already does that. I will refactor this to leave the default kut handler in place, which is much nicer and safer.

So upcoming version will refactor this test a bit and rely on the default kvm-unit-test int handler and the nice new register_ext_cleanup_func().
diff mbox series

Patch

diff --git a/s390x/Makefile b/s390x/Makefile
index efd5e0c13102..e4649da50d9d 100644
--- a/s390x/Makefile
+++ b/s390x/Makefile
@@ -34,6 +34,7 @@  tests += $(TEST_DIR)/migration.elf
 tests += $(TEST_DIR)/pv-attest.elf
 tests += $(TEST_DIR)/migration-cmm.elf
 tests += $(TEST_DIR)/migration-skey.elf
+tests += $(TEST_DIR)/panic-loop-extint.elf
 
 pv-tests += $(TEST_DIR)/pv-diags.elf
 
diff --git a/s390x/panic-loop-extint.c b/s390x/panic-loop-extint.c
new file mode 100644
index 000000000000..d3a3f06d9a34
--- /dev/null
+++ b/s390x/panic-loop-extint.c
@@ -0,0 +1,60 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * External interrupt loop test
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Authors:
+ *  Nico Boehr <nrb@linux.ibm.com>
+ */
+#include <libcflat.h>
+#include <asm/interrupt.h>
+#include <asm/barrier.h>
+#include <asm/time.h>
+#include <hardware.h>
+
+static void ext_int_handler(void)
+{
+	/*
+	 * return to ext_old_psw. This gives us the chance to print the return_fail
+	 * in case something goes wrong.
+	 */
+	asm volatile (
+		"lpswe %[ext_old_psw]\n"
+		:
+		: [ext_old_psw] "Q"(lowcore.ext_old_psw)
+		: "memory"
+	);
+}
+
+int main(void)
+{
+	struct psw ext_new_psw_orig;
+
+	report_prefix_push("panic-loop-extint");
+
+	if (!host_is_qemu() || host_is_tcg()) {
+		report_skip("QEMU-KVM-only test");
+		goto out;
+	}
+
+	ext_new_psw_orig = lowcore.ext_new_psw;
+	lowcore.ext_new_psw.addr = (uint64_t)ext_int_handler;
+	lowcore.ext_new_psw.mask |= PSW_MASK_EXT;
+
+	load_psw_mask(extract_psw_mask() | PSW_MASK_EXT);
+	ctl_set_bit(0, CTL0_CLOCK_COMPARATOR);
+
+	cpu_timer_set_ms(1);
+
+	mdelay(2000);
+
+	/* restore previous ext_new_psw so QEMU can properly terminate */
+	lowcore.ext_new_psw = ext_new_psw_orig;
+
+	report_fail("survived extint loop");
+
+out:
+	report_prefix_pop();
+	return report_summary();
+}
diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg
index f7b1fc3dbca1..b1b25f118ff6 100644
--- a/s390x/unittests.cfg
+++ b/s390x/unittests.cfg
@@ -185,3 +185,9 @@  groups = migration
 [migration-skey]
 file = migration-skey.elf
 groups = migration
+
+[panic-loop-extint]
+file = panic-loop-extint.elf
+groups = panic
+accel = kvm
+timeout = 5