diff mbox series

hw/scsi/lsi53c895a: add hack to prevent scsi timeouts in HP-UX 10.20

Message ID 20240228211149.1533426-1-svens@stackframe.org (mailing list archive)
State New, archived
Headers show
Series hw/scsi/lsi53c895a: add hack to prevent scsi timeouts in HP-UX 10.20 | expand

Commit Message

Sven Schnelle Feb. 28, 2024, 9:11 p.m. UTC
HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
under certain circumstances. As the SCSI controller and CPU are not
running at the same time this loop will never finish. After some
time, the check loop interrupts with a unexpected device disconnect.
This works, but is slow because the kernel resets the scsi controller.
Instead of signaling UDC, add an option 'hpux-spin-workaround' which
emulates a INTERRUPT 2 script instruction. This instruction tells the
kernel that the request was fulfilled. With this change, SCSI speeds
improves significantly.

The option can be enabled by adding

-global lsi53c895a.hpux-spin-workaround=on

to the qemu commandline.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
---
 hw/scsi/lsi53c895a.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

Comments

Peter Maydell Feb. 29, 2024, 9:36 a.m. UTC | #1
On Wed, 28 Feb 2024 at 21:12, Sven Schnelle <svens@stackframe.org> wrote:
>
> HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
> under certain circumstances. As the SCSI controller and CPU are not
> running at the same time this loop will never finish. After some
> time, the check loop interrupts with a unexpected device disconnect.
> This works, but is slow because the kernel resets the scsi controller.
> Instead of signaling UDC, add an option 'hpux-spin-workaround' which
> emulates a INTERRUPT 2 script instruction. This instruction tells the
> kernel that the request was fulfilled. With this change, SCSI speeds
> improves significantly.
>
> The option can be enabled by adding
>
> -global lsi53c895a.hpux-spin-workaround=on
>
> to the qemu commandline.
>
> Signed-off-by: Sven Schnelle <svens@stackframe.org>
> ---
>  hw/scsi/lsi53c895a.c | 20 ++++++++++++++++++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
> index d607a5f9fb..20c353f594 100644
> --- a/hw/scsi/lsi53c895a.c
> +++ b/hw/scsi/lsi53c895a.c
> @@ -304,6 +304,7 @@ struct LSIState {
>      uint32_t adder;
>
>      uint8_t script_ram[2048 * sizeof(uint32_t)];
> +    bool hpux_spin_workaround;
>  };
>
>  #define TYPE_LSI53C810  "lsi53c810"
> @@ -1156,8 +1157,17 @@ again:
>              qemu_log_mask(LOG_GUEST_ERROR,
>                            "lsi_scsi: inf. loop with UDC masked");
>          }
> -        lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
> -        lsi_disconnect(s);
> +        if (s->hpux_spin_workaround) {
> +            /*
> +             * Workaround for HP-UX 10.20: Instead of disconnecting, which
> +             * causes a long delay, emulate a INTERRUPT 2 instruction.
> +             */
> +            s->dsps = 2;
> +            lsi_script_dma_interrupt(s, LSI_DSTAT_SIR);
> +        } else {
> +            lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
> +            lsi_disconnect(s);
> +        }
>          trace_lsi_execute_script_stop();
>          reentrancy_level--;
>          return;


I see we already have a hacky workaround for other OSes
that do something similar. The ideal fix for both of these
I think would be for lsi_execute_script() to, instead of stopping,
arrange to defer executing more script instructions until
after the guest has had a chance to run a bit more.
I think setting a timer that calls lsi_resume_script() after
a while would have that effect.

-- PMM
Sven Schnelle Feb. 29, 2024, 4:54 p.m. UTC | #2
Peter Maydell <peter.maydell@linaro.org> writes:

> On Wed, 28 Feb 2024 at 21:12, Sven Schnelle <svens@stackframe.org> wrote:
>>
>> HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
>> under certain circumstances. As the SCSI controller and CPU are not
>> running at the same time this loop will never finish. After some
>> time, the check loop interrupts with a unexpected device disconnect.
>> This works, but is slow because the kernel resets the scsi controller.
>> Instead of signaling UDC, add an option 'hpux-spin-workaround' which
>> emulates a INTERRUPT 2 script instruction. This instruction tells the
>> kernel that the request was fulfilled. With this change, SCSI speeds
>> improves significantly.
>> [..]
> I see we already have a hacky workaround for other OSes
> that do something similar. The ideal fix for both of these
> I think would be for lsi_execute_script() to, instead of stopping,
> arrange to defer executing more script instructions until
> after the guest has had a chance to run a bit more.
> I think setting a timer that calls lsi_resume_script() after
> a while would have that effect.

Thanks, good idea. So something like this?

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index d607a5f9fb..9931799d44 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -188,7 +188,7 @@ static const char *names[] = {
 #define LSI_TAG_VALID     (1 << 16)
 
 /* Maximum instructions to process. */
-#define LSI_MAX_INSN    10000
+#define LSI_MAX_INSN    1000
 
 typedef struct lsi_request {
     SCSIRequest *req;
@@ -205,6 +205,7 @@ enum {
     LSI_WAIT_RESELECT, /* Wait Reselect instruction has been issued */
     LSI_DMA_SCRIPTS, /* processing DMA from lsi_execute_script */
     LSI_DMA_IN_PROGRESS, /* DMA operation is in progress */
+    LSI_WAIT_SCRIPTS, /* SCRIPTS where stopped because of instruction count limit */
 };
 
 enum {
@@ -224,6 +225,7 @@ struct LSIState {
     MemoryRegion ram_io;
     MemoryRegion io_io;
     AddressSpace pci_io_as;
+    QEMUTimer *scripts_timer;
 
     int carry; /* ??? Should this be an a visible register somewhere?  */
     int status;
@@ -1152,13 +1154,9 @@ again:
      * which should be enough for all valid use cases).
      */
     if (++insn_processed > LSI_MAX_INSN || reentrancy_level > 8) {
-        if (!(s->sien0 & LSI_SIST0_UDC)) {
-            qemu_log_mask(LOG_GUEST_ERROR,
-                          "lsi_scsi: inf. loop with UDC masked");
-        }
-        lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
-        lsi_disconnect(s);
-        trace_lsi_execute_script_stop();
+        s->waiting = LSI_WAIT_SCRIPTS;
+        timer_mod(s->scripts_timer, qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + 500);
+        trace_lsi_scripts_timer_arm();
         reentrancy_level--;
         return;
     }
@@ -2294,6 +2292,17 @@ static const struct SCSIBusInfo lsi_scsi_info = {
     .cancel = lsi_request_cancelled
 };
 
+static void scripts_timer_cb(void *opaque)
+{
+    LSIState *s = opaque;
+
+    trace_lsi_scripts_timer_triggered();
+    if (s->waiting == LSI_WAIT_SCRIPTS) {
+        s->waiting = 0;
+        lsi_execute_script(s);
+    }
+}
+
 static void lsi_scsi_realize(PCIDevice *dev, Error **errp)
 {
     LSIState *s = LSI53C895A(dev);
@@ -2313,6 +2322,7 @@ static void lsi_scsi_realize(PCIDevice *dev, Error **errp)
                           "lsi-ram", 0x2000);
     memory_region_init_io(&s->io_io, OBJECT(s), &lsi_io_ops, s,
                           "lsi-io", 256);
+    s->scripts_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, scripts_timer_cb, s);
 
     /*
      * Since we use the address-space API to interact with ram_io, disable the
diff --git a/hw/scsi/trace-events b/hw/scsi/trace-events
index d72f741ed8..4456a08ab0 100644
--- a/hw/scsi/trace-events
+++ b/hw/scsi/trace-events
@@ -302,6 +302,8 @@ lsi_execute_script_stop(void) "SCRIPTS execution stopped"
 lsi_awoken(void) "Woken by SIGP"
 lsi_reg_read(const char *name, int offset, uint8_t ret) "Read reg %s 0x%x = 0x%02x"
 lsi_reg_write(const char *name, int offset, uint8_t val) "Write reg %s 0x%x = 0x%02x"
+lsi_scripts_timer_triggered(void) "SCRIPTS timer triggered"
+lsi_scripts_timer_arm(void) "SCRIPTS timer armed"
 
 # virtio-scsi.c
 virtio_scsi_cmd_req(int lun, uint32_t tag, uint8_t cmd) "virtio_scsi_cmd_req lun=%u tag=0x%x cmd=0x%x"
Peter Maydell Feb. 29, 2024, 5:26 p.m. UTC | #3
On Thu, 29 Feb 2024 at 16:54, Sven Schnelle <svens@stackframe.org> wrote:
>
> Peter Maydell <peter.maydell@linaro.org> writes:
>
> > On Wed, 28 Feb 2024 at 21:12, Sven Schnelle <svens@stackframe.org> wrote:
> >>
> >> HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
> >> under certain circumstances. As the SCSI controller and CPU are not
> >> running at the same time this loop will never finish. After some
> >> time, the check loop interrupts with a unexpected device disconnect.
> >> This works, but is slow because the kernel resets the scsi controller.
> >> Instead of signaling UDC, add an option 'hpux-spin-workaround' which
> >> emulates a INTERRUPT 2 script instruction. This instruction tells the
> >> kernel that the request was fulfilled. With this change, SCSI speeds
> >> improves significantly.
> >> [..]
> > I see we already have a hacky workaround for other OSes
> > that do something similar. The ideal fix for both of these
> > I think would be for lsi_execute_script() to, instead of stopping,
> > arrange to defer executing more script instructions until
> > after the guest has had a chance to run a bit more.
> > I think setting a timer that calls lsi_resume_script() after
> > a while would have that effect.
>
> Thanks, good idea. So something like this?

Yeah, something like that I think. (You probably want to delete
the timer on reset, and you need to handle migration -- you can
either put the timer state in a new vmstate section, or else
in post-load if the state is 'LSI_WAIT_SCRIPTS' arm the timer.)

Does it work? :-)

-- PMM
Sven Schnelle Feb. 29, 2024, 6:01 p.m. UTC | #4
Peter Maydell <peter.maydell@linaro.org> writes:

> On Thu, 29 Feb 2024 at 16:54, Sven Schnelle <svens@stackframe.org> wrote:
>>
>> Peter Maydell <peter.maydell@linaro.org> writes:
>>
>> > On Wed, 28 Feb 2024 at 21:12, Sven Schnelle <svens@stackframe.org> wrote:
>> >>
>> >> HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
>> >> under certain circumstances. As the SCSI controller and CPU are not
>> >> running at the same time this loop will never finish. After some
>> >> time, the check loop interrupts with a unexpected device disconnect.
>> >> This works, but is slow because the kernel resets the scsi controller.
>> >> Instead of signaling UDC, add an option 'hpux-spin-workaround' which
>> >> emulates a INTERRUPT 2 script instruction. This instruction tells the
>> >> kernel that the request was fulfilled. With this change, SCSI speeds
>> >> improves significantly.
>> >> [..]
>> > I see we already have a hacky workaround for other OSes
>> > that do something similar. The ideal fix for both of these
>> > I think would be for lsi_execute_script() to, instead of stopping,
>> > arrange to defer executing more script instructions until
>> > after the guest has had a chance to run a bit more.
>> > I think setting a timer that calls lsi_resume_script() after
>> > a while would have that effect.
>>
>> Thanks, good idea. So something like this?
>
> Yeah, something like that I think. (You probably want to delete
> the timer on reset, and you need to handle migration -- you can
> either put the timer state in a new vmstate section, or else
> in post-load if the state is 'LSI_WAIT_SCRIPTS' arm the timer.)

> Does it work? :-)

Yes it does. I only did a quick hack without caring about things like
reset, freeing the timer and migration. I successfully booted HP-UX 10.20
without any scsi errors in the dmesg. So i think this is much better
than the commandline option. I'll prepare a proper patch in the next
days and submit it.

Thanks!
Sven
diff mbox series

Patch

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index d607a5f9fb..20c353f594 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -304,6 +304,7 @@  struct LSIState {
     uint32_t adder;
 
     uint8_t script_ram[2048 * sizeof(uint32_t)];
+    bool hpux_spin_workaround;
 };
 
 #define TYPE_LSI53C810  "lsi53c810"
@@ -1156,8 +1157,17 @@  again:
             qemu_log_mask(LOG_GUEST_ERROR,
                           "lsi_scsi: inf. loop with UDC masked");
         }
-        lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
-        lsi_disconnect(s);
+        if (s->hpux_spin_workaround) {
+            /*
+             * Workaround for HP-UX 10.20: Instead of disconnecting, which
+             * causes a long delay, emulate a INTERRUPT 2 instruction.
+             */
+            s->dsps = 2;
+            lsi_script_dma_interrupt(s, LSI_DSTAT_SIR);
+        } else {
+            lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
+            lsi_disconnect(s);
+        }
         trace_lsi_execute_script_stop();
         reentrancy_level--;
         return;
@@ -2339,6 +2349,11 @@  static void lsi_scsi_exit(PCIDevice *dev)
     address_space_destroy(&s->pci_io_as);
 }
 
+static Property lsi_props[] = {
+    DEFINE_PROP_BOOL("hpux-spin-workaround", LSIState, hpux_spin_workaround,
+                     false),
+};
+
 static void lsi_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -2353,6 +2368,7 @@  static void lsi_class_init(ObjectClass *klass, void *data)
     dc->reset = lsi_scsi_reset;
     dc->vmsd = &vmstate_lsi_scsi;
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    device_class_set_props(dc, lsi_props);
 }
 
 static const TypeInfo lsi_info = {